[Bio] / FigKernelScripts / split_sequences_into_sets.pl Repository:
ViewVC logotype

View of /FigKernelScripts/split_sequences_into_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Tue Aug 25 16:30:16 2009 UTC (10 years, 3 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.4: +1 -1 lines
handle asym connections

########################################################################
# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

use strict;
use FIG;

use ProtSims;
use gjoseqlib;

my $min_sz  = 30;
my $fullF   = "/tmp/seqs.$$";

my $usage = "usage: split_sequences_into_sets OutputDirectory [SimilarityCutoff FracCoverage] < full_seqs";

my $output;
(
 ($output = shift @ARGV)
)
    || die $usage;

(! -e $output) || die "$output already exists -- check it (and delete it, if that is what you really wish)";
&FIG::verify_dir($output);

my @seqs = &gjoseqlib::read_fasta();
my %seq_of = map { $_->[0] => $_->[2] } @seqs;

my($sim_cutoff,$frac_cov);
if (@ARGV == 2)
{
    $sim_cutoff = shift @ARGV;
    $frac_cov   = shift @ARGV;
}
else
{
    $sim_cutoff = 1.0e-30;
    $frac_cov   = 0.70;
}

my $min_size = (@seqs > 20) ? (@seqs / 10) : (@seqs / 2);
my @all = map { my($id1,$id2,undef,undef,undef,undef,$b1,$e1,$b2,$e2,$psc,undef,$ln1,$ln2) = @$_;
	        [$id1,$id2,(($e1+1)-$b1)/$ln1,(($e2+1)-$b2)/$ln2,$psc] 
	      } &ProtSims::blastP(\@seqs,\@seqs,$min_size);


my @sims = grep { my($id1,$id2,$frac1,$frac2,$psc) = @$_; 
	          ($id1 ne $id2) && ($frac1 >= $frac_cov) && ($frac2 >= $frac_cov) } @all;

my(%seen,%conn);
foreach my $sim (@sims)
{
    my($id1,$id2,$frac1,$frac2,$psc) = @$sim;
    my $key = join("\t",sort ($id1,$id2));
    if (! $seen{$key})
    {
	$seen{$key} = 1;
	push(@{$conn{$id1}},$id2);
	push(@{$conn{$id2}},$id1);
    }
}
undef %seen;
my @ids = sort { my $x = $conn{$b} ? @{$conn{$b}} : 0;
		 my $y = $conn{$a} ? @{$conn{$a}} : 0;
		 ($x <=> $y) 
	       } keys(%seq_of);

my $n = 1;
my @sets = ();

open(SZ,">$output/set.sizes") || die "could not open $output/set.sizes";

foreach my $central_id (@ids)
{
    if (! $seen{$central_id})
    {

	my $set = &extract_set($central_id,\%conn,\%seen,\%seq_of);
	print SZ join("\t",($n,scalar @$set)),"\n";

	push(@sets,[$n,$set]);

	open(OUT,">$output/$n") || die "could not open $output/$n";
	$n++;
	foreach my $peg (@$set)
	{
	    $seen{$peg} = 1;
	    print OUT ">$peg\n$seq_of{$peg}\n";
	}
	close(OUT);
    }
}
close(SZ);

my @distances = &construct_distance_matrix(\@sets,\@all); # entries are [n1,n2,sc]
open(DIST,">$output/distance.matrix\n") || die "could not open $output/distance.matrix";
foreach my $tuple (@distances)
{
    print DIST join("\t",@$tuple),"\n";
}
close(DIST);

sub run {
    my($cmd) = @_;

#   my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n";
    (system($cmd) == 0) || confess "FAILED: $cmd";
}

sub extract_set {
    my($central_id,$conn,$seen,$seq_of) = @_;

    my $center_length = length($seq_of->{$central_id});
    my $cluster = [$central_id];
    my %in      = ($central_id,1);

    my $i;
    while ($i < @$cluster)
    {
	my $x = $conn->{$cluster->[$i]};
	$i++;
	foreach my $peg (@$x)
	{
	    if ((! $in{$peg}) && (! $seen->{$peg}))
	    {
		my $len = length($seq_of->{$peg});
		if ((abs($center_length - $len) / $center_length) <= (1 - $frac_cov))
		{
		    push(@$cluster,$peg);
		    $in{$peg} = 1;
		}
	    }
	}
    }
    return $cluster;
}

sub construct_distance_matrix {
    my($sets,$sims) = @_;
    my($tuple,$n,$set,$id1,$id2,$sc,$sim,%best,$key,$n1,$n2);
    my(@distances,$sofar,%in);

    foreach my $tuple (@$sets)
    {
	my($n,$set) = @$tuple;
	foreach my $id (@$set)
	{
	    $in{$id} = $n;
	}
    }

    my %best;
    foreach my $sim (@$sims)
    {
	my($id1,$id2,undef,undef,$sc) = @$sim;
	if ($in{$id1} && $in{$id2} && ($in{$id1} != $in{$id2}))
	{
	    my $key = join("\t",($in{$id1},$in{$id2}));
	    if ((! ($sofar = $best{$key})) || ($sofar < $sc))
	    {
		$best{$key} = $sc;
	    }

	    $key = join("\t",($in{$id2},$in{$id1}));
	    if ((! ($sofar = $best{$key})) || ($sofar > $sc))
	    {
		$best{$key} = $sc;
	    }
	}
    }

    my @distances = ();
    foreach my $key (keys(%best))
    {
	my($n1,$n2) = split(/\t/,$key);
	push(@distances,[$n1,$n2,$best{$key}]);
    }
    return sort { ($a->[0] <=> $b->[0]) or ($a->[1] <=> $b->[1]) } @distances;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3