Parent Directory
|
Revision Log
redone entirely
######################################################################## # -*- perl -*- # # Copyright (c) 2003-2006 University of Chicago and Fellowship # for Interpretations of Genomes. All Rights Reserved. # # This file is part of the SEED Toolkit. # # The SEED Toolkit is free software. You can redistribute # it and/or modify it under the terms of the SEED Toolkit # Public License. # # You should have received a copy of the SEED Toolkit Public License # along with this program; if not write to the University of Chicago # at info@ci.uchicago.edu or the Fellowship for Interpretation of # Genomes at veronika@thefig.info or download a copy from # http://www.theseed.org/LICENSE.TXT. # use strict; use FIG; use ProtSims; use gjoseqlib; my $min_sz = 30; my $fullF = "/tmp/seqs.$$"; my $usage = "usage: split_sequences_into_sets OutputDirectory [SimilarityCutoff FracCoverage] < full_seqs"; my $output; ( ($output = shift @ARGV) ) || die $usage; (! -e $output) || die "$output already exists -- check it (and delete it, if that is what you really wish)"; &FIG::verify_dir($output); my @seqs = &gjoseqlib::read_fasta(); my %seq_of = map { $_->[0] => $_->[2] } @seqs; my($sim_cutoff,$frac_cov); if (@ARGV == 2) { $sim_cutoff = shift @ARGV; $frac_cov = shift @ARGV; } else { $sim_cutoff = 1.0e-30; $frac_cov = 0.70; } my $min_size = (@seqs > 20) ? (@seqs / 10) : (@seqs / 2); my @all = map { my($id1,$id2,undef,undef,undef,undef,$b1,$e1,$b2,$e2,$psc,undef,$ln1,$ln2) = @$_; [$id1,$id2,(($e1+1)-$b1)/$ln1,(($e2+1)-$b2)/$ln2,$psc] } &ProtSims::blastP(\@seqs,\@seqs,$min_size); my @sims = grep { my($id1,$id2,$frac1,$frac2,$psc) = @$_; ($id1 ne $id2) && ($frac1 >= $frac_cov) && ($frac2 >= $frac_cov) } @all; my(%seen,%conn); foreach my $sim (@sims) { my($id1,$id2,$frac1,$frac2,$psc) = @$sim; my $key = join("\t",sort ($id1,$id2)); if (! $seen{$key}) { $seen{$key} = 1; push(@{$conn{$id1}},$id2); push(@{$conn{$id2}},$id1); } } undef %seen; my @ids = sort { my $x = $conn{$b} ? @{$conn{$b}} : 0; my $y = $conn{$a} ? @{$conn{$a}} : 0; ($x <=> $y) } keys(%seq_of); my $n = 1; my @sets = (); open(SZ,">$output/set.sizes") || die "could not open $output/set.sizes"; foreach my $central_id (@ids) { if (! $seen{$central_id}) { my $set = &extract_set($central_id,\%conn,\%seen); print SZ join("\t",($n,scalar @$set)),"\n"; push(@sets,[$n,$set]); open(OUT,">$output/$n") || die "could not open $output/$n"; $n++; foreach my $peg (@$set) { $seen{$peg} = 1; print OUT ">$peg\n$seq_of{$peg}\n"; } close(OUT); } } close(SZ); my @distances = &construct_distance_matrix(\@sets,\@all); # entries are [n1,n2,sc] open(DIST,">$output/distance.matrix\n") || die "could not open $output/distance.matrix"; foreach my $tuple (@distances) { print DIST join("\t",@$tuple),"\n"; } close(DIST); sub run { my($cmd) = @_; # my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n"; (system($cmd) == 0) || confess "FAILED: $cmd"; } sub extract_set { my($central_id,$conn,$seen) = @_; my $cluster = [$central_id]; my %in = ($central_id,1); my $i; while ($i < @$cluster) { my $x = $conn->{$cluster->[$i]}; $i++; foreach my $peg (@$x) { if (! $in{$peg}) { push(@$cluster,$peg); $in{$peg} = 1; } } } return $cluster; } sub construct_distance_matrix { my($sets,$sims) = @_; my($tuple,$n,$set,$id1,$id2,$sc,$sim,%best,$key,$n1,$n2); my(@distances,$sofar,%in); foreach my $tuple (@$sets) { my($n,$set) = @$tuple; foreach my $id (@$set) { $in{$id} = $n; } } my %best; foreach my $sim (@$sims) { my($id1,$id2,undef,undef,$sc) = @$sim; if ($in{$id1} && $in{$id2} && ($in{$id1} != $in{$id2})) { my $key = join("\t",($in{$id1},$in{$id2})); if ((! ($sofar = $best{$key})) || ($sofar < $sc)) { $best{$key} = $sc; } $key = join("\t",($in{$id2},$in{$id1})); if ((! ($sofar = $best{$key})) || ($sofar > $sc)) { $best{$key} = $sc; } } } my @distances = (); foreach my $key (keys(%best)) { my($n1,$n2) = split(/\t/,$key); push(@distances,[$n1,$n2,$best{$key}]); } return sort { ($a->[0] <=> $b->[0]) or ($a->[1] <=> $b->[1]) } @distances; }
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |