[Bio] / FigKernelScripts / split_sequences_into_sets.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/split_sequences_into_sets.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Aug 3 19:48:53 2009 UTC revision 1.2, Wed Aug 19 16:11:28 2009 UTC
# Line 17  Line 17 
17  # http://www.theseed.org/LICENSE.TXT.  # http://www.theseed.org/LICENSE.TXT.
18  #  #
19    
20    use strict;
21  use FIG;  use FIG;
 $min_sz  = 30;  
 $fullF   = "/tmp/seqs.$$";  
22    
23  $usage = "usage: split_sequences_into_sets OutputDirectory [SimilarityCutoff FracCoverage] [blastout=BlastFile] < full_seqs";  use ProtSims;
24    use gjoseqlib;
25    
26  for ($i= $#ARGV; ($i >= 0); $i--)  my $min_sz  = 30;
27  {  my $fullF   = "/tmp/seqs.$$";
     if ($ARGV[$i] =~ /blastout=(\S+)$/)  
     {  
         $blastout = $1;  
         splice(@ARGV,$i,1);  
     }  
 }  
28    
29    my $usage = "usage: split_sequences_into_sets OutputDirectory [SimilarityCutoff FracCoverage] < full_seqs";
30    
31    my $output;
32  (  (
33   ($output = shift @ARGV)   ($output = shift @ARGV)
34  )  )
# Line 40  Line 37 
37  (! -e $output) || die "$output already exists -- check it (and delete it, if that is what you really wish)";  (! -e $output) || die "$output already exists -- check it (and delete it, if that is what you really wish)";
38  &FIG::verify_dir($output);  &FIG::verify_dir($output);
39    
40  open(TMP,">$fullF") || die "could not open $fullF";  my @seqs = &gjoseqlib::read_fasta();
41  $/ = "\n>";  my %seq_of = map { $_->[0] => $_->[2] } @seqs;
 while (defined($_ = <STDIN>))  
 {  
     chomp;  
     if ($_ =~ /^>?(\S+)[^\n]*\n(.*)/s)  
     {  
         $id  =  $1;  
         $seq =  $2;  
         $seq =~ s/\s//gs;  
         $seq =~ s/[uU]/x/g;  
         $seq_of{$id} = $seq;  
         print TMP ">$id\n$seq\n";  
     }  
 }  
 close(TMP);  
 $/ = "\n";  
42    
43    my($sim_cutoff,$frac_cov);
44  if (@ARGV == 2)  if (@ARGV == 2)
45  {  {
46      $sim_cutoff = shift @ARGV;      $sim_cutoff = shift @ARGV;
# Line 69  Line 52 
52      $frac_cov   = 0.70;      $frac_cov   = 0.70;
53  }  }
54    
55  &run("$FIG_Config::ext_bin/formatdb -i $fullF -p T");  my $min_size = (@seqs > 20) ? (@seqs / 10) : (@seqs / 2);
56    my @all = &ProtSims::blastP(\@seqs,\@seqs,$min_size);
57    # &print_sims(\@all);
58    
59  open(BLASTOUT,"$FIG_Config::ext_bin/blastall -i $fullF -d $fullF -FF -m 8 -p blastp -b 20000 -v 20000 |")  # $_ = @all; print STDERR "computed $_ similarities\n";
     || die "could not run blastall -i $fullF -d $fullF -FF -m 8 -p blastp -b 20000 -v 20000";  
 my @all = ();  
 while (defined($_ = <BLASTOUT>))  
 {  
     if (($_ =~ /^(\S+)\t(\S+)\t(\S+\t){4}(\S+)\t(\S+)\t(\S+)\t(\S+)\s+(\S+)/) &&  
         ($1 ne $2) &&  
         ($8 <= $sim_cutoff) &&  
         ((@all == 0) || (($1 ne $all[$#all]->[0]) || ($2 ne $all[$#all]->[1]))))  
     {  
         push(@all,[$1,$2,$4,$5,$6,$7,$8]);  
     }  
 }  
 close(BLASTOUT);  
   
 if (defined($blastout))  
 {  
     open(BLASTOUT,">$blastout") || die "could not open $blastout";  
     foreach $_ (@all)  
     {  
         print BLASTOUT join("\t",(@$_,length($seq_of{$_->[0]}),length($seq_of{$_->[1]}))),"\n";  
     }  
     close(BLASTOUT);  
 }  
60    
61  @sims = grep { ($_->[6] <= $sim_cutoff) }  my @sims = grep { ($_->id1 ne $_->id2) &&
62          grep { ($_->[0] ne $_->[1]) &&                    ($_->psc <= $sim_cutoff) &&
63                  ((($_->[3] - $_->[2]) / length($seq_of{$_->[0]})) >= $frac_cov) &&                    ((($_->e1 - $_->b1) / $_->ln1) >= $frac_cov) &&
64                  ((($_->[5] - $_->[4]) / length($seq_of{$_->[1]})) >= $frac_cov)                    ((($_->e2 - $_->b2) / $_->ln2) >= $frac_cov)
65               }                  } @all;
         @all;  
66    
67  if (@sims == 0)  # print STDERR "=======\n";
68  {  # &print_sims(\@sims);
     print STDERR "No similarities worth processing:\n"  
         , join("\n", map { join(", ", @ { $_ }) } @all), "\n"  
         , "--- deleting $output\n";  
     system("rm -fR $output") && warn "Could not delete directory $output";  
     exit(0);  
 }  
69    
70  foreach $sim (@sims)  my(%seen,%conn);
71    foreach my $sim (@sims)
72  {  {
73      if (! $seen{"$sim->[0]\t$sim->[1]"})  # pick only the best similarity between two ids      if (! $seen{"$sim->id1\t$sim->id2"})  # pick only the best similarity between two ids
74      {      {
75          push(@{$conn{$sim->[0]}},$sim);          push(@{$conn{$sim->id1}},$sim);
76          $seen{"$sim->[0]\t$sim->[1]"} = 1;          $seen{"$sim->id1\t$sim->id2"} = 1;
77      }      }
78  }  }
79  undef %seen;  undef %seen;
80    # print STDERR "built connections\n";
81    
82  if (0)      # This is just for debugging -- display the similarities  my @ids = sort { &cov($conn{$b}) <=> &cov($conn{$a}) }
 {  
     foreach $id (sort keys(%seq_of))  
     {  
         &print_sims($id,\@sims);  
     }  
 }  
   
 # @ids = sort { (@{$conn{$b}} <=> @{$conn{$a}}) or (&cov($conn{$b}) <=> &cov($conn{$a})) }  
 # sort by number of connections; within that on sum of coverage  
   
 @ids = sort { &cov($conn{$b}) <=> &cov($conn{$a}) }  
83         keys(%seq_of);         keys(%seq_of);
84  # sort by sum of coverage  # sort by sum of coverage
85    
86  @tmp = map { $x = $_; [$x,scalar @{$conn{$x}},&cov($conn{$x})] } @ids;  my $n = 1;
87  $n = 1;  my @sets = ();
 @sets = ();  
88    
89  open(SZ,">$output/set.sizes") || die "could not open $output/set.sizes";  open(SZ,">$output/set.sizes") || die "could not open $output/set.sizes";
90    
91  foreach $central_id (@ids)  foreach my $central_id (@ids)
92  {  {
93      if (! $seen{$central_id})      if (! $seen{$central_id})
94      {      {
 #       print STDERR "========\n";  
 #       print STDERR "picking central_id = $central_id\n";  
   
         $set = &extract_set($central_id,$conn{$central_id},\%seq_of,\%seen);  
 #       print STDERR "========\n";  
 #       print STDERR "calculated set:", &Dumper($et);  
95    
96            my $set = &extract_set($central_id,\%conn,\%seen);
97          print SZ join("\t",($n,scalar @$set)),"\n";          print SZ join("\t",($n,scalar @$set)),"\n";
98    
99          push(@sets,[$n,$set]);          push(@sets,[$n,$set]);
100    
101          open(OUT,">$output/$n") || die "could not open $output/$n";          open(OUT,">$output/$n") || die "could not open $output/$n";
102          $n++;          $n++;
103          foreach $tuple (@$set)          foreach my $peg (@$set)
104          {          {
105              ($id,$b,$e,$seq) = @$tuple;              $seen{$peg} = 1;
106              $seen{$id} = 1;              print OUT ">$peg\n$seq_of{$peg}\n";
             print OUT ">$id\n$seq\n";  
107          }          }
108          close(OUT);          close(OUT);
109      }      }
110  }  }
111  close(SZ);  close(SZ);
112    # print STDERR "built set sizes\n";
113    
114  @distances = &construct_distance_matrix(\@sets,\@all); # entries are [n1,n2,sc]  my @distances = &construct_distance_matrix(\@sets,\@all); # entries are [n1,n2,sc]
115  open(DIST,">$output/distance.matrix\n") || die "could not open $output/distance.matrix";  open(DIST,">$output/distance.matrix\n") || die "could not open $output/distance.matrix";
116  foreach $tuple (@distances)  foreach my $tuple (@distances)
117  {  {
118      print DIST join("\t",@$tuple),"\n";      print DIST join("\t",@$tuple),"\n";
119  }  }
120  close(DIST);  close(DIST);
121    # print STDERR "built distance matrix\n";
122    
123    sub print_sims {
124        my($sims) = @_;
125    
126  system("rm $fullF\*");      foreach my $sim (@$sims)
   
 sub possible_region {  
     my($r1,$r2) = @_;  
   
     $r2 = $r2 ? $r2 : [1,100000];  
   
     my($b1,$e1) = @$r1;  
     my($b2,$e2) = @$r2;  
   
     my $b = ($b1 > $b2) ? $b1 : $b2;  
     my $e = ($e1 < $e2) ? $e1 : $e2;  
   
     if ((($e-$b)+1) >= $min_sz)  
127      {      {
128          return [$b,$e];          print STDERR join("\t",($sim->id1,$sim->id2,$sim->iden,$sim->b1,$sim->e1,$sim->b2,$sim->e2,$sim->psc,$sim->ln1,$sim->ln2)),"\n";
129      }      }
     return undef;  
130  }  }
131    
132  sub cov {  sub cov {
# Line 207  Line 136 
136      $n = 0;      $n = 0;
137      foreach $x (@$sims)      foreach $x (@$sims)
138      {      {
139          $n += ($x->[3] - $x->[2]);          $n += ($x->e1 - $x->b1);
140      }      }
141      return $n;      return $n;
142  }  }
# Line 219  Line 148 
148      (system($cmd) == 0) || confess "FAILED: $cmd";      (system($cmd) == 0) || confess "FAILED: $cmd";
149  }  }
150    
 sub print_sims {  
     my($id,$sims) = @_;  
     my($sim,@tmp);  
   
     print STDERR "$id\n";  
     foreach $sim (@$sims)  
     {  
         if ($sim->[0] eq $id)  
         {  
             @tmp = @$sim;  
             $tmp[0] = "";  
             print STDERR join("\t",@tmp),"\n";  
         }  
     }  
 }  
   
151  sub extract_set {  sub extract_set {
152      my($central_id,$sims,$seq_of,$seen) = @_;      my($central_id,$conn,$seen) = @_;
     my($sim,$r1,$id1,$id2,$b1,$e1,$b2,$e2);  
153    
154      my(@tuples) = ();      my $cluster = [$central_id];
155        my %in      = ($central_id,1);
156    
157      my @sims = sort { (($b->[3] - $b->[2]) <=> ($a->[3] - $a->[2])) } @$sims;      my $i;
158      my $region = [1,length($seq_of->{$central_id})];      while ($i < @$cluster)
   
     foreach $sim (@sims)  
159      {      {
160          ($id1,$id2,$b1,$e1,$b2,$e2) = @$sim;          my $x = $conn->{$cluster->[$i]};
161          if ((! $seen->{$id2}) && ($r1 = &possible_region([$b1,$e1],$region)))          $i++;
162            foreach my $peg (map { $_->id2} @$x)
163          {          {
164              $region = $r1;              if (! $in{$peg})
         }  
     }  
   
     my($b1F,$e1F) = @$region;  
     my $seq = substr($seq_of->{$central_id},$b1F-1,($e1F-$b1F)+1);  
     my $core_ln = length($seq);  
   
     my $singleF = "/tmp/seq.$$";  
   
     open(TMP,">$singleF") || die "could not open $singleF";  
     print TMP ">$central_id\n$seq\n";  
     close(TMP);  
   
     @sims =  grep { ($_->[6] <= 1.0e-5) && (($_->[3] - $_->[2]) > (0.8 * $core_ln)) }  
              map { $_ =~ /^(\S+)\t(\S+)\t(\S+\t){4}(\S+)\t(\S+)\t(\S+)\t(\S+)\s+(\S+)/;  
                    [$1,$2,$4,$5,$6,$7,$8] }  
              `$FIG_Config::ext_bin/blastall -i $singleF -d $fullF -FF -m 8 -p blastp -b 20000 -v 20000`;  
     system "rm $singleF\*";  
     undef %seen1;  
   
     foreach $sim (@sims)  
165      {      {
166          ($id1,$id2,$b1,$e1,$b2,$e2) = @$sim;                  push(@$cluster,$peg);
167          if ((! $seen->{$id2}) && (! $seen1{$id2}) && ((($e2-$b2)+1) >= $min_sz))                  $in{$peg} = 1;
168          {              }
             $seq = $seq_of{$id2};  
 ##          $seq = substr($seq_of{$id2},$b2-1,($e2-$b2)+1);  <<<  
             push(@tuples,[$id2,$b2,$e2,$seq]);  
             $seen1{$id2} = 1;  
169          }          }
170      }      }
171      return [@tuples];      return $cluster;
172  }  }
173    
174  sub construct_distance_matrix {  sub construct_distance_matrix {
175      my($sets,$sims) = @_;      my($sets,$sims) = @_;
176      my($tuple1,$tuple2,$n,$set,$id1,$id2,$sc,$sim,%best,$key,$n1,$n2);      my($tuple,$n,$set,$id1,$id2,$sc,$sim,%best,$key,$n1,$n2);
177      my(@distances,$sofar);      my(@distances,$sofar,%in);
178    
179      foreach $tuple1 (@$sets)      foreach my $tuple (@$sets)
180      {      {
181          ($n,$set) = @$tuple1;          my($n,$set) = @$tuple;
182          foreach $tuple2 (@$set)          foreach my $id (@$set)
183          {          {
             ($id,$b,$e,$seq) = @$tuple2;  
184              $in{$id} = $n;              $in{$id} = $n;
185          }          }
186      }      }
187      foreach $sim (@$sims)  
188        my %best;
189        foreach my $sim (@$sims)
190      {      {
191          ($id1,$id2,undef,undef,undef,undef,$sc) = @$sim;          my $id1 = $sim->id1;
192            my $id2 = $sim->id2;
193            my $sc = $sim->psc;
194          if ($in{$id1} && $in{$id2} && ($in{$id1} != $in{$id2}))          if ($in{$id1} && $in{$id2} && ($in{$id1} != $in{$id2}))
195          {          {
196              $key = join("\t",($in{$id1},$in{$id2}));              my $key = join("\t",($in{$id1},$in{$id2}));
197              if ((! ($sofar = $best{$key})) || ($sofar > $sc))              if ((! ($sofar = $best{$key})) || ($sofar > $sc))
198              {              {
199                  $best{$key} = $sc;                  $best{$key} = $sc;
# Line 317  Line 207 
207          }          }
208      }      }
209    
210      @distances = ();      my @distances = ();
211      foreach $key (keys(%best))      foreach my $key (keys(%best))
212      {      {
213          ($n1,$n2) = split(/\t/,$key);          my($n1,$n2) = split(/\t/,$key);
214          push(@distances,[$n1,$n2,$best{$key}]);          push(@distances,[$n1,$n2,$best{$key}]);
215      }      }
216      return sort { ($a->[0] <=> $b->[0]) or ($a->[1] <=> $b->[1]) } @distances;      return sort { ($a->[0] <=> $b->[0]) or ($a->[1] <=> $b->[1]) } @distances;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3