[Bio] / FigKernelPackages / PinnedRegions.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/PinnedRegions.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.24, Thu Aug 13 20:34:15 2009 UTC revision 1.25, Mon Aug 17 21:19:54 2009 UTC
# Line 30  Line 30 
30    
31  sub pinned_regions {  sub pinned_regions {
32      my($fig, $pin_desc, $fast_color, $sims_from, $map_sz, $add_features, $extended) = @_;      my($fig, $pin_desc, $fast_color, $sims_from, $map_sz, $add_features, $extended) = @_;
   
     # The variable $fast_color is not used any more  
   
     # if blat is found then use it, if not then use blast  
     my $aln_tool;  
     if ( `which blat` ) {  
         $aln_tool = 'blat';  
     } else {  
         $aln_tool = 'blast';  
         # Set blast environment variables  
         $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";  
         if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }  
     }  
   
33      Trace("Pinned regions method called.") if T(3);      Trace("Pinned regions method called.") if T(3);
34      # Get list of pegs required by the description in $pin_desc      # Get list of pegs required by the description in $pin_desc
35      my $pinned_pegs = &expand_peg_list($fig, $pin_desc, $aln_tool);      my $pinned_pegs = &expand_peg_list($fig, $pin_desc);
36      Trace("Pinned pegs are " . join(", ", @$pinned_pegs) . ".") if T(3);      Trace("Pinned pegs are " . join(", ", @$pinned_pegs) . ".") if T(3);
37      # Filter out the pegs that don't exist.      # Filter out the pegs that don't exist.
38      # Get regions around pinned pegs -- boundaries, features, etc.      # Get regions around pinned pegs -- boundaries, features, etc.
# Line 65  Line 51 
51    
52      Trace("Coloring pegs.") if T(3);      Trace("Coloring pegs.") if T(3);
53      # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores      # Assign a set number to some PEGs through transitive closure based on similarity, from blast scores
54      &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $sims_from, $aln_tool);      &color_pegs($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from);
55    
56      # Filter out regions which have only a single PEG (the pinned one) colored.      # Filter out regions which have only a single PEG (the pinned one) colored.
57      # $regions = &filter_regions_2($pin_desc, $regions, $feature_data);      # $regions = &filter_regions_2($pin_desc, $regions, $feature_data);
# Line 81  Line 67 
67  }  }
68    
69  sub expand_peg_list {  sub expand_peg_list {
70      my($fig, $pin_desc, $aln_tool) = @_;      my($fig, $pin_desc) = @_;
71      my @pegs = ();      my @pegs = ();
72    
73      # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs      # Filter for legitimate genome IDS -- should handle deleted organisms and (in NMPDR) non-NMPDR orgs
# Line 137  Line 123 
123          elsif ( $pin_desc->{'sort_by'} eq 'similarity' )          elsif ( $pin_desc->{'sort_by'} eq 'similarity' )
124          {          {
125              # Sort by similarity to input peg (first peg in list of arguments)              # Sort by similarity to input peg (first peg in list of arguments)
126              @pegs = &sort_pegs_by_similarity($fig, $peg, [@$sim_pegs, @$pch_pinned_pegs], $aln_tool);              @pegs = &sort_pegs_by_similarity($fig, $peg, @$sim_pegs, @$pch_pinned_pegs);
127          }          }
128       }       }
129    
# Line 254  Line 240 
240  }  }
241    
242  sub sort_pegs_by_similarity {  sub sort_pegs_by_similarity {
243      my($fig, $input_peg, $other_pegs, $aln_tool) = @_;      my($fig, $input_peg, @other_pegs) = @_;
244    
245        # Set blast environment variables
246        $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
247        if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
248    
249      # Get amino acid sequences      # Get amino acid sequences
250      my $sequences = &get_peg_sequences($fig, [$input_peg, @$other_pegs]);      my $sequences = &get_peg_sequences($fig, [$input_peg, @other_pegs]);
251    
252      # Write the input peg sequence to a fasta file      # Write the input peg sequence to a fasta file
253      my $input_seq = {$input_peg => $sequences->{$input_peg}};      my $input_seq = {$input_peg => $sequences->{$input_peg}};
# Line 269  Line 259 
259      my $other_seq_file = "$FIG_Config::temp/other_seq.$$.tmp.fasta";      my $other_seq_file = "$FIG_Config::temp/other_seq.$$.tmp.fasta";
260      &write_seqs_to_file($sequences, $other_seq_file);      &write_seqs_to_file($sequences, $other_seq_file);
261    
262      # set high evalue cutoff so all hits get reported, all pegs in the pin should be similar anyway      # Run formatdb
     my $cutoff = 10;  
     my $hits = [];  
   
     if ( $aln_tool eq 'blat' )  
     {  
         # blat input peg sequence against other peg sequences  
         $hits = &blat_files($fig, $input_seq_file, $other_seq_file, $cutoff);  
     }  
     else  
     {  
         # BLAST input peg sequence against other peg sequences  
263          &formatdb_file($fig, $other_seq_file);          &formatdb_file($fig, $other_seq_file);
         $hits = &blast_files($fig, $input_seq_file, $other_seq_file, $cutoff);  
     }  
264    
265      unlink $input_seq_file;      # BLAST input peg sequence against other peg sequences
266      unlink $other_seq_file;      # set high evalue cutoff so all hits get reported
267        my $cutoff = 10;
268        my $hits = &blast_files($fig, $input_seq_file, $other_seq_file, $cutoff);
269    
     my %seen;  
270      my @sorted = ($input_peg);      my @sorted = ($input_peg);
271      foreach my $hit ( @$hits )      foreach my $hit ( @$hits )
272      {      {
273          # hits are already sorted by similarity          # BLAST output is already sorted by similarity
274          my($peg2) = (split(/\s+/, $hit))[1];          my($peg2) = (split(/\s+/, $hit))[1];
         if ( ! $seen{$peg2} )  
         {  
275              push @sorted, $peg2;              push @sorted, $peg2;
276          }          }
         $seen{$peg2} = 1;  
     }  
277    
278      return @sorted;      return @sorted;
279  }  }
# Line 554  Line 528 
528                     'offset'     => $offset,                     'offset'     => $offset,
529                     'offset_beg' => $offset_beg,                     'offset_beg' => $offset_beg,
530                     'offset_end' => $offset_end,                     'offset_end' => $offset_end,
531                     'function'   => $func, };                     'function'   => $func };
   
532      if ($extended) {      if ($extended) {
533          my $aliases = $fig->feature_aliases($fid) || '';          my $aliases = $fig->feature_aliases($fid) || '';
534          $retval->{aliases} = $aliases;          $retval->{aliases} = $aliases;
# Line 706  Line 679 
679  }  }
680    
681  sub color_pegs {  sub color_pegs {
682      my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $sims_from, $aln_tool) = @_;      my($fig, $pin_desc, $pinned_pegs, $regions, $feature_data, $fast_color, $sims_from) = @_;
683    
684      # Run blastp and get connected pegs      # Run blastp and get connected pegs
685      my $blast_hit = {};      my $blast_hit = {};
# Line 714  Line 687 
687    
688      if ( $sims_from eq 'blast' )      if ( $sims_from eq 'blast' )
689      {      {
690          # actually blat or blast gets used depending on the variable $aln_tool          $blast_hit = &blast_hits($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
         $blast_hit = &blast_hits($fig, $regions, $feature_data, $color_sim_cutoff, $aln_tool);  
691      }      }
692      else      else
693      {      {
694          $blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $color_sim_cutoff);          $blast_hit = &blast_hits_from_sims($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff);
695      }      }
696    
697      # Assign set numbers to pegs based on blast scores      # Assign set numbers to pegs based on blast scores
# Line 888  Line 860 
860  }  }
861    
862  sub blast_hits {  sub blast_hits {
863      my($fig, $regions, $feature_data, $color_sim_cutoff, $aln_tool) = @_;      my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
864    
865        # Set blast environment variables
866        $ENV{"BLASTMAT"} ||= "$FIG_Config::blastmat";
867        if ($ENV{"PATH"} !~ /fig\/bin/) { $ENV{"PATH"} = "$FIG_Config::bin:" . $ENV{"PATH"}; }
868    
869      # Get amino acid sequences      # Get amino acid sequences
870      my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;      my @pegs = grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
# Line 898  Line 874 
874      my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";      my $all_seqs_file = "$FIG_Config::temp/all_seqs.$$.tmp.fasta";
875      &write_seqs_to_file($sequences, $all_seqs_file);      &write_seqs_to_file($sequences, $all_seqs_file);
876    
877      my $hits = [];      # Run formatdb
878      if ( $aln_tool eq 'blat' )      &formatdb_file($fig, $all_seqs_file);
879    
880        # If $fast_color == 0, the complete blast of all vs. all sequences is performed
881        # Otherwise, instead of all vs. all, the sequences from a single pinned peg region
882        # is blasted against all sequences. If any of the hits are better than a given cutoff
883        # ($cutoff_2), the pegs hit are deemed to be 'done', i.e. it is assumed that blasting this
884        # peg will not yield additional information for forming the peg sets. These 'done' pegs
885        # are then omitted from the blast when the sequences from the region they belong to
886        # is blasted.
887        my %blast_hit;
888        if ( $fast_color )
889        {
890            my $cutoff_2 = $color_sim_cutoff * 1e-20;
891            my %done_with;
892    
893            foreach my $region ( @$regions )
894      {      {
895          # Run blat              # Iterate through each region
896          $hits = &blat_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);              my $fids = $region->{'features'};
897    
898                # Build up a hash of peg sequences which are not 'done_with'
899                my %region_seqs;
900                foreach my $peg ( grep {$feature_data->{$_}{'type'} eq 'peg'} @$fids )
901                {
902                    if ( $sequences->{$peg} and not $done_with{$peg} )
903                    {
904                        $region_seqs{$peg} = $sequences->{$peg};
905                    }
906                }
907    
908                if ( scalar keys %region_seqs )
909                {
910                    # Write the region sequences to a file
911                    my $region_seqs_file = "$FIG_Config::temp/region_seqs.$$.tmp.fasta";
912                    &write_seqs_to_file(\%region_seqs, $region_seqs_file);
913    
914                    # BLAST the region sequences against all other sequences
915                    my $hits = &blast_files($fig, $region_seqs_file, $all_seqs_file, $color_sim_cutoff);
916    
917                    # Build up hash of blast hits
918                    foreach my $hit ( @$hits )
919                    {
920                        my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
921    
922                        if ( $peg1 ne $peg2 )
923                        {
924                            $blast_hit{$peg1}{$peg2} = 1;
925                            $blast_hit{$peg2}{$peg1} = 1;
926    
927                            # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
928                            # that it will not be blasted with it's region sequences
929                            if ( $sc <= $cutoff_2 )
930                            {
931                                $done_with{$peg2} = 1;
932                            }
933                        }
934                    }
935                }
936            }
937      }      }
938      else      else
939      {      {
940          # BLAST sequence file against itself          # BLAST sequence file against itself
941          &formatdb_file($fig, $all_seqs_file);          my $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);
         $hits = &blast_files($fig, $all_seqs_file, $all_seqs_file, $color_sim_cutoff);  
     }  
   
     unlink($all_seqs_file);  
942    
943      # Build up hash of blast hits      # Build up hash of blast hits
     my %blast_hit;  
944      foreach my $hit ( @$hits )      foreach my $hit ( @$hits )
945      {      {
946          my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];          my($peg1, $peg2, $sc) = (split(/\s+/, $hit))[0,1,10];
# Line 925  Line 951 
951              $blast_hit{$peg2}{$peg1} = 1;              $blast_hit{$peg2}{$peg1} = 1;
952          }          }
953      }      }
954        }
955    
956      return \%blast_hit;      return \%blast_hit;
957  }  }
958    
959  sub blast_hits_from_sims {  sub blast_hits_from_sims {
960      my($fig, $regions, $feature_data, $color_sim_cutoff) = @_;      my($fig, $regions, $feature_data, $fast_color, $color_sim_cutoff) = @_;
961    
962      my %blast_hit;      my %blast_hit;
963    
# Line 943  Line 970 
970      # Create a hash of all pegs      # Create a hash of all pegs
971      my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;      my %region_peg = map {$_ => 1} grep {$feature_data->{$_}{'type'} eq 'peg'} keys %$feature_data;
972    
973        if ( $fast_color == 1 )
974        {
975            my $cutoff_2 = $color_sim_cutoff * 1e-20;
976            my %done_with;
977    
978      # Iterate through each peg      # Iterate through each peg
979      foreach my $peg1 ( keys %region_peg )      foreach my $peg1 ( keys %region_peg )
980      {      {
981                # Skip the 'done_with' pegs
982                next if ($done_with{$peg1});
983    
984          my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);          my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
985    
986          foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )              foreach my $sim ( grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
987          {          {
988                    my($peg2, $sc) = @$sim[1,10];
989    
990              $blast_hit{$peg1}{$peg2} = 1;              $blast_hit{$peg1}{$peg2} = 1;
991              $blast_hit{$peg2}{$peg1} = 1;              $blast_hit{$peg2}{$peg1} = 1;
992    
993                    # If the blast score is less than the chosen cutoff, mark $peg2 as 'done_with' so
994                    # that it will not be blasted with it's region sequences
995                    if ( $sc <= $cutoff_2 )
996                    {
997                        $done_with{$peg2} = 1;
998          }          }
999      }      }
   
     return \%blast_hit;  
1000  }  }
1001        }
1002  sub blat_files {      else
     my($fig, $input, $database, $cutoff) = @_;  
   
     my $out_file = "$FIG_Config::temp/blat.out.$$";  
     my $cmd = "blat -prot -out=blast8 $database $input $out_file 2> /dev/null > /dev/null";  
     $fig->run($cmd);  
   
     open(OUT, "<$out_file") or die "could not open file '$out_file': $!";  
     my @blat_out_all = <OUT>;  
     close(OUT);  
     unlink $out_file;  
   
     my @blat_out;  
     foreach my $hit ( @blat_out_all )  
1003      {      {
1004          my($evalue) = (split(/\s+/, $hit))[10];          # Iterate through each peg
1005          if ( $evalue <= $cutoff )          foreach my $peg1 ( keys %region_peg )
1006          {          {
1007              push @blat_out, $hit;              my @sims = $fig->sims($peg1, $maxN, $maxP, $select, $max_expand, $filters);
1008    
1009                foreach my $peg2 ( map {$_->[1]} grep {$region_peg{$_->[1]} and $_->[1] ne $peg1} @sims )
1010                {
1011                    $blast_hit{$peg1}{$peg2} = 1;
1012                    $blast_hit{$peg2}{$peg1} = 1;
1013                }
1014          }          }
1015      }      }
1016      return \@blat_out;  
1017        return \%blast_hit;
1018  }  }
1019    
1020  sub blast_files {  sub blast_files {
# Line 988  Line 1023 
1023      my $cmd  = "$FIG_Config::ext_bin/blastall";      my $cmd  = "$FIG_Config::ext_bin/blastall";
1024      my @args = ('-p', 'blastp',  '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);      my @args = ('-p', 'blastp',  '-i', $input, '-d', $database, '-m', 8, '-e', $cutoff);
1025      my @blast_out = $fig->run_gathering_output($cmd, @args);      my @blast_out = $fig->run_gathering_output($cmd, @args);
     return \@blast_out;  
 }  
1026    
1027  sub unique_hits {      return \@blast_out;
     my($hits) = @_;  
     my %seen;  
   
     # Return the best hit between the ordered pair (id1,id2)  
     # The input needs to be sorted (i.e. unchanged output from BLAST -m8 or blat -out=blast8)  
     # otherwise the hit returned may not be the best hit.  
     my @unique = grep {not $seen{$_->[0]}{$_->[1]}++} @$hits;  
     return \@unique;  
1028  }  }
1029    
1030  sub formatdb_file {  sub formatdb_file {

Legend:
Removed from v.1.24  
changed lines
  Added in v.1.25

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3