[Bio] / FigKernelPackages / FIGO.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIGO.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.28, Sat Oct 13 15:10:06 2007 UTC revision 1.31, Sat Dec 1 22:17:03 2007 UTC
# Line 17  Line 17 
17  #  #
18  ########################################################################  ########################################################################
19    
20    =head1 TODO
21    
22    =over 4
23    
24    =item Null arg to ContigO::dna_seq() should return entire contig seq.
25    
26    =item Add method to access "FIG::crude_estimate_of_distance()"
27    
28    =back
29    
30    =cut
31    
32  =head1 Overview  =head1 Overview
33    
34  This module is a set of packages encapsulating the SEED's core methods  This module is a set of packages encapsulating the SEED's core methods
35  using an "OOP-like" style.  using an "OOP-like" style.
36    
37  There are several modules clearly related to "individual genomes:"  There are several modules clearly related to "individual genomes:"
38  FIGO, GenomeO, ContigO, FeatureO (and I<maybe> AnnotationO).  GenomeO, ContigO, FeatureO (and I<maybe> AnnotationO).
39    
40  There are also modules that deal with complex relationships between  There are also modules that deal with complex relationships between
41  pairs or sets of features in one, two, or more genomes,  pairs or sets of features in one, two, or more genomes,
# Line 32  Line 44 
44    
45  Finally, the methods in "Attribute" might in principle attach  Finally, the methods in "Attribute" might in principle attach
46  "atributes" to any type of object.  "atributes" to any type of object.
47  (Likewise, in principle one might like to attach an "annotation"  (Likewise, in principle one might also want to attach an "annotation"
48  to any type of object  to any type of object,
49    although currently we only support annotations of "features.")
50    
51  The four modules dealing with "genomes" have a reasonable clear  The three modules that act on "individual genomes" have a reasonable clear
52  "implied heirarchy:"  "implied heirarchy" relative to FIGO:
53    
54  =over 4  =over 4
55    
# Line 693  Line 706 
706    
707  =item RETURNS:  =item RETURNS:
708    
709      string of DNA sequence running from $beg to $end  String containing DNA subsequence running from $beg to $end
710      (NOTE: if $beg > $end, returns reverse complement of DNA subsequence.)      (NOTE: if $beg > $end, returns reverse complement of DNA subsequence.)
711    
712  =back  =back
# Line 1205  Line 1218 
1218      my($self) = @_;      my($self) = @_;
1219      my $figO = $self->{_figO};      my $figO = $self->{_figO};
1220      my $fig = $figO->{_fig};      my $fig = $figO->{_fig};
     my($tmp_dir) = $figO->{_tmp_dir};  
1221    
1222      my $tmp_dna  = "$tmp_dir/tmp_dna.$$.fasta";      return $fig->possible_frameshift($self->id);
     my $tmp_prot = "$tmp_dir/tmp_prot.$$.fasta";  
   
     #...Skip tests and return '0' if truncated...  
     if (! $self->possibly_truncated)  
     {  
         #...Get best precomputed BLAST hit if E-value < 1.0e-20:  
         my @sims = $self->sims( -max => 5, -cutoff => 1.0e-20);  
         while ((@sims > 0) && $fig->possibly_truncated($sims[0]->id2)) { shift @sims }  
   
         #...If a sim was returned:  
         if (my $sim = shift @sims)  
         {  
             #...Get best hit FID and boundaries:  
             my $peg2 = $sim->id2;  
             my $ln1  = $sim->ln1;  
             my $ln2  = $sim->ln2;  
             my $b2   = $sim->b2;  
             my $e2   = $sim->e2;  
   
             #...Convert from AA to BP, and pad out w/ 100 bp guard region:  
             my $adjL = 100 + (($b2-1) * 3);  
             my $adjR = 100 + (($ln2 - $e2) * 3);  
   
             if ($ENV{DEBUG}) { print STDERR "adjL = $adjL adjR = $adjR ln1 = $ln1 peg2 = $peg2 ln2 = $ln2\n" }  
             #...If hit is more than 20% longer than query:  
             if ($ln2 > (1.2 * $ln1))  
             {  
                 #...Get and parse query location:  
                 my $loc = $self->location;  
                 if ($loc =~ /^(\S+)_(\d+)_(\d+)/)  
                 {  
                     my $contig = $1;  
                     my $beg    = $2;  
                     my $end    = $3;  
   
                     #...Create new ContigO object:  
                     my $contigO = new ContigO($figO, $self->genome->id, $contig);  
   
                     #...Extract DNA subsequence, including guard regions:  
                     my($begA,$endA,$dna);  
                     if ($beg < $end)  
                     {  
                         $begA = &max(1, $beg - $adjL);  
                         $endA = &min($end+$adjR, $contigO->contig_length);  
                         $dna  = $contigO->dna_seq($begA,$endA);  
                     }  
                     else  
                     {  
                         $endA = &max(1, $beg - $adjL);  
                         $begA = &min($end+$adjR, $contigO->contig_length);  
                         $dna  = $contigO->dna_seq($begA,$endA);  
                     }  
   
                     if (defined($dna) && (length($dna) > 90))  
                     {  
                         #...Open tmp-file and write FASTA containing DNA subregion to be BLASTed:  
                         open( TMP, ">$tmp_dna") || die "could not open $tmp_dna";  
                         print TMP  ">dna\n$dna\n";  
                         close(TMP);  
   
                         #...Create new FeatureO object corresponding tp $peg2:  
                         my $pegO2 = new FeatureO($figO,$peg2);  
   
                         #...Fetch its translation, and print to tmp FASTA file for BLASTing:  
                         my $prot  = $pegO2->prot_seq;  
                         if (defined($prot) && (length($prot) > 30))  
                         {  
                             open( TMP, ">$tmp_prot") || die "could not open $tmp_prot";  
                             print TMP  ">tmp_prot\n$prot\n";  
                             close(TMP);  
   
                             #...Build BLAST nucleotide database for extracted DNA region,  
                             #   and TBLASTN $peg2 against the DNA:  
                             &run("formatdb -i $tmp_dna -pF");  
                             open(BLAST,"blastall -i $tmp_prot -d $tmp_dna -p tblastn -FF -e 1.0e-20 |")  
                                 || die "could not blast";  
   
                             #...Parse the TBLASTN output; find and sort HSPs by left boundary:  
                             my $db_seq_out = &gjoparseblast::next_blast_subject(\*BLAST,1);  
                             if ($ENV{DEBUG}) { print STDERR &Dumper(['blast output',$db_seq_out]) }  
                             my @hsps       = sort { $a->[0] <=> $b->[0] }  
                                               map { [$_->[9], $_->[10], $_->[12], $_->[13]] }  
                                               grep { $_->[1] < 1.0e-20 }  
                                               @ { $db_seq_out->[6] };  
   
                             #...Extract HSP boundary pairs:  
                             my @prot = map { [$_->[0], $_->[1]] } @hsps;  
                             my @dna  = map { [$_->[2], $_->[3]] } @hsps;  
                             if ($ENV{DEBUG}) { print STDERR &Dumper(\@prot,\@dna) }  
   
                             #...If the "cover" of the HSPs covers more than 90% of $peg2 w gaps < 3 AA,  
                             #   and the "cover" of the HPSs cover more than 90% of the extracted DNA  
                             #   w/ gaps < 9 bp (but not a multiple of 3), suspect a possible frameshift:  
                             if (&covers(\@prot,length($prot),3,0) && &covers(\@dna,3*length($prot),9,1))  
                             {  
                                 unlink($tmp_dna,$tmp_prot);  
                                 return 1;  
                             }  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     unlink($tmp_dna,$tmp_prot);  
     return 0;  
1223  }  }
1224    
1225    
# Line 1404  Line 1310 
1310      return ($x < $y) ? $x : $y;      return ($x < $y) ? $x : $y;
1311  }  }
1312    
   
   
 =head3 covers  
   
 (Question: Should this function be considered "PRIVATE" ???)  
   
 USAGE:  
     C<< if (&covers(\@hits, $len, $diff, $must_shift)) { #...Do stuff } >>  
   
 Returns boolean C<TRUE> if a set of BLAST HSPs "cover" more than 90%  
 of the database sequence(?).  
   
 =cut  
   
 sub covers {  
     my($hsps,$ln,$diff,$must_shift) = @_;  
   
     if ($ENV{DEBUG}) { print STDERR &Dumper(['hsps',$hsps,'ln',$ln,'diff',$diff,'must_shift',$must_shift]) }  
     my $hsp1 = shift @$hsps;  
     my $hsp2;  
     my $merged = 0;  
     while ($hsp1 && ($hsp2 = shift @$hsps) &&  
            ($must_shift ? &diff_frames($hsp1,$hsp2) : 1) &&  
            ($hsp1 = &merge($hsp1,$hsp2,$diff)))  
     {  
         $merged = 1;  
         if ($ENV{DEBUG}) { print STDERR &Dumper(['merged',$hsp1]) }  
     }  
     return ($merged && $hsp1 && (($hsp1->[1] - $hsp1->[0]) > (0.9 * $ln)));  
 }  
   
 sub diff_frames {  
     my($hsp1,$hsp2) = @_;  
     return ((($hsp1->[1]+1) % 3) != ($hsp2->[0] % 3));  
 }  
   
   
   
 =head3 merge  
   
 Merge two HSPs unless their overlap or separation is too large.  
   
 RETURNS: Merged boundaries if merger succeeds, and C<undef> if merger fails.  
   
 =cut  
   
 sub merge {  
     my($hsp1,$hsp2,$diff) = @_;  
   
     my($b1,$e1) = @$hsp1;  
     my($b2,$e2) = @$hsp2;  
     return (($e2 > $e1) && (($b2-$e1) <= $diff)) ? [$b1,$e2] : undef;  
 }  
   
   
   
1313  =head3 sims  =head3 sims
1314    
1315  =over 4  =over 4

Legend:
Removed from v.1.28  
changed lines
  Added in v.1.31

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3