[Bio] / FigKernelPackages / FIGO.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIGO.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.27, Thu Oct 4 00:58:56 2007 UTC revision 1.32, Thu Dec 6 13:59:34 2007 UTC
# Line 17  Line 17 
17  #  #
18  ########################################################################  ########################################################################
19    
20    =head1 TODO
21    
22    =over 4
23    
24    =item Null arg to ContigO::dna_seq() should return entire contig seq.
25    
26    =item Add method to access "FIG::crude_estimate_of_distance()"
27    
28    =back
29    
30    =cut
31    
32  =head1 Overview  =head1 Overview
33    
34  This module is a set of packages encapsulating the SEED's core methods  This module is a set of packages encapsulating the SEED's core methods
35  using an "OOP-like" style.  using an "OOP-like" style.
36    
37  There are several modules clearly related to "individual genomes:"  There are several modules clearly related to "individual genomes:"
38  FIGO, GenomeO, ContigO, FeatureO (and I<maybe> AnnotationO).  GenomeO, ContigO, FeatureO (and I<maybe> AnnotationO).
39    
40  There are also modules that deal with complex relationships between  There are also modules that deal with complex relationships between
41  pairs or sets of features in one, two, or more genomes,  pairs or sets of features in one, two, or more genomes,
# Line 32  Line 44 
44    
45  Finally, the methods in "Attribute" might in principle attach  Finally, the methods in "Attribute" might in principle attach
46  "atributes" to any type of object.  "atributes" to any type of object.
47  (Likewise, in principle one might like to attach an "annotation"  (Likewise, in principle one might also want to attach an "annotation"
48  to any type of object  to any type of object,
49    although currently we only support annotations of "features.")
50    
51  The four modules dealing with "genomes" have a reasonable clear  The three modules that act on "individual genomes" have a reasonable clear
52  "implied heirarchy:"  "implied heirarchy" relative to FIGO:
53    
54  =over 4  =over 4
55    
# Line 92  Line 105 
105    
106  =item USAGE:  =item USAGE:
107    
108  C<< my $figo = FIGO->new();           #...Subclass defaults to FIG >>      my $figo = FIGO->new();           #...Subclass defaults to FIG
109    
110  C<< my $figo = FIGO->new('SPROUT');   #...Subclass is a SPROUT object >>      my $figo = FIGO->new('SPROUT');   #...Subclass is a SPROUT object
111    
112  =back  =back
113    
# Line 137  Line 150 
150    
151  =item USAGE:  =item USAGE:
152    
153  C<< my @tax_ids = $figo->genomes(); >>      my @tax_ids = $figo->genomes();
154    
155  C<< my @tax_ids = $figo->genomes( @constraints ); >>      my @tax_ids = $figo->genomes( @constraints );
156    
157  =item @constraints  =item @constraints
158    
# Line 258  Line 271 
271    
272  =item USAGE:  =item USAGE:
273    
274  C<< foreach $fam ($figO->all_figfams) { #...Do something } >>      foreach $fam ($figO->all_figfams) { #...Do something }
275    
276  =item RETURNS:  =item RETURNS:
277    
# Line 287  Line 300 
300    
301  =item USAGE:  =item USAGE:
302    
303  C<< my ($fam, $sims) = $figO->family_containing($seq); >>      my ($fam, $sims) = $figO->family_containing($seq);
304    
305  =item $seq:  =item $seq:
306    
# Line 327  Line 340 
340    
341  =item USAGE:  =item USAGE:
342    
343  C<< my $fam = $figO->figfam($family_id); >>      my $fam = $figO->figfam($family_id);
344    
345  =item $family_id;  =item $family_id;
346    
# Line 366  Line 379 
379    
380  =item USAGE:  =item USAGE:
381    
382  C<< my $orgO = GenomeO->new($figO, $tax_id); >>      my $orgO = GenomeO->new($figO, $tax_id);
383    
384  =item RETURNS:  =item RETURNS:
385    
# Line 393  Line 406 
406    
407  =item USAGE:  =item USAGE:
408    
409  C<< my $tax_id = $orgO->id(); >>      my $tax_id = $orgO->id();
410    
411  =item RETURNS:  =item RETURNS:
412    
# Line 417  Line 430 
430    
431  =item USAGE:  =item USAGE:
432    
433  C<< $gs = $orgO->genus_species(); >>      $gs = $orgO->genus_species();
434    
435  =item RETURNS:  =item RETURNS:
436    
# Line 447  Line 460 
460    
461  =item USAGE:  =item USAGE:
462    
463  C<< my $taxonomy = $orgO->taxonomy_of(); >>      my $taxonomy = $orgO->taxonomy_of();
464    
465  =item RETURNS:  =item RETURNS:
466    
# Line 503  Line 516 
516    
517  =item USAGE:  =item USAGE:
518    
519  C<< my @featureOs = $orgO->features_of();        #...Fetch all features >>      my @featureOs = $orgO->features_of();        #...Fetch all features
520    
521  or  or
522    
523  C<< my @featureOs = $orgO->features_of('peg');   #...Fetch only PEGs >>      my @featureOs = $orgO->features_of('peg');   #...Fetch only PEGs
524    
525  =item RETURNS:  =item RETURNS:
526    
# Line 535  Line 548 
548    
549  =item USAGE:  =item USAGE:
550    
551  C<< $genome->display(); >>      $genome->display();
552    
553  =item RETURNS:  =item RETURNS:
554    
# Line 572  Line 585 
585    
586  =item USAGE:  =item USAGE:
587    
588  C<< $contig = ContigO->new( $figO, $genomeId, $contigId); >>      $contig = ContigO->new( $figO, $genomeId, $contigId);
589    
590  =item $figO:  =item $figO:
591    
# Line 631  Line 644 
644    
645  =item USAGE:  =item USAGE:
646    
647  C<< my $tax_id = $contig->genome->id(); >>      my $tax_id = $contig->genome->id();
648    
649  =item RETURNS:  =item RETURNS:
650    
# Line 656  Line 669 
669    
670  =item USAGE:  =item USAGE:
671    
672  C<< my $len = $contig->contig_length(); >>      my $len = $contig->contig_length();
673    
674  =item RETURNS:  =item RETURNS:
675    
# Line 681  Line 694 
694    
695  =item USAGE:  =item USAGE:
696    
697  C<< my $seq = $contig->dna_seq(beg, $end); >>      my $seq = $contig->dna_seq(beg, $end);
698    
699  =item $beg:  =item $beg:
700    
# Line 693  Line 706 
706    
707  =item RETURNS:  =item RETURNS:
708    
709      string of DNA sequence running from $beg to $end  String containing DNA subsequence running from $beg to $end
710      (NOTE: if $beg > $end, returns reverse complement of DNA subsequence.)      (NOTE: if $beg > $end, returns reverse complement of DNA subsequence.)
711    
712  =back  =back
# Line 775  Line 788 
788    
789  =item USAGE:  =item USAGE:
790    
791  C<< my $feature = FeatureO->new( $figO, $fid ); >>      my $feature = FeatureO->new( $figO, $fid );
792    
793  =item C<$figO>:  =item C<$figO>:
794    
# Line 811  Line 824 
824    
825  =item USAGE:  =item USAGE:
826    
827  C<< my $fid = $feature->id(); >>      my $fid = $feature->id();
828    
829  =item RETURNS:  =item RETURNS:
830    
# Line 835  Line 848 
848    
849  =item USAGE:  =item USAGE:
850    
851  C<< my $taxid = $feature->genome(); >>      my $taxid = $feature->genome();
852    
853  =item RETURNS:  =item RETURNS:
854    
# Line 860  Line 873 
873    
874  =item USAGE:  =item USAGE:
875    
876  C<< my $feature_type = $feature->type(); >>      my $feature_type = $feature->type();
877    
878  =item RETURNS:  =item RETURNS:
879    
# Line 885  Line 898 
898    
899  =item USAGE:  =item USAGE:
900    
901  C<< my $loc = $feature->location(); >>      my $loc = $feature->location();
902    
903  =item RETURNS:  =item RETURNS:
904    
# Line 910  Line 923 
923    
924  =item USAGE:  =item USAGE:
925    
926  C<< my $contig = $feature->contig(); >>      my $contig = $feature->contig();
927    
928  =item RETURNS:  =item RETURNS:
929    
# Line 938  Line 951 
951    
952  =item USAGE:  =item USAGE:
953    
954  C<< my $beg = $feature->begin(); >>      my $beg = $feature->begin();
955    
956  =item RETURNS:  =item RETURNS:
957    
# Line 963  Line 976 
976    
977  =item USAGE:  =item USAGE:
978    
979  C<< my $end = $feature->end(); >>      my $end = $feature->end();
980    
981  =item RETURNS:  =item RETURNS:
982    
# Line 988  Line 1001 
1001    
1002  =item USAGE:  =item USAGE:
1003    
1004  C<< my $dna_seq = $feature->dna_seq(); >>      my $dna_seq = $feature->dna_seq();
1005    
1006  =item RETURNS:  =item RETURNS:
1007    
# Line 1018  Line 1031 
1031    
1032  =item USAGE:  =item USAGE:
1033    
1034  C<< my $dna_seq = $feature->prot_seq(); >>      my $dna_seq = $feature->prot_seq();
1035    
1036  =item RETURNS:  =item RETURNS:
1037    
# Line 1046  Line 1059 
1059    
1060  =item USAGE:  =item USAGE:
1061    
1062  C<< my $func = $feature->function_of(); >>      my $func = $feature->function_of();
1063    
1064  =item RETURNS:  =item RETURNS:
1065    
# Line 1073  Line 1086 
1086    
1087  =item USAGE:  =item USAGE:
1088    
1089  C<< my @coupled_features = $feature->coupled_to(); >>      my @coupled_features = $feature->coupled_to();
1090    
1091  =item RETURNS:  =item RETURNS:
1092    
# Line 1108  Line 1121 
1121    
1122  =item USAGE:  =item USAGE:
1123    
1124  C<< my @annot_list = $feature->annotations(); >>      my @annot_list = $feature->annotations();
1125    
1126  =item RETURNS:  =item RETURNS:
1127    
# Line 1134  Line 1147 
1147    
1148  =item USAGE:  =item USAGE:
1149    
1150  C<< my @subsys_list = $feature->in_subsystems(); >>      my @subsys_list = $feature->in_subsystems();
1151    
1152  =item RETURNS:  =item RETURNS:
1153    
# Line 1160  Line 1173 
1173    
1174  =item USAGE:  =item USAGE:
1175    
1176  C<< my $trunc = $feature->possibly_truncated(); >>      my $trunc = $feature->possibly_truncated();
1177    
1178  =item RETURNS:  =item RETURNS:
1179    
# Line 1187  Line 1200 
1200    
1201  =item USAGE:  =item USAGE:
1202    
1203  C<< my $fs = $feature->possible_frameshift(); >>      my $fs = $feature->possible_frameshift();
1204    
1205  =item RETURNS:  =item RETURNS:
1206    
# Line 1205  Line 1218 
1218      my($self) = @_;      my($self) = @_;
1219      my $figO = $self->{_figO};      my $figO = $self->{_figO};
1220      my $fig = $figO->{_fig};      my $fig = $figO->{_fig};
     my($tmp_dir) = $figO->{_tmp_dir};  
1221    
1222      #...Skip tests and return '0' if truncated...      return $fig->possible_frameshift($self->id);
     if (! $self->possibly_truncated)  
     {  
         #...Get best precomputed BLAST hit if E-value < 1.0e-20:  
         my @sims = $self->sims( -max => 5, -cutoff => 1.0e-20);  
         while ((@sims > 0) && $fig->possibly_truncated($sims[0]->id2)) { shift @sims }  
   
         #...If a sim was returned:  
         if (my $sim = shift @sims)  
         {  
             #...Get best hit FID and boundaries:  
             my $peg2 = $sim->id2;  
             my $ln1  = $sim->ln1;  
             my $ln2  = $sim->ln2;  
             my $b2   = $sim->b2;  
             my $e2   = $sim->e2;  
   
             #...Convert from AA to BP, and pad out w/ 100 bp guard region:  
             my $adjL = 100 + (($b2-1) * 3);  
             my $adjR = 100 + (($ln2 - $e2) * 3);  
   
             if ($ENV{DEBUG}) { print STDERR "adjL = $adjL adjR = $adjR ln1 = $ln1 peg2 = $peg2 ln2 = $ln2\n" }  
             #...If hit is more than 20% longer than query:  
             if ($ln2 > (1.2 * $ln1))  
             {  
                 #...Get and parse query location:  
                 my $loc = $self->location;  
                 if ($loc =~ /^(\S+)_(\d+)_(\d+)/)  
                 {  
                     my $contig = $1;  
                     my $beg    = $2;  
                     my $end    = $3;  
   
                     #...Create new ContigO object:  
                     my $contigO = new ContigO($figO, $self->genome->id, $contig);  
   
                     #...Extract DNA subsequence, including guard regions:  
                     my($begA,$endA,$dna);  
                     if ($beg < $end)  
                     {  
                         $begA = &max(1, $beg - $adjL);  
                         $endA = &min($end+$adjR, $contigO->contig_length);  
                         $dna  = $contigO->dna_seq($begA,$endA);  
                     }  
                     else  
                     {  
                         $endA = &max(1, $beg - $adjL);  
                         $begA = &min($end+$adjR, $contigO->contig_length);  
                         $dna  = $contigO->dna_seq($begA,$endA);  
                     }  
   
                     if (defined($dna) && (length($dna) > 90))  
                     {  
                         #...Open tmp-file and write FASTA containing DNA subregion to be BLASTed:  
                         open( TMP, ">$tmp_dir/tmp_dna") || die "could not open tmp_dna";  
                         print TMP  ">dna\n$dna\n";  
                         close(TMP);  
   
                         #...Create new FeatureO object corresponding tp $peg2:  
                         my $pegO2 = new FeatureO($figO,$peg2);  
   
                         #...Fetch its translation, and print to tmp FASTA file for BLASTing:  
                         my $prot  = $pegO2->prot_seq;  
                         if (defined($prot) && (length($prot) > 30))  
                         {  
                             open( TMP, ">$tmp_dir/tmp_prot") || die "could not open tmp_prot";  
                             print TMP  ">tmp_prot\n$prot\n";  
                             close(TMP);  
   
                             #...Build BLAST nucleotide database for extracted DNA region,  
                             #   and TBLASTN $peg2 against the DNA:  
                             &run("formatdb -i $tmp_dir/tmp_dna -pF");  
                             open(BLAST,"blastall -i $tmp_dir/tmp_prot -d $tmp_dir/tmp_dna -p tblastn -FF -e 1.0e-20 |")  
                                 || die "could not blast";  
   
                             #...Parse the TBLASTN output; find and sort HSPs by left boundary:  
                             my $db_seq_out = &gjoparseblast::next_blast_subject(\*BLAST,1);  
                             if ($ENV{DEBUG}) { print STDERR &Dumper(['blast output',$db_seq_out]) }  
                             my @hsps       = sort { $a->[0] <=> $b->[0] }  
                                               map { [$_->[9], $_->[10], $_->[12], $_->[13]] }  
                                               grep { $_->[1] < 1.0e-20 }  
                                               @ { $db_seq_out->[6] };  
   
                             #...Extract HSP boundary pairs:  
                             my @prot = map { [$_->[0], $_->[1]] } @hsps;  
                             my @dna  = map { [$_->[2], $_->[3]] } @hsps;  
                             if ($ENV{DEBUG}) { print STDERR &Dumper(\@prot,\@dna) }  
   
                             #...If the "cover" of the HSPs covers more than 90% of $peg2 w gaps < 3 AA,  
                             #   and the "cover" of the HPSs cover more than 90% of the extracted DNA  
                             #   w/ gaps < 9 bp (but not a multiple of 3), suspect a possible frameshift:  
                             if (&covers(\@prot,length($prot),3,0) && &covers(\@dna,3*length($prot),9,1))  
                             {  
                                 return 1;  
                             }  
                         }  
                     }  
                 }  
             }  
         }  
     }  
     return 0;  
1223  }  }
1224    
1225    
# Line 1325  Line 1236 
1236    
1237  =item USAGE:  =item USAGE:
1238    
1239  C<< $feature->run($cmd); >>      $feature->run($cmd);
1240    
1241  =item RETURNS:  =item RETURNS:
1242    
# Line 1351  Line 1262 
1262    
1263  =item USAGE:  =item USAGE:
1264    
1265  C<< my $max = $feature->max($x, $y); >>      my $max = $feature->max($x, $y);
1266    
1267  =item C<$x> and  C<$y>  =item C<$x> and  C<$y>
1268    
# Line 1380  Line 1291 
1291    
1292  =item USAGE:  =item USAGE:
1293    
1294  C<< my $min = $feature->min($x, $y); >>      my $min = $feature->min($x, $y);
1295    
1296  =item C<$x> and C<$y>  =item C<$x> and C<$y>
1297    
# Line 1399  Line 1310 
1310      return ($x < $y) ? $x : $y;      return ($x < $y) ? $x : $y;
1311  }  }
1312    
   
   
 =head3 covers  
   
 (Question: Should this function be considered "PRIVATE" ???)  
   
 USAGE:  
     C<< if (&covers(\@hits, $len, $diff, $must_shift)) { #...Do stuff } >>  
   
 Returns boolean C<TRUE> if a set of BLAST HSPs "cover" more than 90%  
 of the database sequence(?).  
   
 =cut  
   
 sub covers {  
     my($hsps,$ln,$diff,$must_shift) = @_;  
   
     if ($ENV{DEBUG}) { print STDERR &Dumper(['hsps',$hsps,'ln',$ln,'diff',$diff,'must_shift',$must_shift]) }  
     my $hsp1 = shift @$hsps;  
     my $hsp2;  
     my $merged = 0;  
     while ($hsp1 && ($hsp2 = shift @$hsps) &&  
            ($must_shift ? &diff_frames($hsp1,$hsp2) : 1) &&  
            ($hsp1 = &merge($hsp1,$hsp2,$diff)))  
     {  
         $merged = 1;  
         if ($ENV{DEBUG}) { print STDERR &Dumper(['merged',$hsp1]) }  
     }  
     return ($merged && $hsp1 && (($hsp1->[1] - $hsp1->[0]) > (0.9 * $ln)));  
 }  
   
 sub diff_frames {  
     my($hsp1,$hsp2) = @_;  
     return ((($hsp1->[1]+1) % 3) != ($hsp2->[0] % 3));  
 }  
   
   
   
 =head3 merge  
   
 Merge two HSPs unless their overlap or separation is too large.  
   
 RETURNS: Merged boundaries if merger succeeds, and C<undef> if merger fails.  
   
 =cut  
   
 sub merge {  
     my($hsp1,$hsp2,$diff) = @_;  
   
     my($b1,$e1) = @$hsp1;  
     my($b2,$e2) = @$hsp2;  
     return (($e2 > $e1) && (($b2-$e1) <= $diff)) ? [$b1,$e2] : undef;  
 }  
   
   
   
1313  =head3 sims  =head3 sims
1314    
1315  =over 4  =over 4
# Line 1465  Line 1320 
1320    
1321  =item USAGE:  =item USAGE:
1322    
1323  C<< my @sims = $pegO->sims( -all, -cutoff => 1.0e-10); >>      my @sims = $pegO->sims( -all, -cutoff => 1.0e-10);
1324    
1325  C<< my @sims = $pegO->sims( -max => 50, -cutoff => 1.0e-10); >>      my @sims = $pegO->sims( -max => 50, -cutoff => 1.0e-10);
1326    
1327  =item RETURNS: List of sim objects.  =item RETURNS: List of sim objects.
1328    
# Line 1513  Line 1368 
1368    
1369  =item USAGE:  =item USAGE:
1370    
1371  C<< my @bbhs = $pegO->bbhs(); >>      my @bbhs = $pegO->bbhs();
1372    
1373  =item RETURNS:  =item RETURNS:
1374    
# Line 1549  Line 1404 
1404    
1405  =item USAGE:  =item USAGE:
1406    
1407  C<< $pegO->display(); >>      $pegO->display();
1408    
1409  =item RETURNS;  =item RETURNS;
1410    
# Line 1608  Line 1463 
1463    
1464  =item USAGE:  =item USAGE:
1465    
1466  C<< my $peg1 = $bbh->peg1(); >>      my $peg1 = $bbh->peg1();
1467    
1468  =item RETURNS:  =item RETURNS:
1469    
# Line 1632  Line 1487 
1487    
1488  =item USAGE:  =item USAGE:
1489    
1490  C<< my $peg2 = $bbh->peg2(); >>      my $peg2 = $bbh->peg2();
1491    
1492  =item RETURNS:  =item RETURNS:
1493    
# Line 1658  Line 1513 
1513    
1514  =item USAGE:  =item USAGE:
1515    
1516  C<< my $psc = $bbh->psc(); >>      my $psc = $bbh->psc();
1517    
1518  =item RETURNS:  =item RETURNS:
1519    
# Line 1683  Line 1538 
1538    
1539  =item USAGE:  =item USAGE:
1540    
1541  C<< my $bsc = $bbh->norm_bitscore(); >>      my $bsc = $bbh->norm_bitscore();
1542    
1543  =item RETURNS:  =item RETURNS:
1544    
# Line 1723  Line 1578 
1578    
1579  =item USAGE:  =item USAGE:
1580    
1581  C<< my $annotO = AnnotationO->new( $fid, $timestamp, $who, $text); >>      my $annotO = AnnotationO->new( $fid, $timestamp, $who, $text);
1582    
1583  =item C<$fid>  =item C<$fid>
1584    
# Line 1772  Line 1627 
1627    
1628  =item USAGE:  =item USAGE:
1629    
1630  C<< my $fid = $annotO->fid(); >>      my $fid = $annotO->fid();
1631    
1632  =item RETURNS;  =item RETURNS;
1633    
# Line 1800  Line 1655 
1655    
1656  =item USAGE:  =item USAGE:
1657    
1658  C<< my $fid = $annotO->timestamp(); >>      my $fid = $annotO->timestamp();
1659    
1660  =item RETURNS;  =item RETURNS;
1661    
# Line 1835  Line 1690 
1690    
1691  =item USAGE:  =item USAGE:
1692    
1693  C<< my $fid = $annotO->made_by(); >>      my $fid = $annotO->made_by();
1694    
1695  =item RETURNS;  =item RETURNS;
1696    
# Line 1865  Line 1720 
1720    
1721  =item USGAE:  =item USGAE:
1722    
1723  C<< my $text = $annotO->text(); >>      my $text = $annotO->text();
1724    
1725  =item RETURNS:  =item RETURNS:
1726    
# Line 1894  Line 1749 
1749    
1750  =item USAGE:  =item USAGE:
1751    
1752  C<< my $annotO->display(); >>      my $annotO->display();
1753    
1754  =item RETURNS:  =item RETURNS:
1755    
# Line 1938  Line 1793 
1793    
1794  =item USAGE:  =item USAGE:
1795    
1796  C<< $couplingO = CouplingO->new($figO, $fid1, $fid2, $sc); >>      $couplingO = CouplingO->new($figO, $fid1, $fid2, $sc);
1797    
1798  =item C<$figO>  =item C<$figO>
1799    
# Line 1985  Line 1840 
1840    
1841  =item USAGE:  =item USAGE:
1842    
1843  C<< my $peg1 = $couplingO->peg1(); >>      my $peg1 = $couplingO->peg1();
1844    
1845  =item RETURNS:  =item RETURNS:
1846    
# Line 2014  Line 1869 
1869    
1870  =item USAGE:  =item USAGE:
1871    
1872  C<< my $peg2 = $couplingO->peg2(); >>      my $peg2 = $couplingO->peg2();
1873    
1874  =item RETURNS:  =item RETURNS:
1875    
# Line 2043  Line 1898 
1898    
1899  =item USAGE:  =item USAGE:
1900    
1901  C<< my $sc = $couplingO->sc(); >>      my $sc = $couplingO->sc();
1902    
1903  =item RETURNS:  =item RETURNS:
1904    
# Line 2073  Line 1928 
1928    
1929  =item USAGE:  =item USAGE:
1930    
1931  C<< my $evidence = $couplingO->evidence(); >>      my $evidence = $couplingO->evidence();
1932    
1933  =item RETURNS  =item RETURNS
1934    
# Line 2111  Line 1966 
1966    
1967  =item USAGE:  =item USAGE:
1968    
1969  C<< $couplingO->display(); >>      $couplingO->display();
1970    
1971  =item RETURNS:  =item RETURNS:
1972    

Legend:
Removed from v.1.27  
changed lines
  Added in v.1.32

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3