[Bio] / FigKernelPackages / FIGO.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FIGO.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.25, Wed Sep 26 03:40:10 2007 UTC revision 1.30, Fri Nov 30 21:35:51 2007 UTC
# Line 17  Line 17 
17  #  #
18  ########################################################################  ########################################################################
19    
20    =head1 TODO
21    
22    =over 4
23    
24    =item Null arg to ContigO::dna_seq() should return entire contig seq.
25    
26    =item Add method to access "FIG::crude_estimate_of_distance()"
27    
28    =back
29    
30    =cut
31    
32  =head1 Overview  =head1 Overview
33    
34  This module is a set of packages encapsulating the SEED's core methods  This module is a set of packages encapsulating the SEED's core methods
35  using an "OOP-like" style.  using an "OOP-like" style.
36    
37  There are several modules clearly related to "individual genomes:"  There are several modules clearly related to "individual genomes:"
38  FIGO, GenomeO, ContigO, FeatureO (and I<maybe> AnnotationO).  GenomeO, ContigO, FeatureO (and I<maybe> AnnotationO).
39    
40  There are also modules that deal with complex relationships between  There are also modules that deal with complex relationships between
41  pairs or sets of features in one, two, or more genomes,  pairs or sets of features in one, two, or more genomes,
# Line 32  Line 44 
44    
45  Finally, the methods in "Attribute" might in principle attach  Finally, the methods in "Attribute" might in principle attach
46  "atributes" to any type of object.  "atributes" to any type of object.
47  (Likewise, in principle one might like to attach an "annotation"  (Likewise, in principle one might also want to attach an "annotation"
48  to any type of object  to any type of object,
49    although currently we only support annotations of "features.")
50    
51  The four modules dealing with "genomes" have a reasonable clear  The three modules that act on "individual genomes" have a reasonable clear
52  "implied heirarchy:"  "implied heirarchy" relative to FIGO:
53    
54  =over 4  =over 4
55    
# Line 366  Line 379 
379    
380  =item USAGE:  =item USAGE:
381    
382  C<< my $org = GenomeO->new($figO, $tax_id); >>  C<< my $orgO = GenomeO->new($figO, $tax_id); >>
383    
384  =item RETURNS:  =item RETURNS:
385    
386      A new GenomeO object.  A new "GenomeO" object.
387    
388  =back  =back
389    
# Line 393  Line 406 
406    
407  =item USAGE:  =item USAGE:
408    
409  C<< my $tax_id = $org->id(); >>  C<< my $tax_id = $orgO->id(); >>
410    
411  =item RETURNS:  =item RETURNS:
412    
413      Taxonomy-ID of GenomeO object.  Taxonomy-ID of "GenomeO" object.
414    
415  =back  =back
416    
# Line 417  Line 430 
430    
431  =item USAGE:  =item USAGE:
432    
433  C<< $gs = $genome->genus_species(); >>  C<< $gs = $orgO->genus_species(); >>
434    
435  =item RETURNS:  =item RETURNS:
436    
# Line 439  Line 452 
452    
453  =head3 taxonomy_of  =head3 taxonomy_of
454    
455    =over 4
456    
457    =item FUNCTION:
458    
459    Return the TAXONOMY string of a "GenomeO" object.
460    
461    =item USAGE:
462    
463    C<< my $taxonomy = $orgO->taxonomy_of(); >>
464    
465    =item RETURNS:
466    
467    TAXONOMY string.
468    
469    =back
470    
471  =cut  =cut
472    
473  sub taxonomy_of {  sub taxonomy_of {
# Line 447  Line 476 
476      my $figO = $self->{_figO};      my $figO = $self->{_figO};
477      my $fig  = $figO->{_fig};      my $fig  = $figO->{_fig};
478    
479      return $fig->taxonomy_of($self->id());      return $fig->taxonomy_of($self->{_id});
480  }  }
481    
482    
# Line 479  Line 508 
508    
509  =head3 features_of  =head3 features_of
510    
511    =over 4
512    
513    =item FUNCTION:
514    
515    Returns a list of "FeatureO" objects contained in a "GenomeO" object.
516    
517    =item USAGE:
518    
519    C<< my @featureOs = $orgO->features_of();        #...Fetch all features >>
520    
521    or
522    
523    C<< my @featureOs = $orgO->features_of('peg');   #...Fetch only PEGs >>
524    
525    =item RETURNS:
526    
527    List of "FeatureO" objects.
528    
529    =back
530    
531  =cut  =cut
532    
533  sub features_of {  sub features_of {
# Line 657  Line 706 
706    
707  =item RETURNS:  =item RETURNS:
708    
709      string of DNA sequence running from $beg to $end  String containing DNA subsequence running from $beg to $end
710      (NOTE: if $beg > $end, returns reverse complement of DNA subsequence.)      (NOTE: if $beg > $end, returns reverse complement of DNA subsequence.)
711    
712  =back  =back
# Line 1168  Line 1217 
1217  sub possible_frameshift {  sub possible_frameshift {
1218      my($self) = @_;      my($self) = @_;
1219      my $figO = $self->{_figO};      my $figO = $self->{_figO};
1220        my $fig = $figO->{_fig};
1221      my($tmp_dir) = $figO->{_tmp_dir};      my($tmp_dir) = $figO->{_tmp_dir};
1222    
1223        my $tmp_dna  = "$tmp_dir/tmp_dna.$$.fasta";
1224        my $tmp_prot = "$tmp_dir/tmp_prot.$$.fasta";
1225    
1226      #...Skip tests and return '0' if truncated...      #...Skip tests and return '0' if truncated...
1227      if (! $self->possibly_truncated)      if (! $self->possibly_truncated)
1228      {      {
1229          #...Get best precomputed BLAST hit if E-value < 1.0e-50:          #...Get best precomputed BLAST hit if E-value < 1.0e-20:
1230          my @sims = $self->sims( -max => 1, -cutoff => 1.0e-50);          my @sims = $self->sims( -max => 5, -cutoff => 1.0e-20);
1231            while ((@sims > 0) && $fig->possibly_truncated($sims[0]->id2)) { shift @sims }
1232    
1233          #...If a sim was returned:          #...If a sim was returned:
1234          if (my $sim = shift @sims)          if (my $sim = shift @sims)
# Line 1190  Line 1244 
1244              my $adjL = 100 + (($b2-1) * 3);              my $adjL = 100 + (($b2-1) * 3);
1245              my $adjR = 100 + (($ln2 - $e2) * 3);              my $adjR = 100 + (($ln2 - $e2) * 3);
1246    
1247                if ($ENV{DEBUG}) { print STDERR "adjL = $adjL adjR = $adjR ln1 = $ln1 peg2 = $peg2 ln2 = $ln2\n" }
1248              #...If hit is more than 20% longer than query:              #...If hit is more than 20% longer than query:
1249              if ($ln2 > (1.2 * $ln1))              if ($ln2 > (1.2 * $ln1))
1250              {              {
# Line 1205  Line 1260 
1260                      my $contigO = new ContigO($figO, $self->genome->id, $contig);                      my $contigO = new ContigO($figO, $self->genome->id, $contig);
1261    
1262                      #...Extract DNA subsequence, including guard regions:                      #...Extract DNA subsequence, including guard regions:
1263                      my $begA = &max(1, $beg - $adjL);                      my($begA,$endA,$dna);
1264                      my $endA = &min($end+$adjR, $contigO->contig_length);                      if ($beg < $end)
1265                      my $dna  = $contigO->dna_seq($begA,$endA);                      {
1266                            $begA = &max(1, $beg - $adjL);
1267                            $endA = &min($end+$adjR, $contigO->contig_length);
1268                            $dna  = $contigO->dna_seq($begA,$endA);
1269                        }
1270                        else
1271                        {
1272                            $endA = &max(1, $beg - $adjL);
1273                            $begA = &min($end+$adjR, $contigO->contig_length);
1274                            $dna  = $contigO->dna_seq($begA,$endA);
1275                        }
1276    
1277                      if (defined($dna) && (length($dna) > 90))                      if (defined($dna) && (length($dna) > 90))
1278                      {                      {
1279                          #...Open tmp-file and write FASTA containing DNA subregion to be BLASTed:                          #...Open tmp-file and write FASTA containing DNA subregion to be BLASTed:
1280                          open( TMP, ">$tmp_dir/tmp_dna") || die "could not open tmp_dna";                          open( TMP, ">$tmp_dna") || die "could not open $tmp_dna";
1281                          print TMP  ">dna\n$dna\n";                          print TMP  ">dna\n$dna\n";
1282                          close(TMP);                          close(TMP);
1283    
# Line 1222  Line 1288 
1288                          my $prot  = $pegO2->prot_seq;                          my $prot  = $pegO2->prot_seq;
1289                          if (defined($prot) && (length($prot) > 30))                          if (defined($prot) && (length($prot) > 30))
1290                          {                          {
1291                              open( TMP, ">$tmp_dir/tmp_prot") || die "could not open tmp_prot";                              open( TMP, ">$tmp_prot") || die "could not open $tmp_prot";
1292                              print TMP  ">tmp_prot\n$prot\n";                              print TMP  ">tmp_prot\n$prot\n";
1293                              close(TMP);                              close(TMP);
1294    
1295                              #...Build BLAST nucleotide database for extracted DNA region,                              #...Build BLAST nucleotide database for extracted DNA region,
1296                              #   and TBLASTN $peg2 against the DNA:                              #   and TBLASTN $peg2 against the DNA:
1297                              &run("formatdb -i $tmp_dir/tmp_dna -pF");                              &run("formatdb -i $tmp_dna -pF");
1298                              open(BLAST,"blastall -i $tmp_dir/tmp_prot -d $tmp_dir/tmp_dna -p tblastn -FF -e 1.0e-50 |")                              open(BLAST,"blastall -i $tmp_prot -d $tmp_dna -p tblastn -FF -e 1.0e-20 |")
1299                                  || die "could not blast";                                  || die "could not blast";
1300    
1301                              #...Parse the TBLASTN output; find and sort HSPs by left boundary:                              #...Parse the TBLASTN output; find and sort HSPs by left boundary:
1302                              my $db_seq_out = &gjoparseblast::next_blast_subject(\*BLAST,1);                              my $db_seq_out = &gjoparseblast::next_blast_subject(\*BLAST,1);
1303                                if ($ENV{DEBUG}) { print STDERR &Dumper(['blast output',$db_seq_out]) }
1304                              my @hsps       = sort { $a->[0] <=> $b->[0] }                              my @hsps       = sort { $a->[0] <=> $b->[0] }
1305                                                map { [$_->[9], $_->[10], $_->[12], $_->[13]] }                                                map { [$_->[9], $_->[10], $_->[12], $_->[13]] }
1306                                                grep { $_->[1] < 1.0e-50 }                                                grep { $_->[1] < 1.0e-20 }
1307                                                @ { $db_seq_out->[6] };                                                @ { $db_seq_out->[6] };
1308    
1309                              #...Extract HSP boundary pairs:                              #...Extract HSP boundary pairs:
1310                              my @prot = map { [$_->[0], $_->[1]] } @hsps;                              my @prot = map { [$_->[0], $_->[1]] } @hsps;
1311                              my @dna  = map { [$_->[2], $_->[3]] } @hsps;                              my @dna  = map { [$_->[2], $_->[3]] } @hsps;
1312                                if ($ENV{DEBUG}) { print STDERR &Dumper(\@prot,\@dna) }
1313    
1314                              #...If the "cover" of the HSPs covers more than 90% of $peg2 w gaps < 3 AA,                              #...If the "cover" of the HSPs covers more than 90% of $peg2 w gaps < 3 AA,
1315                              #   and the "cover" of the HPSs cover more than 90% of the extracted DNA                              #   and the "cover" of the HPSs cover more than 90% of the extracted DNA
1316                              #   w/ gaps < 9 bp (but not a multiple of 3), suspect a possible frameshift:                              #   w/ gaps < 9 bp (but not a multiple of 3), suspect a possible frameshift:
1317                              if (&covers(\@prot,length($prot),3,0) && &covers(\@dna,3*length($prot),9,1))                              if (&covers(\@prot,length($prot),3,0) && &covers(\@dna,3*length($prot),9,1))
1318                              {                              {
1319                                  return 1;                                  unlink($tmp_dna,$tmp_prot);
1320                                    return [$contig,$begA,$endA,$dna,$peg2];
1321                              }                              }
1322                          }                          }
1323                      }                      }
# Line 1256  Line 1325 
1325              }              }
1326          }          }
1327      }      }
1328        unlink($tmp_dna,$tmp_prot);
1329      return 0;      return 0;
1330  }  }
1331    
# Line 1364  Line 1434 
1434  sub covers {  sub covers {
1435      my($hsps,$ln,$diff,$must_shift) = @_;      my($hsps,$ln,$diff,$must_shift) = @_;
1436    
1437        if ($ENV{DEBUG}) { print STDERR &Dumper(['hsps',$hsps,'ln',$ln,'diff',$diff,'must_shift',$must_shift]) }
1438      my $hsp1 = shift @$hsps;      my $hsp1 = shift @$hsps;
1439      my $hsp2;      my $hsp2;
1440      my $merged = 0;      my $merged = 0;
1441      while ($hsp1 && ($hsp2 = shift @$hsps) &&      while ($hsp1 && ($hsp2 = shift @$hsps) &&
1442             ($must_shift ? &diff_frames($hsp1,$hsp2) : 1) &&             ($must_shift ? &diff_frames($hsp1,$hsp2) : 1) &&
1443             ($hsp1 = &merge($hsp1,$hsp2,$diff))) { $merged = 1 }             ($hsp1 = &merge($hsp1,$hsp2,$diff)))
1444        {
1445            $merged = 1;
1446            if ($ENV{DEBUG}) { print STDERR &Dumper(['merged',$hsp1]) }
1447        }
1448      return ($merged && $hsp1 && (($hsp1->[1] - $hsp1->[0]) > (0.9 * $ln)));      return ($merged && $hsp1 && (($hsp1->[1] - $hsp1->[0]) > (0.9 * $ln)));
1449  }  }
1450    
1451  sub diff_frames {  sub diff_frames {
1452      my($hsp1,$hsp2) = @_;      my($hsp1,$hsp2) = @_;
1453      return (($hsp1->[0] % 3) != ($hsp2->[0] % 3));      return ((($hsp1->[1]+1) % 3) != ($hsp2->[0] % 3));
1454  }  }
1455    
1456    
# Line 1393  Line 1468 
1468    
1469      my($b1,$e1) = @$hsp1;      my($b1,$e1) = @$hsp1;
1470      my($b2,$e2) = @$hsp2;      my($b2,$e2) = @$hsp2;
1471      return (($e2 > $e1) && (abs($b2-$e1) <= $diff)) ? [$b1,$e2] : undef;      return (($e2 > $e1) && (($b2-$e1) <= $diff)) ? [$b1,$e2] : undef;
1472  }  }
1473    
1474    

Legend:
Removed from v.1.25  
changed lines
  Added in v.1.30

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3