[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.20, Mon Dec 14 02:11:58 2009 UTC revision 1.21, Mon Dec 14 22:22:28 2009 UTC
# Line 26  Line 26 
26      use base qw(Exporter);      use base qw(Exporter);
27      our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record      our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
28                       rev_comp genome_of min max sims verify_dir between translate                       rev_comp genome_of min max sims verify_dir between translate
29                       standard_genetic_code parse_location roles_in_function                       standard_genetic_code parse_location roles_of_function
30                       strip_ec location_string location_cmp strand_of);                       strip_ec location_string location_cmp strand_of);
31    
32  =head1 SEED Utility Methods  =head1 SEED Utility Methods
# Line 38  Line 38 
38    
39  =head2 Public Methods  =head2 Public Methods
40    
41    =head3 abbrev
42    
43        my $abbrev = SeedUtils::abbrev($genome_name);
44    
45    Return an abbreviation of the specified genome name. This method is used to create
46    a reasonably indicative genome name that fits in 10 characters.
47    
48    =over 4
49    
50    =item genome_name
51    
52    Genome name to abbreviate.
53    
54    =item RETURN
55    
56    Returns a shortened version of the genome name that is 10 characters or less in
57    length.
58    
59    =back
60    
61    =cut
62    
63    sub abbrev {
64        my($genome_name) = @_;
65    
66        $genome_name =~ s/^(\S{3})\S+/$1./;
67        $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
68        $genome_name =~ s/ //g;
69        if (length($genome_name) > 10) {
70            $genome_name = substr($genome_name,0,10);
71        }
72        return $genome_name;
73    }
74    
75    =head3 between
76    
77        my $flag = between($x, $y, $z);
78    
79    Determine whether or not $y is between $x and $z.
80    
81    =over 4
82    
83    =item x
84    
85    First edge number.
86    
87    =item y
88    
89    Number to examine.
90    
91    =item z
92    
93    Second edge number.
94    
95    =item RETURN
96    
97    Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
98    is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
99    TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
100    I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
101    then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
102    
103    =back
104    
105    =cut
106    #: Return Type $;
107    sub between {
108        shift if UNIVERSAL::isa($_[0],__PACKAGE__);
109        my($x,$y,$z) = @_;
110    
111        if ($x < $z) {
112            return (($x <= $y) && ($y <= $z));
113        } else {
114            return (($x >= $y) && ($y >= $z));
115        }
116    }
117    
118    
119  =head3 boundaries_of  =head3 boundaries_of
120    
121      my ($contig, $min, $max) = boundaries_of($locs);      my ($contig, $min, $max, $dir) = boundaries_of($locs);
122    
123  Return the boundaries of a set of locations. The contig, the leftmost  Return the boundaries of a set of locations. The contig, the leftmost
124  location, and the rightmost location will be returned to the caller. If  location, and the rightmost location will be returned to the caller. If
# Line 58  Line 136 
136    
137  =item RETURN  =item RETURN
138    
139  Returns a 3-element list. The first element is the contig ID from all the locations,  Returns a 4-element list. The first element is the contig ID from all the locations,
140  the second is the offset of leftmost base pair represented in the locations, and the  the second is the offset of leftmost base pair represented in the locations, the
141  third is the offset of the rightmost base pair represented in the locations.  third is the offset of the rightmost base pair represented in the locations, and
142    the fourth is the dominant strand.
143    
144  =back  =back
145    
# Line 73  Line 152 
152      my ($contig, $min, $max);      my ($contig, $min, $max);
153      # We'll put all the starting and ending offsets found in here.      # We'll put all the starting and ending offsets found in here.
154      my @offsets;      my @offsets;
155        # This will be used to count the orientations.
156        my %dirs = ('+' => 0, '-' => 0);
157      # This will count the number of errors found.      # This will count the number of errors found.
158      my $error = 0;      my $error = 0;
159      # Loop through the locations.      # Loop through the locations.
# Line 87  Line 168 
168              } else {              } else {
169                  # Save the contig.                  # Save the contig.
170                  $contig = $newContig;                  $contig = $newContig;
171                    # Count the orientation.
172                    $dirs{$dir}++;
173                  # Compute the ending offset.                  # Compute the ending offset.
174                  my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);                  my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
175                  # Save both offsets.                  # Save both offsets.
# Line 104  Line 187 
187      # Compute the min and max from the offsets collected.      # Compute the min and max from the offsets collected.
188      $min = min(@offsets);      $min = min(@offsets);
189      $max = max(@offsets);      $max = max(@offsets);
190        # Save the dominant orientation.
191        my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
192      # Return the results.      # Return the results.
193      return ($contig, $min, $max);      return ($contig, $min, $max, $dir);
194  }  }
195    
196    =head3 boundary_loc
197    
198  =head3 roles_in_function      my $singleLoc = SeedUtils::boundary_loc($locations);
199    
200      my @roles = roles_in_function($assignment);  Return a single location string (see L<SAP/Location Strings>) that covers
201    the incoming list of locations. NOTE that if the locations listed span
202    more than one contig, this method may return an unexpected result.
203    
204  Return a list of the functional roles in the specified assignment string.  This method is useful for converting the output of L<SAP/fid_locations> to
205  A single assignment may contain multiple roles as well as comments; this  location strings.
 method separates them out.  
206    
207  =over 4  =over 4
208    
209  =item assignment  =item locations
210    
211  Functional assignment to parse for roles.  A set of location strings formatted as a comma-separated list or as a reference
212    to a list of location strings.
213    
214  =item RETURN  =item RETURN
215    
216  Returns a list of the individual roles in the assignment.  Returns a single location string that covers as best as possible the list of
217    incoming locations.
218    
219  =back  =back
220    
221  =cut  =cut
222    
223  sub roles_in_function {  sub boundary_loc {
224      # Get the parameters.      # Get the parameters.
225      my ($assignment) = @_;      my ($locations) = @_;
226      # Remove any comment.      # Convert the incoming locations to a list.
227      my $commentFree = ($assignment =~ /(.+?)\s*#/ ? $1 : $assignment);      my @locs;
228      # Split out the roles.      if (ref $locations eq 'ARRAY') {
229      my @retVal = split /\s+[\/@;]\s+/, $commentFree;          @locs = @$locations;
230        } else {
231            @locs = split /\s*,\s*/, $locations;
232        }
233        # Get the boundary information for the listed locations.
234        my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
235        # Compute the indicated location string.
236        my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
237                    ($max + 1 - $min);
238      # Return the result.      # Return the result.
239      return @retVal;      return $retVal;
240  }  }
241    
242    =head3 by_fig_id
243    
244  =head3 parse_location      my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
245    
246      my ($contig, $begin, $end, $strand) = parse_location($locString);  Compare two feature IDs.
247    
248  Return the contigID, start offset, and end offset for a specified  This function is designed to assist in sorting features by ID. The sort is by
249  location string (see L<SAP/Location Strings>).  genome ID followed by feature type and then feature number.
250    
251  =over 4  =over 4
252    
253  =item locString  =item a
254    
255  Location string to parse.  First feature ID.
256    
257    =item b
258    
259    Second feature ID.
260    
261  =item RETURN  =item RETURN
262    
263  Returns a four-element list containing the contig ID from the location  Returns a negative number if the first parameter is smaller, zero if both parameters
264  string, the starting offset of the location, the ending offset, and the  are equal, and a positive number if the first parameter is greater.
 strand. If the location string is not valid, the values returned will be  
 C<undef>.  
265    
266  =back  =back
267    
268  =cut  =cut
269    
270  sub parse_location {  sub by_fig_id {
271      # Get the parameters.      my($a,$b) = @_;
272      my ($locString) = @_;      my($g1,$g2,$t1,$t2,$n1,$n2);
273      # Declare the return variables.      if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
274      my ($contig, $begin, $end, $strand);           ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
275      # Parse the location string.          ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {  
         # Pull out the contig ID, strand, and begin location.  
         $contig = $1;  
         $begin = $2;  
         $strand = $3;  
         # Compute the ending location from the direction and length.  
         if ($3 eq '+') {  
             $end = $begin + $4 - 1;  
276          } else {          } else {
277              $end = $begin - $4 + 1;          $a cmp $b;
         }  
278      }      }
     # Return the results.  
     return ($contig, $begin, $end, $strand);  
279  }  }
280    
281  =head3 location_string  =head3 create_fasta_record
282    
283      my $locString = location_string($contig, $beg, $end);      my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
284    
285  Form a location string for the specified contig that starts at the  Create a FASTA record from the specified DNA or protein sequence. The
286  indicated begin location and stops at the indicated end location. A  sequence will be split into 60-character lines, and the record will
287  single-base location will automatically be put on the forward strand.  include an identifier line.
288    
289  =over 4  =over 4
290    
291  =item contig  =item id
292    
293  ID of the contig to contain this location.  ID for the sequence, to be placed at the beginning of the identifier
294    line.
295    
296  =item beg  =item comment (optional)
297    
298  Beginning offset of the location.  Comment text to place after the ID on the identifier line. If this parameter
299    is empty, undefined, or 0, no comment will be placed.
300    
301  =item end  =item sequence
302    
303  Ending offset of the location.  Sequence of letters to form into FASTA. For purposes of convenience, whitespace
304    characters in the sequence will be removed automatically.
305    
306    =item stripped (optional)
307    
308    If TRUE, then the sequence will be returned unmodified instead of converted
309    to FASTA format. The default is FALSE.
310    
311  =item RETURN  =item RETURN
312    
313  Returns a location string (see L<SAP/Location String>) for the specified  Returns the desired sequence in FASTA format.
 location.  
314    
315  =back  =back
316    
317  =cut  =cut
318    
319  sub location_string {  sub create_fasta_record {
320      # Get the parameters.      # Get the parameters.
321      my ($contig, $beg, $end) = @_;      my ($id, $comment, $sequence, $stripped) = @_;
322      # Compute the strand and length.      # Declare the return variable.
323      my ($strand, $len);      my $retVal;
324      if ($beg <= $end) {      # If we're in stripped mode, we just return the sequence.
325          $strand = '+';      if ($stripped) {
326          $len = $end + 1 - $beg;          $retVal = $sequence;
327      } else {      } else {
328          $strand = '-';          # Here we have to do the FASTA conversion. Start with the ID.
329          $len = $beg + 1 - $end;          my $header = ">$id";
330            # Add a comment, if any.
331            if ($comment) {
332                $header .= " $comment";
333            }
334            # Clean up the sequence.
335            $sequence =~ s/\s+//g;
336            # We need to format the sequence into 60-byte chunks. We use the infamous
337            # grep-split trick. The split, because of the presence of the parentheses,
338            # includes the matched delimiters in the output list. The grep strips out
339            # the empty list items that appear between the so-called delimiters, since
340            # the delimiters are what we want.
341            my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
342            # Add the chunks and the trailer.
343            $retVal = join("\n", $header, @chunks) . "\n";
344      }      }
345      # Return the result.      # Return the result.
346      return $contig . "_$beg$strand$len";      return $retVal;
347  }  }
348    
349    =head3 display_id_and_seq
350    
351  =head3 location_cmp      SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
   
     my $cmp = location_cmp($loc1, $loc2);  
352    
353  Compare two location strings (see L<SAP/Location Strings>).  Display a fasta ID and sequence to the specified open file. This method is designed
354    to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
355    input a string pointer rather than a string. If the file handle is omitted it
356    defaults to STDOUT.
357    
358  The ordering principle for locations is that they are sorted first by contig ID, then by  The output is formatted into a FASTA record. The first line of the output is
359  leftmost position, in reverse order by length, and then by direction. The effect is that  preceded by a C<< > >> symbol, and the sequence is split into 60-character
360  within a contig, the locations are ordered first and foremost in the way they would  chunks displayed one per line. Thus, this method can be used to produce
361  appear when displayed in a picture of the contig and second in such a way that embedded  FASTA files from data gathered by the rest of the system.
 locations come after the locations in which they are embedded. In the case of two  
 locations that represent the exact same base pairs, the forward (C<+>) location is  
 arbitrarily placed first.  
362    
363  =over 4  =over 4
364    
365  =item loc1  =item id_and_comment
   
 First location string to compare.  
366    
367  =item loc2  The sequence ID and (optionally) the comment from the sequence's FASTA record.
368    The ID
369    
370  Second location string to compare.  =item seqP
371    
372  =item RETURN  Reference to a string containing the sequence. The sequence is automatically
373    formatted into 60-character chunks displayed one per line.
374    
375  Returns a negative number if the B<loc1> location sorts first, a positive number if the  =item fh
 B<loc2> location sorts first, and zero if the two locations are the same.  
376    
377    Open file handle to which the ID and sequence should be output. If omitted,
378    C<\*STDOUT> is assumed.
379    
380  =back  =back
381    
382  =cut  =cut
383    
384  sub location_cmp {  sub display_id_and_seq {
385      # Get the parameters.  
386      my ($loc1, $loc2) = @_;      my( $id, $seqP, $fh ) = @_;
387      # Parse the locations.  
388      my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;      if (! defined($fh) )  { $fh = \*STDOUT; }
389      my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);  
390      my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;      print $fh ">$id\n";
391      my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);      &display_seq($seqP, $fh);
392      # Declare the return variable. We compare the indicative parts of the location  }
393      # in order. Note that we sort in reverse order by length, so the last comparison  
394      # puts 2 before 1.  =head3 display_seq
395      my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||  
396                   ($len2 <=> $len1);      SeedUtils::display_seq(\$seqP, $fh);
397      # If everything matches to this point, check the strands.  
398      if (! $retVal) {  Display a fasta sequence to the specified open file. If the file handle is
399          if ($strand1 eq '+') {  omitted it defaults to STDOUT.
400              # First location is positive, so if the locations are unequal, it  
401              # sorts first.  The sequence is split into 60-character chunks displayed one per line for
402              $retVal = ($strand2 eq '+' ? 0 : -1);  readability.
403    
404    =over 4
405    
406    =item seqP
407    
408    Reference to a string containing the sequence.
409    
410    =item fh
411    
412    Open file handle to which the sequence should be output. If omitted,
413    C<STDOUT> is assumed.
414    
415    =back
416    
417    =cut
418    
419    sub display_seq {
420    
421        my ( $seqP, $fh ) = @_;
422        my ( $i, $n, $ln );
423    
424        if (! defined($fh) )  { $fh = \*STDOUT; }
425    
426        $n = length($$seqP);
427    #   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
428        for ($i=0; ($i < $n); $i += 60) {
429            if (($i + 60) <= $n) {
430                $ln = substr($$seqP,$i,60);
431          } else {          } else {
432              # First location is negative, so if the locations are unequal, it              $ln = substr($$seqP,$i,($n-$i));
             # sorts second.  
             $retVal = ($strand1 eq '-' ? 0 : 1);  
433          }          }
434            print $fh "$ln\n";
435      }      }
     # Return the result.  
     return $retVal;  
436  }  }
437    
 =head3 strand_of  
438    
439      my $plusOrMinus = strand_of($loc);  =head3 genome_of
440    
441  Return the strand (C<+> or C<->) from the specified location string.      my $genomeID = genome_of($fid);
442    
443    Return the Genome ID embedded in the specified FIG feature ID.
444    
445  =over 4  =over 4
446    
447  =item loc  =item fid
448    
449  Location string to parse (see L<SAP/Location Strings>).  Feature ID of interest.
450    
451  =item RETURN  =item RETURN
452    
453  Returns C<+> if the location is on the forward strand, else C<->.  Returns the genome ID in the middle portion of the FIG feature ID. If the
454    feature ID is invalid, this method returns an undefined value.
455    
456  =back  =back
457    
458  =cut  =cut
459    
460  sub strand_of {  sub genome_of {
461      # Get the parameters.      # Get the parameters.
462      my ($loc) = @_;      my ($fid) = @_;
463      # Declare the return variable.      # Declare the return variable.
464      my $retVal;      my $retVal;
465      # Parse the strand indicator from the location.      # Parse the feature ID.
466      if ($loc =~ /\d+([+-])\d+/) {      if ($fid =~ /^fig\|(\d+\.\d+)\./) {
467          $retVal = $1;          $retVal = $1;
468      }      }
469      # Return the result.      # Return the result.
470      return $retVal;      return $retVal;
471  }  }
472    
473    =head3 hypo
474    
475        my $flag = hypo($func);
476    
477    Return TRUE if the specified functional role is hypothetical, else FALSE.
478    Hypothetical functional roles are identified by key words in the text,
479    such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
480    
481    =over 4
482    
483    =item func
484    
485    Text of the functional role whose nature is to be determined.
486    
487    =item RETURN
488    
489    Returns TRUE if the role is hypothetical, else FALSE.
490    
491    =back
492    
493    =cut
494    
495    sub hypo {
496        my ($func) = @_;
497        if (! $func)                             { return 1 }
498        if ($func =~ /lmo\d+ protein/i)          { return 1 }
499        if ($func =~ /hypoth/i)                  { return 1 }
500        if ($func =~ /conserved protein/i)       { return 1 }
501        if ($func =~ /gene product/i)            { return 1 }
502        if ($func =~ /interpro/i)                { return 1 }
503        if ($func =~ /B[sl][lr]\d/i)             { return 1 }
504        if ($func =~ /^U\d/)                     { return 1 }
505        if ($func =~ /^orf[^_]/i)                { return 1 }
506        if ($func =~ /uncharacterized/i)         { return 1 }
507        if ($func =~ /pseudogene/i)              { return 1 }
508        if ($func =~ /^predicted/i)              { return 1 }
509        if ($func =~ /AGR_/)                     { return 1 }
510        if ($func =~ /similar to/i)              { return 1 }
511        if ($func =~ /similarity/i)              { return 1 }
512        if ($func =~ /glimmer/i)                 { return 1 }
513        if ($func =~ /unknown/i)                 { return 1 }
514        if (($func =~ /domain/i) ||
515            ($func =~ /^y[a-z]{2,4}\b/i) ||
516            ($func =~ /complete/i) ||
517            ($func =~ /ensang/i) ||
518            ($func =~ /unnamed/i) ||
519            ($func =~ /EG:/) ||
520            ($func =~ /orf\d+/i) ||
521            ($func =~ /RIKEN/) ||
522            ($func =~ /Expressed/i) ||
523            ($func =~ /[a-zA-Z]{2,3}\|/) ||
524            ($func =~ /predicted by Psort/) ||
525            ($func =~ /^bh\d+/i) ||
526            ($func =~ /cds_/i) ||
527            ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
528            ($func =~ /similar to/i) ||
529            ($func =~ / identi/i) ||
530            ($func =~ /ortholog of/i) ||
531            ($func =~ /structural feature/i))    { return 1 }
532        return 0;
533    
534    }
535    
536    =head3 location_cmp
537    
538        my $cmp = location_cmp($loc1, $loc2);
539    
540    Compare two location strings (see L<SAP/Location Strings>).
541    
542    The ordering principle for locations is that they are sorted first by contig ID, then by
543    leftmost position, in reverse order by length, and then by direction. The effect is that
544    within a contig, the locations are ordered first and foremost in the way they would
545    appear when displayed in a picture of the contig and second in such a way that embedded
546    locations come after the locations in which they are embedded. In the case of two
547    locations that represent the exact same base pairs, the forward (C<+>) location is
548    arbitrarily placed first.
549    
550    =over 4
551    
552    =item loc1
553    
554    First location string to compare.
555    
556    =item loc2
557    
558    Second location string to compare.
559    
560    =item RETURN
561    
562    Returns a negative number if the B<loc1> location sorts first, a positive number if the
563    B<loc2> location sorts first, and zero if the two locations are the same.
564    
565    
566    =back
567    
568    =cut
569    
570    sub location_cmp {
571        # Get the parameters.
572        my ($loc1, $loc2) = @_;
573        # Parse the locations.
574        my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
575        my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
576        my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
577        my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
578        # Declare the return variable. We compare the indicative parts of the location
579        # in order. Note that we sort in reverse order by length, so the last comparison
580        # puts 2 before 1.
581        my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
582                     ($len2 <=> $len1);
583        # If everything matches to this point, check the strands.
584        if (! $retVal) {
585            if ($strand1 eq '+') {
586                # First location is positive, so if the locations are unequal, it
587                # sorts first.
588                $retVal = ($strand2 eq '+' ? 0 : -1);
589            } else {
590                # First location is negative, so if the locations are unequal, it
591                # sorts second.
592                $retVal = ($strand1 eq '-' ? 0 : 1);
593            }
594        }
595        # Return the result.
596        return $retVal;
597    }
598    
599    =head3 location_string
600    
601        my $locString = location_string($contig, $beg, $end);
602    
603    Form a location string for the specified contig that starts at the
604    indicated begin location and stops at the indicated end location. A
605    single-base location will automatically be put on the forward strand.
606    
607    =over 4
608    
609    =item contig
610    
611    ID of the contig to contain this location.
612    
613    =item beg
614    
615    Beginning offset of the location.
616    
617    =item end
618    
619    Ending offset of the location.
620    
621    =item RETURN
622    
623    Returns a location string (see L<SAP/Location String>) for the specified
624    location.
625    
626    =back
627    
628    =cut
629    
630    sub location_string {
631        # Get the parameters.
632        my ($contig, $beg, $end) = @_;
633        # Compute the strand and length.
634        my ($strand, $len);
635        if ($beg <= $end) {
636            $strand = '+';
637            $len = $end + 1 - $beg;
638        } else {
639            $strand = '-';
640            $len = $beg + 1 - $end;
641        }
642        # Return the result.
643        return $contig . "_$beg$strand$len";
644    }
645    
646  =head3 max  =head3 max
647    
# Line 396  Line 705 
705      return $retVal;      return $retVal;
706  }  }
707    
708  =head3 create_fasta_record  =head3 parse_fasta_record
   
     my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);  
   
 Create a FASTA record from the specified DNA or protein sequence. The  
 sequence will be split into 60-character lines, and the record will  
 include an identifier line.  
709    
710  =over 4      my ($id, $comment, $seq) = parse_fasta_record($string);
711    
712  =item id  Extract the ID, comment, and sequence from a single FASTA record. For
713    backward compatability, instead of a FASTA record the ID and sequence can
714    be specified separated by a comma. In this case, the returned comment
715    will be empty.
716    
717  ID for the sequence, to be placed at the beginning of the identifier  =over 4
 line.  
718    
719  =item comment (optional)  =item string
720    
721  Comment text to place after the ID on the identifier line. If this parameter  A single FASTA record, or an ID and sequence separated by a single comma,
722  is empty, undefined, or 0, no comment will be placed.  an unadorned sequence, a 2-element list consisting of an ID and a sequence,
723    or a 3-element list consisting of an ID, a comment, and a sequence.
724    
725  =item sequence  =item RETURN
726    
727  Sequence of letters to form into FASTA. For purposes of convenience, whitespace  Returns a three-element list consisting of the incoming ID, the associated
728  characters in the sequence will be removed automatically.  comment, and the specified DNA or protein sequence. If the incoming string is
729    invalid, all three list elements will come back undefined. If no ID is
730    specified, an MD5 will be provided.
731    
732  =item stripped (optional)  =back
733    
734  If TRUE, then the sequence will be returned unmodified instead of converted  =cut
735  to FASTA format. The default is FALSE.  
736    sub parse_fasta_record {
737        # Get the parameters.
738        my ($string) = @_;
739        # Declare the return variables.
740        my ($id, $comment, $seq);
741        # Check the type of input string.
742        if (! defined $string) {
743            # Do nothing if no string was passed in. This extra check prevents a
744            # warning at runtime.
745        } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
746            # Here we have a standard FASTA string.
747            ($id, $comment, $seq) = ($1, $2, $3);
748            # Remove white space from the sequence string.
749            $seq =~ s/\s+//sg;
750            # Trim front of comment.
751            $comment =~ s/^s+//;
752        } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
753            ($id, $comment, $seq) = ($1, '', $2);
754        } elsif (ref $string eq 'ARRAY') {
755            # Here the data came in pre-formatted as a list reference.
756            ($id, $comment, $seq) = @$string;
757            # If there's no comment, we need to adjust.
758            if (! defined $seq) {
759                $seq = $comment;
760                $comment = '';
761            }
762        } else {
763            # Here we have only a sequence. We need to construct the ID.
764            $seq = $string;
765            require Digest::MD5;
766            $id = "md5|" . Digest::MD5::md5_base64($seq);
767            $comment = "";
768        }
769        # Return the results.
770        return ($id, $comment, $seq);
771    }
772    
773    =head3 parse_location
774    
775        my ($contig, $begin, $end, $strand) = parse_location($locString);
776    
777    Return the contigID, start offset, and end offset for a specified
778    location string (see L<SAP/Location Strings>).
779    
780    =over 4
781    
782    =item locString
783    
784    Location string to parse.
785    
786  =item RETURN  =item RETURN
787    
788  Returns the desired sequence in FASTA format.  Returns a four-element list containing the contig ID from the location
789    string, the starting offset of the location, the ending offset, and the
790    strand. If the location string is not valid, the values returned will be
791    C<undef>.
792    
793  =back  =back
794    
795  =cut  =cut
796    
797  sub create_fasta_record {  sub parse_location {
798      # Get the parameters.      # Get the parameters.
799      my ($id, $comment, $sequence, $stripped) = @_;      my ($locString) = @_;
800      # Declare the return variable.      # Declare the return variables.
801      my $retVal;      my ($contig, $begin, $end, $strand);
802      # If we're in stripped mode, we just return the sequence.      # Parse the location string.
803      if ($stripped) {      if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
804          $retVal = $sequence;          # Pull out the contig ID, strand, and begin location.
805            $contig = $1;
806            $begin = $2;
807            $strand = $3;
808            # Compute the ending location from the direction and length.
809            if ($3 eq '+') {
810                $end = $begin + $4 - 1;
811      } else {      } else {
812          # Here we have to do the FASTA conversion. Start with the ID.              $end = $begin - $4 + 1;
         my $header = ">$id";  
         # Add a comment, if any.  
         if ($comment) {  
             $header .= " $comment";  
813          }          }
         # Clean up the sequence.  
         $sequence =~ s/\s+//g;  
         # We need to format the sequence into 60-byte chunks. We use the infamous  
         # grep-split trick. The split, because of the presence of the parentheses,  
         # includes the matched delimiters in the output list. The grep strips out  
         # the empty list items that appear between the so-called delimiters, since  
         # the delimiters are what we want.  
         my @chunks = grep { $_ } split /(.{1,60})/, $sequence;  
         # Add the chunks and the trailer.  
         $retVal = join("\n", $header, @chunks) . "\n";  
814      }      }
815      # Return the result.      # Return the results.
816      return $retVal;      return ($contig, $begin, $end, $strand);
817  }  }
818    
819  =head3 rev_comp  =head3 rev_comp
# Line 496  Line 848 
848      # Determine how we were called.      # Determine how we were called.
849      my ($retVal, $refMode);      my ($retVal, $refMode);
850      if (ref $dna eq 'SCALAR') {      if (ref $dna eq 'SCALAR') {
851          $retVal = lc reverse $dna;          $retVal = reverse $dna;
852          $refMode = 0;          $refMode = 0;
853      } else {      } else {
854          $retVal = lc reverse $$dna;          $retVal = reverse $$dna;
855          $refMode = 1;          $refMode = 1;
856      }      }
857      # Now $retVal contains the reversed DNA string in all lower case, and      # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
858      # $refMode is TRUE iff the user passed in a reference. The following      # user passed in a reference. The following translation step complements the
859      # translation step complements the string.      # string.
860      $retVal =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;      $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
861      # Return the result in the method corresponding to the way it came in.      # Return the result in the method corresponding to the way it came in.
862      if ($refMode) {      if ($refMode) {
863          $$dna = $retVal;          $$dna = $retVal;
# Line 515  Line 867 
867      }      }
868  }  }
869    
870    # Synonym of rev_comp, for backward compatibility.
871  =head3 by_fig_id  sub reverse_comp {
872        return rev_comp($_[0]);
     my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids;  
   
 Compare two feature IDs.  
   
 This function is designed to assist in sorting features by ID. The sort is by  
 genome ID followed by feature type and then feature number.  
   
 =over 4  
   
 =item a  
   
 First feature ID.  
   
 =item b  
   
 Second feature ID.  
   
 =item RETURN  
   
 Returns a negative number if the first parameter is smaller, zero if both parameters  
 are equal, and a positive number if the first parameter is greater.  
   
 =back  
   
 =cut  
   
 sub by_fig_id {  
     my($a,$b) = @_;  
     my($g1,$g2,$t1,$t2,$n1,$n2);  
     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&  
          ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {  
         ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);  
     } else {  
         $a cmp $b;  
     }  
873  }  }
874    
875    =head3 roles_of_function
876    
877  =head3 genome_of      my @roles = roles_of_function($assignment);
   
     my $genomeID = genome_of($fid);  
878    
879  Return the Genome ID embedded in the specified FIG feature ID.  Return a list of the functional roles in the specified assignment string.
880    A single assignment may contain multiple roles as well as comments; this
881    method separates them out.
882    
883  =over 4  =over 4
884    
885  =item fid  =item assignment
886    
887  Feature ID of interest.  Functional assignment to parse for roles.
888    
889  =item RETURN  =item RETURN
890    
891  Returns the genome ID in the middle portion of the FIG feature ID. If the  Returns a list of the individual roles in the assignment.
 feature ID is invalid, this method returns an undefined value.  
892    
893  =back  =back
894    
895  =cut  =cut
896    
897  sub genome_of {  sub roles_of_function {
898      # Get the parameters.      # Get the parameters.
899      my ($fid) = @_;      my ($assignment) = @_;
900      # Declare the return variable.      # Remove any comment.
901      my $retVal;      my $commentFree = ($assignment =~ /(.+?)\s*#/ ? $1 : $assignment);
902      # Parse the feature ID.      # Split out the roles.
903      if ($fid =~ /^fig\|(\d+\.\d+)\./) {      my @retVal = split /\s+[\/@;]\s+/, $commentFree;
         $retVal = $1;  
     }  
904      # Return the result.      # Return the result.
905      return $retVal;      return @retVal;
906  }  }
907    
908  =head3 sims  =head3 sims
# Line 759  Line 1074 
1074      return @retVal;      return @retVal;
1075  }  }
1076    
1077  =head3 verify_dir  =head3 standard_genetic_code
   
     verify_dir($dirName);  
   
 Insure that the specified directory exists. If the directory does not  
 exist, it will be created.  
   
 =over 4  
   
 =item dirName  
1078    
1079  Name of the relevant directory.      my $code = standard_genetic_code();
1080    
1081  =back  Return a hash containing the standard translation of nucleotide triples to proteins.
1082    Methods such as L</translate> can take a translation scheme as a parameter. This method
1083    returns the default translation scheme. The scheme is implemented as a reference to a
1084    hash that contains nucleotide triplets as keys and has protein letters as values.
1085    
1086  =cut  =cut
1087    
1088  sub verify_dir {  sub standard_genetic_code {
     # Get the parameters.  
     my ($dirName) = @_;  
     # Strip off the final slash, if any.  
     $dirName =~ s#/$##;  
     # Only proceed if the directory does NOT already exist.  
     if (! -d $dirName) {  
         # If there is a parent directory, recursively insure it is there.  
         if ($dirName =~ m#(.+)/[^/]+$#) {  
             verify_dir($1);  
         }  
         # Create this particular directory with full permissions.  
         mkdir $dirName, 0777;  
     }  
 }  
   
 =head3 parse_fasta_record  
   
     my ($id, $comment, $seq) = parse_fasta_record($string);  
   
 Extract the ID, comment, and sequence from a single FASTA record. For  
 backward compatability, instead of a FASTA record the ID and sequence can  
 be specified separated by a comma. In this case, the returned comment  
 will be empty.  
   
 =over 4  
   
 =item string  
   
 A single FASTA record, or an ID and sequence separated by a single comma,  
 an unadorned sequence, a 2-element list consisting of an ID and a sequence,  
 or a 3-element list consisting of an ID, a comment, and a sequence.  
   
 =item RETURN  
   
 Returns a three-element list consisting of the incoming ID, the associated  
 comment, and the specified DNA or protein sequence. If the incoming string is  
 invalid, all three list elements will come back undefined. If no ID is  
 specified, an MD5 will be provided.  
   
 =back  
1089    
1090  =cut      my $code = {};
   
 sub parse_fasta_record {  
     # Get the parameters.  
     my ($string) = @_;  
     # Declare the return variables.  
     my ($id, $comment, $seq);  
     # Check the type of input string.  
     if (! defined $string) {  
         # Do nothing if no string was passed in. This extra check prevents a  
         # warning at runtime.  
     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {  
         # Here we have a standard FASTA string.  
         ($id, $comment, $seq) = ($1, $2, $3);  
         # Remove white space from the sequence string.  
         $seq =~ s/\s+//sg;  
         # Trim front of comment.  
         $comment =~ s/^s+//;  
     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {  
         ($id, $comment, $seq) = ($1, '', $2);  
     } elsif (ref $string eq 'ARRAY') {  
         # Here the data came in pre-formatted as a list reference.  
         ($id, $comment, $seq) = @$string;  
         # If there's no comment, we need to adjust.  
         if (! defined $seq) {  
             $seq = $comment;  
             $comment = '';  
         }  
     } else {  
         # Here we have only a sequence. We need to construct the ID.  
         $seq = $string;  
         require Digest::MD5;  
         $id = "md5|" . Digest::MD5::md5_base64($seq);  
         $comment = "";  
     }  
     # Return the results.  
     return ($id, $comment, $seq);  
 }  
   
   
 =head3 hypo  
   
     my $flag = hypo($func);  
   
 Return TRUE if the specified functional role is hypothetical, else FALSE.  
 Hypothetical functional roles are identified by key words in the text,  
 such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).  
   
 =over 4  
   
 =item func  
   
 Text of the functional role whose nature is to be determined.  
   
 =item RETURN  
   
 Returns TRUE if the role is hypothetical, else FALSE.  
   
 =back  
   
 =cut  
   
 sub hypo {  
     my ($func) = @_;  
     if (! $func)                             { return 1 }  
     if ($func =~ /lmo\d+ protein/i)          { return 1 }  
     if ($func =~ /hypoth/i)                  { return 1 }  
     if ($func =~ /conserved protein/i)       { return 1 }  
     if ($func =~ /gene product/i)            { return 1 }  
     if ($func =~ /interpro/i)                { return 1 }  
     if ($func =~ /B[sl][lr]\d/i)             { return 1 }  
     if ($func =~ /^U\d/)                     { return 1 }  
     if ($func =~ /^orf[^_]/i)                { return 1 }  
     if ($func =~ /uncharacterized/i)         { return 1 }  
     if ($func =~ /pseudogene/i)              { return 1 }  
     if ($func =~ /^predicted/i)              { return 1 }  
     if ($func =~ /AGR_/)                     { return 1 }  
     if ($func =~ /similar to/i)              { return 1 }  
     if ($func =~ /similarity/i)              { return 1 }  
     if ($func =~ /glimmer/i)                 { return 1 }  
     if ($func =~ /unknown/i)                 { return 1 }  
     if (($func =~ /domain/i) ||  
         ($func =~ /^y[a-z]{2,4}\b/i) ||  
         ($func =~ /complete/i) ||  
         ($func =~ /ensang/i) ||  
         ($func =~ /unnamed/i) ||  
         ($func =~ /EG:/) ||  
         ($func =~ /orf\d+/i) ||  
         ($func =~ /RIKEN/) ||  
         ($func =~ /Expressed/i) ||  
         ($func =~ /[a-zA-Z]{2,3}\|/) ||  
         ($func =~ /predicted by Psort/) ||  
         ($func =~ /^bh\d+/i) ||  
         ($func =~ /cds_/i) ||  
         ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||  
         ($func =~ /similar to/i) ||  
         ($func =~ / identi/i) ||  
         ($func =~ /ortholog of/i) ||  
         ($func =~ /structural feature/i))    { return 1 }  
     return 0;  
   
 }  
   
 =head3 between  
   
     my $flag = between($x, $y, $z);  
   
 Determine whether or not $y is between $x and $z.  
   
 =over 4  
   
 =item x  
   
 First edge number.  
   
 =item y  
   
 Number to examine.  
   
 =item z  
   
 Second edge number.  
   
 =item RETURN  
   
 Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check  
 is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns  
 TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than  
 I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,  
 then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.  
   
 =back  
   
 =cut  
 #: Return Type $;  
 sub between {  
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
     my($x,$y,$z) = @_;  
   
     if ($x < $z) {  
         return (($x <= $y) && ($y <= $z));  
     } else {  
         return (($x >= $y) && ($y >= $z));  
     }  
 }  
   
 =head3 standard_genetic_code  
   
     my $code = standard_genetic_code();  
   
 Return a hash containing the standard translation of nucleotide triples to proteins.  
 Methods such as L</translate> can take a translation scheme as a parameter. This method  
 returns the default translation scheme. The scheme is implemented as a reference to a  
 hash that contains nucleotide triplets as keys and has protein letters as values.  
   
 =cut  
   
 sub standard_genetic_code {  
   
     my $code = {};  
1091    
1092      $code->{"AAA"} = "K";      $code->{"AAA"} = "K";
1093      $code->{"AAC"} = "N";      $code->{"AAC"} = "N";
# Line 1047  Line 1157 
1157      return $code;      return $code;
1158  }  }
1159    
1160  =head3 translate  =head3 strand_of
1161    
1162      my $aa_seq = translate($dna_seq, $code, $fix_start);      my $plusOrMinus = strand_of($loc);
1163    
1164  Translate a DNA sequence to a protein sequence using the specified genetic code.  Return the strand (C<+> or C<->) from the specified location string.
 If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to  
 C<M>. (In the standard genetic code, these two combinations normally translate  
 to C<V> and C<L>, respectively.)  
1165    
1166  =over 4  =over 4
1167    
1168  =item dna_seq  =item loc
   
 DNA sequence to translate. Note that the DNA sequence can only contain  
 known nucleotides.  
   
 =item code  
   
 Reference to a hash specifying the translation code. The hash is keyed by  
 nucleotide triples, and the value for each key is the corresponding protein  
 letter. If this parameter is omitted, the L</standard_genetic_code> will be  
 used.  
   
 =item fix_start  
1169    
1170  TRUE if the first triple is to get special treatment, else FALSE. If TRUE,  Location string to parse (see L<SAP/Location Strings>).
 then a value of C<TTG> or C<GTG> in the first position will be translated to  
 C<M> instead of the value specified in the translation code.  
1171    
1172  =item RETURN  =item RETURN
1173    
1174  Returns a string resulting from translating each nucleotide triple into a  Returns C<+> if the location is on the forward strand, else C<->.
 protein letter.  
1175    
1176  =back  =back
1177    
1178  =cut  =cut
 #: Return Type $;  
 sub translate {  
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
   
     my( $dna,$code,$start ) = @_;  
     my( $i,$j,$ln );  
     my( $x,$y );  
     my( $prot );  
   
     if (! defined($code)) {  
         $code = &FIG::standard_genetic_code;  
     }  
     $ln = length($dna);  
     $prot = "X" x ($ln/3);  
     $dna =~ tr/a-z/A-Z/;  
1179    
1180      for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {  sub strand_of {
1181          $x = substr($dna,$i,3);      # Get the parameters.
1182          if ($y = $code->{$x}) {      my ($loc) = @_;
1183              substr($prot,$j,1) = $y;      # Declare the return variable.
1184          }      my $retVal;
1185      }      # Parse the strand indicator from the location.
1186        if ($loc =~ /\d+([+-])\d+/) {
1187      if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {          $retVal = $1;
         substr($prot,0,1) = 'M';  
1188      }      }
1189      return $prot;      # Return the result.
1190        return $retVal;
1191  }  }
1192    
1193  =head3 strip_ec  =head3 strip_ec
# Line 1147  Line 1224 
1224      return $retVal;      return $retVal;
1225  }  }
1226    
1227  =head3 display_id_and_seq  =head3 translate
   
     SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);  
   
   
1228    
1229  Display a fasta ID and sequence to the specified open file. This method is designed      my $aa_seq = translate($dna_seq, $code, $fix_start);
 to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as  
 input a string pointer rather than a string. If the file handle is omitted it  
 defaults to STDOUT.  
1230    
1231  The output is formatted into a FASTA record. The first line of the output is  Translate a DNA sequence to a protein sequence using the specified genetic code.
1232  preceded by a C<< > >> symbol, and the sequence is split into 60-character  If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1233  chunks displayed one per line. Thus, this method can be used to produce  C<M>. (In the standard genetic code, these two combinations normally translate
1234  FASTA files from data gathered by the rest of the system.  to C<V> and C<L>, respectively.)
1235    
1236  =over 4  =over 4
1237    
1238  =item id_and_comment  =item dna_seq
   
 The sequence ID and (optionally) the comment from the sequence's FASTA record.  
 The ID  
   
 =item seqP  
   
 Reference to a string containing the sequence. The sequence is automatically  
 formatted into 60-character chunks displayed one per line.  
   
 =item fh  
   
 Open file handle to which the ID and sequence should be output. If omitted,  
 C<\*STDOUT> is assumed.  
   
 =back  
   
 =cut  
   
 sub display_id_and_seq {  
   
     if (UNIVERSAL::isa($_[0],__PACKAGE__)) {  
         shift @_;  
         #Trace("Invalid call to display_id_and_seq.");  
     }  
   
     my( $id, $seqP, $fh ) = @_;  
   
     if (! defined($fh) )  { $fh = \*STDOUT; }  
   
     print $fh ">$id\n";  
     &display_seq($seqP, $fh);  
 }  
   
 =head3 display_seq  
   
     SeedUtils::display_seq(\$seqP, $fh);  
1239    
1240  Display a fasta sequence to the specified open file. This method is designed  DNA sequence to translate. Note that the DNA sequence can only contain
1241  to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as  known nucleotides.
 input a string pointer rather than a string. If the file handle is omitted it  
 defaults to STDOUT.  
1242    
1243  The sequence is split into 60-character chunks displayed one per line for  =item code
 readability.  
1244    
1245  =over 4  Reference to a hash specifying the translation code. The hash is keyed by
1246    nucleotide triples, and the value for each key is the corresponding protein
1247    letter. If this parameter is omitted, the L</standard_genetic_code> will be
1248    used.
1249    
1250  =item seqP  =item fix_start
1251    
1252  Reference to a string containing the sequence.  TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1253    then a value of C<TTG> or C<GTG> in the first position will be translated to
1254    C<M> instead of the value specified in the translation code.
1255    
1256  =item fh  =item RETURN
1257    
1258  Open file handle to which the sequence should be output. If omitted,  Returns a string resulting from translating each nucleotide triple into a
1259  C<STDOUT> is assumed.  protein letter.
1260    
1261  =back  =back
1262    
1263  =cut  =cut
1264    #: Return Type $;
1265  sub display_seq {  sub translate {
   
1266      shift if UNIVERSAL::isa($_[0],__PACKAGE__);      shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1267    
1268      my ( $seqP, $fh ) = @_;      my( $dna,$code,$start ) = @_;
1269      my ( $i, $n, $ln );      my( $i,$j,$ln );
1270        my( $x,$y );
1271      if (! defined($fh) )  { $fh = \*STDOUT; }      my( $prot );
1272    
1273      $n = length($$seqP);      if (! defined($code)) {
1274  #   confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );          $code = &FIG::standard_genetic_code;
     for ($i=0; ($i < $n); $i += 60) {  
         if (($i + 60) <= $n) {  
             $ln = substr($$seqP,$i,60);  
         } else {  
             $ln = substr($$seqP,$i,($n-$i));  
1275          }          }
1276          print $fh "$ln\n";      $ln = length($dna);
1277        $prot = "X" x ($ln/3);
1278        $dna =~ tr/a-z/A-Z/;
1279    
1280        for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1281            $x = substr($dna,$i,3);
1282            if ($y = $code->{$x}) {
1283                substr($prot,$j,1) = $y;
1284      }      }
1285  }  }
1286    
1287        if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1288  =head3 roles_of_function          substr($prot,0,1) = 'M';
   
     my @roles = $fig->roles_of_function($func);  
   
 Returns a list of the functional roles implemented by the specified function. This method  
 parses the role data out of the function name, and does not require access to the database.  
   
 =over 4  
   
 =item func  
   
 Name of the function whose roles are to be parsed out.  
   
 =item RETURN  
   
 Returns a list of the roles performed by the specified function.  
   
 =back  
   
 =cut  
   
 sub roles_of_function {  
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
     my $func = (@_ == 1) ? $_[0] : $_[1];  
   
     $func =~ s/\s*[\!\#].*$//;  
     my %roles = map { $_ => 1 } (split(/\s*;\s+|\s+[\@\/]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g),$func);  
     return sort keys(%roles);  
1289  }  }
1290        return $prot;
   
 =head3 reverse_comp  
   
     my $dnaR = FIG::reverse_comp($dna);  
   
 or  
   
     my $dnaR = $fig->reverse_comp($dna);  
   
 Return the reverse complement os the specified DNA sequence.  
   
 NOTE: for extremely long DNA strings, use L</rev_comp>, which allows you to  
 pass the strings around in the form of pointers.  
   
 =over 4  
   
 =item dna  
   
 DNA sequence whose reverse complement is desired.  
   
 =item RETURN  
   
 Returns the reverse complement of the incoming DNA sequence.  
   
 =back  
   
 =cut  
 #: Return Type $;  
 sub reverse_comp {  
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
     my($seq) = @_;  
   
     return ${&rev_comp(\$seq)};  
1291  }  }
1292    
1293  =head3 rev_comp  =head3 verify_dir
   
     my $dnaRP = FIG::rev_comp(\$dna);  
   
 or  
1294    
1295      my $dnaRP = $fig->rev_comp(\$dna);      verify_dir($dirName);
1296    
1297  Return the reverse complement of the specified DNA sequence. The DNA sequence  Insure that the specified directory exists. If the directory does not
1298  is passed in as a string reference rather than a raw string for performance  exist, it will be created.
 reasons. If this is unnecessary, use L</reverse_comp>, which processes strings  
 instead of references to strings.  
1299    
1300  =over 4  =over 4
1301    
1302  =item dna  =item dirName
   
 Reference to the DNA sequence whose reverse complement is desired.  
   
 =item RETURN  
1303    
1304  Returns a reference to the reverse complement of the incoming DNA sequence.  Name of the relevant directory.
1305    
1306  =back  =back
1307    
1308  =cut  =cut
 #: Return Type $;  
 sub rev_comp {  
     shift if UNIVERSAL::isa($_[0],__PACKAGE__);  
     my( $seqP ) = @_;  
     my( $rev  );  
1309    
1310      $rev =  reverse( $$seqP );  sub verify_dir {
1311      $rev =~ tr/A-Z/a-z/;      # Get the parameters.
1312      $rev =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;      my ($dirName) = @_;
1313      return \$rev;      # Strip off the final slash, if any.
1314        $dirName =~ s#/$##;
1315        # Only proceed if the directory does NOT already exist.
1316        if (! -d $dirName) {
1317            # If there is a parent directory, recursively insure it is there.
1318            if ($dirName =~ m#(.+)/[^/]+$#) {
1319                verify_dir($1);
1320  }  }
1321            # Create this particular directory with full permissions.
1322  sub abbrev {          mkdir $dirName, 0777;
     my($genome_name) = @_;  
   
     $genome_name =~ s/^(\S{3})\S+/$1./;  
     $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;  
     $genome_name =~ s/ //g;  
     if (length($genome_name) > 10) {  
         $genome_name = substr($genome_name,0,10);  
1323      }      }
     return $genome_name;  
1324  }  }
1325    
1326    
1327  1;  1;

Legend:
Removed from v.1.20  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3