[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 :    
24 :     use strict;
25 : parrello 1.12 no warnings qw(once);
26 : parrello 1.1 use base qw(Exporter);
27 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
28 :     rev_comp genome_of min max sims verify_dir between translate
29 : parrello 1.18 standard_genetic_code parse_location roles_in_function
30 :     strip_ec);
31 : parrello 1.1
32 :     =head1 SEED Utility Methods
33 :    
34 :     =head2 Introduction
35 :    
36 :     This is a simple utility package that performs functions useful for
37 :     bioinformatics, but that do not require access to the databases.
38 :    
39 :     =head2 Public Methods
40 :    
41 : parrello 1.13 =head3 boundaries_of
42 :    
43 :     my ($contig, $min, $max) = boundaries_of($locs);
44 :    
45 :     Return the boundaries of a set of locations. The contig, the leftmost
46 :     location, and the rightmost location will be returned to the caller. If
47 :     more than one contig is represented, the method will return an undefined
48 :     value for the contig (indicating failure).
49 :    
50 :     =over 4
51 :    
52 :     =item locs
53 :    
54 :     Reference to a list of location strings. A location string contains a contig ID,
55 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
56 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
57 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
58 :    
59 :     =item RETURN
60 :    
61 :     Returns a 3-element list. The first element is the contig ID from all the locations,
62 :     the second is the offset of leftmost base pair represented in the locations, and the
63 :     third is the offset of the rightmost base pair represented in the locations.
64 :    
65 :     =back
66 :    
67 :     =cut
68 :    
69 :     sub boundaries_of {
70 :     # Get the parameters.
71 :     my ($locs) = @_;
72 :     # Declare the return variables.
73 :     my ($contig, $min, $max);
74 :     # We'll put all the starting and ending offsets found in here.
75 :     my @offsets;
76 :     # This will count the number of errors found.
77 :     my $error = 0;
78 :     # Loop through the locations.
79 :     for my $loc (@$locs) {
80 :     # Parse this location.
81 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
82 :     my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
83 :     # Is this contig valid?
84 :     if ($contig && $newContig ne $contig) {
85 :     # No, skip this location.
86 :     $error++;
87 :     } else {
88 :     # Save the contig.
89 :     $contig = $newContig;
90 :     # Compute the ending offset.
91 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
92 :     # Save both offsets.
93 :     push @offsets, $begin, $end;
94 :     }
95 :     } else {
96 :     # The location is invalid, so it's an error,
97 :     $error++;
98 :     }
99 :     }
100 :     # If there's an error, clear the contig ID.
101 :     if ($error) {
102 :     $contig = undef;
103 :     }
104 :     # Compute the min and max from the offsets collected.
105 :     $min = min(@offsets);
106 :     $max = max(@offsets);
107 :     # Return the results.
108 :     return ($contig, $min, $max);
109 :     }
110 :    
111 : parrello 1.17
112 :     =head3 roles_in_function
113 :    
114 :     my @roles = roles_in_function($assignment);
115 :    
116 :     Return a list of the functional roles in the specified assignment string.
117 :     A single assignment may contain multiple roles as well as comments; this
118 :     method separates them out.
119 :    
120 :     =over 4
121 :    
122 :     =item assignment
123 :    
124 :     Functional assignment to parse for roles.
125 :    
126 :     =item RETURN
127 :    
128 :     Returns a list of the individual roles in the assignment.
129 :    
130 :     =back
131 :    
132 :     =cut
133 :    
134 :     sub roles_in_function {
135 :     # Get the parameters.
136 :     my ($assignment) = @_;
137 :     # Remove any comment.
138 : parrello 1.18 my $commentFree = ($assignment =~ /(.+?)\s*#/ ? $1 : $assignment);
139 : parrello 1.17 # Split out the roles.
140 : parrello 1.18 my @retVal = split /\s+[\/@;]\s+/, $commentFree;
141 : parrello 1.17 # Return the result.
142 :     return @retVal;
143 :     }
144 :    
145 :    
146 : parrello 1.16 =head3 parse_location
147 :    
148 :     my ($contig, $begin, $end) = parse_location($locString);
149 :    
150 :     Return the contigID, start offset, and end offset for a specified
151 :     location string (see L<SAP/Location Strings>).
152 :    
153 :     =over 4
154 :    
155 :     =item locString
156 :    
157 :     Location string to parse.
158 :    
159 :     =item RETURN
160 :    
161 :     Returns a three-element list containing the contig ID from the location
162 :     string, the starting offset of the location, and the ending offset. If
163 :     the location string is not valid, the values returned will be C<undef>.
164 :    
165 :     =back
166 :    
167 :     =cut
168 :    
169 :     sub parse_location {
170 :     # Get the parameters.
171 :     my ($locString) = @_;
172 :     # Declare the return variables.
173 :     my ($contig, $begin, $end);
174 :     # Parse the location string.
175 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
176 :     # Pull out the contig ID and the begin location.
177 :     $contig = $1;
178 :     $begin = $2;
179 :     # Compute the ending location from the direction and length.
180 :     if ($3 eq '+') {
181 :     $end = $begin + $4 - 1;
182 :     } else {
183 :     $end = $begin - $4 + 1;
184 :     }
185 :     }
186 :     # Return the results.
187 :     return ($contig, $begin, $end);
188 :     }
189 :    
190 :    
191 : parrello 1.5 =head3 max
192 :    
193 :     my $max = max(@nums);
194 :    
195 :     Return the maximum number from all the values in the specified list.
196 :    
197 :     =over 4
198 :    
199 :     =item nums
200 :    
201 :     List of numbers to examine.
202 :    
203 :     =item RETURN
204 :    
205 :     Returns the maximum numeric value from the specified parameters, or
206 :     an undefined value if an empty list is passed in.
207 :    
208 :     =back
209 :    
210 :     =cut
211 :    
212 :     sub max {
213 :     my ($retVal, @nums) = @_;
214 :     for my $num (@nums) {
215 :     if ($num > $retVal) {
216 :     $retVal = $num;
217 :     }
218 :     }
219 :     return $retVal;
220 :     }
221 :    
222 :     =head3 min
223 :    
224 :     my $min = min(@nums);
225 :    
226 :     Return the minimum number from all the values in the specified list.
227 :    
228 :     =over 4
229 :    
230 :     =item nums
231 :    
232 :     List of numbers to examine.
233 :    
234 :     =item RETURN
235 :    
236 :     Returns the minimum numeric value from the specified parameters, or
237 :     an undefined value if an empty list is passed in.
238 :    
239 :     =back
240 :    
241 :     =cut
242 :    
243 :     sub min {
244 :     my ($retVal, @nums) = @_;
245 :     for my $num (@nums) {
246 :     if ($num < $retVal) {
247 :     $retVal = $num;
248 :     }
249 :     }
250 :     return $retVal;
251 :     }
252 :    
253 : parrello 1.1 =head3 create_fasta_record
254 :    
255 : parrello 1.7 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
256 : parrello 1.1
257 :     Create a FASTA record from the specified DNA or protein sequence. The
258 :     sequence will be split into 60-character lines, and the record will
259 : parrello 1.4 include an identifier line.
260 : parrello 1.1
261 :     =over 4
262 :    
263 :     =item id
264 :    
265 :     ID for the sequence, to be placed at the beginning of the identifier
266 :     line.
267 :    
268 :     =item comment (optional)
269 :    
270 :     Comment text to place after the ID on the identifier line. If this parameter
271 :     is empty, undefined, or 0, no comment will be placed.
272 :    
273 :     =item sequence
274 :    
275 :     Sequence of letters to form into FASTA. For purposes of convenience, whitespace
276 :     characters in the sequence will be removed automatically.
277 :    
278 : parrello 1.7 =item stripped (optional)
279 :    
280 :     If TRUE, then the sequence will be returned unmodified instead of converted
281 :     to FASTA format. The default is FALSE.
282 :    
283 : parrello 1.1 =item RETURN
284 :    
285 :     Returns the desired sequence in FASTA format.
286 :    
287 :     =back
288 :    
289 :     =cut
290 :    
291 :     sub create_fasta_record {
292 :     # Get the parameters.
293 : parrello 1.7 my ($id, $comment, $sequence, $stripped) = @_;
294 :     # Declare the return variable.
295 :     my $retVal;
296 :     # If we're in stripped mode, we just return the sequence.
297 :     if ($stripped) {
298 :     $retVal = $sequence;
299 :     } else {
300 :     # Here we have to do the FASTA conversion. Start with the ID.
301 :     my $header = ">$id";
302 :     # Add a comment, if any.
303 :     if ($comment) {
304 :     $header .= " $comment";
305 :     }
306 :     # Clean up the sequence.
307 :     $sequence =~ s/\s+//g;
308 :     # We need to format the sequence into 60-byte chunks. We use the infamous
309 :     # grep-split trick. The split, because of the presence of the parentheses,
310 :     # includes the matched delimiters in the output list. The grep strips out
311 :     # the empty list items that appear between the so-called delimiters, since
312 :     # the delimiters are what we want.
313 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
314 :     # Add the chunks and the trailer.
315 : parrello 1.10 $retVal = join("\n", $header, @chunks) . "\n";
316 : parrello 1.1 }
317 :     # Return the result.
318 :     return $retVal;
319 :     }
320 :    
321 : parrello 1.2 =head3 rev_comp
322 :    
323 :     my $revcmp = rev_comp($dna);
324 :    
325 :     or
326 :    
327 :     rev_comp(\$dna);
328 :    
329 :     Return the reverse complement of a DNA string.
330 :    
331 :     =over 4
332 :    
333 :     =item dna
334 :    
335 :     Either a DNA string, or a reference to a DNA string.
336 :    
337 :     =item RETURN
338 :    
339 :     If the input is a DNA string, returns the reverse complement. If the
340 :     input is a reference to a DNA string, the string itself is reverse
341 :     complemented.
342 :    
343 :     =back
344 :    
345 :     =cut
346 :    
347 :     sub rev_comp {
348 :     # Get the parameters.
349 :     my ($dna) = @_;
350 :     # Determine how we were called.
351 :     my ($retVal, $refMode);
352 :     if (ref $dna eq 'SCALAR') {
353 :     $retVal = lc reverse $dna;
354 :     $refMode = 0;
355 :     } else {
356 :     $retVal = lc reverse $$dna;
357 :     $refMode = 1;
358 :     }
359 :     # Now $retVal contains the reversed DNA string in all lower case, and
360 :     # $refMode is TRUE iff the user passed in a reference. The following
361 :     # translation step complements the string.
362 :     $retVal =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;
363 :     # Return the result in the method corresponding to the way it came in.
364 :     if ($refMode) {
365 :     $$dna = $retVal;
366 :     return;
367 :     } else {
368 :     return $retVal;
369 :     }
370 :     }
371 :    
372 : parrello 1.15
373 :     =head3 by_fig_id
374 :    
375 :     my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids;
376 :    
377 :     Compare two feature IDs.
378 :    
379 :     This function is designed to assist in sorting features by ID. The sort is by
380 :     genome ID followed by feature type and then feature number.
381 :    
382 :     =over 4
383 :    
384 :     =item a
385 :    
386 :     First feature ID.
387 :    
388 :     =item b
389 :    
390 :     Second feature ID.
391 :    
392 :     =item RETURN
393 :    
394 :     Returns a negative number if the first parameter is smaller, zero if both parameters
395 :     are equal, and a positive number if the first parameter is greater.
396 :    
397 :     =back
398 :    
399 :     =cut
400 :    
401 :     sub by_fig_id {
402 :     my($a,$b) = @_;
403 :     my($g1,$g2,$t1,$t2,$n1,$n2);
404 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
405 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
406 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
407 :     } else {
408 :     $a cmp $b;
409 :     }
410 :     }
411 :    
412 :    
413 : parrello 1.3 =head3 genome_of
414 :    
415 :     my $genomeID = genome_of($fid);
416 :    
417 :     Return the Genome ID embedded in the specified FIG feature ID.
418 :    
419 :     =over 4
420 :    
421 :     =item fid
422 :    
423 :     Feature ID of interest.
424 :    
425 :     =item RETURN
426 :    
427 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
428 :     feature ID is invalid, this method returns an undefined value.
429 :    
430 :     =back
431 :    
432 :     =cut
433 :    
434 :     sub genome_of {
435 :     # Get the parameters.
436 :     my ($fid) = @_;
437 :     # Declare the return variable.
438 :     my $retVal;
439 :     # Parse the feature ID.
440 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
441 :     $retVal = $1;
442 :     }
443 :     # Return the result.
444 :     return $retVal;
445 :     }
446 :    
447 : parrello 1.6 =head3 sims
448 :    
449 :     my $sims = sims($id, \%seen, $maxN, $maxP, $select, $max_expand, $filters);
450 :    
451 :     Retrieve similarities from the network similarity server. The similarity retrieval
452 :     is performed using an HTTP user agent that returns similarity data in multiple
453 :     chunks. An anonymous subroutine is passed to the user agent that parses and
454 :     reformats the chunks as they come in. The similarites themselves are returned
455 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
456 :     The Sim object methods allow access to the elements by name.
457 :    
458 :     Similarities can be either raw or expanded. The raw similarities are basic
459 :     hits between features with similar DNA. Expanding a raw similarity drags in any
460 :     features considered substantially identical. So, for example, if features B<A1>,
461 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
462 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
463 :    
464 :     =over 4
465 :    
466 :     =item id
467 :    
468 :     ID of the feature whose similarities are desired, or reference to a list
469 :     of the IDs of the features whose similarities are desired.
470 :    
471 :     =item maxN
472 :    
473 :     Maximum number of similarities to return.
474 :    
475 :     =item maxP
476 :    
477 :     The maximum allowable similarity score.
478 :    
479 :     =item select
480 :    
481 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
482 :     means only similarities to FIG features are returned; C<all> means all expanded
483 :     similarities are returned; and C<figx> means similarities are expanded until the
484 :     number of FIG features equals the maximum.
485 :    
486 :     =item max_expand
487 :    
488 :     The maximum number of features to expand.
489 :    
490 :     =item filters
491 :    
492 :     Reference to a hash containing filter information, or a subroutine that can be
493 :     used to filter the sims.
494 :    
495 :     =item RETURN
496 :    
497 :     Returns a list of L<Sim> objects.
498 :    
499 :     =back
500 :    
501 :     =cut
502 :    
503 :     sub sims {
504 :     # Get the parameters.
505 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
506 :     # Get the URL for submitting to the sims server.
507 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
508 :     # Get a list of the IDs to process.
509 :     my @ids;
510 :     if (ref($id) eq "ARRAY") {
511 :     @ids = @$id;
512 :     } else {
513 :     @ids = ($id);
514 :     }
515 :     # Form a list of the parameters to pass to the server.
516 :     my %args = ();
517 :     $args{id} = \@ids;
518 :     $args{maxN} = $maxN if defined($maxN);
519 :     $args{maxP} = $maxP if defined($maxP);
520 :     $args{select} = $select if defined($select);
521 :     $args{max_expand} = $max_expand if defined($max_expand);
522 :     # If the filter is a hash, put the filters in the argument list.
523 :     if (ref($filters) eq 'HASH') {
524 :     for my $k (keys(%$filters))
525 :     {
526 :     $args{"filter_$k"}= $filters->{$k};
527 :     }
528 :     }
529 :     # Get the user agent.
530 :     require LWP::UserAgent;
531 :     my $ua = LWP::UserAgent->new();
532 :     # Insure we have the Sim module.
533 :     require Sim;
534 :     #
535 :     # Our next task is to create the anonymous subroutine that will process the
536 :     # chunks that come back from the server. We require three global variables:
537 :     # @sims to hold the similarities found, $tail to remember the unprocessed
538 :     # data from the previous chunk, and $chunks to count the chunks.
539 :     #
540 :     my @retVal;
541 :     my $tail;
542 :     my $chunks = 0;
543 :     #
544 :     # ANONYMOUS SUBROUTINE
545 :     #
546 :     my $cb = sub {
547 :     eval {
548 :     # Get the parameters.
549 :     my ($data, $command) = @_;
550 :     # Check for a reset command. If we get one, we discard any data
551 :     # in progress.
552 :     if ($command && $command eq 'reset') {
553 :     $tail = '';
554 :     } else {
555 :     $chunks++;
556 :     # Get the data to process. Note we concatenate it to the incoming
557 :     # tail from last time.
558 :     my $c = $tail . $data;
559 :     # Make sure the caller hasn't messed up the new-line character.
560 :     # FASTA readers in particular are notorious for doing things
561 :     # like that.
562 :     local $/ = "\n";
563 :     # Split the input into lines.
564 :     my @lines = split(/\n/, $c);
565 :     # If the input does not end with a new-line, we have a partial
566 :     # chunk and need to put it in the tail for next time. If not,
567 :     # there is no tail for next time.
568 :     if (substr($c, -1, 1) ne "\n") {
569 :     $tail = pop @lines;
570 :     } else {
571 :     $tail = '';
572 :     }
573 :     # Loop through the lines. Note there's no need to chomp because
574 :     # the SPLIT took out the new-line characters.
575 :     for my $l (@lines) {
576 :     # Split the line into fields.
577 :     my @s = split(/\t/, $l);
578 :     # Insure we have all the fields we need.
579 : parrello 1.12 if (@s >= 9) {
580 : parrello 1.6 # Check to see if we've seen this SIM before.
581 :     my $id1 = $s[0];
582 :     my $id2 = $s[1];
583 :     # Add it to the result list.
584 :     push(@retVal, bless \@s, 'Sim');
585 :     }
586 :     }
587 :     }
588 :     };
589 :     };
590 :     #
591 :     # END OF ANONYMOUS SUBROUTINE
592 :     #
593 :     # Now we're ready to start. Because networking is an iffy thing, we set up
594 :     # to try our request multiple times.
595 :     my $n_retries = 10;
596 :     my $attempts = 0;
597 :     # Set the timeout value, in seconds.
598 :     $ua->timeout(180);
599 :     # Loop until we succeed or run out of retries.
600 :     my $done = 0;
601 :     while (! $done && $attempts++ < $n_retries) {
602 :     # Reset the content processor. This clears the tail.
603 :     &$cb(undef, 'reset');
604 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
605 :     if ($resp->is_success) {
606 :     # If the response was successful, get the content. This triggers
607 :     # the anonymous subroutine.
608 :     my $x = $resp->content;
609 :     # Denote we've been successful.
610 :     $done = 1;
611 :     }
612 :     }
613 :     return @retVal;
614 :     }
615 :    
616 : parrello 1.8 =head3 verify_dir
617 :    
618 :     verify_dir($dirName);
619 :    
620 :     Insure that the specified directory exists. If the directory does not
621 :     exist, it will be created.
622 :    
623 :     =over 4
624 :    
625 :     =item dirName
626 :    
627 :     Name of the relevant directory.
628 :    
629 :     =back
630 :    
631 :     =cut
632 :    
633 :     sub verify_dir {
634 :     # Get the parameters.
635 :     my ($dirName) = @_;
636 :     # Strip off the final slash, if any.
637 :     $dirName =~ s#/$##;
638 :     # Only proceed if the directory does NOT already exist.
639 :     if (! -d $dirName) {
640 :     # If there is a parent directory, recursively insure it is there.
641 :     if ($dirName =~ m#(.+)/[^/]+$#) {
642 :     verify_dir($1);
643 :     }
644 :     # Create this particular directory with full permissions.
645 :     mkdir $dirName, 0777;
646 :     }
647 :     }
648 :    
649 : parrello 1.9 =head3 parse_fasta_record
650 :    
651 :     my ($id, $comment, $seq) = parse_fasta_record($string);
652 :    
653 :     Extract the ID, comment, and sequence from a single FASTA record. For
654 :     backward compatability, instead of a FASTA record the ID and sequence can
655 :     be specified separated by a comma. In this case, the returned comment
656 :     will be empty.
657 :    
658 :     =over 4
659 :    
660 :     =item string
661 :    
662 :     A single FASTA record, or an ID and sequence separated by a single comma,
663 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
664 :     or a 3-element list consisting of an ID, a comment, and a sequence.
665 :    
666 :     =item RETURN
667 :    
668 :     Returns a three-element list consisting of the incoming ID, the associated
669 :     comment, and the specified DNA or protein sequence. If the incoming string is
670 :     invalid, all three list elements will come back undefined. If no ID is
671 :     specified, an MD5 will be provided.
672 :    
673 :     =back
674 :    
675 :     =cut
676 :    
677 :     sub parse_fasta_record {
678 :     # Get the parameters.
679 :     my ($string) = @_;
680 :     # Declare the return variables.
681 :     my ($id, $comment, $seq);
682 :     # Check the type of input string.
683 :     if (! defined $string) {
684 :     # Do nothing if no string was passed in. This extra check prevents a
685 :     # warning at runtime.
686 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
687 :     # Here we have a standard FASTA string.
688 :     ($id, $comment, $seq) = ($1, $2, $3);
689 :     # Remove white space from the sequence string.
690 :     $seq =~ s/\s+//sg;
691 :     # Trim front of comment.
692 :     $comment =~ s/^s+//;
693 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
694 :     ($id, $comment, $seq) = ($1, '', $2);
695 :     } elsif (ref $string eq 'ARRAY') {
696 :     # Here the data came in pre-formatted as a list reference.
697 :     ($id, $comment, $seq) = @$string;
698 :     # If there's no comment, we need to adjust.
699 :     if (! defined $seq) {
700 :     $seq = $comment;
701 :     $comment = '';
702 :     }
703 :     } else {
704 :     # Here we have only a sequence. We need to construct the ID.
705 :     $seq = $string;
706 : parrello 1.12 require Digest::MD5;
707 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
708 : parrello 1.9 $comment = "";
709 :     }
710 :     # Return the results.
711 :     return ($id, $comment, $seq);
712 :     }
713 :    
714 :    
715 : parrello 1.11 =head3 hypo
716 :    
717 :     my $flag = hypo($func);
718 :    
719 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
720 :     Hypothetical functional roles are identified by key words in the text,
721 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
722 :    
723 :     =over 4
724 :    
725 :     =item func
726 :    
727 :     Text of the functional role whose nature is to be determined.
728 :    
729 :     =item RETURN
730 :    
731 :     Returns TRUE if the role is hypothetical, else FALSE.
732 :    
733 :     =back
734 :    
735 :     =cut
736 :    
737 :     sub hypo {
738 :     my ($func) = @_;
739 :     if (! $func) { return 1 }
740 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
741 :     if ($func =~ /hypoth/i) { return 1 }
742 :     if ($func =~ /conserved protein/i) { return 1 }
743 :     if ($func =~ /gene product/i) { return 1 }
744 :     if ($func =~ /interpro/i) { return 1 }
745 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
746 :     if ($func =~ /^U\d/) { return 1 }
747 :     if ($func =~ /^orf[^_]/i) { return 1 }
748 :     if ($func =~ /uncharacterized/i) { return 1 }
749 :     if ($func =~ /pseudogene/i) { return 1 }
750 :     if ($func =~ /^predicted/i) { return 1 }
751 :     if ($func =~ /AGR_/) { return 1 }
752 :     if ($func =~ /similar to/i) { return 1 }
753 :     if ($func =~ /similarity/i) { return 1 }
754 :     if ($func =~ /glimmer/i) { return 1 }
755 :     if ($func =~ /unknown/i) { return 1 }
756 :     if (($func =~ /domain/i) ||
757 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
758 :     ($func =~ /complete/i) ||
759 :     ($func =~ /ensang/i) ||
760 :     ($func =~ /unnamed/i) ||
761 :     ($func =~ /EG:/) ||
762 :     ($func =~ /orf\d+/i) ||
763 :     ($func =~ /RIKEN/) ||
764 :     ($func =~ /Expressed/i) ||
765 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
766 :     ($func =~ /predicted by Psort/) ||
767 :     ($func =~ /^bh\d+/i) ||
768 :     ($func =~ /cds_/i) ||
769 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
770 :     ($func =~ /similar to/i) ||
771 :     ($func =~ / identi/i) ||
772 :     ($func =~ /ortholog of/i) ||
773 :     ($func =~ /structural feature/i)) { return 1 }
774 :     return 0;
775 :    
776 :     }
777 : parrello 1.9
778 : overbeek 1.14 =head3 between
779 :    
780 : parrello 1.15 my $flag = between($x, $y, $z);
781 : overbeek 1.14
782 :     Determine whether or not $y is between $x and $z.
783 :    
784 :     =over 4
785 :    
786 :     =item x
787 :    
788 :     First edge number.
789 :    
790 :     =item y
791 :    
792 :     Number to examine.
793 :    
794 :     =item z
795 :    
796 :     Second edge number.
797 :    
798 :     =item RETURN
799 :    
800 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
801 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
802 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
803 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
804 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
805 :    
806 :     =back
807 :    
808 :     =cut
809 :     #: Return Type $;
810 :     sub between {
811 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
812 :     my($x,$y,$z) = @_;
813 :    
814 :     if ($x < $z) {
815 :     return (($x <= $y) && ($y <= $z));
816 :     } else {
817 :     return (($x >= $y) && ($y >= $z));
818 :     }
819 :     }
820 : parrello 1.1
821 : parrello 1.15 =head3 standard_genetic_code
822 :    
823 :     my $code = standard_genetic_code();
824 :    
825 :     Return a hash containing the standard translation of nucleotide triples to proteins.
826 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
827 :     returns the default translation scheme. The scheme is implemented as a reference to a
828 :     hash that contains nucleotide triplets as keys and has protein letters as values.
829 :    
830 :     =cut
831 :    
832 :     sub standard_genetic_code {
833 :    
834 :     my $code = {};
835 :    
836 :     $code->{"AAA"} = "K";
837 :     $code->{"AAC"} = "N";
838 :     $code->{"AAG"} = "K";
839 :     $code->{"AAT"} = "N";
840 :     $code->{"ACA"} = "T";
841 :     $code->{"ACC"} = "T";
842 :     $code->{"ACG"} = "T";
843 :     $code->{"ACT"} = "T";
844 :     $code->{"AGA"} = "R";
845 :     $code->{"AGC"} = "S";
846 :     $code->{"AGG"} = "R";
847 :     $code->{"AGT"} = "S";
848 :     $code->{"ATA"} = "I";
849 :     $code->{"ATC"} = "I";
850 :     $code->{"ATG"} = "M";
851 :     $code->{"ATT"} = "I";
852 :     $code->{"CAA"} = "Q";
853 :     $code->{"CAC"} = "H";
854 :     $code->{"CAG"} = "Q";
855 :     $code->{"CAT"} = "H";
856 :     $code->{"CCA"} = "P";
857 :     $code->{"CCC"} = "P";
858 :     $code->{"CCG"} = "P";
859 :     $code->{"CCT"} = "P";
860 :     $code->{"CGA"} = "R";
861 :     $code->{"CGC"} = "R";
862 :     $code->{"CGG"} = "R";
863 :     $code->{"CGT"} = "R";
864 :     $code->{"CTA"} = "L";
865 :     $code->{"CTC"} = "L";
866 :     $code->{"CTG"} = "L";
867 :     $code->{"CTT"} = "L";
868 :     $code->{"GAA"} = "E";
869 :     $code->{"GAC"} = "D";
870 :     $code->{"GAG"} = "E";
871 :     $code->{"GAT"} = "D";
872 :     $code->{"GCA"} = "A";
873 :     $code->{"GCC"} = "A";
874 :     $code->{"GCG"} = "A";
875 :     $code->{"GCT"} = "A";
876 :     $code->{"GGA"} = "G";
877 :     $code->{"GGC"} = "G";
878 :     $code->{"GGG"} = "G";
879 :     $code->{"GGT"} = "G";
880 :     $code->{"GTA"} = "V";
881 :     $code->{"GTC"} = "V";
882 :     $code->{"GTG"} = "V";
883 :     $code->{"GTT"} = "V";
884 :     $code->{"TAA"} = "*";
885 :     $code->{"TAC"} = "Y";
886 :     $code->{"TAG"} = "*";
887 :     $code->{"TAT"} = "Y";
888 :     $code->{"TCA"} = "S";
889 :     $code->{"TCC"} = "S";
890 :     $code->{"TCG"} = "S";
891 :     $code->{"TCT"} = "S";
892 :     $code->{"TGA"} = "*";
893 :     $code->{"TGC"} = "C";
894 :     $code->{"TGG"} = "W";
895 :     $code->{"TGT"} = "C";
896 :     $code->{"TTA"} = "L";
897 :     $code->{"TTC"} = "F";
898 :     $code->{"TTG"} = "L";
899 :     $code->{"TTT"} = "F";
900 :    
901 :     return $code;
902 :     }
903 :    
904 :     =head3 translate
905 :    
906 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
907 :    
908 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
909 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
910 :     C<M>. (In the standard genetic code, these two combinations normally translate
911 :     to C<V> and C<L>, respectively.)
912 :    
913 :     =over 4
914 :    
915 :     =item dna_seq
916 :    
917 :     DNA sequence to translate. Note that the DNA sequence can only contain
918 :     known nucleotides.
919 :    
920 :     =item code
921 :    
922 :     Reference to a hash specifying the translation code. The hash is keyed by
923 :     nucleotide triples, and the value for each key is the corresponding protein
924 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
925 :     used.
926 :    
927 :     =item fix_start
928 :    
929 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
930 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
931 :     C<M> instead of the value specified in the translation code.
932 :    
933 :     =item RETURN
934 :    
935 :     Returns a string resulting from translating each nucleotide triple into a
936 :     protein letter.
937 :    
938 :     =back
939 :    
940 :     =cut
941 :     #: Return Type $;
942 :     sub translate {
943 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
944 :    
945 :     my( $dna,$code,$start ) = @_;
946 :     my( $i,$j,$ln );
947 :     my( $x,$y );
948 :     my( $prot );
949 :    
950 :     if (! defined($code)) {
951 :     $code = &FIG::standard_genetic_code;
952 :     }
953 :     $ln = length($dna);
954 :     $prot = "X" x ($ln/3);
955 :     $dna =~ tr/a-z/A-Z/;
956 :    
957 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
958 :     $x = substr($dna,$i,3);
959 :     if ($y = $code->{$x}) {
960 :     substr($prot,$j,1) = $y;
961 :     }
962 :     }
963 :    
964 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
965 :     substr($prot,0,1) = 'M';
966 :     }
967 :     return $prot;
968 :     }
969 :    
970 : parrello 1.18 =head3 strip_ec
971 :    
972 :     my $role = strip_ec($rawRole);
973 :    
974 :     Strip the EC number (if any) from the specified role or functional
975 :     assignment.
976 :    
977 :     =over 4
978 :    
979 :     =item rawRole
980 :    
981 :     Role or functional assignment from which the EC numbers are to be stripped.
982 :    
983 :     =item RETURN
984 :    
985 :     Returns the incoming string with any EC numbers removed. The EC numbers must
986 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
987 :     and surrounding parentheses).
988 :    
989 :     =back
990 :    
991 :     =cut
992 :    
993 :     sub strip_ec {
994 :     # Get the parameters.
995 :     my ($rawRole) = @_;
996 :     # Declare the return variable.
997 :     my $retVal = $rawRole;
998 :     # Remove the EC numbers.
999 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1000 :     # Return the result.
1001 :     return $retVal;
1002 :     }
1003 :    
1004 : parrello 1.15
1005 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3