[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 :    
24 :     use strict;
25 : parrello 1.12 no warnings qw(once);
26 : parrello 1.1 use base qw(Exporter);
27 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
28 :     rev_comp genome_of min max sims verify_dir between translate
29 : parrello 1.17 standard_genetic_code parse_location roles_in_function);
30 : parrello 1.1
31 :     =head1 SEED Utility Methods
32 :    
33 :     =head2 Introduction
34 :    
35 :     This is a simple utility package that performs functions useful for
36 :     bioinformatics, but that do not require access to the databases.
37 :    
38 :     =head2 Public Methods
39 :    
40 : parrello 1.13 =head3 boundaries_of
41 :    
42 :     my ($contig, $min, $max) = boundaries_of($locs);
43 :    
44 :     Return the boundaries of a set of locations. The contig, the leftmost
45 :     location, and the rightmost location will be returned to the caller. If
46 :     more than one contig is represented, the method will return an undefined
47 :     value for the contig (indicating failure).
48 :    
49 :     =over 4
50 :    
51 :     =item locs
52 :    
53 :     Reference to a list of location strings. A location string contains a contig ID,
54 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
55 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
56 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
57 :    
58 :     =item RETURN
59 :    
60 :     Returns a 3-element list. The first element is the contig ID from all the locations,
61 :     the second is the offset of leftmost base pair represented in the locations, and the
62 :     third is the offset of the rightmost base pair represented in the locations.
63 :    
64 :     =back
65 :    
66 :     =cut
67 :    
68 :     sub boundaries_of {
69 :     # Get the parameters.
70 :     my ($locs) = @_;
71 :     # Declare the return variables.
72 :     my ($contig, $min, $max);
73 :     # We'll put all the starting and ending offsets found in here.
74 :     my @offsets;
75 :     # This will count the number of errors found.
76 :     my $error = 0;
77 :     # Loop through the locations.
78 :     for my $loc (@$locs) {
79 :     # Parse this location.
80 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
81 :     my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
82 :     # Is this contig valid?
83 :     if ($contig && $newContig ne $contig) {
84 :     # No, skip this location.
85 :     $error++;
86 :     } else {
87 :     # Save the contig.
88 :     $contig = $newContig;
89 :     # Compute the ending offset.
90 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
91 :     # Save both offsets.
92 :     push @offsets, $begin, $end;
93 :     }
94 :     } else {
95 :     # The location is invalid, so it's an error,
96 :     $error++;
97 :     }
98 :     }
99 :     # If there's an error, clear the contig ID.
100 :     if ($error) {
101 :     $contig = undef;
102 :     }
103 :     # Compute the min and max from the offsets collected.
104 :     $min = min(@offsets);
105 :     $max = max(@offsets);
106 :     # Return the results.
107 :     return ($contig, $min, $max);
108 :     }
109 :    
110 : parrello 1.17
111 :     =head3 roles_in_function
112 :    
113 :     my @roles = roles_in_function($assignment);
114 :    
115 :     Return a list of the functional roles in the specified assignment string.
116 :     A single assignment may contain multiple roles as well as comments; this
117 :     method separates them out.
118 :    
119 :     =over 4
120 :    
121 :     =item assignment
122 :    
123 :     Functional assignment to parse for roles.
124 :    
125 :     =item RETURN
126 :    
127 :     Returns a list of the individual roles in the assignment.
128 :    
129 :     =back
130 :    
131 :     =cut
132 :    
133 :     sub roles_in_function {
134 :     # Get the parameters.
135 :     my ($assignment) = @_;
136 :     # Remove any comment.
137 :     my $commentFree = ($assignment =~ /(.+)\s*#/ ? $1 : $assignment);
138 :     # Split out the roles.
139 :     my @retVal = split /\s*[\/@;]\s*/, $commentFree;
140 :     # Return the result.
141 :     return @retVal;
142 :     }
143 :    
144 :    
145 : parrello 1.16 =head3 parse_location
146 :    
147 :     my ($contig, $begin, $end) = parse_location($locString);
148 :    
149 :     Return the contigID, start offset, and end offset for a specified
150 :     location string (see L<SAP/Location Strings>).
151 :    
152 :     =over 4
153 :    
154 :     =item locString
155 :    
156 :     Location string to parse.
157 :    
158 :     =item RETURN
159 :    
160 :     Returns a three-element list containing the contig ID from the location
161 :     string, the starting offset of the location, and the ending offset. If
162 :     the location string is not valid, the values returned will be C<undef>.
163 :    
164 :     =back
165 :    
166 :     =cut
167 :    
168 :     sub parse_location {
169 :     # Get the parameters.
170 :     my ($locString) = @_;
171 :     # Declare the return variables.
172 :     my ($contig, $begin, $end);
173 :     # Parse the location string.
174 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
175 :     # Pull out the contig ID and the begin location.
176 :     $contig = $1;
177 :     $begin = $2;
178 :     # Compute the ending location from the direction and length.
179 :     if ($3 eq '+') {
180 :     $end = $begin + $4 - 1;
181 :     } else {
182 :     $end = $begin - $4 + 1;
183 :     }
184 :     }
185 :     # Return the results.
186 :     return ($contig, $begin, $end);
187 :     }
188 :    
189 :    
190 : parrello 1.5 =head3 max
191 :    
192 :     my $max = max(@nums);
193 :    
194 :     Return the maximum number from all the values in the specified list.
195 :    
196 :     =over 4
197 :    
198 :     =item nums
199 :    
200 :     List of numbers to examine.
201 :    
202 :     =item RETURN
203 :    
204 :     Returns the maximum numeric value from the specified parameters, or
205 :     an undefined value if an empty list is passed in.
206 :    
207 :     =back
208 :    
209 :     =cut
210 :    
211 :     sub max {
212 :     my ($retVal, @nums) = @_;
213 :     for my $num (@nums) {
214 :     if ($num > $retVal) {
215 :     $retVal = $num;
216 :     }
217 :     }
218 :     return $retVal;
219 :     }
220 :    
221 :     =head3 min
222 :    
223 :     my $min = min(@nums);
224 :    
225 :     Return the minimum number from all the values in the specified list.
226 :    
227 :     =over 4
228 :    
229 :     =item nums
230 :    
231 :     List of numbers to examine.
232 :    
233 :     =item RETURN
234 :    
235 :     Returns the minimum numeric value from the specified parameters, or
236 :     an undefined value if an empty list is passed in.
237 :    
238 :     =back
239 :    
240 :     =cut
241 :    
242 :     sub min {
243 :     my ($retVal, @nums) = @_;
244 :     for my $num (@nums) {
245 :     if ($num < $retVal) {
246 :     $retVal = $num;
247 :     }
248 :     }
249 :     return $retVal;
250 :     }
251 :    
252 : parrello 1.1 =head3 create_fasta_record
253 :    
254 : parrello 1.7 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
255 : parrello 1.1
256 :     Create a FASTA record from the specified DNA or protein sequence. The
257 :     sequence will be split into 60-character lines, and the record will
258 : parrello 1.4 include an identifier line.
259 : parrello 1.1
260 :     =over 4
261 :    
262 :     =item id
263 :    
264 :     ID for the sequence, to be placed at the beginning of the identifier
265 :     line.
266 :    
267 :     =item comment (optional)
268 :    
269 :     Comment text to place after the ID on the identifier line. If this parameter
270 :     is empty, undefined, or 0, no comment will be placed.
271 :    
272 :     =item sequence
273 :    
274 :     Sequence of letters to form into FASTA. For purposes of convenience, whitespace
275 :     characters in the sequence will be removed automatically.
276 :    
277 : parrello 1.7 =item stripped (optional)
278 :    
279 :     If TRUE, then the sequence will be returned unmodified instead of converted
280 :     to FASTA format. The default is FALSE.
281 :    
282 : parrello 1.1 =item RETURN
283 :    
284 :     Returns the desired sequence in FASTA format.
285 :    
286 :     =back
287 :    
288 :     =cut
289 :    
290 :     sub create_fasta_record {
291 :     # Get the parameters.
292 : parrello 1.7 my ($id, $comment, $sequence, $stripped) = @_;
293 :     # Declare the return variable.
294 :     my $retVal;
295 :     # If we're in stripped mode, we just return the sequence.
296 :     if ($stripped) {
297 :     $retVal = $sequence;
298 :     } else {
299 :     # Here we have to do the FASTA conversion. Start with the ID.
300 :     my $header = ">$id";
301 :     # Add a comment, if any.
302 :     if ($comment) {
303 :     $header .= " $comment";
304 :     }
305 :     # Clean up the sequence.
306 :     $sequence =~ s/\s+//g;
307 :     # We need to format the sequence into 60-byte chunks. We use the infamous
308 :     # grep-split trick. The split, because of the presence of the parentheses,
309 :     # includes the matched delimiters in the output list. The grep strips out
310 :     # the empty list items that appear between the so-called delimiters, since
311 :     # the delimiters are what we want.
312 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
313 :     # Add the chunks and the trailer.
314 : parrello 1.10 $retVal = join("\n", $header, @chunks) . "\n";
315 : parrello 1.1 }
316 :     # Return the result.
317 :     return $retVal;
318 :     }
319 :    
320 : parrello 1.2 =head3 rev_comp
321 :    
322 :     my $revcmp = rev_comp($dna);
323 :    
324 :     or
325 :    
326 :     rev_comp(\$dna);
327 :    
328 :     Return the reverse complement of a DNA string.
329 :    
330 :     =over 4
331 :    
332 :     =item dna
333 :    
334 :     Either a DNA string, or a reference to a DNA string.
335 :    
336 :     =item RETURN
337 :    
338 :     If the input is a DNA string, returns the reverse complement. If the
339 :     input is a reference to a DNA string, the string itself is reverse
340 :     complemented.
341 :    
342 :     =back
343 :    
344 :     =cut
345 :    
346 :     sub rev_comp {
347 :     # Get the parameters.
348 :     my ($dna) = @_;
349 :     # Determine how we were called.
350 :     my ($retVal, $refMode);
351 :     if (ref $dna eq 'SCALAR') {
352 :     $retVal = lc reverse $dna;
353 :     $refMode = 0;
354 :     } else {
355 :     $retVal = lc reverse $$dna;
356 :     $refMode = 1;
357 :     }
358 :     # Now $retVal contains the reversed DNA string in all lower case, and
359 :     # $refMode is TRUE iff the user passed in a reference. The following
360 :     # translation step complements the string.
361 :     $retVal =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;
362 :     # Return the result in the method corresponding to the way it came in.
363 :     if ($refMode) {
364 :     $$dna = $retVal;
365 :     return;
366 :     } else {
367 :     return $retVal;
368 :     }
369 :     }
370 :    
371 : parrello 1.15
372 :     =head3 by_fig_id
373 :    
374 :     my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids;
375 :    
376 :     Compare two feature IDs.
377 :    
378 :     This function is designed to assist in sorting features by ID. The sort is by
379 :     genome ID followed by feature type and then feature number.
380 :    
381 :     =over 4
382 :    
383 :     =item a
384 :    
385 :     First feature ID.
386 :    
387 :     =item b
388 :    
389 :     Second feature ID.
390 :    
391 :     =item RETURN
392 :    
393 :     Returns a negative number if the first parameter is smaller, zero if both parameters
394 :     are equal, and a positive number if the first parameter is greater.
395 :    
396 :     =back
397 :    
398 :     =cut
399 :    
400 :     sub by_fig_id {
401 :     my($a,$b) = @_;
402 :     my($g1,$g2,$t1,$t2,$n1,$n2);
403 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
404 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
405 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
406 :     } else {
407 :     $a cmp $b;
408 :     }
409 :     }
410 :    
411 :    
412 : parrello 1.3 =head3 genome_of
413 :    
414 :     my $genomeID = genome_of($fid);
415 :    
416 :     Return the Genome ID embedded in the specified FIG feature ID.
417 :    
418 :     =over 4
419 :    
420 :     =item fid
421 :    
422 :     Feature ID of interest.
423 :    
424 :     =item RETURN
425 :    
426 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
427 :     feature ID is invalid, this method returns an undefined value.
428 :    
429 :     =back
430 :    
431 :     =cut
432 :    
433 :     sub genome_of {
434 :     # Get the parameters.
435 :     my ($fid) = @_;
436 :     # Declare the return variable.
437 :     my $retVal;
438 :     # Parse the feature ID.
439 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
440 :     $retVal = $1;
441 :     }
442 :     # Return the result.
443 :     return $retVal;
444 :     }
445 :    
446 : parrello 1.6 =head3 sims
447 :    
448 :     my $sims = sims($id, \%seen, $maxN, $maxP, $select, $max_expand, $filters);
449 :    
450 :     Retrieve similarities from the network similarity server. The similarity retrieval
451 :     is performed using an HTTP user agent that returns similarity data in multiple
452 :     chunks. An anonymous subroutine is passed to the user agent that parses and
453 :     reformats the chunks as they come in. The similarites themselves are returned
454 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
455 :     The Sim object methods allow access to the elements by name.
456 :    
457 :     Similarities can be either raw or expanded. The raw similarities are basic
458 :     hits between features with similar DNA. Expanding a raw similarity drags in any
459 :     features considered substantially identical. So, for example, if features B<A1>,
460 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
461 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
462 :    
463 :     =over 4
464 :    
465 :     =item id
466 :    
467 :     ID of the feature whose similarities are desired, or reference to a list
468 :     of the IDs of the features whose similarities are desired.
469 :    
470 :     =item maxN
471 :    
472 :     Maximum number of similarities to return.
473 :    
474 :     =item maxP
475 :    
476 :     The maximum allowable similarity score.
477 :    
478 :     =item select
479 :    
480 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
481 :     means only similarities to FIG features are returned; C<all> means all expanded
482 :     similarities are returned; and C<figx> means similarities are expanded until the
483 :     number of FIG features equals the maximum.
484 :    
485 :     =item max_expand
486 :    
487 :     The maximum number of features to expand.
488 :    
489 :     =item filters
490 :    
491 :     Reference to a hash containing filter information, or a subroutine that can be
492 :     used to filter the sims.
493 :    
494 :     =item RETURN
495 :    
496 :     Returns a list of L<Sim> objects.
497 :    
498 :     =back
499 :    
500 :     =cut
501 :    
502 :     sub sims {
503 :     # Get the parameters.
504 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
505 :     # Get the URL for submitting to the sims server.
506 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
507 :     # Get a list of the IDs to process.
508 :     my @ids;
509 :     if (ref($id) eq "ARRAY") {
510 :     @ids = @$id;
511 :     } else {
512 :     @ids = ($id);
513 :     }
514 :     # Form a list of the parameters to pass to the server.
515 :     my %args = ();
516 :     $args{id} = \@ids;
517 :     $args{maxN} = $maxN if defined($maxN);
518 :     $args{maxP} = $maxP if defined($maxP);
519 :     $args{select} = $select if defined($select);
520 :     $args{max_expand} = $max_expand if defined($max_expand);
521 :     # If the filter is a hash, put the filters in the argument list.
522 :     if (ref($filters) eq 'HASH') {
523 :     for my $k (keys(%$filters))
524 :     {
525 :     $args{"filter_$k"}= $filters->{$k};
526 :     }
527 :     }
528 :     # Get the user agent.
529 :     require LWP::UserAgent;
530 :     my $ua = LWP::UserAgent->new();
531 :     # Insure we have the Sim module.
532 :     require Sim;
533 :     #
534 :     # Our next task is to create the anonymous subroutine that will process the
535 :     # chunks that come back from the server. We require three global variables:
536 :     # @sims to hold the similarities found, $tail to remember the unprocessed
537 :     # data from the previous chunk, and $chunks to count the chunks.
538 :     #
539 :     my @retVal;
540 :     my $tail;
541 :     my $chunks = 0;
542 :     #
543 :     # ANONYMOUS SUBROUTINE
544 :     #
545 :     my $cb = sub {
546 :     eval {
547 :     # Get the parameters.
548 :     my ($data, $command) = @_;
549 :     # Check for a reset command. If we get one, we discard any data
550 :     # in progress.
551 :     if ($command && $command eq 'reset') {
552 :     $tail = '';
553 :     } else {
554 :     $chunks++;
555 :     # Get the data to process. Note we concatenate it to the incoming
556 :     # tail from last time.
557 :     my $c = $tail . $data;
558 :     # Make sure the caller hasn't messed up the new-line character.
559 :     # FASTA readers in particular are notorious for doing things
560 :     # like that.
561 :     local $/ = "\n";
562 :     # Split the input into lines.
563 :     my @lines = split(/\n/, $c);
564 :     # If the input does not end with a new-line, we have a partial
565 :     # chunk and need to put it in the tail for next time. If not,
566 :     # there is no tail for next time.
567 :     if (substr($c, -1, 1) ne "\n") {
568 :     $tail = pop @lines;
569 :     } else {
570 :     $tail = '';
571 :     }
572 :     # Loop through the lines. Note there's no need to chomp because
573 :     # the SPLIT took out the new-line characters.
574 :     for my $l (@lines) {
575 :     # Split the line into fields.
576 :     my @s = split(/\t/, $l);
577 :     # Insure we have all the fields we need.
578 : parrello 1.12 if (@s >= 9) {
579 : parrello 1.6 # Check to see if we've seen this SIM before.
580 :     my $id1 = $s[0];
581 :     my $id2 = $s[1];
582 :     # Add it to the result list.
583 :     push(@retVal, bless \@s, 'Sim');
584 :     }
585 :     }
586 :     }
587 :     };
588 :     };
589 :     #
590 :     # END OF ANONYMOUS SUBROUTINE
591 :     #
592 :     # Now we're ready to start. Because networking is an iffy thing, we set up
593 :     # to try our request multiple times.
594 :     my $n_retries = 10;
595 :     my $attempts = 0;
596 :     # Set the timeout value, in seconds.
597 :     $ua->timeout(180);
598 :     # Loop until we succeed or run out of retries.
599 :     my $done = 0;
600 :     while (! $done && $attempts++ < $n_retries) {
601 :     # Reset the content processor. This clears the tail.
602 :     &$cb(undef, 'reset');
603 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
604 :     if ($resp->is_success) {
605 :     # If the response was successful, get the content. This triggers
606 :     # the anonymous subroutine.
607 :     my $x = $resp->content;
608 :     # Denote we've been successful.
609 :     $done = 1;
610 :     }
611 :     }
612 :     return @retVal;
613 :     }
614 :    
615 : parrello 1.8 =head3 verify_dir
616 :    
617 :     verify_dir($dirName);
618 :    
619 :     Insure that the specified directory exists. If the directory does not
620 :     exist, it will be created.
621 :    
622 :     =over 4
623 :    
624 :     =item dirName
625 :    
626 :     Name of the relevant directory.
627 :    
628 :     =back
629 :    
630 :     =cut
631 :    
632 :     sub verify_dir {
633 :     # Get the parameters.
634 :     my ($dirName) = @_;
635 :     # Strip off the final slash, if any.
636 :     $dirName =~ s#/$##;
637 :     # Only proceed if the directory does NOT already exist.
638 :     if (! -d $dirName) {
639 :     # If there is a parent directory, recursively insure it is there.
640 :     if ($dirName =~ m#(.+)/[^/]+$#) {
641 :     verify_dir($1);
642 :     }
643 :     # Create this particular directory with full permissions.
644 :     mkdir $dirName, 0777;
645 :     }
646 :     }
647 :    
648 : parrello 1.9 =head3 parse_fasta_record
649 :    
650 :     my ($id, $comment, $seq) = parse_fasta_record($string);
651 :    
652 :     Extract the ID, comment, and sequence from a single FASTA record. For
653 :     backward compatability, instead of a FASTA record the ID and sequence can
654 :     be specified separated by a comma. In this case, the returned comment
655 :     will be empty.
656 :    
657 :     =over 4
658 :    
659 :     =item string
660 :    
661 :     A single FASTA record, or an ID and sequence separated by a single comma,
662 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
663 :     or a 3-element list consisting of an ID, a comment, and a sequence.
664 :    
665 :     =item RETURN
666 :    
667 :     Returns a three-element list consisting of the incoming ID, the associated
668 :     comment, and the specified DNA or protein sequence. If the incoming string is
669 :     invalid, all three list elements will come back undefined. If no ID is
670 :     specified, an MD5 will be provided.
671 :    
672 :     =back
673 :    
674 :     =cut
675 :    
676 :     sub parse_fasta_record {
677 :     # Get the parameters.
678 :     my ($string) = @_;
679 :     # Declare the return variables.
680 :     my ($id, $comment, $seq);
681 :     # Check the type of input string.
682 :     if (! defined $string) {
683 :     # Do nothing if no string was passed in. This extra check prevents a
684 :     # warning at runtime.
685 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
686 :     # Here we have a standard FASTA string.
687 :     ($id, $comment, $seq) = ($1, $2, $3);
688 :     # Remove white space from the sequence string.
689 :     $seq =~ s/\s+//sg;
690 :     # Trim front of comment.
691 :     $comment =~ s/^s+//;
692 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
693 :     ($id, $comment, $seq) = ($1, '', $2);
694 :     } elsif (ref $string eq 'ARRAY') {
695 :     # Here the data came in pre-formatted as a list reference.
696 :     ($id, $comment, $seq) = @$string;
697 :     # If there's no comment, we need to adjust.
698 :     if (! defined $seq) {
699 :     $seq = $comment;
700 :     $comment = '';
701 :     }
702 :     } else {
703 :     # Here we have only a sequence. We need to construct the ID.
704 :     $seq = $string;
705 : parrello 1.12 require Digest::MD5;
706 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
707 : parrello 1.9 $comment = "";
708 :     }
709 :     # Return the results.
710 :     return ($id, $comment, $seq);
711 :     }
712 :    
713 :    
714 : parrello 1.11 =head3 hypo
715 :    
716 :     my $flag = hypo($func);
717 :    
718 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
719 :     Hypothetical functional roles are identified by key words in the text,
720 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
721 :    
722 :     =over 4
723 :    
724 :     =item func
725 :    
726 :     Text of the functional role whose nature is to be determined.
727 :    
728 :     =item RETURN
729 :    
730 :     Returns TRUE if the role is hypothetical, else FALSE.
731 :    
732 :     =back
733 :    
734 :     =cut
735 :    
736 :     sub hypo {
737 :     my ($func) = @_;
738 :     if (! $func) { return 1 }
739 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
740 :     if ($func =~ /hypoth/i) { return 1 }
741 :     if ($func =~ /conserved protein/i) { return 1 }
742 :     if ($func =~ /gene product/i) { return 1 }
743 :     if ($func =~ /interpro/i) { return 1 }
744 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
745 :     if ($func =~ /^U\d/) { return 1 }
746 :     if ($func =~ /^orf[^_]/i) { return 1 }
747 :     if ($func =~ /uncharacterized/i) { return 1 }
748 :     if ($func =~ /pseudogene/i) { return 1 }
749 :     if ($func =~ /^predicted/i) { return 1 }
750 :     if ($func =~ /AGR_/) { return 1 }
751 :     if ($func =~ /similar to/i) { return 1 }
752 :     if ($func =~ /similarity/i) { return 1 }
753 :     if ($func =~ /glimmer/i) { return 1 }
754 :     if ($func =~ /unknown/i) { return 1 }
755 :     if (($func =~ /domain/i) ||
756 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
757 :     ($func =~ /complete/i) ||
758 :     ($func =~ /ensang/i) ||
759 :     ($func =~ /unnamed/i) ||
760 :     ($func =~ /EG:/) ||
761 :     ($func =~ /orf\d+/i) ||
762 :     ($func =~ /RIKEN/) ||
763 :     ($func =~ /Expressed/i) ||
764 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
765 :     ($func =~ /predicted by Psort/) ||
766 :     ($func =~ /^bh\d+/i) ||
767 :     ($func =~ /cds_/i) ||
768 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
769 :     ($func =~ /similar to/i) ||
770 :     ($func =~ / identi/i) ||
771 :     ($func =~ /ortholog of/i) ||
772 :     ($func =~ /structural feature/i)) { return 1 }
773 :     return 0;
774 :    
775 :     }
776 : parrello 1.9
777 : overbeek 1.14 =head3 between
778 :    
779 : parrello 1.15 my $flag = between($x, $y, $z);
780 : overbeek 1.14
781 :     Determine whether or not $y is between $x and $z.
782 :    
783 :     =over 4
784 :    
785 :     =item x
786 :    
787 :     First edge number.
788 :    
789 :     =item y
790 :    
791 :     Number to examine.
792 :    
793 :     =item z
794 :    
795 :     Second edge number.
796 :    
797 :     =item RETURN
798 :    
799 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
800 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
801 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
802 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
803 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
804 :    
805 :     =back
806 :    
807 :     =cut
808 :     #: Return Type $;
809 :     sub between {
810 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
811 :     my($x,$y,$z) = @_;
812 :    
813 :     if ($x < $z) {
814 :     return (($x <= $y) && ($y <= $z));
815 :     } else {
816 :     return (($x >= $y) && ($y >= $z));
817 :     }
818 :     }
819 : parrello 1.1
820 : parrello 1.15 =head3 standard_genetic_code
821 :    
822 :     my $code = standard_genetic_code();
823 :    
824 :     Return a hash containing the standard translation of nucleotide triples to proteins.
825 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
826 :     returns the default translation scheme. The scheme is implemented as a reference to a
827 :     hash that contains nucleotide triplets as keys and has protein letters as values.
828 :    
829 :     =cut
830 :    
831 :     sub standard_genetic_code {
832 :    
833 :     my $code = {};
834 :    
835 :     $code->{"AAA"} = "K";
836 :     $code->{"AAC"} = "N";
837 :     $code->{"AAG"} = "K";
838 :     $code->{"AAT"} = "N";
839 :     $code->{"ACA"} = "T";
840 :     $code->{"ACC"} = "T";
841 :     $code->{"ACG"} = "T";
842 :     $code->{"ACT"} = "T";
843 :     $code->{"AGA"} = "R";
844 :     $code->{"AGC"} = "S";
845 :     $code->{"AGG"} = "R";
846 :     $code->{"AGT"} = "S";
847 :     $code->{"ATA"} = "I";
848 :     $code->{"ATC"} = "I";
849 :     $code->{"ATG"} = "M";
850 :     $code->{"ATT"} = "I";
851 :     $code->{"CAA"} = "Q";
852 :     $code->{"CAC"} = "H";
853 :     $code->{"CAG"} = "Q";
854 :     $code->{"CAT"} = "H";
855 :     $code->{"CCA"} = "P";
856 :     $code->{"CCC"} = "P";
857 :     $code->{"CCG"} = "P";
858 :     $code->{"CCT"} = "P";
859 :     $code->{"CGA"} = "R";
860 :     $code->{"CGC"} = "R";
861 :     $code->{"CGG"} = "R";
862 :     $code->{"CGT"} = "R";
863 :     $code->{"CTA"} = "L";
864 :     $code->{"CTC"} = "L";
865 :     $code->{"CTG"} = "L";
866 :     $code->{"CTT"} = "L";
867 :     $code->{"GAA"} = "E";
868 :     $code->{"GAC"} = "D";
869 :     $code->{"GAG"} = "E";
870 :     $code->{"GAT"} = "D";
871 :     $code->{"GCA"} = "A";
872 :     $code->{"GCC"} = "A";
873 :     $code->{"GCG"} = "A";
874 :     $code->{"GCT"} = "A";
875 :     $code->{"GGA"} = "G";
876 :     $code->{"GGC"} = "G";
877 :     $code->{"GGG"} = "G";
878 :     $code->{"GGT"} = "G";
879 :     $code->{"GTA"} = "V";
880 :     $code->{"GTC"} = "V";
881 :     $code->{"GTG"} = "V";
882 :     $code->{"GTT"} = "V";
883 :     $code->{"TAA"} = "*";
884 :     $code->{"TAC"} = "Y";
885 :     $code->{"TAG"} = "*";
886 :     $code->{"TAT"} = "Y";
887 :     $code->{"TCA"} = "S";
888 :     $code->{"TCC"} = "S";
889 :     $code->{"TCG"} = "S";
890 :     $code->{"TCT"} = "S";
891 :     $code->{"TGA"} = "*";
892 :     $code->{"TGC"} = "C";
893 :     $code->{"TGG"} = "W";
894 :     $code->{"TGT"} = "C";
895 :     $code->{"TTA"} = "L";
896 :     $code->{"TTC"} = "F";
897 :     $code->{"TTG"} = "L";
898 :     $code->{"TTT"} = "F";
899 :    
900 :     return $code;
901 :     }
902 :    
903 :     =head3 translate
904 :    
905 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
906 :    
907 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
908 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
909 :     C<M>. (In the standard genetic code, these two combinations normally translate
910 :     to C<V> and C<L>, respectively.)
911 :    
912 :     =over 4
913 :    
914 :     =item dna_seq
915 :    
916 :     DNA sequence to translate. Note that the DNA sequence can only contain
917 :     known nucleotides.
918 :    
919 :     =item code
920 :    
921 :     Reference to a hash specifying the translation code. The hash is keyed by
922 :     nucleotide triples, and the value for each key is the corresponding protein
923 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
924 :     used.
925 :    
926 :     =item fix_start
927 :    
928 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
929 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
930 :     C<M> instead of the value specified in the translation code.
931 :    
932 :     =item RETURN
933 :    
934 :     Returns a string resulting from translating each nucleotide triple into a
935 :     protein letter.
936 :    
937 :     =back
938 :    
939 :     =cut
940 :     #: Return Type $;
941 :     sub translate {
942 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
943 :    
944 :     my( $dna,$code,$start ) = @_;
945 :     my( $i,$j,$ln );
946 :     my( $x,$y );
947 :     my( $prot );
948 :    
949 :     if (! defined($code)) {
950 :     $code = &FIG::standard_genetic_code;
951 :     }
952 :     $ln = length($dna);
953 :     $prot = "X" x ($ln/3);
954 :     $dna =~ tr/a-z/A-Z/;
955 :    
956 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
957 :     $x = substr($dna,$i,3);
958 :     if ($y = $code->{$x}) {
959 :     substr($prot,$j,1) = $y;
960 :     }
961 :     }
962 :    
963 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
964 :     substr($prot,0,1) = 'M';
965 :     }
966 :     return $prot;
967 :     }
968 :    
969 :    
970 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3