[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 :    
24 :     use strict;
25 : parrello 1.12 no warnings qw(once);
26 : parrello 1.1 use base qw(Exporter);
27 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
28 :     rev_comp genome_of min max sims verify_dir between translate
29 : parrello 1.18 standard_genetic_code parse_location roles_in_function
30 : parrello 1.19 strip_ec location_string location_cmp strand_of);
31 : parrello 1.1
32 :     =head1 SEED Utility Methods
33 :    
34 :     =head2 Introduction
35 :    
36 :     This is a simple utility package that performs functions useful for
37 :     bioinformatics, but that do not require access to the databases.
38 :    
39 :     =head2 Public Methods
40 :    
41 : parrello 1.13 =head3 boundaries_of
42 :    
43 :     my ($contig, $min, $max) = boundaries_of($locs);
44 :    
45 :     Return the boundaries of a set of locations. The contig, the leftmost
46 :     location, and the rightmost location will be returned to the caller. If
47 :     more than one contig is represented, the method will return an undefined
48 :     value for the contig (indicating failure).
49 :    
50 :     =over 4
51 :    
52 :     =item locs
53 :    
54 :     Reference to a list of location strings. A location string contains a contig ID,
55 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
56 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
57 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
58 :    
59 :     =item RETURN
60 :    
61 :     Returns a 3-element list. The first element is the contig ID from all the locations,
62 :     the second is the offset of leftmost base pair represented in the locations, and the
63 :     third is the offset of the rightmost base pair represented in the locations.
64 :    
65 :     =back
66 :    
67 :     =cut
68 :    
69 :     sub boundaries_of {
70 :     # Get the parameters.
71 :     my ($locs) = @_;
72 :     # Declare the return variables.
73 :     my ($contig, $min, $max);
74 :     # We'll put all the starting and ending offsets found in here.
75 :     my @offsets;
76 :     # This will count the number of errors found.
77 :     my $error = 0;
78 :     # Loop through the locations.
79 :     for my $loc (@$locs) {
80 :     # Parse this location.
81 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
82 :     my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
83 :     # Is this contig valid?
84 :     if ($contig && $newContig ne $contig) {
85 :     # No, skip this location.
86 :     $error++;
87 :     } else {
88 :     # Save the contig.
89 :     $contig = $newContig;
90 :     # Compute the ending offset.
91 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
92 :     # Save both offsets.
93 :     push @offsets, $begin, $end;
94 :     }
95 :     } else {
96 :     # The location is invalid, so it's an error,
97 :     $error++;
98 :     }
99 :     }
100 :     # If there's an error, clear the contig ID.
101 :     if ($error) {
102 :     $contig = undef;
103 :     }
104 :     # Compute the min and max from the offsets collected.
105 :     $min = min(@offsets);
106 :     $max = max(@offsets);
107 :     # Return the results.
108 :     return ($contig, $min, $max);
109 :     }
110 :    
111 : parrello 1.17
112 :     =head3 roles_in_function
113 :    
114 :     my @roles = roles_in_function($assignment);
115 :    
116 :     Return a list of the functional roles in the specified assignment string.
117 :     A single assignment may contain multiple roles as well as comments; this
118 :     method separates them out.
119 :    
120 :     =over 4
121 :    
122 :     =item assignment
123 :    
124 :     Functional assignment to parse for roles.
125 :    
126 :     =item RETURN
127 :    
128 :     Returns a list of the individual roles in the assignment.
129 :    
130 :     =back
131 :    
132 :     =cut
133 :    
134 :     sub roles_in_function {
135 :     # Get the parameters.
136 :     my ($assignment) = @_;
137 :     # Remove any comment.
138 : parrello 1.18 my $commentFree = ($assignment =~ /(.+?)\s*#/ ? $1 : $assignment);
139 : parrello 1.17 # Split out the roles.
140 : parrello 1.18 my @retVal = split /\s+[\/@;]\s+/, $commentFree;
141 : parrello 1.17 # Return the result.
142 :     return @retVal;
143 :     }
144 :    
145 :    
146 : parrello 1.16 =head3 parse_location
147 :    
148 : parrello 1.19 my ($contig, $begin, $end, $strand) = parse_location($locString);
149 : parrello 1.16
150 :     Return the contigID, start offset, and end offset for a specified
151 :     location string (see L<SAP/Location Strings>).
152 :    
153 :     =over 4
154 :    
155 :     =item locString
156 :    
157 :     Location string to parse.
158 :    
159 :     =item RETURN
160 :    
161 : parrello 1.19 Returns a four-element list containing the contig ID from the location
162 :     string, the starting offset of the location, the ending offset, and the
163 :     strand. If the location string is not valid, the values returned will be
164 :     C<undef>.
165 : parrello 1.16
166 :     =back
167 :    
168 :     =cut
169 :    
170 :     sub parse_location {
171 :     # Get the parameters.
172 :     my ($locString) = @_;
173 :     # Declare the return variables.
174 : parrello 1.19 my ($contig, $begin, $end, $strand);
175 : parrello 1.16 # Parse the location string.
176 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
177 : parrello 1.19 # Pull out the contig ID, strand, and begin location.
178 : parrello 1.16 $contig = $1;
179 :     $begin = $2;
180 : parrello 1.19 $strand = $3;
181 : parrello 1.16 # Compute the ending location from the direction and length.
182 :     if ($3 eq '+') {
183 :     $end = $begin + $4 - 1;
184 :     } else {
185 :     $end = $begin - $4 + 1;
186 :     }
187 :     }
188 :     # Return the results.
189 : parrello 1.19 return ($contig, $begin, $end, $strand);
190 :     }
191 :    
192 :     =head3 location_string
193 :    
194 :     my $locString = location_string($contig, $beg, $end);
195 :    
196 :     Form a location string for the specified contig that starts at the
197 :     indicated begin location and stops at the indicated end location. A
198 :     single-base location will automatically be put on the forward strand.
199 :    
200 :     =over 4
201 :    
202 :     =item contig
203 :    
204 :     ID of the contig to contain this location.
205 :    
206 :     =item beg
207 :    
208 :     Beginning offset of the location.
209 :    
210 :     =item end
211 :    
212 :     Ending offset of the location.
213 :    
214 :     =item RETURN
215 :    
216 :     Returns a location string (see L<SAP/Location String>) for the specified
217 :     location.
218 :    
219 :     =back
220 :    
221 :     =cut
222 :    
223 :     sub location_string {
224 :     # Get the parameters.
225 :     my ($contig, $beg, $end) = @_;
226 :     # Compute the strand and length.
227 :     my ($strand, $len);
228 :     if ($beg <= $end) {
229 :     $strand = '+';
230 :     $len = $end + 1 - $beg;
231 :     } else {
232 :     $strand = '-';
233 :     $len = $beg + 1 - $end;
234 :     }
235 :     # Return the result.
236 :     return $contig . "_$beg$strand$len";
237 :     }
238 :    
239 :    
240 :     =head3 location_cmp
241 :    
242 :     my $cmp = location_cmp($loc1, $loc2);
243 :    
244 :     Compare two location strings (see L<SAP/Location Strings>).
245 :    
246 :     The ordering principle for locations is that they are sorted first by contig ID, then by
247 :     leftmost position, in reverse order by length, and then by direction. The effect is that
248 :     within a contig, the locations are ordered first and foremost in the way they would
249 :     appear when displayed in a picture of the contig and second in such a way that embedded
250 :     locations come after the locations in which they are embedded. In the case of two
251 :     locations that represent the exact same base pairs, the forward (C<+>) location is
252 :     arbitrarily placed first.
253 :    
254 :     =over 4
255 :    
256 :     =item loc1
257 :    
258 :     First location string to compare.
259 :    
260 :     =item loc2
261 :    
262 :     Second location string to compare.
263 :    
264 :     =item RETURN
265 :    
266 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
267 :     B<loc2> location sorts first, and zero if the two locations are the same.
268 :    
269 :    
270 :     =back
271 :    
272 :     =cut
273 :    
274 :     sub location_cmp {
275 :     # Get the parameters.
276 :     my ($loc1, $loc2) = @_;
277 :     # Parse the locations.
278 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
279 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
280 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
281 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
282 :     # Declare the return variable. We compare the indicative parts of the location
283 :     # in order. Note that we sort in reverse order by length, so the last comparison
284 :     # puts 2 before 1.
285 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
286 :     ($len2 <=> $len1);
287 :     # If everything matches to this point, check the strands.
288 :     if (! $retVal) {
289 :     if ($strand1 eq '+') {
290 :     # First location is positive, so if the locations are unequal, it
291 :     # sorts first.
292 :     $retVal = ($strand2 eq '+' ? 0 : -1);
293 :     } else {
294 :     # First location is negative, so if the locations are unequal, it
295 :     # sorts second.
296 :     $retVal = ($strand1 eq '-' ? 0 : 1);
297 :     }
298 :     }
299 :     # Return the result.
300 :     return $retVal;
301 :     }
302 :    
303 :     =head3 strand_of
304 :    
305 :     my $plusOrMinus = strand_of($loc);
306 :    
307 :     Return the strand (C<+> or C<->) from the specified location string.
308 :    
309 :     =over 4
310 :    
311 :     =item loc
312 :    
313 :     Location string to parse (see L<SAP/Location Strings>).
314 :    
315 :     =item RETURN
316 :    
317 :     Returns C<+> if the location is on the forward strand, else C<->.
318 :    
319 :     =back
320 :    
321 :     =cut
322 :    
323 :     sub strand_of {
324 :     # Get the parameters.
325 :     my ($loc) = @_;
326 :     # Declare the return variable.
327 :     my $retVal;
328 :     # Parse the strand indicator from the location.
329 :     if ($loc =~ /\d+([+-])\d+/) {
330 :     $retVal = $1;
331 :     }
332 :     # Return the result.
333 :     return $retVal;
334 : parrello 1.16 }
335 :    
336 :    
337 : parrello 1.5 =head3 max
338 :    
339 :     my $max = max(@nums);
340 :    
341 :     Return the maximum number from all the values in the specified list.
342 :    
343 :     =over 4
344 :    
345 :     =item nums
346 :    
347 :     List of numbers to examine.
348 :    
349 :     =item RETURN
350 :    
351 :     Returns the maximum numeric value from the specified parameters, or
352 :     an undefined value if an empty list is passed in.
353 :    
354 :     =back
355 :    
356 :     =cut
357 :    
358 :     sub max {
359 :     my ($retVal, @nums) = @_;
360 :     for my $num (@nums) {
361 :     if ($num > $retVal) {
362 :     $retVal = $num;
363 :     }
364 :     }
365 :     return $retVal;
366 :     }
367 :    
368 :     =head3 min
369 :    
370 :     my $min = min(@nums);
371 :    
372 :     Return the minimum number from all the values in the specified list.
373 :    
374 :     =over 4
375 :    
376 :     =item nums
377 :    
378 :     List of numbers to examine.
379 :    
380 :     =item RETURN
381 :    
382 :     Returns the minimum numeric value from the specified parameters, or
383 :     an undefined value if an empty list is passed in.
384 :    
385 :     =back
386 :    
387 :     =cut
388 :    
389 :     sub min {
390 :     my ($retVal, @nums) = @_;
391 :     for my $num (@nums) {
392 :     if ($num < $retVal) {
393 :     $retVal = $num;
394 :     }
395 :     }
396 :     return $retVal;
397 :     }
398 :    
399 : parrello 1.1 =head3 create_fasta_record
400 :    
401 : parrello 1.7 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
402 : parrello 1.1
403 :     Create a FASTA record from the specified DNA or protein sequence. The
404 :     sequence will be split into 60-character lines, and the record will
405 : parrello 1.4 include an identifier line.
406 : parrello 1.1
407 :     =over 4
408 :    
409 :     =item id
410 :    
411 :     ID for the sequence, to be placed at the beginning of the identifier
412 :     line.
413 :    
414 :     =item comment (optional)
415 :    
416 :     Comment text to place after the ID on the identifier line. If this parameter
417 :     is empty, undefined, or 0, no comment will be placed.
418 :    
419 :     =item sequence
420 :    
421 :     Sequence of letters to form into FASTA. For purposes of convenience, whitespace
422 :     characters in the sequence will be removed automatically.
423 :    
424 : parrello 1.7 =item stripped (optional)
425 :    
426 :     If TRUE, then the sequence will be returned unmodified instead of converted
427 :     to FASTA format. The default is FALSE.
428 :    
429 : parrello 1.1 =item RETURN
430 :    
431 :     Returns the desired sequence in FASTA format.
432 :    
433 :     =back
434 :    
435 :     =cut
436 :    
437 :     sub create_fasta_record {
438 :     # Get the parameters.
439 : parrello 1.7 my ($id, $comment, $sequence, $stripped) = @_;
440 :     # Declare the return variable.
441 :     my $retVal;
442 :     # If we're in stripped mode, we just return the sequence.
443 :     if ($stripped) {
444 :     $retVal = $sequence;
445 :     } else {
446 :     # Here we have to do the FASTA conversion. Start with the ID.
447 :     my $header = ">$id";
448 :     # Add a comment, if any.
449 :     if ($comment) {
450 :     $header .= " $comment";
451 :     }
452 :     # Clean up the sequence.
453 :     $sequence =~ s/\s+//g;
454 :     # We need to format the sequence into 60-byte chunks. We use the infamous
455 :     # grep-split trick. The split, because of the presence of the parentheses,
456 :     # includes the matched delimiters in the output list. The grep strips out
457 :     # the empty list items that appear between the so-called delimiters, since
458 :     # the delimiters are what we want.
459 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
460 :     # Add the chunks and the trailer.
461 : parrello 1.10 $retVal = join("\n", $header, @chunks) . "\n";
462 : parrello 1.1 }
463 :     # Return the result.
464 :     return $retVal;
465 :     }
466 :    
467 : parrello 1.2 =head3 rev_comp
468 :    
469 :     my $revcmp = rev_comp($dna);
470 :    
471 :     or
472 :    
473 :     rev_comp(\$dna);
474 :    
475 :     Return the reverse complement of a DNA string.
476 :    
477 :     =over 4
478 :    
479 :     =item dna
480 :    
481 :     Either a DNA string, or a reference to a DNA string.
482 :    
483 :     =item RETURN
484 :    
485 :     If the input is a DNA string, returns the reverse complement. If the
486 :     input is a reference to a DNA string, the string itself is reverse
487 :     complemented.
488 :    
489 :     =back
490 :    
491 :     =cut
492 :    
493 :     sub rev_comp {
494 :     # Get the parameters.
495 :     my ($dna) = @_;
496 :     # Determine how we were called.
497 :     my ($retVal, $refMode);
498 :     if (ref $dna eq 'SCALAR') {
499 :     $retVal = lc reverse $dna;
500 :     $refMode = 0;
501 :     } else {
502 :     $retVal = lc reverse $$dna;
503 :     $refMode = 1;
504 :     }
505 :     # Now $retVal contains the reversed DNA string in all lower case, and
506 :     # $refMode is TRUE iff the user passed in a reference. The following
507 :     # translation step complements the string.
508 :     $retVal =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;
509 :     # Return the result in the method corresponding to the way it came in.
510 :     if ($refMode) {
511 :     $$dna = $retVal;
512 :     return;
513 :     } else {
514 :     return $retVal;
515 :     }
516 :     }
517 :    
518 : parrello 1.15
519 :     =head3 by_fig_id
520 :    
521 :     my @sorted_by_fig_id = sort { FIG::by_fig_id($a,$b) } @fig_ids;
522 :    
523 :     Compare two feature IDs.
524 :    
525 :     This function is designed to assist in sorting features by ID. The sort is by
526 :     genome ID followed by feature type and then feature number.
527 :    
528 :     =over 4
529 :    
530 :     =item a
531 :    
532 :     First feature ID.
533 :    
534 :     =item b
535 :    
536 :     Second feature ID.
537 :    
538 :     =item RETURN
539 :    
540 :     Returns a negative number if the first parameter is smaller, zero if both parameters
541 :     are equal, and a positive number if the first parameter is greater.
542 :    
543 :     =back
544 :    
545 :     =cut
546 :    
547 :     sub by_fig_id {
548 :     my($a,$b) = @_;
549 :     my($g1,$g2,$t1,$t2,$n1,$n2);
550 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
551 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
552 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
553 :     } else {
554 :     $a cmp $b;
555 :     }
556 :     }
557 :    
558 :    
559 : parrello 1.3 =head3 genome_of
560 :    
561 :     my $genomeID = genome_of($fid);
562 :    
563 :     Return the Genome ID embedded in the specified FIG feature ID.
564 :    
565 :     =over 4
566 :    
567 :     =item fid
568 :    
569 :     Feature ID of interest.
570 :    
571 :     =item RETURN
572 :    
573 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
574 :     feature ID is invalid, this method returns an undefined value.
575 :    
576 :     =back
577 :    
578 :     =cut
579 :    
580 :     sub genome_of {
581 :     # Get the parameters.
582 :     my ($fid) = @_;
583 :     # Declare the return variable.
584 :     my $retVal;
585 :     # Parse the feature ID.
586 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
587 :     $retVal = $1;
588 :     }
589 :     # Return the result.
590 :     return $retVal;
591 :     }
592 :    
593 : parrello 1.6 =head3 sims
594 :    
595 :     my $sims = sims($id, \%seen, $maxN, $maxP, $select, $max_expand, $filters);
596 :    
597 :     Retrieve similarities from the network similarity server. The similarity retrieval
598 :     is performed using an HTTP user agent that returns similarity data in multiple
599 :     chunks. An anonymous subroutine is passed to the user agent that parses and
600 :     reformats the chunks as they come in. The similarites themselves are returned
601 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
602 :     The Sim object methods allow access to the elements by name.
603 :    
604 :     Similarities can be either raw or expanded. The raw similarities are basic
605 :     hits between features with similar DNA. Expanding a raw similarity drags in any
606 :     features considered substantially identical. So, for example, if features B<A1>,
607 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
608 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
609 :    
610 :     =over 4
611 :    
612 :     =item id
613 :    
614 :     ID of the feature whose similarities are desired, or reference to a list
615 :     of the IDs of the features whose similarities are desired.
616 :    
617 :     =item maxN
618 :    
619 :     Maximum number of similarities to return.
620 :    
621 :     =item maxP
622 :    
623 :     The maximum allowable similarity score.
624 :    
625 :     =item select
626 :    
627 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
628 :     means only similarities to FIG features are returned; C<all> means all expanded
629 :     similarities are returned; and C<figx> means similarities are expanded until the
630 :     number of FIG features equals the maximum.
631 :    
632 :     =item max_expand
633 :    
634 :     The maximum number of features to expand.
635 :    
636 :     =item filters
637 :    
638 :     Reference to a hash containing filter information, or a subroutine that can be
639 :     used to filter the sims.
640 :    
641 :     =item RETURN
642 :    
643 :     Returns a list of L<Sim> objects.
644 :    
645 :     =back
646 :    
647 :     =cut
648 :    
649 :     sub sims {
650 :     # Get the parameters.
651 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
652 :     # Get the URL for submitting to the sims server.
653 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
654 :     # Get a list of the IDs to process.
655 :     my @ids;
656 :     if (ref($id) eq "ARRAY") {
657 :     @ids = @$id;
658 :     } else {
659 :     @ids = ($id);
660 :     }
661 :     # Form a list of the parameters to pass to the server.
662 :     my %args = ();
663 :     $args{id} = \@ids;
664 :     $args{maxN} = $maxN if defined($maxN);
665 :     $args{maxP} = $maxP if defined($maxP);
666 :     $args{select} = $select if defined($select);
667 :     $args{max_expand} = $max_expand if defined($max_expand);
668 :     # If the filter is a hash, put the filters in the argument list.
669 :     if (ref($filters) eq 'HASH') {
670 :     for my $k (keys(%$filters))
671 :     {
672 :     $args{"filter_$k"}= $filters->{$k};
673 :     }
674 :     }
675 :     # Get the user agent.
676 :     require LWP::UserAgent;
677 :     my $ua = LWP::UserAgent->new();
678 :     # Insure we have the Sim module.
679 :     require Sim;
680 :     #
681 :     # Our next task is to create the anonymous subroutine that will process the
682 :     # chunks that come back from the server. We require three global variables:
683 :     # @sims to hold the similarities found, $tail to remember the unprocessed
684 :     # data from the previous chunk, and $chunks to count the chunks.
685 :     #
686 :     my @retVal;
687 :     my $tail;
688 :     my $chunks = 0;
689 :     #
690 :     # ANONYMOUS SUBROUTINE
691 :     #
692 :     my $cb = sub {
693 :     eval {
694 :     # Get the parameters.
695 :     my ($data, $command) = @_;
696 :     # Check for a reset command. If we get one, we discard any data
697 :     # in progress.
698 :     if ($command && $command eq 'reset') {
699 :     $tail = '';
700 :     } else {
701 :     $chunks++;
702 :     # Get the data to process. Note we concatenate it to the incoming
703 :     # tail from last time.
704 :     my $c = $tail . $data;
705 :     # Make sure the caller hasn't messed up the new-line character.
706 :     # FASTA readers in particular are notorious for doing things
707 :     # like that.
708 :     local $/ = "\n";
709 :     # Split the input into lines.
710 :     my @lines = split(/\n/, $c);
711 :     # If the input does not end with a new-line, we have a partial
712 :     # chunk and need to put it in the tail for next time. If not,
713 :     # there is no tail for next time.
714 :     if (substr($c, -1, 1) ne "\n") {
715 :     $tail = pop @lines;
716 :     } else {
717 :     $tail = '';
718 :     }
719 :     # Loop through the lines. Note there's no need to chomp because
720 :     # the SPLIT took out the new-line characters.
721 :     for my $l (@lines) {
722 :     # Split the line into fields.
723 :     my @s = split(/\t/, $l);
724 :     # Insure we have all the fields we need.
725 : parrello 1.12 if (@s >= 9) {
726 : parrello 1.6 # Check to see if we've seen this SIM before.
727 :     my $id1 = $s[0];
728 :     my $id2 = $s[1];
729 :     # Add it to the result list.
730 :     push(@retVal, bless \@s, 'Sim');
731 :     }
732 :     }
733 :     }
734 :     };
735 :     };
736 :     #
737 :     # END OF ANONYMOUS SUBROUTINE
738 :     #
739 :     # Now we're ready to start. Because networking is an iffy thing, we set up
740 :     # to try our request multiple times.
741 :     my $n_retries = 10;
742 :     my $attempts = 0;
743 :     # Set the timeout value, in seconds.
744 :     $ua->timeout(180);
745 :     # Loop until we succeed or run out of retries.
746 :     my $done = 0;
747 :     while (! $done && $attempts++ < $n_retries) {
748 :     # Reset the content processor. This clears the tail.
749 :     &$cb(undef, 'reset');
750 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
751 :     if ($resp->is_success) {
752 :     # If the response was successful, get the content. This triggers
753 :     # the anonymous subroutine.
754 :     my $x = $resp->content;
755 :     # Denote we've been successful.
756 :     $done = 1;
757 :     }
758 :     }
759 :     return @retVal;
760 :     }
761 :    
762 : parrello 1.8 =head3 verify_dir
763 :    
764 :     verify_dir($dirName);
765 :    
766 :     Insure that the specified directory exists. If the directory does not
767 :     exist, it will be created.
768 :    
769 :     =over 4
770 :    
771 :     =item dirName
772 :    
773 :     Name of the relevant directory.
774 :    
775 :     =back
776 :    
777 :     =cut
778 :    
779 :     sub verify_dir {
780 :     # Get the parameters.
781 :     my ($dirName) = @_;
782 :     # Strip off the final slash, if any.
783 :     $dirName =~ s#/$##;
784 :     # Only proceed if the directory does NOT already exist.
785 :     if (! -d $dirName) {
786 :     # If there is a parent directory, recursively insure it is there.
787 :     if ($dirName =~ m#(.+)/[^/]+$#) {
788 :     verify_dir($1);
789 :     }
790 :     # Create this particular directory with full permissions.
791 :     mkdir $dirName, 0777;
792 :     }
793 :     }
794 :    
795 : parrello 1.9 =head3 parse_fasta_record
796 :    
797 :     my ($id, $comment, $seq) = parse_fasta_record($string);
798 :    
799 :     Extract the ID, comment, and sequence from a single FASTA record. For
800 :     backward compatability, instead of a FASTA record the ID and sequence can
801 :     be specified separated by a comma. In this case, the returned comment
802 :     will be empty.
803 :    
804 :     =over 4
805 :    
806 :     =item string
807 :    
808 :     A single FASTA record, or an ID and sequence separated by a single comma,
809 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
810 :     or a 3-element list consisting of an ID, a comment, and a sequence.
811 :    
812 :     =item RETURN
813 :    
814 :     Returns a three-element list consisting of the incoming ID, the associated
815 :     comment, and the specified DNA or protein sequence. If the incoming string is
816 :     invalid, all three list elements will come back undefined. If no ID is
817 :     specified, an MD5 will be provided.
818 :    
819 :     =back
820 :    
821 :     =cut
822 :    
823 :     sub parse_fasta_record {
824 :     # Get the parameters.
825 :     my ($string) = @_;
826 :     # Declare the return variables.
827 :     my ($id, $comment, $seq);
828 :     # Check the type of input string.
829 :     if (! defined $string) {
830 :     # Do nothing if no string was passed in. This extra check prevents a
831 :     # warning at runtime.
832 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
833 :     # Here we have a standard FASTA string.
834 :     ($id, $comment, $seq) = ($1, $2, $3);
835 :     # Remove white space from the sequence string.
836 :     $seq =~ s/\s+//sg;
837 :     # Trim front of comment.
838 :     $comment =~ s/^s+//;
839 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
840 :     ($id, $comment, $seq) = ($1, '', $2);
841 :     } elsif (ref $string eq 'ARRAY') {
842 :     # Here the data came in pre-formatted as a list reference.
843 :     ($id, $comment, $seq) = @$string;
844 :     # If there's no comment, we need to adjust.
845 :     if (! defined $seq) {
846 :     $seq = $comment;
847 :     $comment = '';
848 :     }
849 :     } else {
850 :     # Here we have only a sequence. We need to construct the ID.
851 :     $seq = $string;
852 : parrello 1.12 require Digest::MD5;
853 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
854 : parrello 1.9 $comment = "";
855 :     }
856 :     # Return the results.
857 :     return ($id, $comment, $seq);
858 :     }
859 :    
860 :    
861 : parrello 1.11 =head3 hypo
862 :    
863 :     my $flag = hypo($func);
864 :    
865 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
866 :     Hypothetical functional roles are identified by key words in the text,
867 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
868 :    
869 :     =over 4
870 :    
871 :     =item func
872 :    
873 :     Text of the functional role whose nature is to be determined.
874 :    
875 :     =item RETURN
876 :    
877 :     Returns TRUE if the role is hypothetical, else FALSE.
878 :    
879 :     =back
880 :    
881 :     =cut
882 :    
883 :     sub hypo {
884 :     my ($func) = @_;
885 :     if (! $func) { return 1 }
886 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
887 :     if ($func =~ /hypoth/i) { return 1 }
888 :     if ($func =~ /conserved protein/i) { return 1 }
889 :     if ($func =~ /gene product/i) { return 1 }
890 :     if ($func =~ /interpro/i) { return 1 }
891 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
892 :     if ($func =~ /^U\d/) { return 1 }
893 :     if ($func =~ /^orf[^_]/i) { return 1 }
894 :     if ($func =~ /uncharacterized/i) { return 1 }
895 :     if ($func =~ /pseudogene/i) { return 1 }
896 :     if ($func =~ /^predicted/i) { return 1 }
897 :     if ($func =~ /AGR_/) { return 1 }
898 :     if ($func =~ /similar to/i) { return 1 }
899 :     if ($func =~ /similarity/i) { return 1 }
900 :     if ($func =~ /glimmer/i) { return 1 }
901 :     if ($func =~ /unknown/i) { return 1 }
902 :     if (($func =~ /domain/i) ||
903 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
904 :     ($func =~ /complete/i) ||
905 :     ($func =~ /ensang/i) ||
906 :     ($func =~ /unnamed/i) ||
907 :     ($func =~ /EG:/) ||
908 :     ($func =~ /orf\d+/i) ||
909 :     ($func =~ /RIKEN/) ||
910 :     ($func =~ /Expressed/i) ||
911 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
912 :     ($func =~ /predicted by Psort/) ||
913 :     ($func =~ /^bh\d+/i) ||
914 :     ($func =~ /cds_/i) ||
915 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
916 :     ($func =~ /similar to/i) ||
917 :     ($func =~ / identi/i) ||
918 :     ($func =~ /ortholog of/i) ||
919 :     ($func =~ /structural feature/i)) { return 1 }
920 :     return 0;
921 :    
922 :     }
923 : parrello 1.9
924 : overbeek 1.14 =head3 between
925 :    
926 : parrello 1.15 my $flag = between($x, $y, $z);
927 : overbeek 1.14
928 :     Determine whether or not $y is between $x and $z.
929 :    
930 :     =over 4
931 :    
932 :     =item x
933 :    
934 :     First edge number.
935 :    
936 :     =item y
937 :    
938 :     Number to examine.
939 :    
940 :     =item z
941 :    
942 :     Second edge number.
943 :    
944 :     =item RETURN
945 :    
946 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
947 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
948 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
949 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
950 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
951 :    
952 :     =back
953 :    
954 :     =cut
955 :     #: Return Type $;
956 :     sub between {
957 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
958 :     my($x,$y,$z) = @_;
959 :    
960 :     if ($x < $z) {
961 :     return (($x <= $y) && ($y <= $z));
962 :     } else {
963 :     return (($x >= $y) && ($y >= $z));
964 :     }
965 :     }
966 : parrello 1.1
967 : parrello 1.15 =head3 standard_genetic_code
968 :    
969 :     my $code = standard_genetic_code();
970 :    
971 :     Return a hash containing the standard translation of nucleotide triples to proteins.
972 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
973 :     returns the default translation scheme. The scheme is implemented as a reference to a
974 :     hash that contains nucleotide triplets as keys and has protein letters as values.
975 :    
976 :     =cut
977 :    
978 :     sub standard_genetic_code {
979 :    
980 :     my $code = {};
981 :    
982 :     $code->{"AAA"} = "K";
983 :     $code->{"AAC"} = "N";
984 :     $code->{"AAG"} = "K";
985 :     $code->{"AAT"} = "N";
986 :     $code->{"ACA"} = "T";
987 :     $code->{"ACC"} = "T";
988 :     $code->{"ACG"} = "T";
989 :     $code->{"ACT"} = "T";
990 :     $code->{"AGA"} = "R";
991 :     $code->{"AGC"} = "S";
992 :     $code->{"AGG"} = "R";
993 :     $code->{"AGT"} = "S";
994 :     $code->{"ATA"} = "I";
995 :     $code->{"ATC"} = "I";
996 :     $code->{"ATG"} = "M";
997 :     $code->{"ATT"} = "I";
998 :     $code->{"CAA"} = "Q";
999 :     $code->{"CAC"} = "H";
1000 :     $code->{"CAG"} = "Q";
1001 :     $code->{"CAT"} = "H";
1002 :     $code->{"CCA"} = "P";
1003 :     $code->{"CCC"} = "P";
1004 :     $code->{"CCG"} = "P";
1005 :     $code->{"CCT"} = "P";
1006 :     $code->{"CGA"} = "R";
1007 :     $code->{"CGC"} = "R";
1008 :     $code->{"CGG"} = "R";
1009 :     $code->{"CGT"} = "R";
1010 :     $code->{"CTA"} = "L";
1011 :     $code->{"CTC"} = "L";
1012 :     $code->{"CTG"} = "L";
1013 :     $code->{"CTT"} = "L";
1014 :     $code->{"GAA"} = "E";
1015 :     $code->{"GAC"} = "D";
1016 :     $code->{"GAG"} = "E";
1017 :     $code->{"GAT"} = "D";
1018 :     $code->{"GCA"} = "A";
1019 :     $code->{"GCC"} = "A";
1020 :     $code->{"GCG"} = "A";
1021 :     $code->{"GCT"} = "A";
1022 :     $code->{"GGA"} = "G";
1023 :     $code->{"GGC"} = "G";
1024 :     $code->{"GGG"} = "G";
1025 :     $code->{"GGT"} = "G";
1026 :     $code->{"GTA"} = "V";
1027 :     $code->{"GTC"} = "V";
1028 :     $code->{"GTG"} = "V";
1029 :     $code->{"GTT"} = "V";
1030 :     $code->{"TAA"} = "*";
1031 :     $code->{"TAC"} = "Y";
1032 :     $code->{"TAG"} = "*";
1033 :     $code->{"TAT"} = "Y";
1034 :     $code->{"TCA"} = "S";
1035 :     $code->{"TCC"} = "S";
1036 :     $code->{"TCG"} = "S";
1037 :     $code->{"TCT"} = "S";
1038 :     $code->{"TGA"} = "*";
1039 :     $code->{"TGC"} = "C";
1040 :     $code->{"TGG"} = "W";
1041 :     $code->{"TGT"} = "C";
1042 :     $code->{"TTA"} = "L";
1043 :     $code->{"TTC"} = "F";
1044 :     $code->{"TTG"} = "L";
1045 :     $code->{"TTT"} = "F";
1046 :    
1047 :     return $code;
1048 :     }
1049 :    
1050 :     =head3 translate
1051 :    
1052 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1053 :    
1054 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1055 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1056 :     C<M>. (In the standard genetic code, these two combinations normally translate
1057 :     to C<V> and C<L>, respectively.)
1058 :    
1059 :     =over 4
1060 :    
1061 :     =item dna_seq
1062 :    
1063 :     DNA sequence to translate. Note that the DNA sequence can only contain
1064 :     known nucleotides.
1065 :    
1066 :     =item code
1067 :    
1068 :     Reference to a hash specifying the translation code. The hash is keyed by
1069 :     nucleotide triples, and the value for each key is the corresponding protein
1070 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1071 :     used.
1072 :    
1073 :     =item fix_start
1074 :    
1075 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1076 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1077 :     C<M> instead of the value specified in the translation code.
1078 :    
1079 :     =item RETURN
1080 :    
1081 :     Returns a string resulting from translating each nucleotide triple into a
1082 :     protein letter.
1083 :    
1084 :     =back
1085 :    
1086 :     =cut
1087 :     #: Return Type $;
1088 :     sub translate {
1089 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1090 :    
1091 :     my( $dna,$code,$start ) = @_;
1092 :     my( $i,$j,$ln );
1093 :     my( $x,$y );
1094 :     my( $prot );
1095 :    
1096 :     if (! defined($code)) {
1097 :     $code = &FIG::standard_genetic_code;
1098 :     }
1099 :     $ln = length($dna);
1100 :     $prot = "X" x ($ln/3);
1101 :     $dna =~ tr/a-z/A-Z/;
1102 :    
1103 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1104 :     $x = substr($dna,$i,3);
1105 :     if ($y = $code->{$x}) {
1106 :     substr($prot,$j,1) = $y;
1107 :     }
1108 :     }
1109 :    
1110 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1111 :     substr($prot,0,1) = 'M';
1112 :     }
1113 :     return $prot;
1114 :     }
1115 :    
1116 : parrello 1.18 =head3 strip_ec
1117 :    
1118 :     my $role = strip_ec($rawRole);
1119 :    
1120 :     Strip the EC number (if any) from the specified role or functional
1121 :     assignment.
1122 :    
1123 :     =over 4
1124 :    
1125 :     =item rawRole
1126 :    
1127 :     Role or functional assignment from which the EC numbers are to be stripped.
1128 :    
1129 :     =item RETURN
1130 :    
1131 :     Returns the incoming string with any EC numbers removed. The EC numbers must
1132 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1133 :     and surrounding parentheses).
1134 :    
1135 :     =back
1136 :    
1137 :     =cut
1138 :    
1139 :     sub strip_ec {
1140 :     # Get the parameters.
1141 :     my ($rawRole) = @_;
1142 :     # Declare the return variable.
1143 :     my $retVal = $rawRole;
1144 :     # Remove the EC numbers.
1145 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1146 :     # Return the result.
1147 :     return $retVal;
1148 :     }
1149 :    
1150 : parrello 1.15
1151 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3