[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.34 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 :    
24 : olson 1.32 #
25 :     # In case we are running in a SEED, pull in the FIG_Config
26 :     #
27 :     BEGIN
28 :     {
29 :     eval {
30 :     require FIG_Config;
31 :     };
32 :     }
33 :    
34 : parrello 1.1 use strict;
35 : parrello 1.12 no warnings qw(once);
36 : parrello 1.1 use base qw(Exporter);
37 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
38 :     rev_comp genome_of min max sims verify_dir between translate
39 : parrello 1.21 standard_genetic_code parse_location roles_of_function
40 : olson 1.33 strip_ec location_string location_cmp strand_of by_fig_id
41 : parrello 1.34 verify_db bbh_data);
42 : parrello 1.1
43 :     =head1 SEED Utility Methods
44 :    
45 :     =head2 Introduction
46 :    
47 :     This is a simple utility package that performs functions useful for
48 :     bioinformatics, but that do not require access to the databases.
49 :    
50 :     =head2 Public Methods
51 :    
52 : parrello 1.21 =head3 abbrev
53 :    
54 :     my $abbrev = SeedUtils::abbrev($genome_name);
55 :    
56 :     Return an abbreviation of the specified genome name. This method is used to create
57 :     a reasonably indicative genome name that fits in 10 characters.
58 :    
59 :     =over 4
60 :    
61 :     =item genome_name
62 :    
63 :     Genome name to abbreviate.
64 :    
65 :     =item RETURN
66 :    
67 :     Returns a shortened version of the genome name that is 10 characters or less in
68 :     length.
69 :    
70 :     =back
71 :    
72 :     =cut
73 :    
74 :     sub abbrev {
75 :     my($genome_name) = @_;
76 :    
77 :     $genome_name =~ s/^(\S{3})\S+/$1./;
78 :     $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
79 :     $genome_name =~ s/ //g;
80 :     if (length($genome_name) > 10) {
81 :     $genome_name = substr($genome_name,0,10);
82 :     }
83 :     return $genome_name;
84 :     }
85 :    
86 : parrello 1.34 =head3 bbh_data
87 :    
88 :     my $bbhList = FIGRules::bbh_data($peg, $cutoff);
89 :    
90 :     Return a list of the bi-directional best hits relevant to the specified PEG.
91 :    
92 :     =over 4
93 :    
94 :     =item peg
95 :    
96 :     ID of the feature whose bidirectional best hits are desired.
97 :    
98 :     =item cutoff
99 :    
100 :     Similarity cutoff. If omitted, 1e-10 is used.
101 :    
102 :     =item RETURN
103 :    
104 :     Returns a reference to a list of 3-tuples. The first element of the list is the best-hit
105 :     PEG; the second element is the score. A lower score indicates a better match. The third
106 :     element is the normalized bit score for the pair; it is normalized to the length
107 :     of the protein.
108 :    
109 :     =back
110 :    
111 :     =cut
112 :     #: Return Type @@;
113 :     sub bbh_data {
114 :     my ($peg, $cutoff) = @_;
115 :     my @retVal = ();
116 :     require LWP::UserAgent;
117 :     my $ua = LWP::UserAgent->new();
118 :     my $url = "http://bioseed.mcs.anl.gov/simserver/perl/bbhs.pl";
119 :     my $retries = 5;
120 :     my $done = 0;
121 :     my $resp;
122 :     while ($retries > 0 && ! $done) {
123 :     $resp = $ua->post($url, { id => $peg, cutoff => $cutoff });
124 :     if ($resp->is_success) {
125 :     my $dat = $resp->content;
126 :     while ($dat =~ /([^\n]+)\n/g) {
127 :     my @l = split(/\t/, $1);
128 :     push @retVal, \@l;
129 :     }
130 :     $done = 1;
131 :     } else {
132 :     $retries--;
133 :     }
134 :     }
135 :     if (! $done) {
136 :     die("Failure retrieving bbh data for $peg: " . $resp->status_line);
137 :     }
138 :     return \@retVal;
139 :     }
140 :    
141 :    
142 : parrello 1.21 =head3 between
143 :    
144 :     my $flag = between($x, $y, $z);
145 :    
146 :     Determine whether or not $y is between $x and $z.
147 :    
148 :     =over 4
149 :    
150 :     =item x
151 :    
152 :     First edge number.
153 :    
154 :     =item y
155 :    
156 :     Number to examine.
157 :    
158 :     =item z
159 :    
160 :     Second edge number.
161 :    
162 :     =item RETURN
163 :    
164 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
165 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
166 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
167 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
168 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
169 :    
170 :     =back
171 :    
172 :     =cut
173 :     #: Return Type $;
174 :     sub between {
175 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
176 :     my($x,$y,$z) = @_;
177 :    
178 :     if ($x < $z) {
179 :     return (($x <= $y) && ($y <= $z));
180 :     } else {
181 :     return (($x >= $y) && ($y >= $z));
182 :     }
183 :     }
184 :    
185 :    
186 : parrello 1.13 =head3 boundaries_of
187 :    
188 : parrello 1.21 my ($contig, $min, $max, $dir) = boundaries_of($locs);
189 : parrello 1.13
190 :     Return the boundaries of a set of locations. The contig, the leftmost
191 :     location, and the rightmost location will be returned to the caller. If
192 :     more than one contig is represented, the method will return an undefined
193 :     value for the contig (indicating failure).
194 :    
195 :     =over 4
196 :    
197 :     =item locs
198 :    
199 :     Reference to a list of location strings. A location string contains a contig ID,
200 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
201 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
202 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
203 :    
204 :     =item RETURN
205 :    
206 : parrello 1.21 Returns a 4-element list. The first element is the contig ID from all the locations,
207 :     the second is the offset of leftmost base pair represented in the locations, the
208 :     third is the offset of the rightmost base pair represented in the locations, and
209 :     the fourth is the dominant strand.
210 : parrello 1.13
211 :     =back
212 :    
213 :     =cut
214 :    
215 :     sub boundaries_of {
216 :     # Get the parameters.
217 :     my ($locs) = @_;
218 :     # Declare the return variables.
219 :     my ($contig, $min, $max);
220 :     # We'll put all the starting and ending offsets found in here.
221 :     my @offsets;
222 : parrello 1.21 # This will be used to count the orientations.
223 :     my %dirs = ('+' => 0, '-' => 0);
224 : parrello 1.13 # This will count the number of errors found.
225 :     my $error = 0;
226 :     # Loop through the locations.
227 :     for my $loc (@$locs) {
228 :     # Parse this location.
229 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
230 : parrello 1.23 # This is a valid location string.
231 : parrello 1.13 my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
232 :     # Is this contig valid?
233 :     if ($contig && $newContig ne $contig) {
234 :     # No, skip this location.
235 :     $error++;
236 :     } else {
237 :     # Save the contig.
238 :     $contig = $newContig;
239 : parrello 1.21 # Count the orientation.
240 :     $dirs{$dir}++;
241 : parrello 1.13 # Compute the ending offset.
242 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
243 :     # Save both offsets.
244 :     push @offsets, $begin, $end;
245 :     }
246 : parrello 1.23 } elsif ($loc =~ /^(.+)_(\d+)_(\d+)/) {
247 :     # Here we have an old-style location string.
248 :     my($newContig, $start, $stop) = ($1, $2, $3);
249 :     # Is this contig valid?
250 : olson 1.22 if ($contig && $newContig ne $contig) {
251 :     # No, skip this location.
252 :     $error++;
253 :     } else {
254 :     # Save the contig.
255 :     $contig = $newContig;
256 : parrello 1.23 # Compute the orientation.
257 :     my $dir;
258 :     if ($start > $stop) {
259 :     $dir = '-';
260 :     } else {
261 :     $dir = '+';
262 :     }
263 :     # Count it.
264 : olson 1.22 $dirs{$dir}++;
265 : parrello 1.23 # Save both offsets.
266 : olson 1.22 push @offsets, $start, $stop;
267 :     }
268 : parrello 1.13 } else {
269 :     # The location is invalid, so it's an error,
270 :     $error++;
271 :     }
272 :     }
273 :     # If there's an error, clear the contig ID.
274 :     if ($error) {
275 :     $contig = undef;
276 :     }
277 :     # Compute the min and max from the offsets collected.
278 :     $min = min(@offsets);
279 :     $max = max(@offsets);
280 : parrello 1.21 # Save the dominant orientation.
281 :     my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
282 : parrello 1.13 # Return the results.
283 : parrello 1.21 return ($contig, $min, $max, $dir);
284 : parrello 1.13 }
285 :    
286 : parrello 1.21 =head3 boundary_loc
287 : parrello 1.17
288 : parrello 1.21 my $singleLoc = SeedUtils::boundary_loc($locations);
289 : parrello 1.17
290 : parrello 1.21 Return a single location string (see L<SAP/Location Strings>) that covers
291 :     the incoming list of locations. NOTE that if the locations listed span
292 :     more than one contig, this method may return an unexpected result.
293 : parrello 1.17
294 : parrello 1.21 This method is useful for converting the output of L<SAP/fid_locations> to
295 :     location strings.
296 : parrello 1.17
297 :     =over 4
298 :    
299 : parrello 1.21 =item locations
300 : parrello 1.17
301 : parrello 1.21 A set of location strings formatted as a comma-separated list or as a reference
302 :     to a list of location strings.
303 : parrello 1.17
304 :     =item RETURN
305 :    
306 : parrello 1.21 Returns a single location string that covers as best as possible the list of
307 :     incoming locations.
308 : parrello 1.17
309 :     =back
310 :    
311 :     =cut
312 :    
313 : parrello 1.21 sub boundary_loc {
314 : parrello 1.17 # Get the parameters.
315 : parrello 1.21 my ($locations) = @_;
316 :     # Convert the incoming locations to a list.
317 :     my @locs;
318 :     if (ref $locations eq 'ARRAY') {
319 :     @locs = @$locations;
320 :     } else {
321 :     @locs = split /\s*,\s*/, $locations;
322 :     }
323 :     # Get the boundary information for the listed locations.
324 :     my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
325 :     # Compute the indicated location string.
326 :     my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
327 :     ($max + 1 - $min);
328 : parrello 1.17 # Return the result.
329 : parrello 1.21 return $retVal;
330 : parrello 1.17 }
331 :    
332 : parrello 1.21 =head3 by_fig_id
333 : parrello 1.17
334 : parrello 1.21 my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
335 : parrello 1.16
336 : parrello 1.21 Compare two feature IDs.
337 : parrello 1.16
338 : parrello 1.21 This function is designed to assist in sorting features by ID. The sort is by
339 :     genome ID followed by feature type and then feature number.
340 : parrello 1.16
341 :     =over 4
342 :    
343 : parrello 1.21 =item a
344 :    
345 :     First feature ID.
346 :    
347 :     =item b
348 : parrello 1.16
349 : parrello 1.21 Second feature ID.
350 : parrello 1.16
351 :     =item RETURN
352 :    
353 : parrello 1.21 Returns a negative number if the first parameter is smaller, zero if both parameters
354 :     are equal, and a positive number if the first parameter is greater.
355 : parrello 1.16
356 :     =back
357 :    
358 :     =cut
359 :    
360 : parrello 1.21 sub by_fig_id {
361 :     my($a,$b) = @_;
362 :     my($g1,$g2,$t1,$t2,$n1,$n2);
363 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
364 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
365 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
366 :     } else {
367 :     $a cmp $b;
368 : parrello 1.16 }
369 : parrello 1.19 }
370 :    
371 : parrello 1.21 =head3 create_fasta_record
372 : parrello 1.19
373 : parrello 1.21 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
374 : parrello 1.19
375 : parrello 1.21 Create a FASTA record from the specified DNA or protein sequence. The
376 :     sequence will be split into 60-character lines, and the record will
377 :     include an identifier line.
378 : parrello 1.19
379 :     =over 4
380 :    
381 : parrello 1.21 =item id
382 :    
383 :     ID for the sequence, to be placed at the beginning of the identifier
384 :     line.
385 :    
386 :     =item comment (optional)
387 : parrello 1.19
388 : parrello 1.21 Comment text to place after the ID on the identifier line. If this parameter
389 :     is empty, undefined, or 0, no comment will be placed.
390 : parrello 1.19
391 : parrello 1.21 =item sequence
392 : parrello 1.19
393 : parrello 1.21 Sequence of letters to form into FASTA. For purposes of convenience, whitespace
394 :     characters in the sequence will be removed automatically.
395 : parrello 1.19
396 : parrello 1.21 =item stripped (optional)
397 : parrello 1.19
398 : parrello 1.21 If TRUE, then the sequence will be returned unmodified instead of converted
399 :     to FASTA format. The default is FALSE.
400 : parrello 1.19
401 :     =item RETURN
402 :    
403 : parrello 1.21 Returns the desired sequence in FASTA format.
404 : parrello 1.19
405 :     =back
406 :    
407 :     =cut
408 :    
409 : parrello 1.21 sub create_fasta_record {
410 : parrello 1.19 # Get the parameters.
411 : parrello 1.21 my ($id, $comment, $sequence, $stripped) = @_;
412 :     # Declare the return variable.
413 :     my $retVal;
414 :     # If we're in stripped mode, we just return the sequence.
415 :     if ($stripped) {
416 :     $retVal = $sequence;
417 : parrello 1.19 } else {
418 : parrello 1.21 # Here we have to do the FASTA conversion. Start with the ID.
419 :     my $header = ">$id";
420 :     # Add a comment, if any.
421 :     if ($comment) {
422 :     $header .= " $comment";
423 :     }
424 :     # Clean up the sequence.
425 :     $sequence =~ s/\s+//g;
426 :     # We need to format the sequence into 60-byte chunks. We use the infamous
427 :     # grep-split trick. The split, because of the presence of the parentheses,
428 :     # includes the matched delimiters in the output list. The grep strips out
429 :     # the empty list items that appear between the so-called delimiters, since
430 :     # the delimiters are what we want.
431 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
432 :     # Add the chunks and the trailer.
433 :     $retVal = join("\n", $header, @chunks) . "\n";
434 : parrello 1.19 }
435 :     # Return the result.
436 : parrello 1.21 return $retVal;
437 : parrello 1.19 }
438 :    
439 : parrello 1.21 =head3 display_id_and_seq
440 : parrello 1.19
441 : parrello 1.21 SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
442 : parrello 1.19
443 : parrello 1.21 Display a fasta ID and sequence to the specified open file. This method is designed
444 :     to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
445 :     input a string pointer rather than a string. If the file handle is omitted it
446 :     defaults to STDOUT.
447 : parrello 1.19
448 : parrello 1.21 The output is formatted into a FASTA record. The first line of the output is
449 :     preceded by a C<< > >> symbol, and the sequence is split into 60-character
450 :     chunks displayed one per line. Thus, this method can be used to produce
451 :     FASTA files from data gathered by the rest of the system.
452 : parrello 1.19
453 :     =over 4
454 :    
455 : parrello 1.21 =item id_and_comment
456 : parrello 1.19
457 : parrello 1.21 The sequence ID and (optionally) the comment from the sequence's FASTA record.
458 :     The ID
459 : parrello 1.19
460 : parrello 1.21 =item seqP
461 : parrello 1.19
462 : parrello 1.21 Reference to a string containing the sequence. The sequence is automatically
463 :     formatted into 60-character chunks displayed one per line.
464 : parrello 1.19
465 : parrello 1.21 =item fh
466 :    
467 :     Open file handle to which the ID and sequence should be output. If omitted,
468 :     C<\*STDOUT> is assumed.
469 :    
470 :     =back
471 :    
472 :     =cut
473 :    
474 :     sub display_id_and_seq {
475 :    
476 :     my( $id, $seqP, $fh ) = @_;
477 :    
478 :     if (! defined($fh) ) { $fh = \*STDOUT; }
479 :    
480 :     print $fh ">$id\n";
481 :     &display_seq($seqP, $fh);
482 :     }
483 :    
484 :     =head3 display_seq
485 :    
486 :     SeedUtils::display_seq(\$seqP, $fh);
487 :    
488 :     Display a fasta sequence to the specified open file. If the file handle is
489 :     omitted it defaults to STDOUT.
490 :    
491 :     The sequence is split into 60-character chunks displayed one per line for
492 :     readability.
493 :    
494 :     =over 4
495 :    
496 :     =item seqP
497 :    
498 :     Reference to a string containing the sequence.
499 :    
500 :     =item fh
501 :    
502 :     Open file handle to which the sequence should be output. If omitted,
503 :     C<STDOUT> is assumed.
504 :    
505 :     =back
506 :    
507 :     =cut
508 :    
509 :     sub display_seq {
510 :    
511 :     my ( $seqP, $fh ) = @_;
512 :     my ( $i, $n, $ln );
513 :    
514 :     if (! defined($fh) ) { $fh = \*STDOUT; }
515 :    
516 :     $n = length($$seqP);
517 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
518 :     for ($i=0; ($i < $n); $i += 60) {
519 :     if (($i + 60) <= $n) {
520 :     $ln = substr($$seqP,$i,60);
521 :     } else {
522 :     $ln = substr($$seqP,$i,($n-$i));
523 :     }
524 :     print $fh "$ln\n";
525 :     }
526 :     }
527 :    
528 :    
529 :     =head3 genome_of
530 :    
531 :     my $genomeID = genome_of($fid);
532 :    
533 :     Return the Genome ID embedded in the specified FIG feature ID.
534 :    
535 :     =over 4
536 :    
537 :     =item fid
538 :    
539 :     Feature ID of interest.
540 :    
541 :     =item RETURN
542 :    
543 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
544 :     feature ID is invalid, this method returns an undefined value.
545 :    
546 :     =back
547 :    
548 :     =cut
549 :    
550 :     sub genome_of {
551 :     # Get the parameters.
552 :     my ($fid) = @_;
553 :     # Declare the return variable.
554 :     my $retVal;
555 :     # Parse the feature ID.
556 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
557 :     $retVal = $1;
558 :     }
559 :     # Return the result.
560 :     return $retVal;
561 :     }
562 :    
563 :     =head3 hypo
564 :    
565 :     my $flag = hypo($func);
566 :    
567 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
568 :     Hypothetical functional roles are identified by key words in the text,
569 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
570 :    
571 :     =over 4
572 :    
573 :     =item func
574 :    
575 :     Text of the functional role whose nature is to be determined.
576 :    
577 :     =item RETURN
578 :    
579 :     Returns TRUE if the role is hypothetical, else FALSE.
580 :    
581 :     =back
582 :    
583 :     =cut
584 :    
585 :     sub hypo {
586 :     my ($func) = @_;
587 :     if (! $func) { return 1 }
588 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
589 :     if ($func =~ /hypoth/i) { return 1 }
590 :     if ($func =~ /conserved protein/i) { return 1 }
591 :     if ($func =~ /gene product/i) { return 1 }
592 :     if ($func =~ /interpro/i) { return 1 }
593 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
594 :     if ($func =~ /^U\d/) { return 1 }
595 :     if ($func =~ /^orf[^_]/i) { return 1 }
596 :     if ($func =~ /uncharacterized/i) { return 1 }
597 :     if ($func =~ /pseudogene/i) { return 1 }
598 :     if ($func =~ /^predicted/i) { return 1 }
599 :     if ($func =~ /AGR_/) { return 1 }
600 :     if ($func =~ /similar to/i) { return 1 }
601 :     if ($func =~ /similarity/i) { return 1 }
602 :     if ($func =~ /glimmer/i) { return 1 }
603 :     if ($func =~ /unknown/i) { return 1 }
604 :     if (($func =~ /domain/i) ||
605 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
606 :     ($func =~ /complete/i) ||
607 :     ($func =~ /ensang/i) ||
608 :     ($func =~ /unnamed/i) ||
609 :     ($func =~ /EG:/) ||
610 :     ($func =~ /orf\d+/i) ||
611 :     ($func =~ /RIKEN/) ||
612 :     ($func =~ /Expressed/i) ||
613 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
614 :     ($func =~ /predicted by Psort/) ||
615 :     ($func =~ /^bh\d+/i) ||
616 :     ($func =~ /cds_/i) ||
617 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
618 :     ($func =~ /similar to/i) ||
619 :     ($func =~ / identi/i) ||
620 :     ($func =~ /ortholog of/i) ||
621 :     ($func =~ /structural feature/i)) { return 1 }
622 :     return 0;
623 :    
624 :     }
625 :    
626 :     =head3 location_cmp
627 :    
628 :     my $cmp = location_cmp($loc1, $loc2);
629 :    
630 :     Compare two location strings (see L<SAP/Location Strings>).
631 :    
632 :     The ordering principle for locations is that they are sorted first by contig ID, then by
633 :     leftmost position, in reverse order by length, and then by direction. The effect is that
634 :     within a contig, the locations are ordered first and foremost in the way they would
635 :     appear when displayed in a picture of the contig and second in such a way that embedded
636 :     locations come after the locations in which they are embedded. In the case of two
637 :     locations that represent the exact same base pairs, the forward (C<+>) location is
638 :     arbitrarily placed first.
639 :    
640 :     =over 4
641 :    
642 :     =item loc1
643 :    
644 :     First location string to compare.
645 :    
646 :     =item loc2
647 :    
648 :     Second location string to compare.
649 :    
650 :     =item RETURN
651 :    
652 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
653 :     B<loc2> location sorts first, and zero if the two locations are the same.
654 : parrello 1.19
655 :    
656 :     =back
657 :    
658 :     =cut
659 :    
660 :     sub location_cmp {
661 :     # Get the parameters.
662 :     my ($loc1, $loc2) = @_;
663 :     # Parse the locations.
664 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
665 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
666 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
667 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
668 :     # Declare the return variable. We compare the indicative parts of the location
669 :     # in order. Note that we sort in reverse order by length, so the last comparison
670 :     # puts 2 before 1.
671 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
672 :     ($len2 <=> $len1);
673 :     # If everything matches to this point, check the strands.
674 :     if (! $retVal) {
675 :     if ($strand1 eq '+') {
676 :     # First location is positive, so if the locations are unequal, it
677 :     # sorts first.
678 :     $retVal = ($strand2 eq '+' ? 0 : -1);
679 :     } else {
680 :     # First location is negative, so if the locations are unequal, it
681 :     # sorts second.
682 :     $retVal = ($strand1 eq '-' ? 0 : 1);
683 :     }
684 :     }
685 :     # Return the result.
686 :     return $retVal;
687 :     }
688 :    
689 : parrello 1.21 =head3 location_string
690 : parrello 1.19
691 : parrello 1.21 my $locString = location_string($contig, $beg, $end);
692 : parrello 1.19
693 : parrello 1.21 Form a location string for the specified contig that starts at the
694 :     indicated begin location and stops at the indicated end location. A
695 :     single-base location will automatically be put on the forward strand.
696 : parrello 1.19
697 :     =over 4
698 :    
699 : parrello 1.21 =item contig
700 :    
701 :     ID of the contig to contain this location.
702 :    
703 :     =item beg
704 :    
705 :     Beginning offset of the location.
706 :    
707 :     =item end
708 : parrello 1.19
709 : parrello 1.21 Ending offset of the location.
710 : parrello 1.19
711 :     =item RETURN
712 :    
713 : parrello 1.21 Returns a location string (see L<SAP/Location String>) for the specified
714 :     location.
715 : parrello 1.19
716 :     =back
717 :    
718 :     =cut
719 :    
720 : parrello 1.21 sub location_string {
721 : parrello 1.19 # Get the parameters.
722 : parrello 1.21 my ($contig, $beg, $end) = @_;
723 :     # Compute the strand and length.
724 :     my ($strand, $len);
725 :     if ($beg <= $end) {
726 :     $strand = '+';
727 :     $len = $end + 1 - $beg;
728 :     } else {
729 :     $strand = '-';
730 :     $len = $beg + 1 - $end;
731 : parrello 1.19 }
732 :     # Return the result.
733 : parrello 1.21 return $contig . "_$beg$strand$len";
734 : parrello 1.16 }
735 :    
736 : parrello 1.5 =head3 max
737 :    
738 :     my $max = max(@nums);
739 :    
740 :     Return the maximum number from all the values in the specified list.
741 :    
742 :     =over 4
743 :    
744 :     =item nums
745 :    
746 :     List of numbers to examine.
747 :    
748 :     =item RETURN
749 :    
750 :     Returns the maximum numeric value from the specified parameters, or
751 :     an undefined value if an empty list is passed in.
752 :    
753 :     =back
754 :    
755 :     =cut
756 :    
757 :     sub max {
758 :     my ($retVal, @nums) = @_;
759 :     for my $num (@nums) {
760 :     if ($num > $retVal) {
761 :     $retVal = $num;
762 :     }
763 :     }
764 :     return $retVal;
765 :     }
766 :    
767 :     =head3 min
768 :    
769 :     my $min = min(@nums);
770 :    
771 :     Return the minimum number from all the values in the specified list.
772 :    
773 :     =over 4
774 :    
775 :     =item nums
776 :    
777 :     List of numbers to examine.
778 :    
779 :     =item RETURN
780 :    
781 :     Returns the minimum numeric value from the specified parameters, or
782 :     an undefined value if an empty list is passed in.
783 :    
784 :     =back
785 :    
786 :     =cut
787 :    
788 :     sub min {
789 :     my ($retVal, @nums) = @_;
790 :     for my $num (@nums) {
791 :     if ($num < $retVal) {
792 :     $retVal = $num;
793 :     }
794 :     }
795 :     return $retVal;
796 :     }
797 :    
798 : parrello 1.21 =head3 parse_fasta_record
799 : parrello 1.1
800 : parrello 1.21 my ($id, $comment, $seq) = parse_fasta_record($string);
801 : parrello 1.1
802 : parrello 1.21 Extract the ID, comment, and sequence from a single FASTA record. For
803 :     backward compatability, instead of a FASTA record the ID and sequence can
804 :     be specified separated by a comma. In this case, the returned comment
805 :     will be empty.
806 : parrello 1.1
807 :     =over 4
808 :    
809 : parrello 1.21 =item string
810 : parrello 1.1
811 : parrello 1.21 A single FASTA record, or an ID and sequence separated by a single comma,
812 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
813 :     or a 3-element list consisting of an ID, a comment, and a sequence.
814 : parrello 1.7
815 : parrello 1.1 =item RETURN
816 :    
817 : parrello 1.21 Returns a three-element list consisting of the incoming ID, the associated
818 :     comment, and the specified DNA or protein sequence. If the incoming string is
819 :     invalid, all three list elements will come back undefined. If no ID is
820 :     specified, an MD5 will be provided.
821 : parrello 1.1
822 :     =back
823 :    
824 :     =cut
825 :    
826 : parrello 1.21 sub parse_fasta_record {
827 : parrello 1.1 # Get the parameters.
828 : parrello 1.21 my ($string) = @_;
829 :     # Declare the return variables.
830 :     my ($id, $comment, $seq);
831 :     # Check the type of input string.
832 :     if (! defined $string) {
833 :     # Do nothing if no string was passed in. This extra check prevents a
834 :     # warning at runtime.
835 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
836 :     # Here we have a standard FASTA string.
837 :     ($id, $comment, $seq) = ($1, $2, $3);
838 :     # Remove white space from the sequence string.
839 :     $seq =~ s/\s+//sg;
840 :     # Trim front of comment.
841 :     $comment =~ s/^s+//;
842 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
843 :     ($id, $comment, $seq) = ($1, '', $2);
844 :     } elsif (ref $string eq 'ARRAY') {
845 :     # Here the data came in pre-formatted as a list reference.
846 :     ($id, $comment, $seq) = @$string;
847 :     # If there's no comment, we need to adjust.
848 :     if (! defined $seq) {
849 :     $seq = $comment;
850 :     $comment = '';
851 :     }
852 : parrello 1.7 } else {
853 : parrello 1.21 # Here we have only a sequence. We need to construct the ID.
854 :     $seq = $string;
855 :     require Digest::MD5;
856 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
857 :     $comment = "";
858 :     }
859 :     # Return the results.
860 :     return ($id, $comment, $seq);
861 :     }
862 :    
863 :     =head3 parse_location
864 :    
865 :     my ($contig, $begin, $end, $strand) = parse_location($locString);
866 :    
867 :     Return the contigID, start offset, and end offset for a specified
868 :     location string (see L<SAP/Location Strings>).
869 :    
870 :     =over 4
871 :    
872 :     =item locString
873 :    
874 :     Location string to parse.
875 :    
876 :     =item RETURN
877 :    
878 :     Returns a four-element list containing the contig ID from the location
879 :     string, the starting offset of the location, the ending offset, and the
880 :     strand. If the location string is not valid, the values returned will be
881 :     C<undef>.
882 :    
883 :     =back
884 :    
885 :     =cut
886 :    
887 :     sub parse_location {
888 :     # Get the parameters.
889 :     my ($locString) = @_;
890 :     # Declare the return variables.
891 :     my ($contig, $begin, $end, $strand);
892 :     # Parse the location string.
893 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
894 :     # Pull out the contig ID, strand, and begin location.
895 :     $contig = $1;
896 :     $begin = $2;
897 :     $strand = $3;
898 :     # Compute the ending location from the direction and length.
899 :     if ($3 eq '+') {
900 :     $end = $begin + $4 - 1;
901 :     } else {
902 :     $end = $begin - $4 + 1;
903 : parrello 1.7 }
904 : parrello 1.1 }
905 : parrello 1.21 # Return the results.
906 :     return ($contig, $begin, $end, $strand);
907 : parrello 1.1 }
908 :    
909 : parrello 1.2 =head3 rev_comp
910 :    
911 :     my $revcmp = rev_comp($dna);
912 :    
913 :     or
914 :    
915 :     rev_comp(\$dna);
916 :    
917 :     Return the reverse complement of a DNA string.
918 :    
919 :     =over 4
920 :    
921 :     =item dna
922 :    
923 :     Either a DNA string, or a reference to a DNA string.
924 :    
925 :     =item RETURN
926 :    
927 :     If the input is a DNA string, returns the reverse complement. If the
928 :     input is a reference to a DNA string, the string itself is reverse
929 :     complemented.
930 :    
931 :     =back
932 :    
933 :     =cut
934 :    
935 :     sub rev_comp {
936 :     # Get the parameters.
937 :     my ($dna) = @_;
938 :     # Determine how we were called.
939 :     my ($retVal, $refMode);
940 : parrello 1.24 if (! ref $dna) {
941 : parrello 1.21 $retVal = reverse $dna;
942 : parrello 1.2 $refMode = 0;
943 :     } else {
944 : parrello 1.21 $retVal = reverse $$dna;
945 : parrello 1.2 $refMode = 1;
946 :     }
947 : parrello 1.21 # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
948 :     # user passed in a reference. The following translation step complements the
949 :     # string.
950 :     $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
951 : parrello 1.2 # Return the result in the method corresponding to the way it came in.
952 :     if ($refMode) {
953 :     $$dna = $retVal;
954 :     return;
955 :     } else {
956 :     return $retVal;
957 :     }
958 :     }
959 :    
960 : parrello 1.21 # Synonym of rev_comp, for backward compatibility.
961 :     sub reverse_comp {
962 :     return rev_comp($_[0]);
963 : parrello 1.15 }
964 :    
965 : parrello 1.21 =head3 roles_of_function
966 : parrello 1.15
967 : parrello 1.21 my @roles = roles_of_function($assignment);
968 : parrello 1.3
969 : parrello 1.21 Return a list of the functional roles in the specified assignment string.
970 :     A single assignment may contain multiple roles as well as comments; this
971 :     method separates them out.
972 : parrello 1.3
973 :     =over 4
974 :    
975 : parrello 1.21 =item assignment
976 : parrello 1.3
977 : parrello 1.21 Functional assignment to parse for roles.
978 : parrello 1.3
979 :     =item RETURN
980 :    
981 : parrello 1.21 Returns a list of the individual roles in the assignment.
982 : parrello 1.3
983 :     =back
984 :    
985 :     =cut
986 :    
987 : parrello 1.21 sub roles_of_function {
988 : parrello 1.3 # Get the parameters.
989 : parrello 1.21 my ($assignment) = @_;
990 :     # Remove any comment.
991 : parrello 1.30 my $commentFree = ($assignment =~ /(.+?)\s*[#!]/ ? $1 : $assignment);
992 : parrello 1.21 # Split out the roles.
993 : parrello 1.30 my @retVal = split /\s+[\/@]\s+|\s*;\s+/, $commentFree;
994 : parrello 1.3 # Return the result.
995 : parrello 1.21 return @retVal;
996 : parrello 1.3 }
997 :    
998 : parrello 1.6 =head3 sims
999 :    
1000 : parrello 1.31 my @sims = sims($id, $maxN, $maxP, 'fig');
1001 :    
1002 :     or
1003 :    
1004 :     my @sims = sims($id, $maxN, $maxP, 'all);
1005 :    
1006 : parrello 1.6
1007 :     Retrieve similarities from the network similarity server. The similarity retrieval
1008 :     is performed using an HTTP user agent that returns similarity data in multiple
1009 :     chunks. An anonymous subroutine is passed to the user agent that parses and
1010 :     reformats the chunks as they come in. The similarites themselves are returned
1011 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
1012 :     The Sim object methods allow access to the elements by name.
1013 :    
1014 :     Similarities can be either raw or expanded. The raw similarities are basic
1015 :     hits between features with similar DNA. Expanding a raw similarity drags in any
1016 :     features considered substantially identical. So, for example, if features B<A1>,
1017 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
1018 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
1019 :    
1020 :     =over 4
1021 :    
1022 :     =item id
1023 :    
1024 :     ID of the feature whose similarities are desired, or reference to a list
1025 :     of the IDs of the features whose similarities are desired.
1026 :    
1027 : parrello 1.31 =item maxN (optional)
1028 : parrello 1.6
1029 : parrello 1.27 Maximum number of similarities to return for each incoming feature.
1030 : parrello 1.6
1031 : parrello 1.31 =item maxP (optional)
1032 : parrello 1.6
1033 :     The maximum allowable similarity score.
1034 :    
1035 : parrello 1.31 =item select (optional)
1036 : parrello 1.6
1037 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
1038 :     means only similarities to FIG features are returned; C<all> means all expanded
1039 :     similarities are returned; and C<figx> means similarities are expanded until the
1040 :     number of FIG features equals the maximum.
1041 :    
1042 : parrello 1.31 =item max_expand (optional)
1043 : parrello 1.6
1044 :     The maximum number of features to expand.
1045 :    
1046 : parrello 1.31 =item filters (optional)
1047 : parrello 1.6
1048 :     Reference to a hash containing filter information, or a subroutine that can be
1049 :     used to filter the sims.
1050 :    
1051 :     =item RETURN
1052 :    
1053 :     Returns a list of L<Sim> objects.
1054 :    
1055 :     =back
1056 :    
1057 :     =cut
1058 :    
1059 :     sub sims {
1060 :     # Get the parameters.
1061 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
1062 :     # Get the URL for submitting to the sims server.
1063 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
1064 :     # Get a list of the IDs to process.
1065 :     my @ids;
1066 :     if (ref($id) eq "ARRAY") {
1067 :     @ids = @$id;
1068 :     } else {
1069 :     @ids = ($id);
1070 :     }
1071 :     # Form a list of the parameters to pass to the server.
1072 :     my %args = ();
1073 :     $args{id} = \@ids;
1074 :     $args{maxN} = $maxN if defined($maxN);
1075 :     $args{maxP} = $maxP if defined($maxP);
1076 :     $args{select} = $select if defined($select);
1077 :     $args{max_expand} = $max_expand if defined($max_expand);
1078 :     # If the filter is a hash, put the filters in the argument list.
1079 :     if (ref($filters) eq 'HASH') {
1080 :     for my $k (keys(%$filters))
1081 :     {
1082 :     $args{"filter_$k"}= $filters->{$k};
1083 :     }
1084 :     }
1085 :     # Get the user agent.
1086 :     require LWP::UserAgent;
1087 :     my $ua = LWP::UserAgent->new();
1088 :     # Insure we have the Sim module.
1089 :     require Sim;
1090 :     #
1091 :     # Our next task is to create the anonymous subroutine that will process the
1092 :     # chunks that come back from the server. We require three global variables:
1093 :     # @sims to hold the similarities found, $tail to remember the unprocessed
1094 :     # data from the previous chunk, and $chunks to count the chunks.
1095 :     #
1096 :     my @retVal;
1097 :     my $tail;
1098 :     my $chunks = 0;
1099 :     #
1100 :     # ANONYMOUS SUBROUTINE
1101 :     #
1102 :     my $cb = sub {
1103 :     eval {
1104 :     # Get the parameters.
1105 :     my ($data, $command) = @_;
1106 :     # Check for a reset command. If we get one, we discard any data
1107 :     # in progress.
1108 :     if ($command && $command eq 'reset') {
1109 :     $tail = '';
1110 :     } else {
1111 :     $chunks++;
1112 :     # Get the data to process. Note we concatenate it to the incoming
1113 :     # tail from last time.
1114 :     my $c = $tail . $data;
1115 :     # Make sure the caller hasn't messed up the new-line character.
1116 :     # FASTA readers in particular are notorious for doing things
1117 :     # like that.
1118 :     local $/ = "\n";
1119 :     # Split the input into lines.
1120 :     my @lines = split(/\n/, $c);
1121 :     # If the input does not end with a new-line, we have a partial
1122 :     # chunk and need to put it in the tail for next time. If not,
1123 :     # there is no tail for next time.
1124 :     if (substr($c, -1, 1) ne "\n") {
1125 :     $tail = pop @lines;
1126 :     } else {
1127 :     $tail = '';
1128 :     }
1129 :     # Loop through the lines. Note there's no need to chomp because
1130 :     # the SPLIT took out the new-line characters.
1131 :     for my $l (@lines) {
1132 :     # Split the line into fields.
1133 :     my @s = split(/\t/, $l);
1134 :     # Insure we have all the fields we need.
1135 : parrello 1.12 if (@s >= 9) {
1136 : parrello 1.6 # Check to see if we've seen this SIM before.
1137 :     my $id1 = $s[0];
1138 :     my $id2 = $s[1];
1139 :     # Add it to the result list.
1140 :     push(@retVal, bless \@s, 'Sim');
1141 :     }
1142 :     }
1143 :     }
1144 :     };
1145 :     };
1146 :     #
1147 :     # END OF ANONYMOUS SUBROUTINE
1148 :     #
1149 :     # Now we're ready to start. Because networking is an iffy thing, we set up
1150 :     # to try our request multiple times.
1151 :     my $n_retries = 10;
1152 :     my $attempts = 0;
1153 :     # Set the timeout value, in seconds.
1154 :     $ua->timeout(180);
1155 :     # Loop until we succeed or run out of retries.
1156 :     my $done = 0;
1157 :     while (! $done && $attempts++ < $n_retries) {
1158 :     # Reset the content processor. This clears the tail.
1159 :     &$cb(undef, 'reset');
1160 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
1161 :     if ($resp->is_success) {
1162 :     # If the response was successful, get the content. This triggers
1163 :     # the anonymous subroutine.
1164 :     my $x = $resp->content;
1165 :     # Denote we've been successful.
1166 :     $done = 1;
1167 :     }
1168 :     }
1169 :     return @retVal;
1170 :     }
1171 :    
1172 : parrello 1.21 =head3 standard_genetic_code
1173 :    
1174 :     my $code = standard_genetic_code();
1175 :    
1176 :     Return a hash containing the standard translation of nucleotide triples to proteins.
1177 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
1178 :     returns the default translation scheme. The scheme is implemented as a reference to a
1179 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1180 :    
1181 :     =cut
1182 :    
1183 :     sub standard_genetic_code {
1184 :    
1185 :     my $code = {};
1186 : parrello 1.8
1187 : parrello 1.21 $code->{"AAA"} = "K";
1188 :     $code->{"AAC"} = "N";
1189 :     $code->{"AAG"} = "K";
1190 :     $code->{"AAT"} = "N";
1191 :     $code->{"ACA"} = "T";
1192 :     $code->{"ACC"} = "T";
1193 :     $code->{"ACG"} = "T";
1194 :     $code->{"ACT"} = "T";
1195 :     $code->{"AGA"} = "R";
1196 :     $code->{"AGC"} = "S";
1197 :     $code->{"AGG"} = "R";
1198 :     $code->{"AGT"} = "S";
1199 :     $code->{"ATA"} = "I";
1200 :     $code->{"ATC"} = "I";
1201 :     $code->{"ATG"} = "M";
1202 :     $code->{"ATT"} = "I";
1203 :     $code->{"CAA"} = "Q";
1204 :     $code->{"CAC"} = "H";
1205 :     $code->{"CAG"} = "Q";
1206 :     $code->{"CAT"} = "H";
1207 :     $code->{"CCA"} = "P";
1208 :     $code->{"CCC"} = "P";
1209 :     $code->{"CCG"} = "P";
1210 :     $code->{"CCT"} = "P";
1211 :     $code->{"CGA"} = "R";
1212 :     $code->{"CGC"} = "R";
1213 :     $code->{"CGG"} = "R";
1214 :     $code->{"CGT"} = "R";
1215 :     $code->{"CTA"} = "L";
1216 :     $code->{"CTC"} = "L";
1217 :     $code->{"CTG"} = "L";
1218 :     $code->{"CTT"} = "L";
1219 :     $code->{"GAA"} = "E";
1220 :     $code->{"GAC"} = "D";
1221 :     $code->{"GAG"} = "E";
1222 :     $code->{"GAT"} = "D";
1223 :     $code->{"GCA"} = "A";
1224 :     $code->{"GCC"} = "A";
1225 :     $code->{"GCG"} = "A";
1226 :     $code->{"GCT"} = "A";
1227 :     $code->{"GGA"} = "G";
1228 :     $code->{"GGC"} = "G";
1229 :     $code->{"GGG"} = "G";
1230 :     $code->{"GGT"} = "G";
1231 :     $code->{"GTA"} = "V";
1232 :     $code->{"GTC"} = "V";
1233 :     $code->{"GTG"} = "V";
1234 :     $code->{"GTT"} = "V";
1235 :     $code->{"TAA"} = "*";
1236 :     $code->{"TAC"} = "Y";
1237 :     $code->{"TAG"} = "*";
1238 :     $code->{"TAT"} = "Y";
1239 :     $code->{"TCA"} = "S";
1240 :     $code->{"TCC"} = "S";
1241 :     $code->{"TCG"} = "S";
1242 :     $code->{"TCT"} = "S";
1243 :     $code->{"TGA"} = "*";
1244 :     $code->{"TGC"} = "C";
1245 :     $code->{"TGG"} = "W";
1246 :     $code->{"TGT"} = "C";
1247 :     $code->{"TTA"} = "L";
1248 :     $code->{"TTC"} = "F";
1249 :     $code->{"TTG"} = "L";
1250 :     $code->{"TTT"} = "F";
1251 : parrello 1.8
1252 : parrello 1.21 return $code;
1253 :     }
1254 : parrello 1.8
1255 : parrello 1.21 =head3 strand_of
1256 : parrello 1.8
1257 : parrello 1.21 my $plusOrMinus = strand_of($loc);
1258 : parrello 1.8
1259 : parrello 1.21 Return the strand (C<+> or C<->) from the specified location string.
1260 : parrello 1.8
1261 : parrello 1.21 =over 4
1262 : parrello 1.8
1263 : parrello 1.21 =item loc
1264 : parrello 1.8
1265 : parrello 1.21 Location string to parse (see L<SAP/Location Strings>).
1266 : parrello 1.9
1267 :     =item RETURN
1268 :    
1269 : parrello 1.21 Returns C<+> if the location is on the forward strand, else C<->.
1270 : parrello 1.9
1271 :     =back
1272 :    
1273 :     =cut
1274 :    
1275 : parrello 1.21 sub strand_of {
1276 : parrello 1.9 # Get the parameters.
1277 : parrello 1.21 my ($loc) = @_;
1278 :     # Declare the return variable.
1279 :     my $retVal;
1280 :     # Parse the strand indicator from the location.
1281 :     if ($loc =~ /\d+([+-])\d+/) {
1282 :     $retVal = $1;
1283 : parrello 1.9 }
1284 : parrello 1.21 # Return the result.
1285 :     return $retVal;
1286 : parrello 1.9 }
1287 :    
1288 : parrello 1.21 =head3 strip_ec
1289 : parrello 1.9
1290 : parrello 1.21 my $role = strip_ec($rawRole);
1291 : parrello 1.11
1292 : parrello 1.21 Strip the EC number (if any) from the specified role or functional
1293 :     assignment.
1294 : parrello 1.11
1295 :     =over 4
1296 :    
1297 : parrello 1.21 =item rawRole
1298 : parrello 1.11
1299 : parrello 1.21 Role or functional assignment from which the EC numbers are to be stripped.
1300 : parrello 1.11
1301 :     =item RETURN
1302 :    
1303 : parrello 1.21 Returns the incoming string with any EC numbers removed. The EC numbers must
1304 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1305 :     and surrounding parentheses).
1306 : parrello 1.11
1307 :     =back
1308 :    
1309 :     =cut
1310 :    
1311 : parrello 1.21 sub strip_ec {
1312 :     # Get the parameters.
1313 :     my ($rawRole) = @_;
1314 :     # Declare the return variable.
1315 :     my $retVal = $rawRole;
1316 :     # Remove the EC numbers.
1317 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1318 :     # Return the result.
1319 :     return $retVal;
1320 : parrello 1.15 }
1321 :    
1322 :     =head3 translate
1323 :    
1324 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1325 :    
1326 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1327 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1328 :     C<M>. (In the standard genetic code, these two combinations normally translate
1329 :     to C<V> and C<L>, respectively.)
1330 :    
1331 :     =over 4
1332 :    
1333 :     =item dna_seq
1334 :    
1335 :     DNA sequence to translate. Note that the DNA sequence can only contain
1336 :     known nucleotides.
1337 :    
1338 :     =item code
1339 :    
1340 :     Reference to a hash specifying the translation code. The hash is keyed by
1341 :     nucleotide triples, and the value for each key is the corresponding protein
1342 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1343 :     used.
1344 :    
1345 :     =item fix_start
1346 :    
1347 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1348 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1349 :     C<M> instead of the value specified in the translation code.
1350 :    
1351 :     =item RETURN
1352 :    
1353 :     Returns a string resulting from translating each nucleotide triple into a
1354 :     protein letter.
1355 :    
1356 :     =back
1357 :    
1358 :     =cut
1359 :     #: Return Type $;
1360 :     sub translate {
1361 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1362 :    
1363 :     my( $dna,$code,$start ) = @_;
1364 :     my( $i,$j,$ln );
1365 :     my( $x,$y );
1366 :     my( $prot );
1367 :    
1368 :     if (! defined($code)) {
1369 :     $code = &FIG::standard_genetic_code;
1370 :     }
1371 :     $ln = length($dna);
1372 :     $prot = "X" x ($ln/3);
1373 :     $dna =~ tr/a-z/A-Z/;
1374 :    
1375 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1376 :     $x = substr($dna,$i,3);
1377 :     if ($y = $code->{$x}) {
1378 :     substr($prot,$j,1) = $y;
1379 :     }
1380 :     }
1381 :    
1382 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1383 :     substr($prot,0,1) = 'M';
1384 :     }
1385 :     return $prot;
1386 :     }
1387 :    
1388 : parrello 1.21 =head3 verify_dir
1389 : parrello 1.18
1390 : parrello 1.21 verify_dir($dirName);
1391 : parrello 1.18
1392 : parrello 1.21 Insure that the specified directory exists. If the directory does not
1393 :     exist, it will be created.
1394 : parrello 1.18
1395 :     =over 4
1396 :    
1397 : parrello 1.21 =item dirName
1398 : parrello 1.18
1399 : parrello 1.21 Name of the relevant directory.
1400 : parrello 1.18
1401 :     =back
1402 :    
1403 :     =cut
1404 :    
1405 : parrello 1.21 sub verify_dir {
1406 : parrello 1.18 # Get the parameters.
1407 : parrello 1.21 my ($dirName) = @_;
1408 :     # Strip off the final slash, if any.
1409 :     $dirName =~ s#/$##;
1410 :     # Only proceed if the directory does NOT already exist.
1411 :     if (! -d $dirName) {
1412 :     # If there is a parent directory, recursively insure it is there.
1413 :     if ($dirName =~ m#(.+)/[^/]+$#) {
1414 :     verify_dir($1);
1415 : overbeek 1.20 }
1416 : parrello 1.21 # Create this particular directory with full permissions.
1417 :     mkdir $dirName, 0777;
1418 : overbeek 1.20 }
1419 :     }
1420 :    
1421 : disz 1.28 sub strip_func {
1422 :     my($func) = @_;
1423 : overbeek 1.20
1424 : disz 1.28 $func =~ s/^FIG\d{6}[^:]*:\s*//;
1425 :     $func =~ s/\s*\#.*$//;
1426 :     return($func);
1427 :     }
1428 :    
1429 : olson 1.32 sub verify_db {
1430 :     my($db,$type) = @_;
1431 :    
1432 :     #
1433 :     # Find formatdb; if we're operating in a SEED environment
1434 :     # use it from there.
1435 :     #
1436 :    
1437 :     my $path = '';
1438 :     if ($FIG_Config::blastbin ne '' && -d $FIG_Config::blastbin)
1439 :     {
1440 :     $path = "$FIG_Config::blastbin/";
1441 :     }
1442 :     elsif ($FIG_Config::ext_bin ne '' && -d $FIG_Config::ext_bin)
1443 :     {
1444 :     $path = "$FIG_Config::ext_bin/";
1445 :     }
1446 :    
1447 :    
1448 :     my @cmd;
1449 :     if ($type =~ /^p/i)
1450 :     {
1451 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db))
1452 :     {
1453 :     @cmd = ("${path}formatdb", "-p", "T", "-i", $db);
1454 :     }
1455 :     }
1456 :     else
1457 :     {
1458 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db))
1459 :     {
1460 :     @cmd = ("${path}formatdb", "-p", "F", "-i", $db);
1461 :     }
1462 :     }
1463 :     if (@cmd)
1464 :     {
1465 :     my $rc = system(@cmd);
1466 :     if ($rc != 0)
1467 :     {
1468 :     warn "SeedUtils::verify_db: formatdb failed with rc=$rc: @cmd\n";
1469 :     }
1470 :     }
1471 :     }
1472 :    
1473 :    
1474 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3