[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.37 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 :    
24 : olson 1.32 #
25 :     # In case we are running in a SEED, pull in the FIG_Config
26 :     #
27 :     BEGIN
28 :     {
29 :     eval {
30 :     require FIG_Config;
31 :     };
32 :     }
33 :    
34 : parrello 1.1 use strict;
35 : parrello 1.12 no warnings qw(once);
36 : parrello 1.1 use base qw(Exporter);
37 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
38 :     rev_comp genome_of min max sims verify_dir between translate
39 : parrello 1.21 standard_genetic_code parse_location roles_of_function
40 : olson 1.33 strip_ec location_string location_cmp strand_of by_fig_id
41 : parrello 1.37 verify_db bbh_data id_url);
42 : parrello 1.1
43 :     =head1 SEED Utility Methods
44 :    
45 :     =head2 Introduction
46 :    
47 :     This is a simple utility package that performs functions useful for
48 :     bioinformatics, but that do not require access to the databases.
49 :    
50 :     =head2 Public Methods
51 :    
52 : parrello 1.21 =head3 abbrev
53 :    
54 :     my $abbrev = SeedUtils::abbrev($genome_name);
55 :    
56 :     Return an abbreviation of the specified genome name. This method is used to create
57 :     a reasonably indicative genome name that fits in 10 characters.
58 :    
59 :     =over 4
60 :    
61 :     =item genome_name
62 :    
63 :     Genome name to abbreviate.
64 :    
65 :     =item RETURN
66 :    
67 :     Returns a shortened version of the genome name that is 10 characters or less in
68 :     length.
69 :    
70 :     =back
71 :    
72 :     =cut
73 :    
74 :     sub abbrev {
75 :     my($genome_name) = @_;
76 :    
77 :     $genome_name =~ s/^(\S{3})\S+/$1./;
78 :     $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
79 :     $genome_name =~ s/ //g;
80 :     if (length($genome_name) > 10) {
81 :     $genome_name = substr($genome_name,0,10);
82 :     }
83 :     return $genome_name;
84 :     }
85 :    
86 : parrello 1.34 =head3 bbh_data
87 :    
88 :     my $bbhList = FIGRules::bbh_data($peg, $cutoff);
89 :    
90 :     Return a list of the bi-directional best hits relevant to the specified PEG.
91 :    
92 :     =over 4
93 :    
94 :     =item peg
95 :    
96 :     ID of the feature whose bidirectional best hits are desired.
97 :    
98 :     =item cutoff
99 :    
100 :     Similarity cutoff. If omitted, 1e-10 is used.
101 :    
102 :     =item RETURN
103 :    
104 :     Returns a reference to a list of 3-tuples. The first element of the list is the best-hit
105 :     PEG; the second element is the score. A lower score indicates a better match. The third
106 :     element is the normalized bit score for the pair; it is normalized to the length
107 :     of the protein.
108 :    
109 :     =back
110 :    
111 :     =cut
112 :     #: Return Type @@;
113 :     sub bbh_data {
114 :     my ($peg, $cutoff) = @_;
115 :     my @retVal = ();
116 :     require LWP::UserAgent;
117 :     my $ua = LWP::UserAgent->new();
118 :     my $url = "http://bioseed.mcs.anl.gov/simserver/perl/bbhs.pl";
119 :     my $retries = 5;
120 :     my $done = 0;
121 :     my $resp;
122 :     while ($retries > 0 && ! $done) {
123 :     $resp = $ua->post($url, { id => $peg, cutoff => $cutoff });
124 :     if ($resp->is_success) {
125 :     my $dat = $resp->content;
126 :     while ($dat =~ /([^\n]+)\n/g) {
127 :     my @l = split(/\t/, $1);
128 :     push @retVal, \@l;
129 :     }
130 :     $done = 1;
131 :     } else {
132 :     $retries--;
133 :     }
134 :     }
135 :     if (! $done) {
136 :     die("Failure retrieving bbh data for $peg: " . $resp->status_line);
137 :     }
138 :     return \@retVal;
139 :     }
140 :    
141 :    
142 : parrello 1.21 =head3 between
143 :    
144 :     my $flag = between($x, $y, $z);
145 :    
146 :     Determine whether or not $y is between $x and $z.
147 :    
148 :     =over 4
149 :    
150 :     =item x
151 :    
152 :     First edge number.
153 :    
154 :     =item y
155 :    
156 :     Number to examine.
157 :    
158 :     =item z
159 :    
160 :     Second edge number.
161 :    
162 :     =item RETURN
163 :    
164 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
165 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
166 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
167 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
168 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
169 :    
170 :     =back
171 :    
172 :     =cut
173 :     #: Return Type $;
174 :     sub between {
175 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
176 :     my($x,$y,$z) = @_;
177 :    
178 :     if ($x < $z) {
179 :     return (($x <= $y) && ($y <= $z));
180 :     } else {
181 :     return (($x >= $y) && ($y >= $z));
182 :     }
183 :     }
184 :    
185 :    
186 : parrello 1.13 =head3 boundaries_of
187 :    
188 : parrello 1.21 my ($contig, $min, $max, $dir) = boundaries_of($locs);
189 : parrello 1.13
190 :     Return the boundaries of a set of locations. The contig, the leftmost
191 :     location, and the rightmost location will be returned to the caller. If
192 :     more than one contig is represented, the method will return an undefined
193 :     value for the contig (indicating failure).
194 :    
195 :     =over 4
196 :    
197 :     =item locs
198 :    
199 :     Reference to a list of location strings. A location string contains a contig ID,
200 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
201 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
202 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
203 :    
204 :     =item RETURN
205 :    
206 : parrello 1.21 Returns a 4-element list. The first element is the contig ID from all the locations,
207 :     the second is the offset of leftmost base pair represented in the locations, the
208 :     third is the offset of the rightmost base pair represented in the locations, and
209 :     the fourth is the dominant strand.
210 : parrello 1.13
211 :     =back
212 :    
213 :     =cut
214 :    
215 :     sub boundaries_of {
216 :     # Get the parameters.
217 :     my ($locs) = @_;
218 :     # Declare the return variables.
219 :     my ($contig, $min, $max);
220 :     # We'll put all the starting and ending offsets found in here.
221 :     my @offsets;
222 : parrello 1.21 # This will be used to count the orientations.
223 :     my %dirs = ('+' => 0, '-' => 0);
224 : parrello 1.13 # This will count the number of errors found.
225 :     my $error = 0;
226 : parrello 1.35 # Insure the location is an array. If it's not, we assume it's a string of
227 :     # comma-separated values.
228 :     if (! ref $locs) {
229 :     $locs = [ split /\s*,\s*/, $locs ];
230 :     }
231 : parrello 1.13 # Loop through the locations.
232 :     for my $loc (@$locs) {
233 :     # Parse this location.
234 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
235 : parrello 1.23 # This is a valid location string.
236 : parrello 1.13 my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
237 :     # Is this contig valid?
238 :     if ($contig && $newContig ne $contig) {
239 :     # No, skip this location.
240 :     $error++;
241 :     } else {
242 :     # Save the contig.
243 :     $contig = $newContig;
244 : parrello 1.21 # Count the orientation.
245 :     $dirs{$dir}++;
246 : parrello 1.13 # Compute the ending offset.
247 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
248 :     # Save both offsets.
249 :     push @offsets, $begin, $end;
250 :     }
251 : parrello 1.23 } elsif ($loc =~ /^(.+)_(\d+)_(\d+)/) {
252 :     # Here we have an old-style location string.
253 :     my($newContig, $start, $stop) = ($1, $2, $3);
254 :     # Is this contig valid?
255 : olson 1.22 if ($contig && $newContig ne $contig) {
256 :     # No, skip this location.
257 :     $error++;
258 :     } else {
259 :     # Save the contig.
260 :     $contig = $newContig;
261 : parrello 1.23 # Compute the orientation.
262 :     my $dir;
263 :     if ($start > $stop) {
264 :     $dir = '-';
265 :     } else {
266 :     $dir = '+';
267 :     }
268 :     # Count it.
269 : olson 1.22 $dirs{$dir}++;
270 : parrello 1.23 # Save both offsets.
271 : olson 1.22 push @offsets, $start, $stop;
272 :     }
273 : parrello 1.13 } else {
274 :     # The location is invalid, so it's an error,
275 :     $error++;
276 :     }
277 :     }
278 :     # If there's an error, clear the contig ID.
279 :     if ($error) {
280 :     $contig = undef;
281 :     }
282 :     # Compute the min and max from the offsets collected.
283 :     $min = min(@offsets);
284 :     $max = max(@offsets);
285 : parrello 1.21 # Save the dominant orientation.
286 :     my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
287 : parrello 1.13 # Return the results.
288 : parrello 1.21 return ($contig, $min, $max, $dir);
289 : parrello 1.13 }
290 :    
291 : parrello 1.21 =head3 boundary_loc
292 : parrello 1.17
293 : parrello 1.21 my $singleLoc = SeedUtils::boundary_loc($locations);
294 : parrello 1.17
295 : parrello 1.21 Return a single location string (see L<SAP/Location Strings>) that covers
296 :     the incoming list of locations. NOTE that if the locations listed span
297 :     more than one contig, this method may return an unexpected result.
298 : parrello 1.17
299 : parrello 1.21 This method is useful for converting the output of L<SAP/fid_locations> to
300 :     location strings.
301 : parrello 1.17
302 :     =over 4
303 :    
304 : parrello 1.21 =item locations
305 : parrello 1.17
306 : parrello 1.21 A set of location strings formatted as a comma-separated list or as a reference
307 :     to a list of location strings.
308 : parrello 1.17
309 :     =item RETURN
310 :    
311 : parrello 1.21 Returns a single location string that covers as best as possible the list of
312 :     incoming locations.
313 : parrello 1.17
314 :     =back
315 :    
316 :     =cut
317 :    
318 : parrello 1.21 sub boundary_loc {
319 : parrello 1.17 # Get the parameters.
320 : parrello 1.21 my ($locations) = @_;
321 :     # Convert the incoming locations to a list.
322 :     my @locs;
323 :     if (ref $locations eq 'ARRAY') {
324 :     @locs = @$locations;
325 :     } else {
326 :     @locs = split /\s*,\s*/, $locations;
327 :     }
328 :     # Get the boundary information for the listed locations.
329 :     my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
330 :     # Compute the indicated location string.
331 :     my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
332 :     ($max + 1 - $min);
333 : parrello 1.17 # Return the result.
334 : parrello 1.21 return $retVal;
335 : parrello 1.17 }
336 :    
337 : parrello 1.21 =head3 by_fig_id
338 : parrello 1.17
339 : parrello 1.21 my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
340 : parrello 1.16
341 : parrello 1.21 Compare two feature IDs.
342 : parrello 1.16
343 : parrello 1.21 This function is designed to assist in sorting features by ID. The sort is by
344 :     genome ID followed by feature type and then feature number.
345 : parrello 1.16
346 :     =over 4
347 :    
348 : parrello 1.21 =item a
349 :    
350 :     First feature ID.
351 :    
352 :     =item b
353 : parrello 1.16
354 : parrello 1.21 Second feature ID.
355 : parrello 1.16
356 :     =item RETURN
357 :    
358 : parrello 1.21 Returns a negative number if the first parameter is smaller, zero if both parameters
359 :     are equal, and a positive number if the first parameter is greater.
360 : parrello 1.16
361 :     =back
362 :    
363 :     =cut
364 :    
365 : parrello 1.21 sub by_fig_id {
366 :     my($a,$b) = @_;
367 :     my($g1,$g2,$t1,$t2,$n1,$n2);
368 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
369 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
370 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
371 :     } else {
372 :     $a cmp $b;
373 : parrello 1.16 }
374 : parrello 1.19 }
375 :    
376 : parrello 1.21 =head3 create_fasta_record
377 : parrello 1.19
378 : parrello 1.21 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
379 : parrello 1.19
380 : parrello 1.21 Create a FASTA record from the specified DNA or protein sequence. The
381 :     sequence will be split into 60-character lines, and the record will
382 :     include an identifier line.
383 : parrello 1.19
384 :     =over 4
385 :    
386 : parrello 1.21 =item id
387 :    
388 :     ID for the sequence, to be placed at the beginning of the identifier
389 :     line.
390 :    
391 :     =item comment (optional)
392 : parrello 1.19
393 : parrello 1.21 Comment text to place after the ID on the identifier line. If this parameter
394 :     is empty, undefined, or 0, no comment will be placed.
395 : parrello 1.19
396 : parrello 1.21 =item sequence
397 : parrello 1.19
398 : parrello 1.21 Sequence of letters to form into FASTA. For purposes of convenience, whitespace
399 :     characters in the sequence will be removed automatically.
400 : parrello 1.19
401 : parrello 1.21 =item stripped (optional)
402 : parrello 1.19
403 : parrello 1.21 If TRUE, then the sequence will be returned unmodified instead of converted
404 :     to FASTA format. The default is FALSE.
405 : parrello 1.19
406 :     =item RETURN
407 :    
408 : parrello 1.21 Returns the desired sequence in FASTA format.
409 : parrello 1.19
410 :     =back
411 :    
412 :     =cut
413 :    
414 : parrello 1.21 sub create_fasta_record {
415 : parrello 1.19 # Get the parameters.
416 : parrello 1.21 my ($id, $comment, $sequence, $stripped) = @_;
417 :     # Declare the return variable.
418 :     my $retVal;
419 :     # If we're in stripped mode, we just return the sequence.
420 :     if ($stripped) {
421 :     $retVal = $sequence;
422 : parrello 1.19 } else {
423 : parrello 1.21 # Here we have to do the FASTA conversion. Start with the ID.
424 :     my $header = ">$id";
425 :     # Add a comment, if any.
426 :     if ($comment) {
427 :     $header .= " $comment";
428 :     }
429 :     # Clean up the sequence.
430 :     $sequence =~ s/\s+//g;
431 :     # We need to format the sequence into 60-byte chunks. We use the infamous
432 :     # grep-split trick. The split, because of the presence of the parentheses,
433 :     # includes the matched delimiters in the output list. The grep strips out
434 :     # the empty list items that appear between the so-called delimiters, since
435 :     # the delimiters are what we want.
436 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
437 :     # Add the chunks and the trailer.
438 :     $retVal = join("\n", $header, @chunks) . "\n";
439 : parrello 1.19 }
440 :     # Return the result.
441 : parrello 1.21 return $retVal;
442 : parrello 1.19 }
443 :    
444 : parrello 1.21 =head3 display_id_and_seq
445 : parrello 1.19
446 : parrello 1.21 SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
447 : parrello 1.19
448 : parrello 1.21 Display a fasta ID and sequence to the specified open file. This method is designed
449 :     to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
450 :     input a string pointer rather than a string. If the file handle is omitted it
451 :     defaults to STDOUT.
452 : parrello 1.19
453 : parrello 1.21 The output is formatted into a FASTA record. The first line of the output is
454 :     preceded by a C<< > >> symbol, and the sequence is split into 60-character
455 :     chunks displayed one per line. Thus, this method can be used to produce
456 :     FASTA files from data gathered by the rest of the system.
457 : parrello 1.19
458 :     =over 4
459 :    
460 : parrello 1.21 =item id_and_comment
461 : parrello 1.19
462 : parrello 1.21 The sequence ID and (optionally) the comment from the sequence's FASTA record.
463 :     The ID
464 : parrello 1.19
465 : parrello 1.21 =item seqP
466 : parrello 1.19
467 : parrello 1.21 Reference to a string containing the sequence. The sequence is automatically
468 :     formatted into 60-character chunks displayed one per line.
469 : parrello 1.19
470 : parrello 1.21 =item fh
471 :    
472 :     Open file handle to which the ID and sequence should be output. If omitted,
473 :     C<\*STDOUT> is assumed.
474 :    
475 :     =back
476 :    
477 :     =cut
478 :    
479 :     sub display_id_and_seq {
480 :    
481 :     my( $id, $seqP, $fh ) = @_;
482 :    
483 :     if (! defined($fh) ) { $fh = \*STDOUT; }
484 :    
485 :     print $fh ">$id\n";
486 :     &display_seq($seqP, $fh);
487 :     }
488 :    
489 :     =head3 display_seq
490 :    
491 :     SeedUtils::display_seq(\$seqP, $fh);
492 :    
493 :     Display a fasta sequence to the specified open file. If the file handle is
494 :     omitted it defaults to STDOUT.
495 :    
496 :     The sequence is split into 60-character chunks displayed one per line for
497 :     readability.
498 :    
499 :     =over 4
500 :    
501 :     =item seqP
502 :    
503 :     Reference to a string containing the sequence.
504 :    
505 :     =item fh
506 :    
507 :     Open file handle to which the sequence should be output. If omitted,
508 :     C<STDOUT> is assumed.
509 :    
510 :     =back
511 :    
512 :     =cut
513 :    
514 :     sub display_seq {
515 :    
516 :     my ( $seqP, $fh ) = @_;
517 :     my ( $i, $n, $ln );
518 :    
519 :     if (! defined($fh) ) { $fh = \*STDOUT; }
520 :    
521 :     $n = length($$seqP);
522 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
523 :     for ($i=0; ($i < $n); $i += 60) {
524 :     if (($i + 60) <= $n) {
525 :     $ln = substr($$seqP,$i,60);
526 :     } else {
527 :     $ln = substr($$seqP,$i,($n-$i));
528 :     }
529 :     print $fh "$ln\n";
530 :     }
531 :     }
532 :    
533 :    
534 :     =head3 genome_of
535 :    
536 :     my $genomeID = genome_of($fid);
537 :    
538 :     Return the Genome ID embedded in the specified FIG feature ID.
539 :    
540 :     =over 4
541 :    
542 :     =item fid
543 :    
544 :     Feature ID of interest.
545 :    
546 :     =item RETURN
547 :    
548 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
549 :     feature ID is invalid, this method returns an undefined value.
550 :    
551 :     =back
552 :    
553 :     =cut
554 :    
555 :     sub genome_of {
556 :     # Get the parameters.
557 :     my ($fid) = @_;
558 :     # Declare the return variable.
559 :     my $retVal;
560 :     # Parse the feature ID.
561 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
562 :     $retVal = $1;
563 :     }
564 :     # Return the result.
565 :     return $retVal;
566 :     }
567 :    
568 :     =head3 hypo
569 :    
570 :     my $flag = hypo($func);
571 :    
572 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
573 :     Hypothetical functional roles are identified by key words in the text,
574 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
575 :    
576 :     =over 4
577 :    
578 :     =item func
579 :    
580 :     Text of the functional role whose nature is to be determined.
581 :    
582 :     =item RETURN
583 :    
584 :     Returns TRUE if the role is hypothetical, else FALSE.
585 :    
586 :     =back
587 :    
588 :     =cut
589 :    
590 :     sub hypo {
591 :     my ($func) = @_;
592 :     if (! $func) { return 1 }
593 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
594 :     if ($func =~ /hypoth/i) { return 1 }
595 :     if ($func =~ /conserved protein/i) { return 1 }
596 :     if ($func =~ /gene product/i) { return 1 }
597 :     if ($func =~ /interpro/i) { return 1 }
598 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
599 :     if ($func =~ /^U\d/) { return 1 }
600 :     if ($func =~ /^orf[^_]/i) { return 1 }
601 :     if ($func =~ /uncharacterized/i) { return 1 }
602 :     if ($func =~ /pseudogene/i) { return 1 }
603 :     if ($func =~ /^predicted/i) { return 1 }
604 :     if ($func =~ /AGR_/) { return 1 }
605 :     if ($func =~ /similar to/i) { return 1 }
606 :     if ($func =~ /similarity/i) { return 1 }
607 :     if ($func =~ /glimmer/i) { return 1 }
608 :     if ($func =~ /unknown/i) { return 1 }
609 :     if (($func =~ /domain/i) ||
610 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
611 :     ($func =~ /complete/i) ||
612 :     ($func =~ /ensang/i) ||
613 :     ($func =~ /unnamed/i) ||
614 :     ($func =~ /EG:/) ||
615 :     ($func =~ /orf\d+/i) ||
616 :     ($func =~ /RIKEN/) ||
617 :     ($func =~ /Expressed/i) ||
618 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
619 :     ($func =~ /predicted by Psort/) ||
620 :     ($func =~ /^bh\d+/i) ||
621 :     ($func =~ /cds_/i) ||
622 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
623 :     ($func =~ /similar to/i) ||
624 :     ($func =~ / identi/i) ||
625 :     ($func =~ /ortholog of/i) ||
626 :     ($func =~ /structural feature/i)) { return 1 }
627 :     return 0;
628 :    
629 :     }
630 :    
631 : parrello 1.37 =head3 id_url
632 :    
633 :     my $url = id_url($id);
634 :    
635 :     Return the URL for a specified external gene ID.
636 :    
637 :     =over 4
638 :    
639 :     =item id
640 :    
641 :     ID of the gene whose URL is desired.
642 :    
643 :     =item RETURN
644 :    
645 :     Returns a URL for displaying information about the specified gene. The structure
646 :     of the ID is used to determine the web site to which the gene belongs.
647 :    
648 :     =back
649 :    
650 :     =cut
651 :    
652 :     sub id_url {
653 :     # Get the parameters.
654 :     my ($id) = @_;
655 :     # Declare the return variable.
656 :     my $retVal;
657 :     # Parse the ID to determine the URL.
658 :     if ($id =~ /^(?:ref\|)?([NXYZA]P_[0-9\.]+)$/) {
659 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein;cmd=search;term=$1";
660 :     } elsif ($id =~ /^gi\|(\d+)$/) {
661 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve;db=Protein&list_uids=$1;dopt=GenPept";
662 :     } elsif ($id =~ /^cmr\|(.+)$/) {
663 :     $retVal = "http://cmr.jcvi.org/tigr-scripts/CMR/shared/GenePage.cgi?locus=$1";
664 :     } elsif ($id =~ /^sp\|([A-Z0-9]{6})$/) {
665 :     $retVal = "http://us.expasy.org/cgi-bin/get-sprot-entry?$1";
666 :     } elsif ($id =~ /^uni\|([A-Z0-9_]+?)$/) {
667 :     $retVal = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=$1";
668 :     } elsif ($id =~ /^kegg\|(([a-z]{2,4}):([a-zA-Z_0-9]+))$/) {
669 :     $retVal = "http://www.genome.ad.jp/dbget-bin/www_bget?$2+$3";
670 :     } elsif ($id =~ /^tr\|([a-zA-Z0-9]+)$/) {
671 :     $retVal = "http://ca.expasy.org/uniprot/$1";
672 :     } elsif ($id =~ /^(fig\|\d+\.\d+\.\w+\.\d+)$/) {
673 :     $retVal = "http://seed-viewer.theseed.org/?pattern=$1&page=SearchResult&action=check_search";
674 :     }
675 :     # Return the result.
676 :     return $retVal;
677 :     }
678 :    
679 :    
680 : parrello 1.21 =head3 location_cmp
681 :    
682 :     my $cmp = location_cmp($loc1, $loc2);
683 :    
684 :     Compare two location strings (see L<SAP/Location Strings>).
685 :    
686 :     The ordering principle for locations is that they are sorted first by contig ID, then by
687 :     leftmost position, in reverse order by length, and then by direction. The effect is that
688 :     within a contig, the locations are ordered first and foremost in the way they would
689 :     appear when displayed in a picture of the contig and second in such a way that embedded
690 :     locations come after the locations in which they are embedded. In the case of two
691 :     locations that represent the exact same base pairs, the forward (C<+>) location is
692 :     arbitrarily placed first.
693 :    
694 :     =over 4
695 :    
696 :     =item loc1
697 :    
698 :     First location string to compare.
699 :    
700 :     =item loc2
701 :    
702 :     Second location string to compare.
703 :    
704 :     =item RETURN
705 :    
706 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
707 :     B<loc2> location sorts first, and zero if the two locations are the same.
708 : parrello 1.19
709 :    
710 :     =back
711 :    
712 :     =cut
713 :    
714 :     sub location_cmp {
715 :     # Get the parameters.
716 :     my ($loc1, $loc2) = @_;
717 :     # Parse the locations.
718 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
719 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
720 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
721 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
722 :     # Declare the return variable. We compare the indicative parts of the location
723 :     # in order. Note that we sort in reverse order by length, so the last comparison
724 :     # puts 2 before 1.
725 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
726 :     ($len2 <=> $len1);
727 :     # If everything matches to this point, check the strands.
728 :     if (! $retVal) {
729 :     if ($strand1 eq '+') {
730 :     # First location is positive, so if the locations are unequal, it
731 :     # sorts first.
732 :     $retVal = ($strand2 eq '+' ? 0 : -1);
733 :     } else {
734 :     # First location is negative, so if the locations are unequal, it
735 :     # sorts second.
736 :     $retVal = ($strand1 eq '-' ? 0 : 1);
737 :     }
738 :     }
739 :     # Return the result.
740 :     return $retVal;
741 :     }
742 :    
743 : parrello 1.21 =head3 location_string
744 : parrello 1.19
745 : parrello 1.21 my $locString = location_string($contig, $beg, $end);
746 : parrello 1.19
747 : parrello 1.21 Form a location string for the specified contig that starts at the
748 :     indicated begin location and stops at the indicated end location. A
749 :     single-base location will automatically be put on the forward strand.
750 : parrello 1.19
751 :     =over 4
752 :    
753 : parrello 1.21 =item contig
754 :    
755 :     ID of the contig to contain this location.
756 :    
757 :     =item beg
758 :    
759 :     Beginning offset of the location.
760 :    
761 :     =item end
762 : parrello 1.19
763 : parrello 1.21 Ending offset of the location.
764 : parrello 1.19
765 :     =item RETURN
766 :    
767 : parrello 1.21 Returns a location string (see L<SAP/Location String>) for the specified
768 :     location.
769 : parrello 1.19
770 :     =back
771 :    
772 :     =cut
773 :    
774 : parrello 1.21 sub location_string {
775 : parrello 1.19 # Get the parameters.
776 : parrello 1.21 my ($contig, $beg, $end) = @_;
777 :     # Compute the strand and length.
778 :     my ($strand, $len);
779 :     if ($beg <= $end) {
780 :     $strand = '+';
781 :     $len = $end + 1 - $beg;
782 :     } else {
783 :     $strand = '-';
784 :     $len = $beg + 1 - $end;
785 : parrello 1.19 }
786 :     # Return the result.
787 : parrello 1.21 return $contig . "_$beg$strand$len";
788 : parrello 1.16 }
789 :    
790 : parrello 1.5 =head3 max
791 :    
792 :     my $max = max(@nums);
793 :    
794 :     Return the maximum number from all the values in the specified list.
795 :    
796 :     =over 4
797 :    
798 :     =item nums
799 :    
800 :     List of numbers to examine.
801 :    
802 :     =item RETURN
803 :    
804 :     Returns the maximum numeric value from the specified parameters, or
805 :     an undefined value if an empty list is passed in.
806 :    
807 :     =back
808 :    
809 :     =cut
810 :    
811 :     sub max {
812 :     my ($retVal, @nums) = @_;
813 :     for my $num (@nums) {
814 :     if ($num > $retVal) {
815 :     $retVal = $num;
816 :     }
817 :     }
818 :     return $retVal;
819 :     }
820 :    
821 :     =head3 min
822 :    
823 :     my $min = min(@nums);
824 :    
825 :     Return the minimum number from all the values in the specified list.
826 :    
827 :     =over 4
828 :    
829 :     =item nums
830 :    
831 :     List of numbers to examine.
832 :    
833 :     =item RETURN
834 :    
835 :     Returns the minimum numeric value from the specified parameters, or
836 :     an undefined value if an empty list is passed in.
837 :    
838 :     =back
839 :    
840 :     =cut
841 :    
842 :     sub min {
843 :     my ($retVal, @nums) = @_;
844 :     for my $num (@nums) {
845 :     if ($num < $retVal) {
846 :     $retVal = $num;
847 :     }
848 :     }
849 :     return $retVal;
850 :     }
851 :    
852 : parrello 1.21 =head3 parse_fasta_record
853 : parrello 1.1
854 : parrello 1.21 my ($id, $comment, $seq) = parse_fasta_record($string);
855 : parrello 1.1
856 : parrello 1.21 Extract the ID, comment, and sequence from a single FASTA record. For
857 :     backward compatability, instead of a FASTA record the ID and sequence can
858 :     be specified separated by a comma. In this case, the returned comment
859 :     will be empty.
860 : parrello 1.1
861 :     =over 4
862 :    
863 : parrello 1.21 =item string
864 : parrello 1.1
865 : parrello 1.21 A single FASTA record, or an ID and sequence separated by a single comma,
866 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
867 :     or a 3-element list consisting of an ID, a comment, and a sequence.
868 : parrello 1.7
869 : parrello 1.1 =item RETURN
870 :    
871 : parrello 1.21 Returns a three-element list consisting of the incoming ID, the associated
872 :     comment, and the specified DNA or protein sequence. If the incoming string is
873 :     invalid, all three list elements will come back undefined. If no ID is
874 :     specified, an MD5 will be provided.
875 : parrello 1.1
876 :     =back
877 :    
878 :     =cut
879 :    
880 : parrello 1.21 sub parse_fasta_record {
881 : parrello 1.1 # Get the parameters.
882 : parrello 1.21 my ($string) = @_;
883 :     # Declare the return variables.
884 :     my ($id, $comment, $seq);
885 :     # Check the type of input string.
886 :     if (! defined $string) {
887 :     # Do nothing if no string was passed in. This extra check prevents a
888 :     # warning at runtime.
889 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
890 :     # Here we have a standard FASTA string.
891 :     ($id, $comment, $seq) = ($1, $2, $3);
892 :     # Remove white space from the sequence string.
893 :     $seq =~ s/\s+//sg;
894 :     # Trim front of comment.
895 :     $comment =~ s/^s+//;
896 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
897 :     ($id, $comment, $seq) = ($1, '', $2);
898 :     } elsif (ref $string eq 'ARRAY') {
899 :     # Here the data came in pre-formatted as a list reference.
900 :     ($id, $comment, $seq) = @$string;
901 :     # If there's no comment, we need to adjust.
902 :     if (! defined $seq) {
903 :     $seq = $comment;
904 :     $comment = '';
905 :     }
906 : parrello 1.7 } else {
907 : parrello 1.21 # Here we have only a sequence. We need to construct the ID.
908 :     $seq = $string;
909 :     require Digest::MD5;
910 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
911 :     $comment = "";
912 :     }
913 :     # Return the results.
914 :     return ($id, $comment, $seq);
915 :     }
916 :    
917 :     =head3 parse_location
918 :    
919 :     my ($contig, $begin, $end, $strand) = parse_location($locString);
920 :    
921 :     Return the contigID, start offset, and end offset for a specified
922 :     location string (see L<SAP/Location Strings>).
923 :    
924 :     =over 4
925 :    
926 :     =item locString
927 :    
928 :     Location string to parse.
929 :    
930 :     =item RETURN
931 :    
932 :     Returns a four-element list containing the contig ID from the location
933 :     string, the starting offset of the location, the ending offset, and the
934 :     strand. If the location string is not valid, the values returned will be
935 :     C<undef>.
936 :    
937 :     =back
938 :    
939 :     =cut
940 :    
941 :     sub parse_location {
942 :     # Get the parameters.
943 :     my ($locString) = @_;
944 :     # Declare the return variables.
945 :     my ($contig, $begin, $end, $strand);
946 :     # Parse the location string.
947 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
948 :     # Pull out the contig ID, strand, and begin location.
949 :     $contig = $1;
950 :     $begin = $2;
951 :     $strand = $3;
952 :     # Compute the ending location from the direction and length.
953 :     if ($3 eq '+') {
954 :     $end = $begin + $4 - 1;
955 :     } else {
956 :     $end = $begin - $4 + 1;
957 : parrello 1.7 }
958 : parrello 1.1 }
959 : parrello 1.21 # Return the results.
960 :     return ($contig, $begin, $end, $strand);
961 : parrello 1.1 }
962 :    
963 : parrello 1.2 =head3 rev_comp
964 :    
965 :     my $revcmp = rev_comp($dna);
966 :    
967 :     or
968 :    
969 :     rev_comp(\$dna);
970 :    
971 :     Return the reverse complement of a DNA string.
972 :    
973 :     =over 4
974 :    
975 :     =item dna
976 :    
977 :     Either a DNA string, or a reference to a DNA string.
978 :    
979 :     =item RETURN
980 :    
981 :     If the input is a DNA string, returns the reverse complement. If the
982 :     input is a reference to a DNA string, the string itself is reverse
983 :     complemented.
984 :    
985 :     =back
986 :    
987 :     =cut
988 :    
989 :     sub rev_comp {
990 :     # Get the parameters.
991 :     my ($dna) = @_;
992 :     # Determine how we were called.
993 :     my ($retVal, $refMode);
994 : parrello 1.24 if (! ref $dna) {
995 : parrello 1.21 $retVal = reverse $dna;
996 : parrello 1.2 $refMode = 0;
997 :     } else {
998 : parrello 1.21 $retVal = reverse $$dna;
999 : parrello 1.2 $refMode = 1;
1000 :     }
1001 : parrello 1.21 # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
1002 :     # user passed in a reference. The following translation step complements the
1003 :     # string.
1004 :     $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
1005 : parrello 1.2 # Return the result in the method corresponding to the way it came in.
1006 :     if ($refMode) {
1007 :     $$dna = $retVal;
1008 :     return;
1009 :     } else {
1010 :     return $retVal;
1011 :     }
1012 :     }
1013 :    
1014 : parrello 1.21 # Synonym of rev_comp, for backward compatibility.
1015 :     sub reverse_comp {
1016 :     return rev_comp($_[0]);
1017 : parrello 1.15 }
1018 :    
1019 : parrello 1.21 =head3 roles_of_function
1020 : parrello 1.15
1021 : parrello 1.21 my @roles = roles_of_function($assignment);
1022 : parrello 1.3
1023 : parrello 1.21 Return a list of the functional roles in the specified assignment string.
1024 :     A single assignment may contain multiple roles as well as comments; this
1025 :     method separates them out.
1026 : parrello 1.3
1027 :     =over 4
1028 :    
1029 : parrello 1.21 =item assignment
1030 : parrello 1.3
1031 : parrello 1.21 Functional assignment to parse for roles.
1032 : parrello 1.3
1033 :     =item RETURN
1034 :    
1035 : parrello 1.21 Returns a list of the individual roles in the assignment.
1036 : parrello 1.3
1037 :     =back
1038 :    
1039 :     =cut
1040 :    
1041 : parrello 1.21 sub roles_of_function {
1042 : parrello 1.3 # Get the parameters.
1043 : parrello 1.21 my ($assignment) = @_;
1044 :     # Remove any comment.
1045 : parrello 1.30 my $commentFree = ($assignment =~ /(.+?)\s*[#!]/ ? $1 : $assignment);
1046 : parrello 1.21 # Split out the roles.
1047 : parrello 1.30 my @retVal = split /\s+[\/@]\s+|\s*;\s+/, $commentFree;
1048 : parrello 1.3 # Return the result.
1049 : parrello 1.21 return @retVal;
1050 : parrello 1.3 }
1051 :    
1052 : parrello 1.6 =head3 sims
1053 :    
1054 : parrello 1.31 my @sims = sims($id, $maxN, $maxP, 'fig');
1055 :    
1056 :     or
1057 :    
1058 :     my @sims = sims($id, $maxN, $maxP, 'all);
1059 :    
1060 : parrello 1.6
1061 :     Retrieve similarities from the network similarity server. The similarity retrieval
1062 :     is performed using an HTTP user agent that returns similarity data in multiple
1063 :     chunks. An anonymous subroutine is passed to the user agent that parses and
1064 :     reformats the chunks as they come in. The similarites themselves are returned
1065 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
1066 :     The Sim object methods allow access to the elements by name.
1067 :    
1068 :     Similarities can be either raw or expanded. The raw similarities are basic
1069 :     hits between features with similar DNA. Expanding a raw similarity drags in any
1070 :     features considered substantially identical. So, for example, if features B<A1>,
1071 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
1072 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
1073 :    
1074 :     =over 4
1075 :    
1076 :     =item id
1077 :    
1078 :     ID of the feature whose similarities are desired, or reference to a list
1079 :     of the IDs of the features whose similarities are desired.
1080 :    
1081 : parrello 1.31 =item maxN (optional)
1082 : parrello 1.6
1083 : parrello 1.27 Maximum number of similarities to return for each incoming feature.
1084 : parrello 1.6
1085 : parrello 1.31 =item maxP (optional)
1086 : parrello 1.6
1087 :     The maximum allowable similarity score.
1088 :    
1089 : parrello 1.31 =item select (optional)
1090 : parrello 1.6
1091 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
1092 :     means only similarities to FIG features are returned; C<all> means all expanded
1093 :     similarities are returned; and C<figx> means similarities are expanded until the
1094 :     number of FIG features equals the maximum.
1095 :    
1096 : parrello 1.31 =item max_expand (optional)
1097 : parrello 1.6
1098 :     The maximum number of features to expand.
1099 :    
1100 : parrello 1.31 =item filters (optional)
1101 : parrello 1.6
1102 :     Reference to a hash containing filter information, or a subroutine that can be
1103 :     used to filter the sims.
1104 :    
1105 :     =item RETURN
1106 :    
1107 :     Returns a list of L<Sim> objects.
1108 :    
1109 :     =back
1110 :    
1111 :     =cut
1112 :    
1113 :     sub sims {
1114 :     # Get the parameters.
1115 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
1116 :     # Get the URL for submitting to the sims server.
1117 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
1118 :     # Get a list of the IDs to process.
1119 :     my @ids;
1120 :     if (ref($id) eq "ARRAY") {
1121 :     @ids = @$id;
1122 :     } else {
1123 :     @ids = ($id);
1124 :     }
1125 :     # Form a list of the parameters to pass to the server.
1126 :     my %args = ();
1127 :     $args{id} = \@ids;
1128 :     $args{maxN} = $maxN if defined($maxN);
1129 :     $args{maxP} = $maxP if defined($maxP);
1130 :     $args{select} = $select if defined($select);
1131 :     $args{max_expand} = $max_expand if defined($max_expand);
1132 :     # If the filter is a hash, put the filters in the argument list.
1133 :     if (ref($filters) eq 'HASH') {
1134 :     for my $k (keys(%$filters))
1135 :     {
1136 :     $args{"filter_$k"}= $filters->{$k};
1137 :     }
1138 :     }
1139 :     # Get the user agent.
1140 :     require LWP::UserAgent;
1141 :     my $ua = LWP::UserAgent->new();
1142 :     # Insure we have the Sim module.
1143 :     require Sim;
1144 :     #
1145 :     # Our next task is to create the anonymous subroutine that will process the
1146 :     # chunks that come back from the server. We require three global variables:
1147 :     # @sims to hold the similarities found, $tail to remember the unprocessed
1148 :     # data from the previous chunk, and $chunks to count the chunks.
1149 :     #
1150 :     my @retVal;
1151 :     my $tail;
1152 :     my $chunks = 0;
1153 :     #
1154 :     # ANONYMOUS SUBROUTINE
1155 :     #
1156 :     my $cb = sub {
1157 :     eval {
1158 :     # Get the parameters.
1159 :     my ($data, $command) = @_;
1160 :     # Check for a reset command. If we get one, we discard any data
1161 :     # in progress.
1162 :     if ($command && $command eq 'reset') {
1163 :     $tail = '';
1164 :     } else {
1165 :     $chunks++;
1166 :     # Get the data to process. Note we concatenate it to the incoming
1167 :     # tail from last time.
1168 :     my $c = $tail . $data;
1169 :     # Make sure the caller hasn't messed up the new-line character.
1170 :     # FASTA readers in particular are notorious for doing things
1171 :     # like that.
1172 :     local $/ = "\n";
1173 :     # Split the input into lines.
1174 :     my @lines = split(/\n/, $c);
1175 :     # If the input does not end with a new-line, we have a partial
1176 :     # chunk and need to put it in the tail for next time. If not,
1177 :     # there is no tail for next time.
1178 :     if (substr($c, -1, 1) ne "\n") {
1179 :     $tail = pop @lines;
1180 :     } else {
1181 :     $tail = '';
1182 :     }
1183 :     # Loop through the lines. Note there's no need to chomp because
1184 :     # the SPLIT took out the new-line characters.
1185 :     for my $l (@lines) {
1186 :     # Split the line into fields.
1187 :     my @s = split(/\t/, $l);
1188 :     # Insure we have all the fields we need.
1189 : parrello 1.12 if (@s >= 9) {
1190 : parrello 1.6 # Check to see if we've seen this SIM before.
1191 :     my $id1 = $s[0];
1192 :     my $id2 = $s[1];
1193 :     # Add it to the result list.
1194 :     push(@retVal, bless \@s, 'Sim');
1195 :     }
1196 :     }
1197 :     }
1198 :     };
1199 :     };
1200 :     #
1201 :     # END OF ANONYMOUS SUBROUTINE
1202 :     #
1203 :     # Now we're ready to start. Because networking is an iffy thing, we set up
1204 :     # to try our request multiple times.
1205 :     my $n_retries = 10;
1206 :     my $attempts = 0;
1207 :     # Set the timeout value, in seconds.
1208 :     $ua->timeout(180);
1209 :     # Loop until we succeed or run out of retries.
1210 :     my $done = 0;
1211 :     while (! $done && $attempts++ < $n_retries) {
1212 :     # Reset the content processor. This clears the tail.
1213 :     &$cb(undef, 'reset');
1214 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
1215 :     if ($resp->is_success) {
1216 :     # If the response was successful, get the content. This triggers
1217 :     # the anonymous subroutine.
1218 :     my $x = $resp->content;
1219 :     # Denote we've been successful.
1220 :     $done = 1;
1221 :     }
1222 :     }
1223 :     return @retVal;
1224 :     }
1225 :    
1226 : parrello 1.21 =head3 standard_genetic_code
1227 :    
1228 :     my $code = standard_genetic_code();
1229 :    
1230 :     Return a hash containing the standard translation of nucleotide triples to proteins.
1231 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
1232 :     returns the default translation scheme. The scheme is implemented as a reference to a
1233 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1234 :    
1235 :     =cut
1236 :    
1237 :     sub standard_genetic_code {
1238 :    
1239 :     my $code = {};
1240 : parrello 1.8
1241 : parrello 1.21 $code->{"AAA"} = "K";
1242 :     $code->{"AAC"} = "N";
1243 :     $code->{"AAG"} = "K";
1244 :     $code->{"AAT"} = "N";
1245 :     $code->{"ACA"} = "T";
1246 :     $code->{"ACC"} = "T";
1247 :     $code->{"ACG"} = "T";
1248 :     $code->{"ACT"} = "T";
1249 :     $code->{"AGA"} = "R";
1250 :     $code->{"AGC"} = "S";
1251 :     $code->{"AGG"} = "R";
1252 :     $code->{"AGT"} = "S";
1253 :     $code->{"ATA"} = "I";
1254 :     $code->{"ATC"} = "I";
1255 :     $code->{"ATG"} = "M";
1256 :     $code->{"ATT"} = "I";
1257 :     $code->{"CAA"} = "Q";
1258 :     $code->{"CAC"} = "H";
1259 :     $code->{"CAG"} = "Q";
1260 :     $code->{"CAT"} = "H";
1261 :     $code->{"CCA"} = "P";
1262 :     $code->{"CCC"} = "P";
1263 :     $code->{"CCG"} = "P";
1264 :     $code->{"CCT"} = "P";
1265 :     $code->{"CGA"} = "R";
1266 :     $code->{"CGC"} = "R";
1267 :     $code->{"CGG"} = "R";
1268 :     $code->{"CGT"} = "R";
1269 :     $code->{"CTA"} = "L";
1270 :     $code->{"CTC"} = "L";
1271 :     $code->{"CTG"} = "L";
1272 :     $code->{"CTT"} = "L";
1273 :     $code->{"GAA"} = "E";
1274 :     $code->{"GAC"} = "D";
1275 :     $code->{"GAG"} = "E";
1276 :     $code->{"GAT"} = "D";
1277 :     $code->{"GCA"} = "A";
1278 :     $code->{"GCC"} = "A";
1279 :     $code->{"GCG"} = "A";
1280 :     $code->{"GCT"} = "A";
1281 :     $code->{"GGA"} = "G";
1282 :     $code->{"GGC"} = "G";
1283 :     $code->{"GGG"} = "G";
1284 :     $code->{"GGT"} = "G";
1285 :     $code->{"GTA"} = "V";
1286 :     $code->{"GTC"} = "V";
1287 :     $code->{"GTG"} = "V";
1288 :     $code->{"GTT"} = "V";
1289 :     $code->{"TAA"} = "*";
1290 :     $code->{"TAC"} = "Y";
1291 :     $code->{"TAG"} = "*";
1292 :     $code->{"TAT"} = "Y";
1293 :     $code->{"TCA"} = "S";
1294 :     $code->{"TCC"} = "S";
1295 :     $code->{"TCG"} = "S";
1296 :     $code->{"TCT"} = "S";
1297 :     $code->{"TGA"} = "*";
1298 :     $code->{"TGC"} = "C";
1299 :     $code->{"TGG"} = "W";
1300 :     $code->{"TGT"} = "C";
1301 :     $code->{"TTA"} = "L";
1302 :     $code->{"TTC"} = "F";
1303 :     $code->{"TTG"} = "L";
1304 :     $code->{"TTT"} = "F";
1305 : parrello 1.8
1306 : parrello 1.21 return $code;
1307 :     }
1308 : parrello 1.8
1309 : parrello 1.21 =head3 strand_of
1310 : parrello 1.8
1311 : parrello 1.21 my $plusOrMinus = strand_of($loc);
1312 : parrello 1.8
1313 : parrello 1.21 Return the strand (C<+> or C<->) from the specified location string.
1314 : parrello 1.8
1315 : parrello 1.21 =over 4
1316 : parrello 1.8
1317 : parrello 1.21 =item loc
1318 : parrello 1.8
1319 : parrello 1.21 Location string to parse (see L<SAP/Location Strings>).
1320 : parrello 1.9
1321 :     =item RETURN
1322 :    
1323 : parrello 1.21 Returns C<+> if the location is on the forward strand, else C<->.
1324 : parrello 1.9
1325 :     =back
1326 :    
1327 :     =cut
1328 :    
1329 : parrello 1.21 sub strand_of {
1330 : parrello 1.9 # Get the parameters.
1331 : parrello 1.21 my ($loc) = @_;
1332 :     # Declare the return variable.
1333 :     my $retVal;
1334 :     # Parse the strand indicator from the location.
1335 :     if ($loc =~ /\d+([+-])\d+/) {
1336 :     $retVal = $1;
1337 : parrello 1.9 }
1338 : parrello 1.21 # Return the result.
1339 :     return $retVal;
1340 : parrello 1.9 }
1341 :    
1342 : parrello 1.21 =head3 strip_ec
1343 : parrello 1.9
1344 : parrello 1.21 my $role = strip_ec($rawRole);
1345 : parrello 1.11
1346 : parrello 1.21 Strip the EC number (if any) from the specified role or functional
1347 :     assignment.
1348 : parrello 1.11
1349 :     =over 4
1350 :    
1351 : parrello 1.21 =item rawRole
1352 : parrello 1.11
1353 : parrello 1.21 Role or functional assignment from which the EC numbers are to be stripped.
1354 : parrello 1.11
1355 :     =item RETURN
1356 :    
1357 : parrello 1.21 Returns the incoming string with any EC numbers removed. The EC numbers must
1358 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1359 :     and surrounding parentheses).
1360 : parrello 1.11
1361 :     =back
1362 :    
1363 :     =cut
1364 :    
1365 : parrello 1.21 sub strip_ec {
1366 :     # Get the parameters.
1367 :     my ($rawRole) = @_;
1368 :     # Declare the return variable.
1369 :     my $retVal = $rawRole;
1370 :     # Remove the EC numbers.
1371 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1372 :     # Return the result.
1373 :     return $retVal;
1374 : parrello 1.15 }
1375 :    
1376 :     =head3 translate
1377 :    
1378 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1379 :    
1380 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1381 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1382 :     C<M>. (In the standard genetic code, these two combinations normally translate
1383 :     to C<V> and C<L>, respectively.)
1384 :    
1385 :     =over 4
1386 :    
1387 :     =item dna_seq
1388 :    
1389 :     DNA sequence to translate. Note that the DNA sequence can only contain
1390 :     known nucleotides.
1391 :    
1392 :     =item code
1393 :    
1394 :     Reference to a hash specifying the translation code. The hash is keyed by
1395 :     nucleotide triples, and the value for each key is the corresponding protein
1396 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1397 :     used.
1398 :    
1399 :     =item fix_start
1400 :    
1401 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1402 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1403 :     C<M> instead of the value specified in the translation code.
1404 :    
1405 :     =item RETURN
1406 :    
1407 :     Returns a string resulting from translating each nucleotide triple into a
1408 :     protein letter.
1409 :    
1410 :     =back
1411 :    
1412 :     =cut
1413 :     #: Return Type $;
1414 :     sub translate {
1415 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1416 :    
1417 :     my( $dna,$code,$start ) = @_;
1418 :     my( $i,$j,$ln );
1419 :     my( $x,$y );
1420 :     my( $prot );
1421 :    
1422 :     if (! defined($code)) {
1423 :     $code = &FIG::standard_genetic_code;
1424 :     }
1425 :     $ln = length($dna);
1426 :     $prot = "X" x ($ln/3);
1427 :     $dna =~ tr/a-z/A-Z/;
1428 :    
1429 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1430 :     $x = substr($dna,$i,3);
1431 :     if ($y = $code->{$x}) {
1432 :     substr($prot,$j,1) = $y;
1433 :     }
1434 :     }
1435 :    
1436 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1437 :     substr($prot,0,1) = 'M';
1438 :     }
1439 :     return $prot;
1440 :     }
1441 :    
1442 : parrello 1.36 =head3 type_of
1443 :    
1444 :     my $type = SeedUtils::type_of($fid);
1445 :    
1446 :     Return the type of a feature, given a FIG feature ID (e.g. C<fig|100226.1.peg.3361>).
1447 :    
1448 :     =over 4
1449 :    
1450 :     =item fid
1451 :    
1452 :     ID of a feature whose type is desired.
1453 :    
1454 :     =item RETURN
1455 :    
1456 :     Returns the type of the feature (e.g. C<peg>, C<rna>, ...).
1457 :    
1458 :     =back
1459 :    
1460 :     =cut
1461 :    
1462 :     sub type_of {
1463 :     # Get the parameter.
1464 :     my ($fid) = @_;
1465 :     # Declare the return variable. We return undefined if the ID is unparseable.
1466 :     my $retVal;
1467 :     # Parse the FIG ID.
1468 :     if ($fid =~ /fig\|\d+\.\d+\.(\w+)\./) {
1469 :     # Save the type segment.
1470 :     $retVal = $1;
1471 :     }
1472 :     # Return the result.
1473 :     return $retVal;
1474 :     }
1475 :    
1476 :    
1477 : parrello 1.21 =head3 verify_dir
1478 : parrello 1.18
1479 : parrello 1.21 verify_dir($dirName);
1480 : parrello 1.18
1481 : parrello 1.21 Insure that the specified directory exists. If the directory does not
1482 :     exist, it will be created.
1483 : parrello 1.18
1484 :     =over 4
1485 :    
1486 : parrello 1.21 =item dirName
1487 : parrello 1.18
1488 : parrello 1.21 Name of the relevant directory.
1489 : parrello 1.18
1490 :     =back
1491 :    
1492 :     =cut
1493 :    
1494 : parrello 1.21 sub verify_dir {
1495 : parrello 1.18 # Get the parameters.
1496 : parrello 1.21 my ($dirName) = @_;
1497 :     # Strip off the final slash, if any.
1498 :     $dirName =~ s#/$##;
1499 :     # Only proceed if the directory does NOT already exist.
1500 :     if (! -d $dirName) {
1501 :     # If there is a parent directory, recursively insure it is there.
1502 :     if ($dirName =~ m#(.+)/[^/]+$#) {
1503 :     verify_dir($1);
1504 : overbeek 1.20 }
1505 : parrello 1.21 # Create this particular directory with full permissions.
1506 :     mkdir $dirName, 0777;
1507 : overbeek 1.20 }
1508 :     }
1509 :    
1510 : disz 1.28 sub strip_func {
1511 :     my($func) = @_;
1512 : overbeek 1.20
1513 : disz 1.28 $func =~ s/^FIG\d{6}[^:]*:\s*//;
1514 :     $func =~ s/\s*\#.*$//;
1515 :     return($func);
1516 :     }
1517 :    
1518 : olson 1.32 sub verify_db {
1519 :     my($db,$type) = @_;
1520 :    
1521 :     #
1522 :     # Find formatdb; if we're operating in a SEED environment
1523 :     # use it from there.
1524 :     #
1525 :    
1526 :     my $path = '';
1527 :     if ($FIG_Config::blastbin ne '' && -d $FIG_Config::blastbin)
1528 :     {
1529 :     $path = "$FIG_Config::blastbin/";
1530 :     }
1531 :     elsif ($FIG_Config::ext_bin ne '' && -d $FIG_Config::ext_bin)
1532 :     {
1533 :     $path = "$FIG_Config::ext_bin/";
1534 :     }
1535 :    
1536 :    
1537 :     my @cmd;
1538 :     if ($type =~ /^p/i)
1539 :     {
1540 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db))
1541 :     {
1542 :     @cmd = ("${path}formatdb", "-p", "T", "-i", $db);
1543 :     }
1544 :     }
1545 :     else
1546 :     {
1547 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db))
1548 :     {
1549 :     @cmd = ("${path}formatdb", "-p", "F", "-i", $db);
1550 :     }
1551 :     }
1552 :     if (@cmd)
1553 :     {
1554 :     my $rc = system(@cmd);
1555 :     if ($rc != 0)
1556 :     {
1557 :     warn "SeedUtils::verify_db: formatdb failed with rc=$rc: @cmd\n";
1558 :     }
1559 :     }
1560 :     }
1561 :    
1562 :    
1563 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3