[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.39 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 : olson 1.38 use BerkTable;
24 :     use DB_File;
25 : parrello 1.1
26 : olson 1.32 #
27 :     # In case we are running in a SEED, pull in the FIG_Config
28 :     #
29 :     BEGIN
30 :     {
31 :     eval {
32 :     require FIG_Config;
33 :     };
34 :     }
35 :    
36 : parrello 1.1 use strict;
37 : parrello 1.12 no warnings qw(once);
38 : parrello 1.1 use base qw(Exporter);
39 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
40 :     rev_comp genome_of min max sims verify_dir between translate
41 : parrello 1.21 standard_genetic_code parse_location roles_of_function
42 : olson 1.33 strip_ec location_string location_cmp strand_of by_fig_id
43 : olson 1.39 verify_db bbh_data id_url validate_fasta_file);
44 : parrello 1.1
45 :     =head1 SEED Utility Methods
46 :    
47 :     =head2 Introduction
48 :    
49 :     This is a simple utility package that performs functions useful for
50 :     bioinformatics, but that do not require access to the databases.
51 :    
52 :     =head2 Public Methods
53 :    
54 : parrello 1.21 =head3 abbrev
55 :    
56 :     my $abbrev = SeedUtils::abbrev($genome_name);
57 :    
58 :     Return an abbreviation of the specified genome name. This method is used to create
59 :     a reasonably indicative genome name that fits in 10 characters.
60 :    
61 :     =over 4
62 :    
63 :     =item genome_name
64 :    
65 :     Genome name to abbreviate.
66 :    
67 :     =item RETURN
68 :    
69 :     Returns a shortened version of the genome name that is 10 characters or less in
70 :     length.
71 :    
72 :     =back
73 :    
74 :     =cut
75 :    
76 :     sub abbrev {
77 :     my($genome_name) = @_;
78 :    
79 :     $genome_name =~ s/^(\S{3})\S+/$1./;
80 :     $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
81 :     $genome_name =~ s/ //g;
82 :     if (length($genome_name) > 10) {
83 :     $genome_name = substr($genome_name,0,10);
84 :     }
85 :     return $genome_name;
86 :     }
87 :    
88 : parrello 1.34 =head3 bbh_data
89 :    
90 :     my $bbhList = FIGRules::bbh_data($peg, $cutoff);
91 :    
92 :     Return a list of the bi-directional best hits relevant to the specified PEG.
93 :    
94 :     =over 4
95 :    
96 :     =item peg
97 :    
98 :     ID of the feature whose bidirectional best hits are desired.
99 :    
100 :     =item cutoff
101 :    
102 :     Similarity cutoff. If omitted, 1e-10 is used.
103 :    
104 :     =item RETURN
105 :    
106 :     Returns a reference to a list of 3-tuples. The first element of the list is the best-hit
107 :     PEG; the second element is the score. A lower score indicates a better match. The third
108 :     element is the normalized bit score for the pair; it is normalized to the length
109 :     of the protein.
110 :    
111 :     =back
112 :    
113 :     =cut
114 :     #: Return Type @@;
115 :     sub bbh_data {
116 :     my ($peg, $cutoff) = @_;
117 :     my @retVal = ();
118 :     require LWP::UserAgent;
119 :     my $ua = LWP::UserAgent->new();
120 :     my $url = "http://bioseed.mcs.anl.gov/simserver/perl/bbhs.pl";
121 :     my $retries = 5;
122 :     my $done = 0;
123 :     my $resp;
124 :     while ($retries > 0 && ! $done) {
125 :     $resp = $ua->post($url, { id => $peg, cutoff => $cutoff });
126 :     if ($resp->is_success) {
127 :     my $dat = $resp->content;
128 :     while ($dat =~ /([^\n]+)\n/g) {
129 :     my @l = split(/\t/, $1);
130 :     push @retVal, \@l;
131 :     }
132 :     $done = 1;
133 :     } else {
134 :     $retries--;
135 :     }
136 :     }
137 :     if (! $done) {
138 :     die("Failure retrieving bbh data for $peg: " . $resp->status_line);
139 :     }
140 :     return \@retVal;
141 :     }
142 :    
143 :    
144 : parrello 1.21 =head3 between
145 :    
146 :     my $flag = between($x, $y, $z);
147 :    
148 :     Determine whether or not $y is between $x and $z.
149 :    
150 :     =over 4
151 :    
152 :     =item x
153 :    
154 :     First edge number.
155 :    
156 :     =item y
157 :    
158 :     Number to examine.
159 :    
160 :     =item z
161 :    
162 :     Second edge number.
163 :    
164 :     =item RETURN
165 :    
166 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
167 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
168 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
169 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
170 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
171 :    
172 :     =back
173 :    
174 :     =cut
175 :     #: Return Type $;
176 :     sub between {
177 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
178 :     my($x,$y,$z) = @_;
179 :    
180 :     if ($x < $z) {
181 :     return (($x <= $y) && ($y <= $z));
182 :     } else {
183 :     return (($x >= $y) && ($y >= $z));
184 :     }
185 :     }
186 :    
187 :    
188 : parrello 1.13 =head3 boundaries_of
189 :    
190 : parrello 1.21 my ($contig, $min, $max, $dir) = boundaries_of($locs);
191 : parrello 1.13
192 :     Return the boundaries of a set of locations. The contig, the leftmost
193 :     location, and the rightmost location will be returned to the caller. If
194 :     more than one contig is represented, the method will return an undefined
195 :     value for the contig (indicating failure).
196 :    
197 :     =over 4
198 :    
199 :     =item locs
200 :    
201 :     Reference to a list of location strings. A location string contains a contig ID,
202 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
203 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
204 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
205 :    
206 :     =item RETURN
207 :    
208 : parrello 1.21 Returns a 4-element list. The first element is the contig ID from all the locations,
209 :     the second is the offset of leftmost base pair represented in the locations, the
210 :     third is the offset of the rightmost base pair represented in the locations, and
211 :     the fourth is the dominant strand.
212 : parrello 1.13
213 :     =back
214 :    
215 :     =cut
216 :    
217 :     sub boundaries_of {
218 :     # Get the parameters.
219 :     my ($locs) = @_;
220 :     # Declare the return variables.
221 :     my ($contig, $min, $max);
222 :     # We'll put all the starting and ending offsets found in here.
223 :     my @offsets;
224 : parrello 1.21 # This will be used to count the orientations.
225 :     my %dirs = ('+' => 0, '-' => 0);
226 : parrello 1.13 # This will count the number of errors found.
227 :     my $error = 0;
228 : parrello 1.35 # Insure the location is an array. If it's not, we assume it's a string of
229 :     # comma-separated values.
230 :     if (! ref $locs) {
231 :     $locs = [ split /\s*,\s*/, $locs ];
232 :     }
233 : parrello 1.13 # Loop through the locations.
234 :     for my $loc (@$locs) {
235 :     # Parse this location.
236 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
237 : parrello 1.23 # This is a valid location string.
238 : parrello 1.13 my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
239 :     # Is this contig valid?
240 :     if ($contig && $newContig ne $contig) {
241 :     # No, skip this location.
242 :     $error++;
243 :     } else {
244 :     # Save the contig.
245 :     $contig = $newContig;
246 : parrello 1.21 # Count the orientation.
247 :     $dirs{$dir}++;
248 : parrello 1.13 # Compute the ending offset.
249 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
250 :     # Save both offsets.
251 :     push @offsets, $begin, $end;
252 :     }
253 : parrello 1.23 } elsif ($loc =~ /^(.+)_(\d+)_(\d+)/) {
254 :     # Here we have an old-style location string.
255 :     my($newContig, $start, $stop) = ($1, $2, $3);
256 :     # Is this contig valid?
257 : olson 1.22 if ($contig && $newContig ne $contig) {
258 :     # No, skip this location.
259 :     $error++;
260 :     } else {
261 :     # Save the contig.
262 :     $contig = $newContig;
263 : parrello 1.23 # Compute the orientation.
264 :     my $dir;
265 :     if ($start > $stop) {
266 :     $dir = '-';
267 :     } else {
268 :     $dir = '+';
269 :     }
270 :     # Count it.
271 : olson 1.22 $dirs{$dir}++;
272 : parrello 1.23 # Save both offsets.
273 : olson 1.22 push @offsets, $start, $stop;
274 :     }
275 : parrello 1.13 } else {
276 :     # The location is invalid, so it's an error,
277 :     $error++;
278 :     }
279 :     }
280 :     # If there's an error, clear the contig ID.
281 :     if ($error) {
282 :     $contig = undef;
283 :     }
284 :     # Compute the min and max from the offsets collected.
285 :     $min = min(@offsets);
286 :     $max = max(@offsets);
287 : parrello 1.21 # Save the dominant orientation.
288 :     my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
289 : parrello 1.13 # Return the results.
290 : parrello 1.21 return ($contig, $min, $max, $dir);
291 : parrello 1.13 }
292 :    
293 : parrello 1.21 =head3 boundary_loc
294 : parrello 1.17
295 : parrello 1.21 my $singleLoc = SeedUtils::boundary_loc($locations);
296 : parrello 1.17
297 : parrello 1.21 Return a single location string (see L<SAP/Location Strings>) that covers
298 :     the incoming list of locations. NOTE that if the locations listed span
299 :     more than one contig, this method may return an unexpected result.
300 : parrello 1.17
301 : parrello 1.21 This method is useful for converting the output of L<SAP/fid_locations> to
302 :     location strings.
303 : parrello 1.17
304 :     =over 4
305 :    
306 : parrello 1.21 =item locations
307 : parrello 1.17
308 : parrello 1.21 A set of location strings formatted as a comma-separated list or as a reference
309 :     to a list of location strings.
310 : parrello 1.17
311 :     =item RETURN
312 :    
313 : parrello 1.21 Returns a single location string that covers as best as possible the list of
314 :     incoming locations.
315 : parrello 1.17
316 :     =back
317 :    
318 :     =cut
319 :    
320 : parrello 1.21 sub boundary_loc {
321 : parrello 1.17 # Get the parameters.
322 : parrello 1.21 my ($locations) = @_;
323 :     # Convert the incoming locations to a list.
324 :     my @locs;
325 :     if (ref $locations eq 'ARRAY') {
326 :     @locs = @$locations;
327 :     } else {
328 :     @locs = split /\s*,\s*/, $locations;
329 :     }
330 :     # Get the boundary information for the listed locations.
331 :     my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
332 :     # Compute the indicated location string.
333 :     my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
334 :     ($max + 1 - $min);
335 : parrello 1.17 # Return the result.
336 : parrello 1.21 return $retVal;
337 : parrello 1.17 }
338 :    
339 : parrello 1.21 =head3 by_fig_id
340 : parrello 1.17
341 : parrello 1.21 my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
342 : parrello 1.16
343 : parrello 1.21 Compare two feature IDs.
344 : parrello 1.16
345 : parrello 1.21 This function is designed to assist in sorting features by ID. The sort is by
346 :     genome ID followed by feature type and then feature number.
347 : parrello 1.16
348 :     =over 4
349 :    
350 : parrello 1.21 =item a
351 :    
352 :     First feature ID.
353 :    
354 :     =item b
355 : parrello 1.16
356 : parrello 1.21 Second feature ID.
357 : parrello 1.16
358 :     =item RETURN
359 :    
360 : parrello 1.21 Returns a negative number if the first parameter is smaller, zero if both parameters
361 :     are equal, and a positive number if the first parameter is greater.
362 : parrello 1.16
363 :     =back
364 :    
365 :     =cut
366 :    
367 : parrello 1.21 sub by_fig_id {
368 :     my($a,$b) = @_;
369 :     my($g1,$g2,$t1,$t2,$n1,$n2);
370 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
371 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
372 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
373 :     } else {
374 :     $a cmp $b;
375 : parrello 1.16 }
376 : parrello 1.19 }
377 :    
378 : parrello 1.21 =head3 create_fasta_record
379 : parrello 1.19
380 : parrello 1.21 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
381 : parrello 1.19
382 : parrello 1.21 Create a FASTA record from the specified DNA or protein sequence. The
383 :     sequence will be split into 60-character lines, and the record will
384 :     include an identifier line.
385 : parrello 1.19
386 :     =over 4
387 :    
388 : parrello 1.21 =item id
389 :    
390 :     ID for the sequence, to be placed at the beginning of the identifier
391 :     line.
392 :    
393 :     =item comment (optional)
394 : parrello 1.19
395 : parrello 1.21 Comment text to place after the ID on the identifier line. If this parameter
396 :     is empty, undefined, or 0, no comment will be placed.
397 : parrello 1.19
398 : parrello 1.21 =item sequence
399 : parrello 1.19
400 : parrello 1.21 Sequence of letters to form into FASTA. For purposes of convenience, whitespace
401 :     characters in the sequence will be removed automatically.
402 : parrello 1.19
403 : parrello 1.21 =item stripped (optional)
404 : parrello 1.19
405 : parrello 1.21 If TRUE, then the sequence will be returned unmodified instead of converted
406 :     to FASTA format. The default is FALSE.
407 : parrello 1.19
408 :     =item RETURN
409 :    
410 : parrello 1.21 Returns the desired sequence in FASTA format.
411 : parrello 1.19
412 :     =back
413 :    
414 :     =cut
415 :    
416 : parrello 1.21 sub create_fasta_record {
417 : parrello 1.19 # Get the parameters.
418 : parrello 1.21 my ($id, $comment, $sequence, $stripped) = @_;
419 :     # Declare the return variable.
420 :     my $retVal;
421 :     # If we're in stripped mode, we just return the sequence.
422 :     if ($stripped) {
423 :     $retVal = $sequence;
424 : parrello 1.19 } else {
425 : parrello 1.21 # Here we have to do the FASTA conversion. Start with the ID.
426 :     my $header = ">$id";
427 :     # Add a comment, if any.
428 :     if ($comment) {
429 :     $header .= " $comment";
430 :     }
431 :     # Clean up the sequence.
432 :     $sequence =~ s/\s+//g;
433 :     # We need to format the sequence into 60-byte chunks. We use the infamous
434 :     # grep-split trick. The split, because of the presence of the parentheses,
435 :     # includes the matched delimiters in the output list. The grep strips out
436 :     # the empty list items that appear between the so-called delimiters, since
437 :     # the delimiters are what we want.
438 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
439 :     # Add the chunks and the trailer.
440 :     $retVal = join("\n", $header, @chunks) . "\n";
441 : parrello 1.19 }
442 :     # Return the result.
443 : parrello 1.21 return $retVal;
444 : parrello 1.19 }
445 :    
446 : parrello 1.21 =head3 display_id_and_seq
447 : parrello 1.19
448 : parrello 1.21 SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
449 : parrello 1.19
450 : parrello 1.21 Display a fasta ID and sequence to the specified open file. This method is designed
451 :     to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
452 :     input a string pointer rather than a string. If the file handle is omitted it
453 :     defaults to STDOUT.
454 : parrello 1.19
455 : parrello 1.21 The output is formatted into a FASTA record. The first line of the output is
456 :     preceded by a C<< > >> symbol, and the sequence is split into 60-character
457 :     chunks displayed one per line. Thus, this method can be used to produce
458 :     FASTA files from data gathered by the rest of the system.
459 : parrello 1.19
460 :     =over 4
461 :    
462 : parrello 1.21 =item id_and_comment
463 : parrello 1.19
464 : parrello 1.21 The sequence ID and (optionally) the comment from the sequence's FASTA record.
465 :     The ID
466 : parrello 1.19
467 : parrello 1.21 =item seqP
468 : parrello 1.19
469 : parrello 1.21 Reference to a string containing the sequence. The sequence is automatically
470 :     formatted into 60-character chunks displayed one per line.
471 : parrello 1.19
472 : parrello 1.21 =item fh
473 :    
474 :     Open file handle to which the ID and sequence should be output. If omitted,
475 :     C<\*STDOUT> is assumed.
476 :    
477 :     =back
478 :    
479 :     =cut
480 :    
481 :     sub display_id_and_seq {
482 :    
483 :     my( $id, $seqP, $fh ) = @_;
484 :    
485 :     if (! defined($fh) ) { $fh = \*STDOUT; }
486 :    
487 :     print $fh ">$id\n";
488 :     &display_seq($seqP, $fh);
489 :     }
490 :    
491 :     =head3 display_seq
492 :    
493 :     SeedUtils::display_seq(\$seqP, $fh);
494 :    
495 :     Display a fasta sequence to the specified open file. If the file handle is
496 :     omitted it defaults to STDOUT.
497 :    
498 :     The sequence is split into 60-character chunks displayed one per line for
499 :     readability.
500 :    
501 :     =over 4
502 :    
503 :     =item seqP
504 :    
505 :     Reference to a string containing the sequence.
506 :    
507 :     =item fh
508 :    
509 :     Open file handle to which the sequence should be output. If omitted,
510 :     C<STDOUT> is assumed.
511 :    
512 :     =back
513 :    
514 :     =cut
515 :    
516 :     sub display_seq {
517 :    
518 :     my ( $seqP, $fh ) = @_;
519 :     my ( $i, $n, $ln );
520 :    
521 :     if (! defined($fh) ) { $fh = \*STDOUT; }
522 :    
523 :     $n = length($$seqP);
524 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
525 :     for ($i=0; ($i < $n); $i += 60) {
526 :     if (($i + 60) <= $n) {
527 :     $ln = substr($$seqP,$i,60);
528 :     } else {
529 :     $ln = substr($$seqP,$i,($n-$i));
530 :     }
531 :     print $fh "$ln\n";
532 :     }
533 :     }
534 :    
535 :    
536 :     =head3 genome_of
537 :    
538 :     my $genomeID = genome_of($fid);
539 :    
540 :     Return the Genome ID embedded in the specified FIG feature ID.
541 :    
542 :     =over 4
543 :    
544 :     =item fid
545 :    
546 :     Feature ID of interest.
547 :    
548 :     =item RETURN
549 :    
550 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
551 :     feature ID is invalid, this method returns an undefined value.
552 :    
553 :     =back
554 :    
555 :     =cut
556 :    
557 :     sub genome_of {
558 :     # Get the parameters.
559 :     my ($fid) = @_;
560 :     # Declare the return variable.
561 :     my $retVal;
562 :     # Parse the feature ID.
563 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
564 :     $retVal = $1;
565 :     }
566 :     # Return the result.
567 :     return $retVal;
568 :     }
569 :    
570 :     =head3 hypo
571 :    
572 :     my $flag = hypo($func);
573 :    
574 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
575 :     Hypothetical functional roles are identified by key words in the text,
576 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
577 :    
578 :     =over 4
579 :    
580 :     =item func
581 :    
582 :     Text of the functional role whose nature is to be determined.
583 :    
584 :     =item RETURN
585 :    
586 :     Returns TRUE if the role is hypothetical, else FALSE.
587 :    
588 :     =back
589 :    
590 :     =cut
591 :    
592 :     sub hypo {
593 :     my ($func) = @_;
594 :     if (! $func) { return 1 }
595 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
596 :     if ($func =~ /hypoth/i) { return 1 }
597 :     if ($func =~ /conserved protein/i) { return 1 }
598 :     if ($func =~ /gene product/i) { return 1 }
599 :     if ($func =~ /interpro/i) { return 1 }
600 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
601 :     if ($func =~ /^U\d/) { return 1 }
602 :     if ($func =~ /^orf[^_]/i) { return 1 }
603 :     if ($func =~ /uncharacterized/i) { return 1 }
604 :     if ($func =~ /pseudogene/i) { return 1 }
605 :     if ($func =~ /^predicted/i) { return 1 }
606 :     if ($func =~ /AGR_/) { return 1 }
607 :     if ($func =~ /similar to/i) { return 1 }
608 :     if ($func =~ /similarity/i) { return 1 }
609 :     if ($func =~ /glimmer/i) { return 1 }
610 :     if ($func =~ /unknown/i) { return 1 }
611 :     if (($func =~ /domain/i) ||
612 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
613 :     ($func =~ /complete/i) ||
614 :     ($func =~ /ensang/i) ||
615 :     ($func =~ /unnamed/i) ||
616 :     ($func =~ /EG:/) ||
617 :     ($func =~ /orf\d+/i) ||
618 :     ($func =~ /RIKEN/) ||
619 :     ($func =~ /Expressed/i) ||
620 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
621 :     ($func =~ /predicted by Psort/) ||
622 :     ($func =~ /^bh\d+/i) ||
623 :     ($func =~ /cds_/i) ||
624 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
625 :     ($func =~ /similar to/i) ||
626 :     ($func =~ / identi/i) ||
627 :     ($func =~ /ortholog of/i) ||
628 :     ($func =~ /structural feature/i)) { return 1 }
629 :     return 0;
630 :    
631 :     }
632 :    
633 : parrello 1.37 =head3 id_url
634 :    
635 :     my $url = id_url($id);
636 :    
637 :     Return the URL for a specified external gene ID.
638 :    
639 :     =over 4
640 :    
641 :     =item id
642 :    
643 :     ID of the gene whose URL is desired.
644 :    
645 :     =item RETURN
646 :    
647 :     Returns a URL for displaying information about the specified gene. The structure
648 :     of the ID is used to determine the web site to which the gene belongs.
649 :    
650 :     =back
651 :    
652 :     =cut
653 :    
654 :     sub id_url {
655 :     # Get the parameters.
656 :     my ($id) = @_;
657 :     # Declare the return variable.
658 :     my $retVal;
659 :     # Parse the ID to determine the URL.
660 :     if ($id =~ /^(?:ref\|)?([NXYZA]P_[0-9\.]+)$/) {
661 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein;cmd=search;term=$1";
662 :     } elsif ($id =~ /^gi\|(\d+)$/) {
663 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve;db=Protein&list_uids=$1;dopt=GenPept";
664 :     } elsif ($id =~ /^cmr\|(.+)$/) {
665 :     $retVal = "http://cmr.jcvi.org/tigr-scripts/CMR/shared/GenePage.cgi?locus=$1";
666 :     } elsif ($id =~ /^sp\|([A-Z0-9]{6})$/) {
667 :     $retVal = "http://us.expasy.org/cgi-bin/get-sprot-entry?$1";
668 :     } elsif ($id =~ /^uni\|([A-Z0-9_]+?)$/) {
669 :     $retVal = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=$1";
670 :     } elsif ($id =~ /^kegg\|(([a-z]{2,4}):([a-zA-Z_0-9]+))$/) {
671 :     $retVal = "http://www.genome.ad.jp/dbget-bin/www_bget?$2+$3";
672 :     } elsif ($id =~ /^tr\|([a-zA-Z0-9]+)$/) {
673 :     $retVal = "http://ca.expasy.org/uniprot/$1";
674 :     } elsif ($id =~ /^(fig\|\d+\.\d+\.\w+\.\d+)$/) {
675 :     $retVal = "http://seed-viewer.theseed.org/?pattern=$1&page=SearchResult&action=check_search";
676 :     }
677 :     # Return the result.
678 :     return $retVal;
679 :     }
680 :    
681 :    
682 : parrello 1.21 =head3 location_cmp
683 :    
684 :     my $cmp = location_cmp($loc1, $loc2);
685 :    
686 :     Compare two location strings (see L<SAP/Location Strings>).
687 :    
688 :     The ordering principle for locations is that they are sorted first by contig ID, then by
689 :     leftmost position, in reverse order by length, and then by direction. The effect is that
690 :     within a contig, the locations are ordered first and foremost in the way they would
691 :     appear when displayed in a picture of the contig and second in such a way that embedded
692 :     locations come after the locations in which they are embedded. In the case of two
693 :     locations that represent the exact same base pairs, the forward (C<+>) location is
694 :     arbitrarily placed first.
695 :    
696 :     =over 4
697 :    
698 :     =item loc1
699 :    
700 :     First location string to compare.
701 :    
702 :     =item loc2
703 :    
704 :     Second location string to compare.
705 :    
706 :     =item RETURN
707 :    
708 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
709 :     B<loc2> location sorts first, and zero if the two locations are the same.
710 : parrello 1.19
711 :    
712 :     =back
713 :    
714 :     =cut
715 :    
716 :     sub location_cmp {
717 :     # Get the parameters.
718 :     my ($loc1, $loc2) = @_;
719 :     # Parse the locations.
720 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
721 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
722 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
723 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
724 :     # Declare the return variable. We compare the indicative parts of the location
725 :     # in order. Note that we sort in reverse order by length, so the last comparison
726 :     # puts 2 before 1.
727 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
728 :     ($len2 <=> $len1);
729 :     # If everything matches to this point, check the strands.
730 :     if (! $retVal) {
731 :     if ($strand1 eq '+') {
732 :     # First location is positive, so if the locations are unequal, it
733 :     # sorts first.
734 :     $retVal = ($strand2 eq '+' ? 0 : -1);
735 :     } else {
736 :     # First location is negative, so if the locations are unequal, it
737 :     # sorts second.
738 :     $retVal = ($strand1 eq '-' ? 0 : 1);
739 :     }
740 :     }
741 :     # Return the result.
742 :     return $retVal;
743 :     }
744 :    
745 : parrello 1.21 =head3 location_string
746 : parrello 1.19
747 : parrello 1.21 my $locString = location_string($contig, $beg, $end);
748 : parrello 1.19
749 : parrello 1.21 Form a location string for the specified contig that starts at the
750 :     indicated begin location and stops at the indicated end location. A
751 :     single-base location will automatically be put on the forward strand.
752 : parrello 1.19
753 :     =over 4
754 :    
755 : parrello 1.21 =item contig
756 :    
757 :     ID of the contig to contain this location.
758 :    
759 :     =item beg
760 :    
761 :     Beginning offset of the location.
762 :    
763 :     =item end
764 : parrello 1.19
765 : parrello 1.21 Ending offset of the location.
766 : parrello 1.19
767 :     =item RETURN
768 :    
769 : parrello 1.21 Returns a location string (see L<SAP/Location String>) for the specified
770 :     location.
771 : parrello 1.19
772 :     =back
773 :    
774 :     =cut
775 :    
776 : parrello 1.21 sub location_string {
777 : parrello 1.19 # Get the parameters.
778 : parrello 1.21 my ($contig, $beg, $end) = @_;
779 :     # Compute the strand and length.
780 :     my ($strand, $len);
781 :     if ($beg <= $end) {
782 :     $strand = '+';
783 :     $len = $end + 1 - $beg;
784 :     } else {
785 :     $strand = '-';
786 :     $len = $beg + 1 - $end;
787 : parrello 1.19 }
788 :     # Return the result.
789 : parrello 1.21 return $contig . "_$beg$strand$len";
790 : parrello 1.16 }
791 :    
792 : parrello 1.5 =head3 max
793 :    
794 :     my $max = max(@nums);
795 :    
796 :     Return the maximum number from all the values in the specified list.
797 :    
798 :     =over 4
799 :    
800 :     =item nums
801 :    
802 :     List of numbers to examine.
803 :    
804 :     =item RETURN
805 :    
806 :     Returns the maximum numeric value from the specified parameters, or
807 :     an undefined value if an empty list is passed in.
808 :    
809 :     =back
810 :    
811 :     =cut
812 :    
813 :     sub max {
814 :     my ($retVal, @nums) = @_;
815 :     for my $num (@nums) {
816 :     if ($num > $retVal) {
817 :     $retVal = $num;
818 :     }
819 :     }
820 :     return $retVal;
821 :     }
822 :    
823 :     =head3 min
824 :    
825 :     my $min = min(@nums);
826 :    
827 :     Return the minimum number from all the values in the specified list.
828 :    
829 :     =over 4
830 :    
831 :     =item nums
832 :    
833 :     List of numbers to examine.
834 :    
835 :     =item RETURN
836 :    
837 :     Returns the minimum numeric value from the specified parameters, or
838 :     an undefined value if an empty list is passed in.
839 :    
840 :     =back
841 :    
842 :     =cut
843 :    
844 :     sub min {
845 :     my ($retVal, @nums) = @_;
846 :     for my $num (@nums) {
847 :     if ($num < $retVal) {
848 :     $retVal = $num;
849 :     }
850 :     }
851 :     return $retVal;
852 :     }
853 :    
854 : parrello 1.21 =head3 parse_fasta_record
855 : parrello 1.1
856 : parrello 1.21 my ($id, $comment, $seq) = parse_fasta_record($string);
857 : parrello 1.1
858 : parrello 1.21 Extract the ID, comment, and sequence from a single FASTA record. For
859 :     backward compatability, instead of a FASTA record the ID and sequence can
860 :     be specified separated by a comma. In this case, the returned comment
861 :     will be empty.
862 : parrello 1.1
863 :     =over 4
864 :    
865 : parrello 1.21 =item string
866 : parrello 1.1
867 : parrello 1.21 A single FASTA record, or an ID and sequence separated by a single comma,
868 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
869 :     or a 3-element list consisting of an ID, a comment, and a sequence.
870 : parrello 1.7
871 : parrello 1.1 =item RETURN
872 :    
873 : parrello 1.21 Returns a three-element list consisting of the incoming ID, the associated
874 :     comment, and the specified DNA or protein sequence. If the incoming string is
875 :     invalid, all three list elements will come back undefined. If no ID is
876 :     specified, an MD5 will be provided.
877 : parrello 1.1
878 :     =back
879 :    
880 :     =cut
881 :    
882 : parrello 1.21 sub parse_fasta_record {
883 : parrello 1.1 # Get the parameters.
884 : parrello 1.21 my ($string) = @_;
885 :     # Declare the return variables.
886 :     my ($id, $comment, $seq);
887 :     # Check the type of input string.
888 :     if (! defined $string) {
889 :     # Do nothing if no string was passed in. This extra check prevents a
890 :     # warning at runtime.
891 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
892 :     # Here we have a standard FASTA string.
893 :     ($id, $comment, $seq) = ($1, $2, $3);
894 :     # Remove white space from the sequence string.
895 :     $seq =~ s/\s+//sg;
896 :     # Trim front of comment.
897 :     $comment =~ s/^s+//;
898 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
899 :     ($id, $comment, $seq) = ($1, '', $2);
900 :     } elsif (ref $string eq 'ARRAY') {
901 :     # Here the data came in pre-formatted as a list reference.
902 :     ($id, $comment, $seq) = @$string;
903 :     # If there's no comment, we need to adjust.
904 :     if (! defined $seq) {
905 :     $seq = $comment;
906 :     $comment = '';
907 :     }
908 : parrello 1.7 } else {
909 : parrello 1.21 # Here we have only a sequence. We need to construct the ID.
910 :     $seq = $string;
911 :     require Digest::MD5;
912 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
913 :     $comment = "";
914 :     }
915 :     # Return the results.
916 :     return ($id, $comment, $seq);
917 :     }
918 :    
919 :     =head3 parse_location
920 :    
921 :     my ($contig, $begin, $end, $strand) = parse_location($locString);
922 :    
923 :     Return the contigID, start offset, and end offset for a specified
924 :     location string (see L<SAP/Location Strings>).
925 :    
926 :     =over 4
927 :    
928 :     =item locString
929 :    
930 :     Location string to parse.
931 :    
932 :     =item RETURN
933 :    
934 :     Returns a four-element list containing the contig ID from the location
935 :     string, the starting offset of the location, the ending offset, and the
936 :     strand. If the location string is not valid, the values returned will be
937 :     C<undef>.
938 :    
939 :     =back
940 :    
941 :     =cut
942 :    
943 :     sub parse_location {
944 :     # Get the parameters.
945 :     my ($locString) = @_;
946 :     # Declare the return variables.
947 :     my ($contig, $begin, $end, $strand);
948 :     # Parse the location string.
949 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
950 :     # Pull out the contig ID, strand, and begin location.
951 :     $contig = $1;
952 :     $begin = $2;
953 :     $strand = $3;
954 :     # Compute the ending location from the direction and length.
955 :     if ($3 eq '+') {
956 :     $end = $begin + $4 - 1;
957 :     } else {
958 :     $end = $begin - $4 + 1;
959 : parrello 1.7 }
960 : parrello 1.1 }
961 : parrello 1.21 # Return the results.
962 :     return ($contig, $begin, $end, $strand);
963 : parrello 1.1 }
964 :    
965 : parrello 1.2 =head3 rev_comp
966 :    
967 :     my $revcmp = rev_comp($dna);
968 :    
969 :     or
970 :    
971 :     rev_comp(\$dna);
972 :    
973 :     Return the reverse complement of a DNA string.
974 :    
975 :     =over 4
976 :    
977 :     =item dna
978 :    
979 :     Either a DNA string, or a reference to a DNA string.
980 :    
981 :     =item RETURN
982 :    
983 :     If the input is a DNA string, returns the reverse complement. If the
984 :     input is a reference to a DNA string, the string itself is reverse
985 :     complemented.
986 :    
987 :     =back
988 :    
989 :     =cut
990 :    
991 :     sub rev_comp {
992 :     # Get the parameters.
993 :     my ($dna) = @_;
994 :     # Determine how we were called.
995 :     my ($retVal, $refMode);
996 : parrello 1.24 if (! ref $dna) {
997 : parrello 1.21 $retVal = reverse $dna;
998 : parrello 1.2 $refMode = 0;
999 :     } else {
1000 : parrello 1.21 $retVal = reverse $$dna;
1001 : parrello 1.2 $refMode = 1;
1002 :     }
1003 : parrello 1.21 # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
1004 :     # user passed in a reference. The following translation step complements the
1005 :     # string.
1006 :     $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
1007 : parrello 1.2 # Return the result in the method corresponding to the way it came in.
1008 :     if ($refMode) {
1009 :     $$dna = $retVal;
1010 :     return;
1011 :     } else {
1012 :     return $retVal;
1013 :     }
1014 :     }
1015 :    
1016 : parrello 1.21 # Synonym of rev_comp, for backward compatibility.
1017 :     sub reverse_comp {
1018 :     return rev_comp($_[0]);
1019 : parrello 1.15 }
1020 :    
1021 : parrello 1.21 =head3 roles_of_function
1022 : parrello 1.15
1023 : parrello 1.21 my @roles = roles_of_function($assignment);
1024 : parrello 1.3
1025 : parrello 1.21 Return a list of the functional roles in the specified assignment string.
1026 :     A single assignment may contain multiple roles as well as comments; this
1027 :     method separates them out.
1028 : parrello 1.3
1029 :     =over 4
1030 :    
1031 : parrello 1.21 =item assignment
1032 : parrello 1.3
1033 : parrello 1.21 Functional assignment to parse for roles.
1034 : parrello 1.3
1035 :     =item RETURN
1036 :    
1037 : parrello 1.21 Returns a list of the individual roles in the assignment.
1038 : parrello 1.3
1039 :     =back
1040 :    
1041 :     =cut
1042 :    
1043 : parrello 1.21 sub roles_of_function {
1044 : parrello 1.3 # Get the parameters.
1045 : parrello 1.21 my ($assignment) = @_;
1046 :     # Remove any comment.
1047 : parrello 1.30 my $commentFree = ($assignment =~ /(.+?)\s*[#!]/ ? $1 : $assignment);
1048 : parrello 1.21 # Split out the roles.
1049 : parrello 1.30 my @retVal = split /\s+[\/@]\s+|\s*;\s+/, $commentFree;
1050 : parrello 1.3 # Return the result.
1051 : parrello 1.21 return @retVal;
1052 : parrello 1.3 }
1053 :    
1054 : parrello 1.6 =head3 sims
1055 :    
1056 : parrello 1.31 my @sims = sims($id, $maxN, $maxP, 'fig');
1057 :    
1058 :     or
1059 :    
1060 :     my @sims = sims($id, $maxN, $maxP, 'all);
1061 :    
1062 : parrello 1.6
1063 :     Retrieve similarities from the network similarity server. The similarity retrieval
1064 :     is performed using an HTTP user agent that returns similarity data in multiple
1065 :     chunks. An anonymous subroutine is passed to the user agent that parses and
1066 :     reformats the chunks as they come in. The similarites themselves are returned
1067 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
1068 :     The Sim object methods allow access to the elements by name.
1069 :    
1070 :     Similarities can be either raw or expanded. The raw similarities are basic
1071 :     hits between features with similar DNA. Expanding a raw similarity drags in any
1072 :     features considered substantially identical. So, for example, if features B<A1>,
1073 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
1074 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
1075 :    
1076 :     =over 4
1077 :    
1078 :     =item id
1079 :    
1080 :     ID of the feature whose similarities are desired, or reference to a list
1081 :     of the IDs of the features whose similarities are desired.
1082 :    
1083 : parrello 1.31 =item maxN (optional)
1084 : parrello 1.6
1085 : parrello 1.27 Maximum number of similarities to return for each incoming feature.
1086 : parrello 1.6
1087 : parrello 1.31 =item maxP (optional)
1088 : parrello 1.6
1089 :     The maximum allowable similarity score.
1090 :    
1091 : parrello 1.31 =item select (optional)
1092 : parrello 1.6
1093 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
1094 :     means only similarities to FIG features are returned; C<all> means all expanded
1095 :     similarities are returned; and C<figx> means similarities are expanded until the
1096 :     number of FIG features equals the maximum.
1097 :    
1098 : parrello 1.31 =item max_expand (optional)
1099 : parrello 1.6
1100 :     The maximum number of features to expand.
1101 :    
1102 : parrello 1.31 =item filters (optional)
1103 : parrello 1.6
1104 :     Reference to a hash containing filter information, or a subroutine that can be
1105 :     used to filter the sims.
1106 :    
1107 :     =item RETURN
1108 :    
1109 :     Returns a list of L<Sim> objects.
1110 :    
1111 :     =back
1112 :    
1113 :     =cut
1114 :    
1115 :     sub sims {
1116 :     # Get the parameters.
1117 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
1118 :     # Get the URL for submitting to the sims server.
1119 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
1120 :     # Get a list of the IDs to process.
1121 :     my @ids;
1122 :     if (ref($id) eq "ARRAY") {
1123 :     @ids = @$id;
1124 :     } else {
1125 :     @ids = ($id);
1126 :     }
1127 :     # Form a list of the parameters to pass to the server.
1128 :     my %args = ();
1129 :     $args{id} = \@ids;
1130 :     $args{maxN} = $maxN if defined($maxN);
1131 :     $args{maxP} = $maxP if defined($maxP);
1132 :     $args{select} = $select if defined($select);
1133 :     $args{max_expand} = $max_expand if defined($max_expand);
1134 :     # If the filter is a hash, put the filters in the argument list.
1135 :     if (ref($filters) eq 'HASH') {
1136 :     for my $k (keys(%$filters))
1137 :     {
1138 :     $args{"filter_$k"}= $filters->{$k};
1139 :     }
1140 :     }
1141 :     # Get the user agent.
1142 :     require LWP::UserAgent;
1143 :     my $ua = LWP::UserAgent->new();
1144 :     # Insure we have the Sim module.
1145 :     require Sim;
1146 :     #
1147 :     # Our next task is to create the anonymous subroutine that will process the
1148 :     # chunks that come back from the server. We require three global variables:
1149 :     # @sims to hold the similarities found, $tail to remember the unprocessed
1150 :     # data from the previous chunk, and $chunks to count the chunks.
1151 :     #
1152 :     my @retVal;
1153 :     my $tail;
1154 :     my $chunks = 0;
1155 :     #
1156 :     # ANONYMOUS SUBROUTINE
1157 :     #
1158 :     my $cb = sub {
1159 :     eval {
1160 :     # Get the parameters.
1161 :     my ($data, $command) = @_;
1162 :     # Check for a reset command. If we get one, we discard any data
1163 :     # in progress.
1164 :     if ($command && $command eq 'reset') {
1165 :     $tail = '';
1166 :     } else {
1167 :     $chunks++;
1168 :     # Get the data to process. Note we concatenate it to the incoming
1169 :     # tail from last time.
1170 :     my $c = $tail . $data;
1171 :     # Make sure the caller hasn't messed up the new-line character.
1172 :     # FASTA readers in particular are notorious for doing things
1173 :     # like that.
1174 :     local $/ = "\n";
1175 :     # Split the input into lines.
1176 :     my @lines = split(/\n/, $c);
1177 :     # If the input does not end with a new-line, we have a partial
1178 :     # chunk and need to put it in the tail for next time. If not,
1179 :     # there is no tail for next time.
1180 :     if (substr($c, -1, 1) ne "\n") {
1181 :     $tail = pop @lines;
1182 :     } else {
1183 :     $tail = '';
1184 :     }
1185 :     # Loop through the lines. Note there's no need to chomp because
1186 :     # the SPLIT took out the new-line characters.
1187 :     for my $l (@lines) {
1188 :     # Split the line into fields.
1189 :     my @s = split(/\t/, $l);
1190 :     # Insure we have all the fields we need.
1191 : parrello 1.12 if (@s >= 9) {
1192 : parrello 1.6 # Check to see if we've seen this SIM before.
1193 :     my $id1 = $s[0];
1194 :     my $id2 = $s[1];
1195 :     # Add it to the result list.
1196 :     push(@retVal, bless \@s, 'Sim');
1197 :     }
1198 :     }
1199 :     }
1200 :     };
1201 :     };
1202 :     #
1203 :     # END OF ANONYMOUS SUBROUTINE
1204 :     #
1205 :     # Now we're ready to start. Because networking is an iffy thing, we set up
1206 :     # to try our request multiple times.
1207 :     my $n_retries = 10;
1208 :     my $attempts = 0;
1209 :     # Set the timeout value, in seconds.
1210 :     $ua->timeout(180);
1211 :     # Loop until we succeed or run out of retries.
1212 :     my $done = 0;
1213 :     while (! $done && $attempts++ < $n_retries) {
1214 :     # Reset the content processor. This clears the tail.
1215 :     &$cb(undef, 'reset');
1216 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
1217 :     if ($resp->is_success) {
1218 :     # If the response was successful, get the content. This triggers
1219 :     # the anonymous subroutine.
1220 :     my $x = $resp->content;
1221 :     # Denote we've been successful.
1222 :     $done = 1;
1223 :     }
1224 :     }
1225 :     return @retVal;
1226 :     }
1227 :    
1228 : parrello 1.21 =head3 standard_genetic_code
1229 :    
1230 :     my $code = standard_genetic_code();
1231 :    
1232 :     Return a hash containing the standard translation of nucleotide triples to proteins.
1233 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
1234 :     returns the default translation scheme. The scheme is implemented as a reference to a
1235 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1236 :    
1237 :     =cut
1238 :    
1239 :     sub standard_genetic_code {
1240 :    
1241 :     my $code = {};
1242 : parrello 1.8
1243 : parrello 1.21 $code->{"AAA"} = "K";
1244 :     $code->{"AAC"} = "N";
1245 :     $code->{"AAG"} = "K";
1246 :     $code->{"AAT"} = "N";
1247 :     $code->{"ACA"} = "T";
1248 :     $code->{"ACC"} = "T";
1249 :     $code->{"ACG"} = "T";
1250 :     $code->{"ACT"} = "T";
1251 :     $code->{"AGA"} = "R";
1252 :     $code->{"AGC"} = "S";
1253 :     $code->{"AGG"} = "R";
1254 :     $code->{"AGT"} = "S";
1255 :     $code->{"ATA"} = "I";
1256 :     $code->{"ATC"} = "I";
1257 :     $code->{"ATG"} = "M";
1258 :     $code->{"ATT"} = "I";
1259 :     $code->{"CAA"} = "Q";
1260 :     $code->{"CAC"} = "H";
1261 :     $code->{"CAG"} = "Q";
1262 :     $code->{"CAT"} = "H";
1263 :     $code->{"CCA"} = "P";
1264 :     $code->{"CCC"} = "P";
1265 :     $code->{"CCG"} = "P";
1266 :     $code->{"CCT"} = "P";
1267 :     $code->{"CGA"} = "R";
1268 :     $code->{"CGC"} = "R";
1269 :     $code->{"CGG"} = "R";
1270 :     $code->{"CGT"} = "R";
1271 :     $code->{"CTA"} = "L";
1272 :     $code->{"CTC"} = "L";
1273 :     $code->{"CTG"} = "L";
1274 :     $code->{"CTT"} = "L";
1275 :     $code->{"GAA"} = "E";
1276 :     $code->{"GAC"} = "D";
1277 :     $code->{"GAG"} = "E";
1278 :     $code->{"GAT"} = "D";
1279 :     $code->{"GCA"} = "A";
1280 :     $code->{"GCC"} = "A";
1281 :     $code->{"GCG"} = "A";
1282 :     $code->{"GCT"} = "A";
1283 :     $code->{"GGA"} = "G";
1284 :     $code->{"GGC"} = "G";
1285 :     $code->{"GGG"} = "G";
1286 :     $code->{"GGT"} = "G";
1287 :     $code->{"GTA"} = "V";
1288 :     $code->{"GTC"} = "V";
1289 :     $code->{"GTG"} = "V";
1290 :     $code->{"GTT"} = "V";
1291 :     $code->{"TAA"} = "*";
1292 :     $code->{"TAC"} = "Y";
1293 :     $code->{"TAG"} = "*";
1294 :     $code->{"TAT"} = "Y";
1295 :     $code->{"TCA"} = "S";
1296 :     $code->{"TCC"} = "S";
1297 :     $code->{"TCG"} = "S";
1298 :     $code->{"TCT"} = "S";
1299 :     $code->{"TGA"} = "*";
1300 :     $code->{"TGC"} = "C";
1301 :     $code->{"TGG"} = "W";
1302 :     $code->{"TGT"} = "C";
1303 :     $code->{"TTA"} = "L";
1304 :     $code->{"TTC"} = "F";
1305 :     $code->{"TTG"} = "L";
1306 :     $code->{"TTT"} = "F";
1307 : parrello 1.8
1308 : parrello 1.21 return $code;
1309 :     }
1310 : parrello 1.8
1311 : parrello 1.21 =head3 strand_of
1312 : parrello 1.8
1313 : parrello 1.21 my $plusOrMinus = strand_of($loc);
1314 : parrello 1.8
1315 : parrello 1.21 Return the strand (C<+> or C<->) from the specified location string.
1316 : parrello 1.8
1317 : parrello 1.21 =over 4
1318 : parrello 1.8
1319 : parrello 1.21 =item loc
1320 : parrello 1.8
1321 : parrello 1.21 Location string to parse (see L<SAP/Location Strings>).
1322 : parrello 1.9
1323 :     =item RETURN
1324 :    
1325 : parrello 1.21 Returns C<+> if the location is on the forward strand, else C<->.
1326 : parrello 1.9
1327 :     =back
1328 :    
1329 :     =cut
1330 :    
1331 : parrello 1.21 sub strand_of {
1332 : parrello 1.9 # Get the parameters.
1333 : parrello 1.21 my ($loc) = @_;
1334 :     # Declare the return variable.
1335 :     my $retVal;
1336 :     # Parse the strand indicator from the location.
1337 :     if ($loc =~ /\d+([+-])\d+/) {
1338 :     $retVal = $1;
1339 : parrello 1.9 }
1340 : parrello 1.21 # Return the result.
1341 :     return $retVal;
1342 : parrello 1.9 }
1343 :    
1344 : parrello 1.21 =head3 strip_ec
1345 : parrello 1.9
1346 : parrello 1.21 my $role = strip_ec($rawRole);
1347 : parrello 1.11
1348 : parrello 1.21 Strip the EC number (if any) from the specified role or functional
1349 :     assignment.
1350 : parrello 1.11
1351 :     =over 4
1352 :    
1353 : parrello 1.21 =item rawRole
1354 : parrello 1.11
1355 : parrello 1.21 Role or functional assignment from which the EC numbers are to be stripped.
1356 : parrello 1.11
1357 :     =item RETURN
1358 :    
1359 : parrello 1.21 Returns the incoming string with any EC numbers removed. The EC numbers must
1360 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1361 :     and surrounding parentheses).
1362 : parrello 1.11
1363 :     =back
1364 :    
1365 :     =cut
1366 :    
1367 : parrello 1.21 sub strip_ec {
1368 :     # Get the parameters.
1369 :     my ($rawRole) = @_;
1370 :     # Declare the return variable.
1371 :     my $retVal = $rawRole;
1372 :     # Remove the EC numbers.
1373 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1374 :     # Return the result.
1375 :     return $retVal;
1376 : parrello 1.15 }
1377 :    
1378 :     =head3 translate
1379 :    
1380 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1381 :    
1382 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1383 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1384 :     C<M>. (In the standard genetic code, these two combinations normally translate
1385 :     to C<V> and C<L>, respectively.)
1386 :    
1387 :     =over 4
1388 :    
1389 :     =item dna_seq
1390 :    
1391 :     DNA sequence to translate. Note that the DNA sequence can only contain
1392 :     known nucleotides.
1393 :    
1394 :     =item code
1395 :    
1396 :     Reference to a hash specifying the translation code. The hash is keyed by
1397 :     nucleotide triples, and the value for each key is the corresponding protein
1398 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1399 :     used.
1400 :    
1401 :     =item fix_start
1402 :    
1403 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1404 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1405 :     C<M> instead of the value specified in the translation code.
1406 :    
1407 :     =item RETURN
1408 :    
1409 :     Returns a string resulting from translating each nucleotide triple into a
1410 :     protein letter.
1411 :    
1412 :     =back
1413 :    
1414 :     =cut
1415 :     #: Return Type $;
1416 :     sub translate {
1417 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1418 :    
1419 :     my( $dna,$code,$start ) = @_;
1420 :     my( $i,$j,$ln );
1421 :     my( $x,$y );
1422 :     my( $prot );
1423 :    
1424 :     if (! defined($code)) {
1425 :     $code = &FIG::standard_genetic_code;
1426 :     }
1427 :     $ln = length($dna);
1428 :     $prot = "X" x ($ln/3);
1429 :     $dna =~ tr/a-z/A-Z/;
1430 :    
1431 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1432 :     $x = substr($dna,$i,3);
1433 :     if ($y = $code->{$x}) {
1434 :     substr($prot,$j,1) = $y;
1435 :     }
1436 :     }
1437 :    
1438 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1439 :     substr($prot,0,1) = 'M';
1440 :     }
1441 :     return $prot;
1442 :     }
1443 :    
1444 : parrello 1.36 =head3 type_of
1445 :    
1446 :     my $type = SeedUtils::type_of($fid);
1447 :    
1448 :     Return the type of a feature, given a FIG feature ID (e.g. C<fig|100226.1.peg.3361>).
1449 :    
1450 :     =over 4
1451 :    
1452 :     =item fid
1453 :    
1454 :     ID of a feature whose type is desired.
1455 :    
1456 :     =item RETURN
1457 :    
1458 :     Returns the type of the feature (e.g. C<peg>, C<rna>, ...).
1459 :    
1460 :     =back
1461 :    
1462 :     =cut
1463 :    
1464 :     sub type_of {
1465 :     # Get the parameter.
1466 :     my ($fid) = @_;
1467 :     # Declare the return variable. We return undefined if the ID is unparseable.
1468 :     my $retVal;
1469 :     # Parse the FIG ID.
1470 :     if ($fid =~ /fig\|\d+\.\d+\.(\w+)\./) {
1471 :     # Save the type segment.
1472 :     $retVal = $1;
1473 :     }
1474 :     # Return the result.
1475 :     return $retVal;
1476 :     }
1477 :    
1478 :    
1479 : parrello 1.21 =head3 verify_dir
1480 : parrello 1.18
1481 : parrello 1.21 verify_dir($dirName);
1482 : parrello 1.18
1483 : parrello 1.21 Insure that the specified directory exists. If the directory does not
1484 :     exist, it will be created.
1485 : parrello 1.18
1486 :     =over 4
1487 :    
1488 : parrello 1.21 =item dirName
1489 : parrello 1.18
1490 : parrello 1.21 Name of the relevant directory.
1491 : parrello 1.18
1492 :     =back
1493 :    
1494 :     =cut
1495 :    
1496 : parrello 1.21 sub verify_dir {
1497 : parrello 1.18 # Get the parameters.
1498 : parrello 1.21 my ($dirName) = @_;
1499 :     # Strip off the final slash, if any.
1500 :     $dirName =~ s#/$##;
1501 :     # Only proceed if the directory does NOT already exist.
1502 :     if (! -d $dirName) {
1503 :     # If there is a parent directory, recursively insure it is there.
1504 :     if ($dirName =~ m#(.+)/[^/]+$#) {
1505 :     verify_dir($1);
1506 : overbeek 1.20 }
1507 : parrello 1.21 # Create this particular directory with full permissions.
1508 :     mkdir $dirName, 0777;
1509 : overbeek 1.20 }
1510 :     }
1511 :    
1512 : olson 1.39 =head3 validate_fasta_file
1513 :    
1514 :     $sequence_type = validate_fasta_file($in_file, $out_file)
1515 :    
1516 :     Ensure the given file is in valid fasta format. If $out_file
1517 :     is given, write the data to $out_file as a normalized fasta file
1518 :     (with cleaned up line endings, upper case data).
1519 :    
1520 :     If successful, returns the string "dna" or "protein".
1521 :    
1522 :     Will invoke die() on failure; call inside eval{} to ensure full error catching.
1523 :    
1524 :     =cut
1525 :    
1526 :     sub validate_fasta_file
1527 :     {
1528 :     my($file, $norm) = @_;
1529 :    
1530 :     my($input_fh, $clean_fh);
1531 :     open($input_fh, "<", $file) or die "cannot open $file: $!";
1532 :    
1533 :     if ($norm)
1534 :     {
1535 :     open($clean_fh, ">", $norm) or die "cannot write normalized file $norm: $!";
1536 :     }
1537 :    
1538 :     my $state = 'expect_header';
1539 :     my $cur_id;
1540 :     my $dna_chars;
1541 :     my $prot_chars;
1542 :    
1543 :     while (<$input_fh>)
1544 :     {
1545 :     chomp;
1546 :    
1547 :     if ($state eq 'expect_header')
1548 :     {
1549 :     if (/^>(\S+)/)
1550 :     {
1551 :     $cur_id = $1;
1552 :     $state = 'expect_data';
1553 :     print $clean_fh ">$cur_id\n" if $clean_fh;
1554 :     next;
1555 :     }
1556 :     else
1557 :     {
1558 :     die "Invalid fasta: Expected header at line $.\n";
1559 :     }
1560 :     }
1561 :     elsif ($state eq 'expect_data')
1562 :     {
1563 :     if (/^>(\S+)/)
1564 :     {
1565 :     $cur_id = $1;
1566 :     $state = 'expect_data';
1567 :     print $clean_fh ">$cur_id\n" if $clean_fh;
1568 :     next;
1569 :     }
1570 :     elsif (/^([acgtumrwsykbdhvn]*)\s*$/i)
1571 :     {
1572 :     print $clean_fh uc($1) . "\n" if $clean_fh;
1573 :     $dna_chars += length($1);
1574 :     next;
1575 :     }
1576 :     elsif (/^([*abcdefghijklmnopqrstuvwxyz]*)\s*$/i)
1577 :     {
1578 :     print $clean_fh uc($1) . "\n" if $clean_fh;
1579 :     $prot_chars += length($1);
1580 :     next;
1581 :     }
1582 :     else
1583 :     {
1584 :     my $str = $_;
1585 :     if (length($_) > 100)
1586 :     {
1587 :     $str = substr($_, 0, 50) . " [...] " . substr($_, -50);
1588 :     }
1589 :     die "Invalid fasta: Bad data at line $.\n$str\n";
1590 :     }
1591 :     }
1592 :     else
1593 :     {
1594 :     die "Internal error: invalid state $state\n";
1595 :     }
1596 :     }
1597 :     close($input_fh);
1598 :     close($clean_fh) if $clean_fh;
1599 :    
1600 :     my $what = ($prot_chars > 0) ? "protein" : "dna";
1601 :    
1602 :     return $what;
1603 :     }
1604 :    
1605 : disz 1.28 sub strip_func {
1606 :     my($func) = @_;
1607 : overbeek 1.20
1608 : disz 1.28 $func =~ s/^FIG\d{6}[^:]*:\s*//;
1609 :     $func =~ s/\s*\#.*$//;
1610 :     return($func);
1611 :     }
1612 :    
1613 : olson 1.32 sub verify_db {
1614 :     my($db,$type) = @_;
1615 :    
1616 :     #
1617 :     # Find formatdb; if we're operating in a SEED environment
1618 :     # use it from there.
1619 :     #
1620 :    
1621 :     my $path = '';
1622 :     if ($FIG_Config::blastbin ne '' && -d $FIG_Config::blastbin)
1623 :     {
1624 :     $path = "$FIG_Config::blastbin/";
1625 :     }
1626 :     elsif ($FIG_Config::ext_bin ne '' && -d $FIG_Config::ext_bin)
1627 :     {
1628 :     $path = "$FIG_Config::ext_bin/";
1629 :     }
1630 :    
1631 :    
1632 :     my @cmd;
1633 :     if ($type =~ /^p/i)
1634 :     {
1635 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db))
1636 :     {
1637 :     @cmd = ("${path}formatdb", "-p", "T", "-i", $db);
1638 :     }
1639 :     }
1640 :     else
1641 :     {
1642 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db))
1643 :     {
1644 :     @cmd = ("${path}formatdb", "-p", "F", "-i", $db);
1645 :     }
1646 :     }
1647 :     if (@cmd)
1648 :     {
1649 :     my $rc = system(@cmd);
1650 :     if ($rc != 0)
1651 :     {
1652 :     warn "SeedUtils::verify_db: formatdb failed with rc=$rc: @cmd\n";
1653 :     }
1654 :     }
1655 :     }
1656 :    
1657 : olson 1.38 #
1658 :     # Some berkeley-db building utilities.
1659 :     #
1660 :    
1661 :     sub create_berk_table
1662 :     {
1663 :     my($input_file, $key_columns, $value_columns, $db_file, %opts) = @_;
1664 :    
1665 :     local $DB_BTREE->{flags};
1666 :     if ($opts{-multiple_values})
1667 :     {
1668 :     $DB_BTREE->{flags} = R_DUP;
1669 :     }
1670 :    
1671 :     my $ifh;
1672 :    
1673 :     if ($opts{-sort})
1674 :     {
1675 :     my $sk = join(" ", map { "-k " . ($_ + 1) } @$key_columns);
1676 :     my $cmd = "sort $sk $input_file";
1677 :     print "Run $cmd\n";
1678 :    
1679 :     open($ifh, "$cmd |") or die "Cannot open sort $sk $input_file for reading: $!";
1680 :     }
1681 :     else
1682 :     {
1683 :     open($ifh, "<", $input_file) or die "Cannot open $input_file for reading: $!";
1684 :     }
1685 :    
1686 :     my $hash = {};
1687 :     my $tie = tie %$hash, "DB_File", $db_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;
1688 :     $tie or die "Cannot create $db_file: $!";
1689 :    
1690 :     while (<$ifh>)
1691 :     {
1692 :     chomp;
1693 :     my @a = split(/\t/);
1694 :     my $k = join($;, @a[@$key_columns]);
1695 :     my $v = join($;, @a[@$value_columns]);
1696 :    
1697 :     $hash->{$k} = $v;
1698 :     }
1699 :     close($ifh);
1700 :     undef $hash;
1701 :     untie $tie;
1702 :     }
1703 :    
1704 :     sub open_berk_table
1705 :     {
1706 :     my($table, %opts) = @_;
1707 :    
1708 :     if (! -f $table)
1709 :     {
1710 :     warn "Cannot read table file $table\n";
1711 :     return undef;
1712 :     }
1713 :     my $h = {};
1714 :     tie %$h, 'BerkTable', $table, %opts;
1715 :     return $h;
1716 :     }
1717 : olson 1.32
1718 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3