[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.29 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : parrello 1.1 #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     package SeedUtils;
23 :    
24 :     use strict;
25 : parrello 1.12 no warnings qw(once);
26 : parrello 1.1 use base qw(Exporter);
27 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
28 :     rev_comp genome_of min max sims verify_dir between translate
29 : parrello 1.21 standard_genetic_code parse_location roles_of_function
30 : parrello 1.25 strip_ec location_string location_cmp strand_of by_fig_id);
31 : parrello 1.1
32 :     =head1 SEED Utility Methods
33 :    
34 :     =head2 Introduction
35 :    
36 :     This is a simple utility package that performs functions useful for
37 :     bioinformatics, but that do not require access to the databases.
38 :    
39 :     =head2 Public Methods
40 :    
41 : parrello 1.21 =head3 abbrev
42 :    
43 :     my $abbrev = SeedUtils::abbrev($genome_name);
44 :    
45 :     Return an abbreviation of the specified genome name. This method is used to create
46 :     a reasonably indicative genome name that fits in 10 characters.
47 :    
48 :     =over 4
49 :    
50 :     =item genome_name
51 :    
52 :     Genome name to abbreviate.
53 :    
54 :     =item RETURN
55 :    
56 :     Returns a shortened version of the genome name that is 10 characters or less in
57 :     length.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub abbrev {
64 :     my($genome_name) = @_;
65 :    
66 :     $genome_name =~ s/^(\S{3})\S+/$1./;
67 :     $genome_name =~ s/^(\S+)\s+(\S{3})\S+/$1$2./;
68 :     $genome_name =~ s/ //g;
69 :     if (length($genome_name) > 10) {
70 :     $genome_name = substr($genome_name,0,10);
71 :     }
72 :     return $genome_name;
73 :     }
74 :    
75 :     =head3 between
76 :    
77 :     my $flag = between($x, $y, $z);
78 :    
79 :     Determine whether or not $y is between $x and $z.
80 :    
81 :     =over 4
82 :    
83 :     =item x
84 :    
85 :     First edge number.
86 :    
87 :     =item y
88 :    
89 :     Number to examine.
90 :    
91 :     =item z
92 :    
93 :     Second edge number.
94 :    
95 :     =item RETURN
96 :    
97 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
98 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
99 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
100 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
101 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
102 :    
103 :     =back
104 :    
105 :     =cut
106 :     #: Return Type $;
107 :     sub between {
108 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
109 :     my($x,$y,$z) = @_;
110 :    
111 :     if ($x < $z) {
112 :     return (($x <= $y) && ($y <= $z));
113 :     } else {
114 :     return (($x >= $y) && ($y >= $z));
115 :     }
116 :     }
117 :    
118 :    
119 : parrello 1.13 =head3 boundaries_of
120 :    
121 : parrello 1.21 my ($contig, $min, $max, $dir) = boundaries_of($locs);
122 : parrello 1.13
123 :     Return the boundaries of a set of locations. The contig, the leftmost
124 :     location, and the rightmost location will be returned to the caller. If
125 :     more than one contig is represented, the method will return an undefined
126 :     value for the contig (indicating failure).
127 :    
128 :     =over 4
129 :    
130 :     =item locs
131 :    
132 :     Reference to a list of location strings. A location string contains a contig ID,
133 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
134 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
135 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
136 :    
137 :     =item RETURN
138 :    
139 : parrello 1.21 Returns a 4-element list. The first element is the contig ID from all the locations,
140 :     the second is the offset of leftmost base pair represented in the locations, the
141 :     third is the offset of the rightmost base pair represented in the locations, and
142 :     the fourth is the dominant strand.
143 : parrello 1.13
144 :     =back
145 :    
146 :     =cut
147 :    
148 :     sub boundaries_of {
149 :     # Get the parameters.
150 :     my ($locs) = @_;
151 :     # Declare the return variables.
152 :     my ($contig, $min, $max);
153 :     # We'll put all the starting and ending offsets found in here.
154 :     my @offsets;
155 : parrello 1.21 # This will be used to count the orientations.
156 :     my %dirs = ('+' => 0, '-' => 0);
157 : parrello 1.13 # This will count the number of errors found.
158 :     my $error = 0;
159 :     # Loop through the locations.
160 :     for my $loc (@$locs) {
161 :     # Parse this location.
162 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
163 : parrello 1.23 # This is a valid location string.
164 : parrello 1.13 my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
165 :     # Is this contig valid?
166 :     if ($contig && $newContig ne $contig) {
167 :     # No, skip this location.
168 :     $error++;
169 :     } else {
170 :     # Save the contig.
171 :     $contig = $newContig;
172 : parrello 1.21 # Count the orientation.
173 :     $dirs{$dir}++;
174 : parrello 1.13 # Compute the ending offset.
175 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
176 :     # Save both offsets.
177 :     push @offsets, $begin, $end;
178 :     }
179 : parrello 1.23 } elsif ($loc =~ /^(.+)_(\d+)_(\d+)/) {
180 :     # Here we have an old-style location string.
181 :     my($newContig, $start, $stop) = ($1, $2, $3);
182 :     # Is this contig valid?
183 : olson 1.22 if ($contig && $newContig ne $contig) {
184 :     # No, skip this location.
185 :     $error++;
186 :     } else {
187 :     # Save the contig.
188 :     $contig = $newContig;
189 : parrello 1.23 # Compute the orientation.
190 :     my $dir;
191 :     if ($start > $stop) {
192 :     $dir = '-';
193 :     } else {
194 :     $dir = '+';
195 :     }
196 :     # Count it.
197 : olson 1.22 $dirs{$dir}++;
198 : parrello 1.23 # Save both offsets.
199 : olson 1.22 push @offsets, $start, $stop;
200 :     }
201 : parrello 1.13 } else {
202 :     # The location is invalid, so it's an error,
203 :     $error++;
204 :     }
205 :     }
206 :     # If there's an error, clear the contig ID.
207 :     if ($error) {
208 :     $contig = undef;
209 :     }
210 :     # Compute the min and max from the offsets collected.
211 :     $min = min(@offsets);
212 :     $max = max(@offsets);
213 : parrello 1.21 # Save the dominant orientation.
214 :     my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
215 : parrello 1.13 # Return the results.
216 : parrello 1.21 return ($contig, $min, $max, $dir);
217 : parrello 1.13 }
218 :    
219 : parrello 1.21 =head3 boundary_loc
220 : parrello 1.17
221 : parrello 1.21 my $singleLoc = SeedUtils::boundary_loc($locations);
222 : parrello 1.17
223 : parrello 1.21 Return a single location string (see L<SAP/Location Strings>) that covers
224 :     the incoming list of locations. NOTE that if the locations listed span
225 :     more than one contig, this method may return an unexpected result.
226 : parrello 1.17
227 : parrello 1.21 This method is useful for converting the output of L<SAP/fid_locations> to
228 :     location strings.
229 : parrello 1.17
230 :     =over 4
231 :    
232 : parrello 1.21 =item locations
233 : parrello 1.17
234 : parrello 1.21 A set of location strings formatted as a comma-separated list or as a reference
235 :     to a list of location strings.
236 : parrello 1.17
237 :     =item RETURN
238 :    
239 : parrello 1.21 Returns a single location string that covers as best as possible the list of
240 :     incoming locations.
241 : parrello 1.17
242 :     =back
243 :    
244 :     =cut
245 :    
246 : parrello 1.21 sub boundary_loc {
247 : parrello 1.17 # Get the parameters.
248 : parrello 1.21 my ($locations) = @_;
249 :     # Convert the incoming locations to a list.
250 :     my @locs;
251 :     if (ref $locations eq 'ARRAY') {
252 :     @locs = @$locations;
253 :     } else {
254 :     @locs = split /\s*,\s*/, $locations;
255 :     }
256 :     # Get the boundary information for the listed locations.
257 :     my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
258 :     # Compute the indicated location string.
259 :     my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
260 :     ($max + 1 - $min);
261 : parrello 1.17 # Return the result.
262 : parrello 1.21 return $retVal;
263 : parrello 1.17 }
264 :    
265 : parrello 1.21 =head3 by_fig_id
266 : parrello 1.17
267 : parrello 1.21 my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
268 : parrello 1.16
269 : parrello 1.21 Compare two feature IDs.
270 : parrello 1.16
271 : parrello 1.21 This function is designed to assist in sorting features by ID. The sort is by
272 :     genome ID followed by feature type and then feature number.
273 : parrello 1.16
274 :     =over 4
275 :    
276 : parrello 1.21 =item a
277 :    
278 :     First feature ID.
279 :    
280 :     =item b
281 : parrello 1.16
282 : parrello 1.21 Second feature ID.
283 : parrello 1.16
284 :     =item RETURN
285 :    
286 : parrello 1.21 Returns a negative number if the first parameter is smaller, zero if both parameters
287 :     are equal, and a positive number if the first parameter is greater.
288 : parrello 1.16
289 :     =back
290 :    
291 :     =cut
292 :    
293 : parrello 1.21 sub by_fig_id {
294 :     my($a,$b) = @_;
295 :     my($g1,$g2,$t1,$t2,$n1,$n2);
296 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
297 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
298 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
299 :     } else {
300 :     $a cmp $b;
301 : parrello 1.16 }
302 : parrello 1.19 }
303 :    
304 : parrello 1.21 =head3 create_fasta_record
305 : parrello 1.19
306 : parrello 1.21 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
307 : parrello 1.19
308 : parrello 1.21 Create a FASTA record from the specified DNA or protein sequence. The
309 :     sequence will be split into 60-character lines, and the record will
310 :     include an identifier line.
311 : parrello 1.19
312 :     =over 4
313 :    
314 : parrello 1.21 =item id
315 :    
316 :     ID for the sequence, to be placed at the beginning of the identifier
317 :     line.
318 :    
319 :     =item comment (optional)
320 : parrello 1.19
321 : parrello 1.21 Comment text to place after the ID on the identifier line. If this parameter
322 :     is empty, undefined, or 0, no comment will be placed.
323 : parrello 1.19
324 : parrello 1.21 =item sequence
325 : parrello 1.19
326 : parrello 1.21 Sequence of letters to form into FASTA. For purposes of convenience, whitespace
327 :     characters in the sequence will be removed automatically.
328 : parrello 1.19
329 : parrello 1.21 =item stripped (optional)
330 : parrello 1.19
331 : parrello 1.21 If TRUE, then the sequence will be returned unmodified instead of converted
332 :     to FASTA format. The default is FALSE.
333 : parrello 1.19
334 :     =item RETURN
335 :    
336 : parrello 1.21 Returns the desired sequence in FASTA format.
337 : parrello 1.19
338 :     =back
339 :    
340 :     =cut
341 :    
342 : parrello 1.21 sub create_fasta_record {
343 : parrello 1.19 # Get the parameters.
344 : parrello 1.21 my ($id, $comment, $sequence, $stripped) = @_;
345 :     # Declare the return variable.
346 :     my $retVal;
347 :     # If we're in stripped mode, we just return the sequence.
348 :     if ($stripped) {
349 :     $retVal = $sequence;
350 : parrello 1.19 } else {
351 : parrello 1.21 # Here we have to do the FASTA conversion. Start with the ID.
352 :     my $header = ">$id";
353 :     # Add a comment, if any.
354 :     if ($comment) {
355 :     $header .= " $comment";
356 :     }
357 :     # Clean up the sequence.
358 :     $sequence =~ s/\s+//g;
359 :     # We need to format the sequence into 60-byte chunks. We use the infamous
360 :     # grep-split trick. The split, because of the presence of the parentheses,
361 :     # includes the matched delimiters in the output list. The grep strips out
362 :     # the empty list items that appear between the so-called delimiters, since
363 :     # the delimiters are what we want.
364 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
365 :     # Add the chunks and the trailer.
366 :     $retVal = join("\n", $header, @chunks) . "\n";
367 : parrello 1.19 }
368 :     # Return the result.
369 : parrello 1.21 return $retVal;
370 : parrello 1.19 }
371 :    
372 : parrello 1.21 =head3 display_id_and_seq
373 : parrello 1.19
374 : parrello 1.21 SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
375 : parrello 1.19
376 : parrello 1.21 Display a fasta ID and sequence to the specified open file. This method is designed
377 :     to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
378 :     input a string pointer rather than a string. If the file handle is omitted it
379 :     defaults to STDOUT.
380 : parrello 1.19
381 : parrello 1.21 The output is formatted into a FASTA record. The first line of the output is
382 :     preceded by a C<< > >> symbol, and the sequence is split into 60-character
383 :     chunks displayed one per line. Thus, this method can be used to produce
384 :     FASTA files from data gathered by the rest of the system.
385 : parrello 1.19
386 :     =over 4
387 :    
388 : parrello 1.21 =item id_and_comment
389 : parrello 1.19
390 : parrello 1.21 The sequence ID and (optionally) the comment from the sequence's FASTA record.
391 :     The ID
392 : parrello 1.19
393 : parrello 1.21 =item seqP
394 : parrello 1.19
395 : parrello 1.21 Reference to a string containing the sequence. The sequence is automatically
396 :     formatted into 60-character chunks displayed one per line.
397 : parrello 1.19
398 : parrello 1.21 =item fh
399 :    
400 :     Open file handle to which the ID and sequence should be output. If omitted,
401 :     C<\*STDOUT> is assumed.
402 :    
403 :     =back
404 :    
405 :     =cut
406 :    
407 :     sub display_id_and_seq {
408 :    
409 :     my( $id, $seqP, $fh ) = @_;
410 :    
411 :     if (! defined($fh) ) { $fh = \*STDOUT; }
412 :    
413 :     print $fh ">$id\n";
414 :     &display_seq($seqP, $fh);
415 :     }
416 :    
417 :     =head3 display_seq
418 :    
419 :     SeedUtils::display_seq(\$seqP, $fh);
420 :    
421 :     Display a fasta sequence to the specified open file. If the file handle is
422 :     omitted it defaults to STDOUT.
423 :    
424 :     The sequence is split into 60-character chunks displayed one per line for
425 :     readability.
426 :    
427 :     =over 4
428 :    
429 :     =item seqP
430 :    
431 :     Reference to a string containing the sequence.
432 :    
433 :     =item fh
434 :    
435 :     Open file handle to which the sequence should be output. If omitted,
436 :     C<STDOUT> is assumed.
437 :    
438 :     =back
439 :    
440 :     =cut
441 :    
442 :     sub display_seq {
443 :    
444 :     my ( $seqP, $fh ) = @_;
445 :     my ( $i, $n, $ln );
446 :    
447 :     if (! defined($fh) ) { $fh = \*STDOUT; }
448 :    
449 :     $n = length($$seqP);
450 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
451 :     for ($i=0; ($i < $n); $i += 60) {
452 :     if (($i + 60) <= $n) {
453 :     $ln = substr($$seqP,$i,60);
454 :     } else {
455 :     $ln = substr($$seqP,$i,($n-$i));
456 :     }
457 :     print $fh "$ln\n";
458 :     }
459 :     }
460 :    
461 :    
462 :     =head3 genome_of
463 :    
464 :     my $genomeID = genome_of($fid);
465 :    
466 :     Return the Genome ID embedded in the specified FIG feature ID.
467 :    
468 :     =over 4
469 :    
470 :     =item fid
471 :    
472 :     Feature ID of interest.
473 :    
474 :     =item RETURN
475 :    
476 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
477 :     feature ID is invalid, this method returns an undefined value.
478 :    
479 :     =back
480 :    
481 :     =cut
482 :    
483 :     sub genome_of {
484 :     # Get the parameters.
485 :     my ($fid) = @_;
486 :     # Declare the return variable.
487 :     my $retVal;
488 :     # Parse the feature ID.
489 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
490 :     $retVal = $1;
491 :     }
492 :     # Return the result.
493 :     return $retVal;
494 :     }
495 :    
496 :     =head3 hypo
497 :    
498 :     my $flag = hypo($func);
499 :    
500 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
501 :     Hypothetical functional roles are identified by key words in the text,
502 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
503 :    
504 :     =over 4
505 :    
506 :     =item func
507 :    
508 :     Text of the functional role whose nature is to be determined.
509 :    
510 :     =item RETURN
511 :    
512 :     Returns TRUE if the role is hypothetical, else FALSE.
513 :    
514 :     =back
515 :    
516 :     =cut
517 :    
518 :     sub hypo {
519 :     my ($func) = @_;
520 :     if (! $func) { return 1 }
521 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
522 :     if ($func =~ /hypoth/i) { return 1 }
523 :     if ($func =~ /conserved protein/i) { return 1 }
524 :     if ($func =~ /gene product/i) { return 1 }
525 :     if ($func =~ /interpro/i) { return 1 }
526 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
527 :     if ($func =~ /^U\d/) { return 1 }
528 :     if ($func =~ /^orf[^_]/i) { return 1 }
529 :     if ($func =~ /uncharacterized/i) { return 1 }
530 :     if ($func =~ /pseudogene/i) { return 1 }
531 :     if ($func =~ /^predicted/i) { return 1 }
532 :     if ($func =~ /AGR_/) { return 1 }
533 :     if ($func =~ /similar to/i) { return 1 }
534 :     if ($func =~ /similarity/i) { return 1 }
535 :     if ($func =~ /glimmer/i) { return 1 }
536 :     if ($func =~ /unknown/i) { return 1 }
537 :     if (($func =~ /domain/i) ||
538 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
539 :     ($func =~ /complete/i) ||
540 :     ($func =~ /ensang/i) ||
541 :     ($func =~ /unnamed/i) ||
542 :     ($func =~ /EG:/) ||
543 :     ($func =~ /orf\d+/i) ||
544 :     ($func =~ /RIKEN/) ||
545 :     ($func =~ /Expressed/i) ||
546 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
547 :     ($func =~ /predicted by Psort/) ||
548 :     ($func =~ /^bh\d+/i) ||
549 :     ($func =~ /cds_/i) ||
550 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
551 :     ($func =~ /similar to/i) ||
552 :     ($func =~ / identi/i) ||
553 :     ($func =~ /ortholog of/i) ||
554 :     ($func =~ /structural feature/i)) { return 1 }
555 :     return 0;
556 :    
557 :     }
558 :    
559 :     =head3 location_cmp
560 :    
561 :     my $cmp = location_cmp($loc1, $loc2);
562 :    
563 :     Compare two location strings (see L<SAP/Location Strings>).
564 :    
565 :     The ordering principle for locations is that they are sorted first by contig ID, then by
566 :     leftmost position, in reverse order by length, and then by direction. The effect is that
567 :     within a contig, the locations are ordered first and foremost in the way they would
568 :     appear when displayed in a picture of the contig and second in such a way that embedded
569 :     locations come after the locations in which they are embedded. In the case of two
570 :     locations that represent the exact same base pairs, the forward (C<+>) location is
571 :     arbitrarily placed first.
572 :    
573 :     =over 4
574 :    
575 :     =item loc1
576 :    
577 :     First location string to compare.
578 :    
579 :     =item loc2
580 :    
581 :     Second location string to compare.
582 :    
583 :     =item RETURN
584 :    
585 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
586 :     B<loc2> location sorts first, and zero if the two locations are the same.
587 : parrello 1.19
588 :    
589 :     =back
590 :    
591 :     =cut
592 :    
593 :     sub location_cmp {
594 :     # Get the parameters.
595 :     my ($loc1, $loc2) = @_;
596 :     # Parse the locations.
597 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
598 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
599 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
600 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
601 :     # Declare the return variable. We compare the indicative parts of the location
602 :     # in order. Note that we sort in reverse order by length, so the last comparison
603 :     # puts 2 before 1.
604 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
605 :     ($len2 <=> $len1);
606 :     # If everything matches to this point, check the strands.
607 :     if (! $retVal) {
608 :     if ($strand1 eq '+') {
609 :     # First location is positive, so if the locations are unequal, it
610 :     # sorts first.
611 :     $retVal = ($strand2 eq '+' ? 0 : -1);
612 :     } else {
613 :     # First location is negative, so if the locations are unequal, it
614 :     # sorts second.
615 :     $retVal = ($strand1 eq '-' ? 0 : 1);
616 :     }
617 :     }
618 :     # Return the result.
619 :     return $retVal;
620 :     }
621 :    
622 : parrello 1.21 =head3 location_string
623 : parrello 1.19
624 : parrello 1.21 my $locString = location_string($contig, $beg, $end);
625 : parrello 1.19
626 : parrello 1.21 Form a location string for the specified contig that starts at the
627 :     indicated begin location and stops at the indicated end location. A
628 :     single-base location will automatically be put on the forward strand.
629 : parrello 1.19
630 :     =over 4
631 :    
632 : parrello 1.21 =item contig
633 :    
634 :     ID of the contig to contain this location.
635 :    
636 :     =item beg
637 :    
638 :     Beginning offset of the location.
639 :    
640 :     =item end
641 : parrello 1.19
642 : parrello 1.21 Ending offset of the location.
643 : parrello 1.19
644 :     =item RETURN
645 :    
646 : parrello 1.21 Returns a location string (see L<SAP/Location String>) for the specified
647 :     location.
648 : parrello 1.19
649 :     =back
650 :    
651 :     =cut
652 :    
653 : parrello 1.21 sub location_string {
654 : parrello 1.19 # Get the parameters.
655 : parrello 1.21 my ($contig, $beg, $end) = @_;
656 :     # Compute the strand and length.
657 :     my ($strand, $len);
658 :     if ($beg <= $end) {
659 :     $strand = '+';
660 :     $len = $end + 1 - $beg;
661 :     } else {
662 :     $strand = '-';
663 :     $len = $beg + 1 - $end;
664 : parrello 1.19 }
665 :     # Return the result.
666 : parrello 1.21 return $contig . "_$beg$strand$len";
667 : parrello 1.16 }
668 :    
669 : parrello 1.5 =head3 max
670 :    
671 :     my $max = max(@nums);
672 :    
673 :     Return the maximum number from all the values in the specified list.
674 :    
675 :     =over 4
676 :    
677 :     =item nums
678 :    
679 :     List of numbers to examine.
680 :    
681 :     =item RETURN
682 :    
683 :     Returns the maximum numeric value from the specified parameters, or
684 :     an undefined value if an empty list is passed in.
685 :    
686 :     =back
687 :    
688 :     =cut
689 :    
690 :     sub max {
691 :     my ($retVal, @nums) = @_;
692 :     for my $num (@nums) {
693 :     if ($num > $retVal) {
694 :     $retVal = $num;
695 :     }
696 :     }
697 :     return $retVal;
698 :     }
699 :    
700 :     =head3 min
701 :    
702 :     my $min = min(@nums);
703 :    
704 :     Return the minimum number from all the values in the specified list.
705 :    
706 :     =over 4
707 :    
708 :     =item nums
709 :    
710 :     List of numbers to examine.
711 :    
712 :     =item RETURN
713 :    
714 :     Returns the minimum numeric value from the specified parameters, or
715 :     an undefined value if an empty list is passed in.
716 :    
717 :     =back
718 :    
719 :     =cut
720 :    
721 :     sub min {
722 :     my ($retVal, @nums) = @_;
723 :     for my $num (@nums) {
724 :     if ($num < $retVal) {
725 :     $retVal = $num;
726 :     }
727 :     }
728 :     return $retVal;
729 :     }
730 :    
731 : parrello 1.21 =head3 parse_fasta_record
732 : parrello 1.1
733 : parrello 1.21 my ($id, $comment, $seq) = parse_fasta_record($string);
734 : parrello 1.1
735 : parrello 1.21 Extract the ID, comment, and sequence from a single FASTA record. For
736 :     backward compatability, instead of a FASTA record the ID and sequence can
737 :     be specified separated by a comma. In this case, the returned comment
738 :     will be empty.
739 : parrello 1.1
740 :     =over 4
741 :    
742 : parrello 1.21 =item string
743 : parrello 1.1
744 : parrello 1.21 A single FASTA record, or an ID and sequence separated by a single comma,
745 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
746 :     or a 3-element list consisting of an ID, a comment, and a sequence.
747 : parrello 1.7
748 : parrello 1.1 =item RETURN
749 :    
750 : parrello 1.21 Returns a three-element list consisting of the incoming ID, the associated
751 :     comment, and the specified DNA or protein sequence. If the incoming string is
752 :     invalid, all three list elements will come back undefined. If no ID is
753 :     specified, an MD5 will be provided.
754 : parrello 1.1
755 :     =back
756 :    
757 :     =cut
758 :    
759 : parrello 1.21 sub parse_fasta_record {
760 : parrello 1.1 # Get the parameters.
761 : parrello 1.21 my ($string) = @_;
762 :     # Declare the return variables.
763 :     my ($id, $comment, $seq);
764 :     # Check the type of input string.
765 :     if (! defined $string) {
766 :     # Do nothing if no string was passed in. This extra check prevents a
767 :     # warning at runtime.
768 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
769 :     # Here we have a standard FASTA string.
770 :     ($id, $comment, $seq) = ($1, $2, $3);
771 :     # Remove white space from the sequence string.
772 :     $seq =~ s/\s+//sg;
773 :     # Trim front of comment.
774 :     $comment =~ s/^s+//;
775 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
776 :     ($id, $comment, $seq) = ($1, '', $2);
777 :     } elsif (ref $string eq 'ARRAY') {
778 :     # Here the data came in pre-formatted as a list reference.
779 :     ($id, $comment, $seq) = @$string;
780 :     # If there's no comment, we need to adjust.
781 :     if (! defined $seq) {
782 :     $seq = $comment;
783 :     $comment = '';
784 :     }
785 : parrello 1.7 } else {
786 : parrello 1.21 # Here we have only a sequence. We need to construct the ID.
787 :     $seq = $string;
788 :     require Digest::MD5;
789 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
790 :     $comment = "";
791 :     }
792 :     # Return the results.
793 :     return ($id, $comment, $seq);
794 :     }
795 :    
796 :     =head3 parse_location
797 :    
798 :     my ($contig, $begin, $end, $strand) = parse_location($locString);
799 :    
800 :     Return the contigID, start offset, and end offset for a specified
801 :     location string (see L<SAP/Location Strings>).
802 :    
803 :     =over 4
804 :    
805 :     =item locString
806 :    
807 :     Location string to parse.
808 :    
809 :     =item RETURN
810 :    
811 :     Returns a four-element list containing the contig ID from the location
812 :     string, the starting offset of the location, the ending offset, and the
813 :     strand. If the location string is not valid, the values returned will be
814 :     C<undef>.
815 :    
816 :     =back
817 :    
818 :     =cut
819 :    
820 :     sub parse_location {
821 :     # Get the parameters.
822 :     my ($locString) = @_;
823 :     # Declare the return variables.
824 :     my ($contig, $begin, $end, $strand);
825 :     # Parse the location string.
826 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
827 :     # Pull out the contig ID, strand, and begin location.
828 :     $contig = $1;
829 :     $begin = $2;
830 :     $strand = $3;
831 :     # Compute the ending location from the direction and length.
832 :     if ($3 eq '+') {
833 :     $end = $begin + $4 - 1;
834 :     } else {
835 :     $end = $begin - $4 + 1;
836 : parrello 1.7 }
837 : parrello 1.1 }
838 : parrello 1.21 # Return the results.
839 :     return ($contig, $begin, $end, $strand);
840 : parrello 1.1 }
841 :    
842 : parrello 1.2 =head3 rev_comp
843 :    
844 :     my $revcmp = rev_comp($dna);
845 :    
846 :     or
847 :    
848 :     rev_comp(\$dna);
849 :    
850 :     Return the reverse complement of a DNA string.
851 :    
852 :     =over 4
853 :    
854 :     =item dna
855 :    
856 :     Either a DNA string, or a reference to a DNA string.
857 :    
858 :     =item RETURN
859 :    
860 :     If the input is a DNA string, returns the reverse complement. If the
861 :     input is a reference to a DNA string, the string itself is reverse
862 :     complemented.
863 :    
864 :     =back
865 :    
866 :     =cut
867 :    
868 :     sub rev_comp {
869 :     # Get the parameters.
870 :     my ($dna) = @_;
871 :     # Determine how we were called.
872 :     my ($retVal, $refMode);
873 : parrello 1.24 if (! ref $dna) {
874 : parrello 1.21 $retVal = reverse $dna;
875 : parrello 1.2 $refMode = 0;
876 :     } else {
877 : parrello 1.21 $retVal = reverse $$dna;
878 : parrello 1.2 $refMode = 1;
879 :     }
880 : parrello 1.21 # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
881 :     # user passed in a reference. The following translation step complements the
882 :     # string.
883 :     $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
884 : parrello 1.2 # Return the result in the method corresponding to the way it came in.
885 :     if ($refMode) {
886 :     $$dna = $retVal;
887 :     return;
888 :     } else {
889 :     return $retVal;
890 :     }
891 :     }
892 :    
893 : parrello 1.21 # Synonym of rev_comp, for backward compatibility.
894 :     sub reverse_comp {
895 :     return rev_comp($_[0]);
896 : parrello 1.15 }
897 :    
898 : parrello 1.21 =head3 roles_of_function
899 : parrello 1.15
900 : parrello 1.21 my @roles = roles_of_function($assignment);
901 : parrello 1.3
902 : parrello 1.21 Return a list of the functional roles in the specified assignment string.
903 :     A single assignment may contain multiple roles as well as comments; this
904 :     method separates them out.
905 : parrello 1.3
906 :     =over 4
907 :    
908 : parrello 1.21 =item assignment
909 : parrello 1.3
910 : parrello 1.21 Functional assignment to parse for roles.
911 : parrello 1.3
912 :     =item RETURN
913 :    
914 : parrello 1.21 Returns a list of the individual roles in the assignment.
915 : parrello 1.3
916 :     =back
917 :    
918 :     =cut
919 :    
920 : parrello 1.21 sub roles_of_function {
921 : parrello 1.3 # Get the parameters.
922 : parrello 1.21 my ($assignment) = @_;
923 :     # Remove any comment.
924 :     my $commentFree = ($assignment =~ /(.+?)\s*#/ ? $1 : $assignment);
925 :     # Split out the roles.
926 :     my @retVal = split /\s+[\/@;]\s+/, $commentFree;
927 : parrello 1.3 # Return the result.
928 : parrello 1.21 return @retVal;
929 : parrello 1.3 }
930 :    
931 : parrello 1.6 =head3 sims
932 :    
933 : parrello 1.26 my $sims = sims($id, $maxN, $maxP, $select, $max_expand, $filters);
934 : parrello 1.6
935 :     Retrieve similarities from the network similarity server. The similarity retrieval
936 :     is performed using an HTTP user agent that returns similarity data in multiple
937 :     chunks. An anonymous subroutine is passed to the user agent that parses and
938 :     reformats the chunks as they come in. The similarites themselves are returned
939 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
940 :     The Sim object methods allow access to the elements by name.
941 :    
942 :     Similarities can be either raw or expanded. The raw similarities are basic
943 :     hits between features with similar DNA. Expanding a raw similarity drags in any
944 :     features considered substantially identical. So, for example, if features B<A1>,
945 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
946 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
947 :    
948 :     =over 4
949 :    
950 :     =item id
951 :    
952 :     ID of the feature whose similarities are desired, or reference to a list
953 :     of the IDs of the features whose similarities are desired.
954 :    
955 :     =item maxN
956 :    
957 : parrello 1.27 Maximum number of similarities to return for each incoming feature.
958 : parrello 1.6
959 :     =item maxP
960 :    
961 :     The maximum allowable similarity score.
962 :    
963 :     =item select
964 :    
965 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
966 :     means only similarities to FIG features are returned; C<all> means all expanded
967 :     similarities are returned; and C<figx> means similarities are expanded until the
968 :     number of FIG features equals the maximum.
969 :    
970 :     =item max_expand
971 :    
972 :     The maximum number of features to expand.
973 :    
974 :     =item filters
975 :    
976 :     Reference to a hash containing filter information, or a subroutine that can be
977 :     used to filter the sims.
978 :    
979 :     =item RETURN
980 :    
981 :     Returns a list of L<Sim> objects.
982 :    
983 :     =back
984 :    
985 :     =cut
986 :    
987 :     sub sims {
988 :     # Get the parameters.
989 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
990 :     # Get the URL for submitting to the sims server.
991 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
992 :     # Get a list of the IDs to process.
993 :     my @ids;
994 :     if (ref($id) eq "ARRAY") {
995 :     @ids = @$id;
996 :     } else {
997 :     @ids = ($id);
998 :     }
999 :     # Form a list of the parameters to pass to the server.
1000 :     my %args = ();
1001 :     $args{id} = \@ids;
1002 :     $args{maxN} = $maxN if defined($maxN);
1003 :     $args{maxP} = $maxP if defined($maxP);
1004 :     $args{select} = $select if defined($select);
1005 :     $args{max_expand} = $max_expand if defined($max_expand);
1006 :     # If the filter is a hash, put the filters in the argument list.
1007 :     if (ref($filters) eq 'HASH') {
1008 :     for my $k (keys(%$filters))
1009 :     {
1010 :     $args{"filter_$k"}= $filters->{$k};
1011 :     }
1012 :     }
1013 :     # Get the user agent.
1014 :     require LWP::UserAgent;
1015 :     my $ua = LWP::UserAgent->new();
1016 :     # Insure we have the Sim module.
1017 :     require Sim;
1018 :     #
1019 :     # Our next task is to create the anonymous subroutine that will process the
1020 :     # chunks that come back from the server. We require three global variables:
1021 :     # @sims to hold the similarities found, $tail to remember the unprocessed
1022 :     # data from the previous chunk, and $chunks to count the chunks.
1023 :     #
1024 :     my @retVal;
1025 :     my $tail;
1026 :     my $chunks = 0;
1027 :     #
1028 :     # ANONYMOUS SUBROUTINE
1029 :     #
1030 :     my $cb = sub {
1031 :     eval {
1032 :     # Get the parameters.
1033 :     my ($data, $command) = @_;
1034 :     # Check for a reset command. If we get one, we discard any data
1035 :     # in progress.
1036 :     if ($command && $command eq 'reset') {
1037 :     $tail = '';
1038 :     } else {
1039 :     $chunks++;
1040 :     # Get the data to process. Note we concatenate it to the incoming
1041 :     # tail from last time.
1042 :     my $c = $tail . $data;
1043 :     # Make sure the caller hasn't messed up the new-line character.
1044 :     # FASTA readers in particular are notorious for doing things
1045 :     # like that.
1046 :     local $/ = "\n";
1047 :     # Split the input into lines.
1048 :     my @lines = split(/\n/, $c);
1049 :     # If the input does not end with a new-line, we have a partial
1050 :     # chunk and need to put it in the tail for next time. If not,
1051 :     # there is no tail for next time.
1052 :     if (substr($c, -1, 1) ne "\n") {
1053 :     $tail = pop @lines;
1054 :     } else {
1055 :     $tail = '';
1056 :     }
1057 :     # Loop through the lines. Note there's no need to chomp because
1058 :     # the SPLIT took out the new-line characters.
1059 :     for my $l (@lines) {
1060 :     # Split the line into fields.
1061 :     my @s = split(/\t/, $l);
1062 :     # Insure we have all the fields we need.
1063 : parrello 1.12 if (@s >= 9) {
1064 : parrello 1.6 # Check to see if we've seen this SIM before.
1065 :     my $id1 = $s[0];
1066 :     my $id2 = $s[1];
1067 :     # Add it to the result list.
1068 :     push(@retVal, bless \@s, 'Sim');
1069 :     }
1070 :     }
1071 :     }
1072 :     };
1073 :     };
1074 :     #
1075 :     # END OF ANONYMOUS SUBROUTINE
1076 :     #
1077 :     # Now we're ready to start. Because networking is an iffy thing, we set up
1078 :     # to try our request multiple times.
1079 :     my $n_retries = 10;
1080 :     my $attempts = 0;
1081 :     # Set the timeout value, in seconds.
1082 :     $ua->timeout(180);
1083 :     # Loop until we succeed or run out of retries.
1084 :     my $done = 0;
1085 :     while (! $done && $attempts++ < $n_retries) {
1086 :     # Reset the content processor. This clears the tail.
1087 :     &$cb(undef, 'reset');
1088 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
1089 :     if ($resp->is_success) {
1090 :     # If the response was successful, get the content. This triggers
1091 :     # the anonymous subroutine.
1092 :     my $x = $resp->content;
1093 :     # Denote we've been successful.
1094 :     $done = 1;
1095 :     }
1096 :     }
1097 :     return @retVal;
1098 :     }
1099 :    
1100 : parrello 1.21 =head3 standard_genetic_code
1101 :    
1102 :     my $code = standard_genetic_code();
1103 :    
1104 :     Return a hash containing the standard translation of nucleotide triples to proteins.
1105 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
1106 :     returns the default translation scheme. The scheme is implemented as a reference to a
1107 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1108 :    
1109 :     =cut
1110 :    
1111 :     sub standard_genetic_code {
1112 :    
1113 :     my $code = {};
1114 : parrello 1.8
1115 : parrello 1.21 $code->{"AAA"} = "K";
1116 :     $code->{"AAC"} = "N";
1117 :     $code->{"AAG"} = "K";
1118 :     $code->{"AAT"} = "N";
1119 :     $code->{"ACA"} = "T";
1120 :     $code->{"ACC"} = "T";
1121 :     $code->{"ACG"} = "T";
1122 :     $code->{"ACT"} = "T";
1123 :     $code->{"AGA"} = "R";
1124 :     $code->{"AGC"} = "S";
1125 :     $code->{"AGG"} = "R";
1126 :     $code->{"AGT"} = "S";
1127 :     $code->{"ATA"} = "I";
1128 :     $code->{"ATC"} = "I";
1129 :     $code->{"ATG"} = "M";
1130 :     $code->{"ATT"} = "I";
1131 :     $code->{"CAA"} = "Q";
1132 :     $code->{"CAC"} = "H";
1133 :     $code->{"CAG"} = "Q";
1134 :     $code->{"CAT"} = "H";
1135 :     $code->{"CCA"} = "P";
1136 :     $code->{"CCC"} = "P";
1137 :     $code->{"CCG"} = "P";
1138 :     $code->{"CCT"} = "P";
1139 :     $code->{"CGA"} = "R";
1140 :     $code->{"CGC"} = "R";
1141 :     $code->{"CGG"} = "R";
1142 :     $code->{"CGT"} = "R";
1143 :     $code->{"CTA"} = "L";
1144 :     $code->{"CTC"} = "L";
1145 :     $code->{"CTG"} = "L";
1146 :     $code->{"CTT"} = "L";
1147 :     $code->{"GAA"} = "E";
1148 :     $code->{"GAC"} = "D";
1149 :     $code->{"GAG"} = "E";
1150 :     $code->{"GAT"} = "D";
1151 :     $code->{"GCA"} = "A";
1152 :     $code->{"GCC"} = "A";
1153 :     $code->{"GCG"} = "A";
1154 :     $code->{"GCT"} = "A";
1155 :     $code->{"GGA"} = "G";
1156 :     $code->{"GGC"} = "G";
1157 :     $code->{"GGG"} = "G";
1158 :     $code->{"GGT"} = "G";
1159 :     $code->{"GTA"} = "V";
1160 :     $code->{"GTC"} = "V";
1161 :     $code->{"GTG"} = "V";
1162 :     $code->{"GTT"} = "V";
1163 :     $code->{"TAA"} = "*";
1164 :     $code->{"TAC"} = "Y";
1165 :     $code->{"TAG"} = "*";
1166 :     $code->{"TAT"} = "Y";
1167 :     $code->{"TCA"} = "S";
1168 :     $code->{"TCC"} = "S";
1169 :     $code->{"TCG"} = "S";
1170 :     $code->{"TCT"} = "S";
1171 :     $code->{"TGA"} = "*";
1172 :     $code->{"TGC"} = "C";
1173 :     $code->{"TGG"} = "W";
1174 :     $code->{"TGT"} = "C";
1175 :     $code->{"TTA"} = "L";
1176 :     $code->{"TTC"} = "F";
1177 :     $code->{"TTG"} = "L";
1178 :     $code->{"TTT"} = "F";
1179 : parrello 1.8
1180 : parrello 1.21 return $code;
1181 :     }
1182 : parrello 1.8
1183 : parrello 1.21 =head3 strand_of
1184 : parrello 1.8
1185 : parrello 1.21 my $plusOrMinus = strand_of($loc);
1186 : parrello 1.8
1187 : parrello 1.21 Return the strand (C<+> or C<->) from the specified location string.
1188 : parrello 1.8
1189 : parrello 1.21 =over 4
1190 : parrello 1.8
1191 : parrello 1.21 =item loc
1192 : parrello 1.8
1193 : parrello 1.21 Location string to parse (see L<SAP/Location Strings>).
1194 : parrello 1.9
1195 :     =item RETURN
1196 :    
1197 : parrello 1.21 Returns C<+> if the location is on the forward strand, else C<->.
1198 : parrello 1.9
1199 :     =back
1200 :    
1201 :     =cut
1202 :    
1203 : parrello 1.21 sub strand_of {
1204 : parrello 1.9 # Get the parameters.
1205 : parrello 1.21 my ($loc) = @_;
1206 :     # Declare the return variable.
1207 :     my $retVal;
1208 :     # Parse the strand indicator from the location.
1209 :     if ($loc =~ /\d+([+-])\d+/) {
1210 :     $retVal = $1;
1211 : parrello 1.9 }
1212 : parrello 1.21 # Return the result.
1213 :     return $retVal;
1214 : parrello 1.9 }
1215 :    
1216 : parrello 1.21 =head3 strip_ec
1217 : parrello 1.9
1218 : parrello 1.21 my $role = strip_ec($rawRole);
1219 : parrello 1.11
1220 : parrello 1.21 Strip the EC number (if any) from the specified role or functional
1221 :     assignment.
1222 : parrello 1.11
1223 :     =over 4
1224 :    
1225 : parrello 1.21 =item rawRole
1226 : parrello 1.11
1227 : parrello 1.21 Role or functional assignment from which the EC numbers are to be stripped.
1228 : parrello 1.11
1229 :     =item RETURN
1230 :    
1231 : parrello 1.21 Returns the incoming string with any EC numbers removed. The EC numbers must
1232 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1233 :     and surrounding parentheses).
1234 : parrello 1.11
1235 :     =back
1236 :    
1237 :     =cut
1238 :    
1239 : parrello 1.21 sub strip_ec {
1240 :     # Get the parameters.
1241 :     my ($rawRole) = @_;
1242 :     # Declare the return variable.
1243 :     my $retVal = $rawRole;
1244 :     # Remove the EC numbers.
1245 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1246 :     # Return the result.
1247 :     return $retVal;
1248 : parrello 1.15 }
1249 :    
1250 :     =head3 translate
1251 :    
1252 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1253 :    
1254 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1255 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1256 :     C<M>. (In the standard genetic code, these two combinations normally translate
1257 :     to C<V> and C<L>, respectively.)
1258 :    
1259 :     =over 4
1260 :    
1261 :     =item dna_seq
1262 :    
1263 :     DNA sequence to translate. Note that the DNA sequence can only contain
1264 :     known nucleotides.
1265 :    
1266 :     =item code
1267 :    
1268 :     Reference to a hash specifying the translation code. The hash is keyed by
1269 :     nucleotide triples, and the value for each key is the corresponding protein
1270 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1271 :     used.
1272 :    
1273 :     =item fix_start
1274 :    
1275 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1276 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1277 :     C<M> instead of the value specified in the translation code.
1278 :    
1279 :     =item RETURN
1280 :    
1281 :     Returns a string resulting from translating each nucleotide triple into a
1282 :     protein letter.
1283 :    
1284 :     =back
1285 :    
1286 :     =cut
1287 :     #: Return Type $;
1288 :     sub translate {
1289 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
1290 :    
1291 :     my( $dna,$code,$start ) = @_;
1292 :     my( $i,$j,$ln );
1293 :     my( $x,$y );
1294 :     my( $prot );
1295 :    
1296 :     if (! defined($code)) {
1297 :     $code = &FIG::standard_genetic_code;
1298 :     }
1299 :     $ln = length($dna);
1300 :     $prot = "X" x ($ln/3);
1301 :     $dna =~ tr/a-z/A-Z/;
1302 :    
1303 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1304 :     $x = substr($dna,$i,3);
1305 :     if ($y = $code->{$x}) {
1306 :     substr($prot,$j,1) = $y;
1307 :     }
1308 :     }
1309 :    
1310 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1311 :     substr($prot,0,1) = 'M';
1312 :     }
1313 :     return $prot;
1314 :     }
1315 :    
1316 : parrello 1.21 =head3 verify_dir
1317 : parrello 1.18
1318 : parrello 1.21 verify_dir($dirName);
1319 : parrello 1.18
1320 : parrello 1.21 Insure that the specified directory exists. If the directory does not
1321 :     exist, it will be created.
1322 : parrello 1.18
1323 :     =over 4
1324 :    
1325 : parrello 1.21 =item dirName
1326 : parrello 1.18
1327 : parrello 1.21 Name of the relevant directory.
1328 : parrello 1.18
1329 :     =back
1330 :    
1331 :     =cut
1332 :    
1333 : parrello 1.21 sub verify_dir {
1334 : parrello 1.18 # Get the parameters.
1335 : parrello 1.21 my ($dirName) = @_;
1336 :     # Strip off the final slash, if any.
1337 :     $dirName =~ s#/$##;
1338 :     # Only proceed if the directory does NOT already exist.
1339 :     if (! -d $dirName) {
1340 :     # If there is a parent directory, recursively insure it is there.
1341 :     if ($dirName =~ m#(.+)/[^/]+$#) {
1342 :     verify_dir($1);
1343 : overbeek 1.20 }
1344 : parrello 1.21 # Create this particular directory with full permissions.
1345 :     mkdir $dirName, 0777;
1346 : overbeek 1.20 }
1347 :     }
1348 :    
1349 : disz 1.28 sub strip_func {
1350 :     my($func) = @_;
1351 : overbeek 1.20
1352 : disz 1.28 $func =~ s/^FIG\d{6}[^:]*:\s*//;
1353 :     $func =~ s/\s*\#.*$//;
1354 :     return($func);
1355 :     }
1356 :    
1357 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3