[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.56 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 : parrello 1.11 # This is a SAS component.
4 :    
5 : gdpusch 1.55 ########################################################################
6 : parrello 1.1 # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 : gdpusch 1.55 ########################################################################
21 : parrello 1.1
22 :     package SeedUtils;
23 : olson 1.38 use BerkTable;
24 :     use DB_File;
25 : olson 1.42 use Carp;
26 : parrello 1.1
27 : olson 1.32 #
28 :     # In case we are running in a SEED, pull in the FIG_Config
29 :     #
30 :     BEGIN
31 :     {
32 :     eval {
33 :     require FIG_Config;
34 :     };
35 :     }
36 :    
37 : parrello 1.1 use strict;
38 : parrello 1.12 no warnings qw(once);
39 : parrello 1.1 use base qw(Exporter);
40 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
41 :     rev_comp genome_of min max sims verify_dir between translate
42 : parrello 1.21 standard_genetic_code parse_location roles_of_function
43 : olson 1.33 strip_ec location_string location_cmp strand_of by_fig_id
44 : olson 1.39 verify_db bbh_data id_url validate_fasta_file);
45 : parrello 1.1
46 :     =head1 SEED Utility Methods
47 :    
48 :     =head2 Introduction
49 :    
50 :     This is a simple utility package that performs functions useful for
51 :     bioinformatics, but that do not require access to the databases.
52 :    
53 :     =head2 Public Methods
54 :    
55 : parrello 1.21 =head3 abbrev
56 :    
57 :     my $abbrev = SeedUtils::abbrev($genome_name);
58 :    
59 :     Return an abbreviation of the specified genome name. This method is used to create
60 :     a reasonably indicative genome name that fits in 10 characters.
61 :    
62 :     =over 4
63 :    
64 :     =item genome_name
65 :    
66 :     Genome name to abbreviate.
67 :    
68 :     =item RETURN
69 :    
70 :     Returns a shortened version of the genome name that is 10 characters or less in
71 :     length.
72 :    
73 :     =back
74 :    
75 :     =cut
76 :    
77 :     sub abbrev {
78 :     my($genome_name) = @_;
79 : overbeek 1.44 my %exclude = map { $_ => 1 }
80 : overbeek 1.50 qw( candidatus subspecies subsp strain str bv pv sp );
81 :     my ($p1,$p2,@rest) = grep { ! $exclude{lc $_} }
82 :     map { $_ =~ s/\W//g; $_ }
83 :     split(/\s/,$genome_name);
84 :     my $p3 = join("",@rest);
85 : parrello 1.21
86 : overbeek 1.44 my $lbl_ln = 10;
87 :    
88 :     if (! $p2)
89 :     {
90 :     if (length($p1) > $lbl_ln) { $p1 = substr($p1,0,$lbl_ln) }
91 :     return $p1;
92 :     }
93 :     elsif (! $p3)
94 :     {
95 :     my $l1 = length($p1);
96 :     my $l2 = length($p2);
97 :     my $ln1 = $l1;
98 :     my $ln2 = $l2;
99 :    
100 :     if (($l1 + $l2 + 1) > $lbl_ln)
101 :     {
102 :     $ln1 = $lbl_ln - ($l2+1);
103 :     $ln1 = &min($l1,&max($ln1,3));
104 :     $ln2 = $lbl_ln - ($ln1+1);
105 :     $p1 = substr($p1,0,$ln1);
106 :     $p2 = substr($p2,0,$ln2);
107 :     }
108 :     my $sep = ($l1 == $ln1) ? '_' : '.';
109 :     return $p1 . $sep . $p2;
110 :     }
111 :     else
112 :     {
113 :     my $l1 = length($p1);
114 :     my $l2 = length($p2);
115 :     my $l3 = length($p3);
116 :     my $l23 = $l2+$l3+1;
117 :     my $ln1 = $l1;
118 :     my $ln2 = $l2;
119 :     my $ln3 = $l3;
120 :    
121 :     if (($l1 + $l2 + $l3 + 2) > $lbl_ln)
122 :     {
123 :     $ln1 = $lbl_ln - ($l23 + 1);
124 :     $ln1 = &min($l1,&max($ln1,3));
125 :     my $rest = $lbl_ln - ($ln1+1);
126 :     $ln2 = $rest - ($ln3+1);
127 :     $ln2 = &min($l2,&max($ln2,3));
128 : overbeek 1.51 $ln3 = $lbl_ln - ($ln1+$ln2+2);
129 : overbeek 1.44
130 :     $p1 = substr($p1,0,$ln1);
131 :     $p2 = substr($p2,0,$ln2);
132 :     $p3 = substr($p3,0,$ln3);
133 :     }
134 :     my $sep1 = ($l1 == $ln1) ? '_' : '.';
135 :     my $sep2 = ($l2 == $ln2) ? '_' : '.';
136 :     return $p1 . $sep1 . $p2 . $sep2 . $p3;
137 :     }
138 :     }
139 :    
140 :     =head3 abbrev_set
141 :    
142 :     my $abbrevH = SeedUtils::abbrev_set($genome_names);
143 :    
144 :     Takes a pointer to a list of genome names and returns a hash mapping names to unique
145 :     abbreviations. The names will be less than or equal to 10 characters in length.
146 :    
147 :     =over 4
148 :    
149 :     =item genome_names
150 :    
151 :     Pointer to a list of genome names
152 :    
153 :     =item RETURN
154 :    
155 :     Returns a hash mapping full names to unique abbreviations.
156 :    
157 :     =back
158 :    
159 :     =cut
160 :    
161 :     sub abbrev_set {
162 :     my($genome_names) = @_;
163 :    
164 :     my %seen;
165 :     my $hash = {};
166 :     foreach my $name (@$genome_names)
167 :     {
168 :     next if ($hash->{$name});
169 :     my $abbrev = &abbrev($name);
170 :     while ($seen{$abbrev})
171 :     {
172 :     $abbrev = &next_try($abbrev);
173 :     }
174 :     $hash->{$name} = $abbrev;
175 :     $seen{$abbrev} = 1;
176 :     }
177 :     return $hash;
178 :     }
179 :    
180 :     sub next_try {
181 :     my($abbrev) = @_;
182 :    
183 :     my($ext) = ($abbrev =~ s/\.(\d+)$//);
184 :     $ext ||= 0;
185 :     $ext++;
186 :     my $ln = length($abbrev) + length($ext) + 1;
187 :     if ($ln > 10)
188 :     {
189 :     $abbrev = substr($abbrev,0,(10 - (length($ext)+1)));
190 : parrello 1.21 }
191 : overbeek 1.44 return "$abbrev.$ext";
192 : parrello 1.21 }
193 :    
194 : parrello 1.34 =head3 bbh_data
195 :    
196 :     my $bbhList = FIGRules::bbh_data($peg, $cutoff);
197 :    
198 :     Return a list of the bi-directional best hits relevant to the specified PEG.
199 :    
200 :     =over 4
201 :    
202 :     =item peg
203 :    
204 :     ID of the feature whose bidirectional best hits are desired.
205 :    
206 :     =item cutoff
207 :    
208 :     Similarity cutoff. If omitted, 1e-10 is used.
209 :    
210 :     =item RETURN
211 :    
212 :     Returns a reference to a list of 3-tuples. The first element of the list is the best-hit
213 :     PEG; the second element is the score. A lower score indicates a better match. The third
214 :     element is the normalized bit score for the pair; it is normalized to the length
215 :     of the protein.
216 :    
217 :     =back
218 :    
219 :     =cut
220 : gdpusch 1.47
221 : parrello 1.34 #: Return Type @@;
222 :     sub bbh_data {
223 :     my ($peg, $cutoff) = @_;
224 :     my @retVal = ();
225 :     require LWP::UserAgent;
226 :     my $ua = LWP::UserAgent->new();
227 :     my $url = "http://bioseed.mcs.anl.gov/simserver/perl/bbhs.pl";
228 :     my $retries = 5;
229 :     my $done = 0;
230 :     my $resp;
231 :     while ($retries > 0 && ! $done) {
232 :     $resp = $ua->post($url, { id => $peg, cutoff => $cutoff });
233 :     if ($resp->is_success) {
234 :     my $dat = $resp->content;
235 :     while ($dat =~ /([^\n]+)\n/g) {
236 :     my @l = split(/\t/, $1);
237 :     push @retVal, \@l;
238 :     }
239 :     $done = 1;
240 :     } else {
241 :     $retries--;
242 :     }
243 :     }
244 :     if (! $done) {
245 :     die("Failure retrieving bbh data for $peg: " . $resp->status_line);
246 :     }
247 :     return \@retVal;
248 :     }
249 :    
250 :    
251 : parrello 1.21 =head3 between
252 :    
253 :     my $flag = between($x, $y, $z);
254 :    
255 :     Determine whether or not $y is between $x and $z.
256 :    
257 :     =over 4
258 :    
259 :     =item x
260 :    
261 :     First edge number.
262 :    
263 :     =item y
264 :    
265 :     Number to examine.
266 :    
267 :     =item z
268 :    
269 :     Second edge number.
270 :    
271 :     =item RETURN
272 :    
273 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
274 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
275 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
276 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
277 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
278 :    
279 :     =back
280 :    
281 :     =cut
282 : gdpusch 1.47
283 : parrello 1.21 #: Return Type $;
284 :     sub between {
285 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
286 :     my($x,$y,$z) = @_;
287 :    
288 :     if ($x < $z) {
289 :     return (($x <= $y) && ($y <= $z));
290 :     } else {
291 :     return (($x >= $y) && ($y >= $z));
292 :     }
293 :     }
294 :    
295 :    
296 : parrello 1.13 =head3 boundaries_of
297 :    
298 : parrello 1.21 my ($contig, $min, $max, $dir) = boundaries_of($locs);
299 : parrello 1.13
300 :     Return the boundaries of a set of locations. The contig, the leftmost
301 :     location, and the rightmost location will be returned to the caller. If
302 :     more than one contig is represented, the method will return an undefined
303 :     value for the contig (indicating failure).
304 :    
305 :     =over 4
306 :    
307 :     =item locs
308 :    
309 :     Reference to a list of location strings. A location string contains a contig ID,
310 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
311 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
312 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
313 :    
314 :     =item RETURN
315 :    
316 : parrello 1.21 Returns a 4-element list. The first element is the contig ID from all the locations,
317 :     the second is the offset of leftmost base pair represented in the locations, the
318 :     third is the offset of the rightmost base pair represented in the locations, and
319 :     the fourth is the dominant strand.
320 : parrello 1.13
321 :     =back
322 :    
323 :     =cut
324 :    
325 :     sub boundaries_of {
326 :     # Get the parameters.
327 :     my ($locs) = @_;
328 :     # Declare the return variables.
329 :     my ($contig, $min, $max);
330 :     # We'll put all the starting and ending offsets found in here.
331 :     my @offsets;
332 : parrello 1.21 # This will be used to count the orientations.
333 :     my %dirs = ('+' => 0, '-' => 0);
334 : parrello 1.13 # This will count the number of errors found.
335 :     my $error = 0;
336 : parrello 1.35 # Insure the location is an array. If it's not, we assume it's a string of
337 :     # comma-separated values.
338 :     if (! ref $locs) {
339 :     $locs = [ split /\s*,\s*/, $locs ];
340 :     }
341 : parrello 1.13 # Loop through the locations.
342 :     for my $loc (@$locs) {
343 :     # Parse this location.
344 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
345 : parrello 1.23 # This is a valid location string.
346 : parrello 1.13 my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
347 :     # Is this contig valid?
348 :     if ($contig && $newContig ne $contig) {
349 :     # No, skip this location.
350 :     $error++;
351 :     } else {
352 :     # Save the contig.
353 :     $contig = $newContig;
354 : parrello 1.21 # Count the orientation.
355 :     $dirs{$dir}++;
356 : parrello 1.13 # Compute the ending offset.
357 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
358 :     # Save both offsets.
359 :     push @offsets, $begin, $end;
360 :     }
361 : parrello 1.23 } elsif ($loc =~ /^(.+)_(\d+)_(\d+)/) {
362 :     # Here we have an old-style location string.
363 :     my($newContig, $start, $stop) = ($1, $2, $3);
364 :     # Is this contig valid?
365 : olson 1.22 if ($contig && $newContig ne $contig) {
366 :     # No, skip this location.
367 :     $error++;
368 :     } else {
369 :     # Save the contig.
370 :     $contig = $newContig;
371 : parrello 1.23 # Compute the orientation.
372 :     my $dir;
373 :     if ($start > $stop) {
374 :     $dir = '-';
375 :     } else {
376 :     $dir = '+';
377 :     }
378 :     # Count it.
379 : olson 1.22 $dirs{$dir}++;
380 : parrello 1.23 # Save both offsets.
381 : olson 1.22 push @offsets, $start, $stop;
382 :     }
383 : parrello 1.13 } else {
384 :     # The location is invalid, so it's an error,
385 :     $error++;
386 :     }
387 :     }
388 :     # If there's an error, clear the contig ID.
389 :     if ($error) {
390 :     $contig = undef;
391 :     }
392 :     # Compute the min and max from the offsets collected.
393 :     $min = min(@offsets);
394 :     $max = max(@offsets);
395 : parrello 1.21 # Save the dominant orientation.
396 :     my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
397 : parrello 1.13 # Return the results.
398 : parrello 1.21 return ($contig, $min, $max, $dir);
399 : parrello 1.13 }
400 :    
401 : parrello 1.21 =head3 boundary_loc
402 : parrello 1.17
403 : parrello 1.21 my $singleLoc = SeedUtils::boundary_loc($locations);
404 : parrello 1.17
405 : parrello 1.21 Return a single location string (see L<SAP/Location Strings>) that covers
406 :     the incoming list of locations. NOTE that if the locations listed span
407 :     more than one contig, this method may return an unexpected result.
408 : parrello 1.17
409 : parrello 1.21 This method is useful for converting the output of L<SAP/fid_locations> to
410 :     location strings.
411 : parrello 1.17
412 :     =over 4
413 :    
414 : parrello 1.21 =item locations
415 : parrello 1.17
416 : parrello 1.21 A set of location strings formatted as a comma-separated list or as a reference
417 :     to a list of location strings.
418 : parrello 1.17
419 :     =item RETURN
420 :    
421 : parrello 1.21 Returns a single location string that covers as best as possible the list of
422 :     incoming locations.
423 : parrello 1.17
424 :     =back
425 :    
426 :     =cut
427 :    
428 : parrello 1.21 sub boundary_loc {
429 : parrello 1.17 # Get the parameters.
430 : parrello 1.21 my ($locations) = @_;
431 :     # Convert the incoming locations to a list.
432 :     my @locs;
433 :     if (ref $locations eq 'ARRAY') {
434 :     @locs = @$locations;
435 :     } else {
436 :     @locs = split /\s*,\s*/, $locations;
437 :     }
438 :     # Get the boundary information for the listed locations.
439 :     my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
440 :     # Compute the indicated location string.
441 :     my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
442 :     ($max + 1 - $min);
443 : parrello 1.17 # Return the result.
444 : parrello 1.21 return $retVal;
445 : parrello 1.17 }
446 :    
447 : parrello 1.21 =head3 by_fig_id
448 : parrello 1.17
449 : parrello 1.21 my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
450 : parrello 1.16
451 : parrello 1.21 Compare two feature IDs.
452 : parrello 1.16
453 : parrello 1.21 This function is designed to assist in sorting features by ID. The sort is by
454 :     genome ID followed by feature type and then feature number.
455 : parrello 1.16
456 :     =over 4
457 :    
458 : parrello 1.21 =item a
459 :    
460 :     First feature ID.
461 :    
462 :     =item b
463 : parrello 1.16
464 : parrello 1.21 Second feature ID.
465 : parrello 1.16
466 :     =item RETURN
467 :    
468 : parrello 1.21 Returns a negative number if the first parameter is smaller, zero if both parameters
469 :     are equal, and a positive number if the first parameter is greater.
470 : parrello 1.16
471 :     =back
472 :    
473 :     =cut
474 :    
475 : parrello 1.21 sub by_fig_id {
476 :     my($a,$b) = @_;
477 :     my($g1,$g2,$t1,$t2,$n1,$n2);
478 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
479 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
480 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
481 :     } else {
482 :     $a cmp $b;
483 : parrello 1.16 }
484 : parrello 1.19 }
485 :    
486 : parrello 1.21 =head3 create_fasta_record
487 : parrello 1.19
488 : parrello 1.21 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
489 : parrello 1.19
490 : parrello 1.21 Create a FASTA record from the specified DNA or protein sequence. The
491 :     sequence will be split into 60-character lines, and the record will
492 :     include an identifier line.
493 : parrello 1.19
494 :     =over 4
495 :    
496 : parrello 1.21 =item id
497 :    
498 :     ID for the sequence, to be placed at the beginning of the identifier
499 :     line.
500 :    
501 :     =item comment (optional)
502 : parrello 1.19
503 : parrello 1.21 Comment text to place after the ID on the identifier line. If this parameter
504 :     is empty, undefined, or 0, no comment will be placed.
505 : parrello 1.19
506 : parrello 1.21 =item sequence
507 : parrello 1.19
508 : parrello 1.21 Sequence of letters to form into FASTA. For purposes of convenience, whitespace
509 :     characters in the sequence will be removed automatically.
510 : parrello 1.19
511 : parrello 1.21 =item stripped (optional)
512 : parrello 1.19
513 : parrello 1.21 If TRUE, then the sequence will be returned unmodified instead of converted
514 :     to FASTA format. The default is FALSE.
515 : parrello 1.19
516 :     =item RETURN
517 :    
518 : parrello 1.21 Returns the desired sequence in FASTA format.
519 : parrello 1.19
520 :     =back
521 :    
522 :     =cut
523 :    
524 : parrello 1.21 sub create_fasta_record {
525 : parrello 1.19 # Get the parameters.
526 : parrello 1.21 my ($id, $comment, $sequence, $stripped) = @_;
527 :     # Declare the return variable.
528 :     my $retVal;
529 :     # If we're in stripped mode, we just return the sequence.
530 :     if ($stripped) {
531 :     $retVal = $sequence;
532 : parrello 1.19 } else {
533 : parrello 1.21 # Here we have to do the FASTA conversion. Start with the ID.
534 :     my $header = ">$id";
535 :     # Add a comment, if any.
536 :     if ($comment) {
537 :     $header .= " $comment";
538 :     }
539 :     # Clean up the sequence.
540 :     $sequence =~ s/\s+//g;
541 :     # We need to format the sequence into 60-byte chunks. We use the infamous
542 :     # grep-split trick. The split, because of the presence of the parentheses,
543 :     # includes the matched delimiters in the output list. The grep strips out
544 :     # the empty list items that appear between the so-called delimiters, since
545 :     # the delimiters are what we want.
546 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
547 :     # Add the chunks and the trailer.
548 :     $retVal = join("\n", $header, @chunks) . "\n";
549 : parrello 1.19 }
550 :     # Return the result.
551 : parrello 1.21 return $retVal;
552 : parrello 1.19 }
553 :    
554 : parrello 1.21 =head3 display_id_and_seq
555 : parrello 1.19
556 : parrello 1.21 SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
557 : parrello 1.19
558 : parrello 1.21 Display a fasta ID and sequence to the specified open file. This method is designed
559 :     to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
560 :     input a string pointer rather than a string. If the file handle is omitted it
561 :     defaults to STDOUT.
562 : parrello 1.19
563 : parrello 1.21 The output is formatted into a FASTA record. The first line of the output is
564 :     preceded by a C<< > >> symbol, and the sequence is split into 60-character
565 :     chunks displayed one per line. Thus, this method can be used to produce
566 :     FASTA files from data gathered by the rest of the system.
567 : parrello 1.19
568 :     =over 4
569 :    
570 : parrello 1.21 =item id_and_comment
571 : parrello 1.19
572 : parrello 1.21 The sequence ID and (optionally) the comment from the sequence's FASTA record.
573 :     The ID
574 : parrello 1.19
575 : parrello 1.21 =item seqP
576 : parrello 1.19
577 : parrello 1.21 Reference to a string containing the sequence. The sequence is automatically
578 :     formatted into 60-character chunks displayed one per line.
579 : parrello 1.19
580 : parrello 1.21 =item fh
581 :    
582 :     Open file handle to which the ID and sequence should be output. If omitted,
583 :     C<\*STDOUT> is assumed.
584 :    
585 :     =back
586 :    
587 :     =cut
588 :    
589 :     sub display_id_and_seq {
590 :    
591 :     my( $id, $seqP, $fh ) = @_;
592 :    
593 :     if (! defined($fh) ) { $fh = \*STDOUT; }
594 :    
595 :     print $fh ">$id\n";
596 :     &display_seq($seqP, $fh);
597 :     }
598 :    
599 :     =head3 display_seq
600 :    
601 :     SeedUtils::display_seq(\$seqP, $fh);
602 :    
603 :     Display a fasta sequence to the specified open file. If the file handle is
604 :     omitted it defaults to STDOUT.
605 :    
606 :     The sequence is split into 60-character chunks displayed one per line for
607 :     readability.
608 :    
609 :     =over 4
610 :    
611 :     =item seqP
612 :    
613 :     Reference to a string containing the sequence.
614 :    
615 :     =item fh
616 :    
617 :     Open file handle to which the sequence should be output. If omitted,
618 :     C<STDOUT> is assumed.
619 :    
620 :     =back
621 :    
622 :     =cut
623 :    
624 :     sub display_seq {
625 :    
626 :     my ( $seqP, $fh ) = @_;
627 :     my ( $i, $n, $ln );
628 :    
629 :     if (! defined($fh) ) { $fh = \*STDOUT; }
630 :    
631 :     $n = length($$seqP);
632 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
633 :     for ($i=0; ($i < $n); $i += 60) {
634 :     if (($i + 60) <= $n) {
635 :     $ln = substr($$seqP,$i,60);
636 :     } else {
637 :     $ln = substr($$seqP,$i,($n-$i));
638 :     }
639 :     print $fh "$ln\n";
640 :     }
641 :     }
642 :    
643 : olson 1.42 =head3 extract_seq
644 :    
645 :     $seq = &SeedUtils::extract_seq($contigs,$loc)
646 :    
647 :     This is just a little utility routine that I have found convenient. It assumes that
648 :     $contigs is a hash that contains IDs as keys and sequences as values. $loc must be of the
649 :     form
650 :    
651 :     Contig_Beg_End
652 :    
653 :     where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought
654 :     subsequence. If Beg > End, it is assumed that you want the reverse complement of the subsequence.
655 :     This routine plucks out the subsequence for you.
656 :    
657 :     =cut
658 :    
659 :     sub extract_seq {
660 :     my($contigs,$loc) = @_;
661 :     my($contig,$beg,$end,$contig_seq);
662 :     my($plus,$minus);
663 :    
664 :     $plus = $minus = 0;
665 :     my $strand = "";
666 :     my @loc = split(/,/,$loc);
667 :     my @seq = ();
668 :     foreach $loc (@loc)
669 :     {
670 :     if ($loc =~ /^\S+_(\d+)_(\d+)$/)
671 :     {
672 :     if ($1 < $2)
673 :     {
674 :     $plus++;
675 :     }
676 :     elsif ($2 < $1)
677 :     {
678 :     $minus++;
679 :     }
680 :     }
681 :     }
682 :     if ($plus > $minus)
683 :     {
684 :     $strand = "+";
685 :     }
686 :     elsif ($plus < $minus)
687 :     {
688 :     $strand = "-";
689 :     }
690 :    
691 :     foreach $loc (@loc)
692 :     {
693 :     if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
694 :     {
695 :     ($contig,$beg,$end) = ($1,$2,$3);
696 :    
697 :     my $len = length($contigs->{$contig});
698 :     if (!$len)
699 :     {
700 :     carp "Undefined or zero-length contig $contig";
701 :     return "";
702 :     }
703 :    
704 :     if (($beg > $len) || ($end > $len))
705 :     {
706 :     carp "Region $loc out of bounds (contig len=$len)";
707 :     }
708 :     else
709 :     {
710 :     if (($beg < $end) || (($beg == $end) && ($strand eq "+")))
711 :     {
712 :     push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg)));
713 :     }
714 :     else
715 :     {
716 :     $strand = "-";
717 :     push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end))));
718 :     }
719 :     }
720 :     }
721 :     }
722 :     return join("",@seq);
723 :     }
724 :    
725 : parrello 1.21
726 : gdpusch 1.46
727 :     =head3 file_read
728 :    
729 :     my $text = $fig->file_read($fileName);
730 :    
731 :     or
732 :    
733 :     my @lines = $fig->file_read($fileName);
734 :    
735 :     or
736 :    
737 :     my $text = FIG::file_read($fileName);
738 :    
739 :     or
740 :    
741 :     my @lines = FIG::file_read($fileName);
742 :    
743 :     Read an entire file into memory. In a scalar context, the file is returned
744 :     as a single text string with line delimiters included. In a list context, the
745 :     file is returned as a list of lines, each line terminated by a line
746 :     delimiter. (For a method that automatically strips the line delimiters,
747 :     use C<Tracer::GetFile>.)
748 :    
749 :     =over 4
750 :    
751 :     =item fileName
752 :    
753 :     Fully-qualified name of the file to read.
754 :    
755 :     =item RETURN
756 :    
757 :     In a list context, returns a list of the file lines. In a scalar context, returns
758 :     a string containing all the lines of the file with delimiters included.
759 :    
760 :     =back
761 :    
762 :     =cut
763 :    
764 :     #: Return Type $;
765 :     #: Return Type @;
766 :     sub file_read {
767 :    
768 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
769 :     my($fileName) = @_;
770 :     return file_head($fileName, '*');
771 :    
772 :     }
773 :    
774 :    
775 :     =head3 file_head
776 :    
777 :     my $text = $fig->file_head($fileName, $count);
778 :    
779 :     or
780 :    
781 :     my @lines = $fig->file_head($fileName, $count);
782 :    
783 :     or
784 :    
785 :     my $text = FIG::file_head($fileName, $count);
786 :    
787 :     or
788 :    
789 :     my @lines = FIG::file_head($fileName, $count);
790 :    
791 :     Read a portion of a file into memory. In a scalar context, the file portion is
792 :     returned as a single text string with line delimiters included. In a list
793 :     context, the file portion is returned as a list of lines, each line terminated
794 :     by a line delimiter.
795 :    
796 :     =over 4
797 :    
798 :     =item fileName
799 :    
800 :     Fully-qualified name of the file to read.
801 :    
802 :     =item count (optional)
803 :    
804 :     Number of lines to read from the file. If omitted, C<1> is assumed. If the
805 :     non-numeric string C<*> is specified, the entire file will be read.
806 :    
807 :     =item RETURN
808 :    
809 :     In a list context, returns a list of the desired file lines. In a scalar context, returns
810 :     a string containing the desired lines of the file with delimiters included.
811 :    
812 :     =back
813 :    
814 :     =cut
815 :    
816 :     #: Return Type $;
817 :     #: Return Type @;
818 :     sub file_head {
819 :    
820 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
821 :     my($file, $count) = @_;
822 :    
823 :     my ($n, $allFlag);
824 :     if ($count eq '*') {
825 :     $allFlag = 1;
826 :     $n = 0;
827 :     } else {
828 :     $allFlag = 0;
829 :     $n = (!$count ? 1 : $count);
830 :     }
831 :    
832 :     if (open(my $fh, "<$file")) {
833 :     my(@ret, $i);
834 :     $i = 0;
835 :     while (<$fh>) {
836 :     push(@ret, $_);
837 :     $i++;
838 :     last if !$allFlag && $i >= $n;
839 :     }
840 :     close($fh);
841 :     if (wantarray) {
842 :     return @ret;
843 :     } else {
844 :     return join("", @ret);
845 :     }
846 :     }
847 :     }
848 :    
849 :    
850 :    
851 : parrello 1.21 =head3 genome_of
852 :    
853 :     my $genomeID = genome_of($fid);
854 :    
855 :     Return the Genome ID embedded in the specified FIG feature ID.
856 :    
857 :     =over 4
858 :    
859 :     =item fid
860 :    
861 :     Feature ID of interest.
862 :    
863 :     =item RETURN
864 :    
865 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
866 :     feature ID is invalid, this method returns an undefined value.
867 :    
868 :     =back
869 :    
870 :     =cut
871 :    
872 :     sub genome_of {
873 :     # Get the parameters.
874 :     my ($fid) = @_;
875 :     # Declare the return variable.
876 :     my $retVal;
877 :     # Parse the feature ID.
878 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
879 :     $retVal = $1;
880 :     }
881 :     # Return the result.
882 :     return $retVal;
883 :     }
884 :    
885 :     =head3 hypo
886 :    
887 :     my $flag = hypo($func);
888 :    
889 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
890 :     Hypothetical functional roles are identified by key words in the text,
891 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
892 :    
893 :     =over 4
894 :    
895 :     =item func
896 :    
897 :     Text of the functional role whose nature is to be determined.
898 :    
899 :     =item RETURN
900 :    
901 :     Returns TRUE if the role is hypothetical, else FALSE.
902 :    
903 :     =back
904 :    
905 :     =cut
906 :    
907 :     sub hypo {
908 :     my ($func) = @_;
909 :     if (! $func) { return 1 }
910 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
911 :     if ($func =~ /hypoth/i) { return 1 }
912 :     if ($func =~ /conserved protein/i) { return 1 }
913 :     if ($func =~ /gene product/i) { return 1 }
914 :     if ($func =~ /interpro/i) { return 1 }
915 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
916 :     if ($func =~ /^U\d/) { return 1 }
917 :     if ($func =~ /^orf[^_]/i) { return 1 }
918 :     if ($func =~ /uncharacterized/i) { return 1 }
919 :     if ($func =~ /pseudogene/i) { return 1 }
920 :     if ($func =~ /^predicted/i) { return 1 }
921 :     if ($func =~ /AGR_/) { return 1 }
922 :     if ($func =~ /similar to/i) { return 1 }
923 :     if ($func =~ /similarity/i) { return 1 }
924 :     if ($func =~ /glimmer/i) { return 1 }
925 :     if ($func =~ /unknown/i) { return 1 }
926 :     if (($func =~ /domain/i) ||
927 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
928 :     ($func =~ /complete/i) ||
929 :     ($func =~ /ensang/i) ||
930 :     ($func =~ /unnamed/i) ||
931 :     ($func =~ /EG:/) ||
932 :     ($func =~ /orf\d+/i) ||
933 :     ($func =~ /RIKEN/) ||
934 :     ($func =~ /Expressed/i) ||
935 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
936 :     ($func =~ /predicted by Psort/) ||
937 :     ($func =~ /^bh\d+/i) ||
938 :     ($func =~ /cds_/i) ||
939 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
940 :     ($func =~ /similar to/i) ||
941 :     ($func =~ / identi/i) ||
942 :     ($func =~ /ortholog of/i) ||
943 :     ($func =~ /structural feature/i)) { return 1 }
944 :     return 0;
945 :    
946 :     }
947 :    
948 : parrello 1.37 =head3 id_url
949 :    
950 :     my $url = id_url($id);
951 :    
952 :     Return the URL for a specified external gene ID.
953 :    
954 :     =over 4
955 :    
956 :     =item id
957 :    
958 :     ID of the gene whose URL is desired.
959 :    
960 :     =item RETURN
961 :    
962 :     Returns a URL for displaying information about the specified gene. The structure
963 :     of the ID is used to determine the web site to which the gene belongs.
964 :    
965 :     =back
966 :    
967 :     =cut
968 :    
969 :     sub id_url {
970 :     # Get the parameters.
971 :     my ($id) = @_;
972 :     # Declare the return variable.
973 :     my $retVal;
974 :     # Parse the ID to determine the URL.
975 :     if ($id =~ /^(?:ref\|)?([NXYZA]P_[0-9\.]+)$/) {
976 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein;cmd=search;term=$1";
977 :     } elsif ($id =~ /^gi\|(\d+)$/) {
978 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve;db=Protein&list_uids=$1;dopt=GenPept";
979 :     } elsif ($id =~ /^cmr\|(.+)$/) {
980 :     $retVal = "http://cmr.jcvi.org/tigr-scripts/CMR/shared/GenePage.cgi?locus=$1";
981 :     } elsif ($id =~ /^sp\|([A-Z0-9]{6})$/) {
982 :     $retVal = "http://us.expasy.org/cgi-bin/get-sprot-entry?$1";
983 :     } elsif ($id =~ /^uni\|([A-Z0-9_]+?)$/) {
984 :     $retVal = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=$1";
985 :     } elsif ($id =~ /^kegg\|(([a-z]{2,4}):([a-zA-Z_0-9]+))$/) {
986 :     $retVal = "http://www.genome.ad.jp/dbget-bin/www_bget?$2+$3";
987 :     } elsif ($id =~ /^tr\|([a-zA-Z0-9]+)$/) {
988 :     $retVal = "http://ca.expasy.org/uniprot/$1";
989 :     } elsif ($id =~ /^(fig\|\d+\.\d+\.\w+\.\d+)$/) {
990 :     $retVal = "http://seed-viewer.theseed.org/?pattern=$1&page=SearchResult&action=check_search";
991 :     }
992 :     # Return the result.
993 :     return $retVal;
994 :     }
995 :    
996 :    
997 : parrello 1.21 =head3 location_cmp
998 :    
999 :     my $cmp = location_cmp($loc1, $loc2);
1000 :    
1001 :     Compare two location strings (see L<SAP/Location Strings>).
1002 :    
1003 :     The ordering principle for locations is that they are sorted first by contig ID, then by
1004 :     leftmost position, in reverse order by length, and then by direction. The effect is that
1005 :     within a contig, the locations are ordered first and foremost in the way they would
1006 :     appear when displayed in a picture of the contig and second in such a way that embedded
1007 :     locations come after the locations in which they are embedded. In the case of two
1008 :     locations that represent the exact same base pairs, the forward (C<+>) location is
1009 :     arbitrarily placed first.
1010 :    
1011 :     =over 4
1012 :    
1013 :     =item loc1
1014 :    
1015 :     First location string to compare.
1016 :    
1017 :     =item loc2
1018 :    
1019 :     Second location string to compare.
1020 :    
1021 :     =item RETURN
1022 :    
1023 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
1024 :     B<loc2> location sorts first, and zero if the two locations are the same.
1025 : parrello 1.19
1026 :    
1027 :     =back
1028 :    
1029 :     =cut
1030 :    
1031 :     sub location_cmp {
1032 :     # Get the parameters.
1033 :     my ($loc1, $loc2) = @_;
1034 :     # Parse the locations.
1035 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
1036 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
1037 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
1038 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
1039 :     # Declare the return variable. We compare the indicative parts of the location
1040 :     # in order. Note that we sort in reverse order by length, so the last comparison
1041 :     # puts 2 before 1.
1042 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
1043 :     ($len2 <=> $len1);
1044 :     # If everything matches to this point, check the strands.
1045 :     if (! $retVal) {
1046 :     if ($strand1 eq '+') {
1047 :     # First location is positive, so if the locations are unequal, it
1048 :     # sorts first.
1049 :     $retVal = ($strand2 eq '+' ? 0 : -1);
1050 :     } else {
1051 :     # First location is negative, so if the locations are unequal, it
1052 :     # sorts second.
1053 :     $retVal = ($strand1 eq '-' ? 0 : 1);
1054 :     }
1055 :     }
1056 :     # Return the result.
1057 :     return $retVal;
1058 :     }
1059 :    
1060 : parrello 1.21 =head3 location_string
1061 : parrello 1.19
1062 : parrello 1.21 my $locString = location_string($contig, $beg, $end);
1063 : parrello 1.19
1064 : parrello 1.21 Form a location string for the specified contig that starts at the
1065 :     indicated begin location and stops at the indicated end location. A
1066 :     single-base location will automatically be put on the forward strand.
1067 : parrello 1.19
1068 :     =over 4
1069 :    
1070 : parrello 1.21 =item contig
1071 :    
1072 :     ID of the contig to contain this location.
1073 :    
1074 :     =item beg
1075 :    
1076 :     Beginning offset of the location.
1077 :    
1078 :     =item end
1079 : parrello 1.19
1080 : parrello 1.21 Ending offset of the location.
1081 : parrello 1.19
1082 :     =item RETURN
1083 :    
1084 : parrello 1.21 Returns a location string (see L<SAP/Location String>) for the specified
1085 :     location.
1086 : parrello 1.19
1087 :     =back
1088 :    
1089 :     =cut
1090 :    
1091 : parrello 1.21 sub location_string {
1092 : parrello 1.19 # Get the parameters.
1093 : parrello 1.21 my ($contig, $beg, $end) = @_;
1094 :     # Compute the strand and length.
1095 :     my ($strand, $len);
1096 :     if ($beg <= $end) {
1097 :     $strand = '+';
1098 :     $len = $end + 1 - $beg;
1099 :     } else {
1100 :     $strand = '-';
1101 :     $len = $beg + 1 - $end;
1102 : parrello 1.19 }
1103 :     # Return the result.
1104 : parrello 1.21 return $contig . "_$beg$strand$len";
1105 : parrello 1.16 }
1106 :    
1107 : parrello 1.5 =head3 max
1108 :    
1109 :     my $max = max(@nums);
1110 :    
1111 :     Return the maximum number from all the values in the specified list.
1112 :    
1113 :     =over 4
1114 :    
1115 :     =item nums
1116 :    
1117 :     List of numbers to examine.
1118 :    
1119 :     =item RETURN
1120 :    
1121 :     Returns the maximum numeric value from the specified parameters, or
1122 :     an undefined value if an empty list is passed in.
1123 :    
1124 :     =back
1125 :    
1126 :     =cut
1127 :    
1128 :     sub max {
1129 :     my ($retVal, @nums) = @_;
1130 :     for my $num (@nums) {
1131 :     if ($num > $retVal) {
1132 :     $retVal = $num;
1133 :     }
1134 :     }
1135 :     return $retVal;
1136 :     }
1137 :    
1138 :     =head3 min
1139 :    
1140 :     my $min = min(@nums);
1141 :    
1142 :     Return the minimum number from all the values in the specified list.
1143 :    
1144 :     =over 4
1145 :    
1146 :     =item nums
1147 :    
1148 :     List of numbers to examine.
1149 :    
1150 :     =item RETURN
1151 :    
1152 :     Returns the minimum numeric value from the specified parameters, or
1153 :     an undefined value if an empty list is passed in.
1154 :    
1155 :     =back
1156 :    
1157 :     =cut
1158 :    
1159 :     sub min {
1160 :     my ($retVal, @nums) = @_;
1161 :     for my $num (@nums) {
1162 :     if ($num < $retVal) {
1163 :     $retVal = $num;
1164 :     }
1165 :     }
1166 :     return $retVal;
1167 :     }
1168 :    
1169 : parrello 1.21 =head3 parse_fasta_record
1170 : parrello 1.1
1171 : parrello 1.21 my ($id, $comment, $seq) = parse_fasta_record($string);
1172 : parrello 1.1
1173 : parrello 1.21 Extract the ID, comment, and sequence from a single FASTA record. For
1174 :     backward compatability, instead of a FASTA record the ID and sequence can
1175 :     be specified separated by a comma. In this case, the returned comment
1176 :     will be empty.
1177 : parrello 1.1
1178 :     =over 4
1179 :    
1180 : parrello 1.21 =item string
1181 : parrello 1.1
1182 : parrello 1.21 A single FASTA record, or an ID and sequence separated by a single comma,
1183 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
1184 :     or a 3-element list consisting of an ID, a comment, and a sequence.
1185 : parrello 1.7
1186 : parrello 1.1 =item RETURN
1187 :    
1188 : parrello 1.21 Returns a three-element list consisting of the incoming ID, the associated
1189 :     comment, and the specified DNA or protein sequence. If the incoming string is
1190 :     invalid, all three list elements will come back undefined. If no ID is
1191 :     specified, an MD5 will be provided.
1192 : parrello 1.1
1193 :     =back
1194 :    
1195 :     =cut
1196 :    
1197 : parrello 1.21 sub parse_fasta_record {
1198 : parrello 1.1 # Get the parameters.
1199 : parrello 1.21 my ($string) = @_;
1200 :     # Declare the return variables.
1201 :     my ($id, $comment, $seq);
1202 :     # Check the type of input string.
1203 :     if (! defined $string) {
1204 :     # Do nothing if no string was passed in. This extra check prevents a
1205 :     # warning at runtime.
1206 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
1207 :     # Here we have a standard FASTA string.
1208 :     ($id, $comment, $seq) = ($1, $2, $3);
1209 :     # Remove white space from the sequence string.
1210 :     $seq =~ s/\s+//sg;
1211 :     # Trim front of comment.
1212 :     $comment =~ s/^s+//;
1213 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
1214 :     ($id, $comment, $seq) = ($1, '', $2);
1215 :     } elsif (ref $string eq 'ARRAY') {
1216 :     # Here the data came in pre-formatted as a list reference.
1217 :     ($id, $comment, $seq) = @$string;
1218 :     # If there's no comment, we need to adjust.
1219 :     if (! defined $seq) {
1220 :     $seq = $comment;
1221 :     $comment = '';
1222 :     }
1223 : parrello 1.7 } else {
1224 : parrello 1.21 # Here we have only a sequence. We need to construct the ID.
1225 :     $seq = $string;
1226 :     require Digest::MD5;
1227 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
1228 :     $comment = "";
1229 :     }
1230 :     # Return the results.
1231 :     return ($id, $comment, $seq);
1232 :     }
1233 :    
1234 :     =head3 parse_location
1235 :    
1236 :     my ($contig, $begin, $end, $strand) = parse_location($locString);
1237 :    
1238 :     Return the contigID, start offset, and end offset for a specified
1239 :     location string (see L<SAP/Location Strings>).
1240 :    
1241 :     =over 4
1242 :    
1243 :     =item locString
1244 :    
1245 :     Location string to parse.
1246 :    
1247 :     =item RETURN
1248 :    
1249 :     Returns a four-element list containing the contig ID from the location
1250 :     string, the starting offset of the location, the ending offset, and the
1251 :     strand. If the location string is not valid, the values returned will be
1252 :     C<undef>.
1253 :    
1254 :     =back
1255 :    
1256 :     =cut
1257 :    
1258 :     sub parse_location {
1259 :     # Get the parameters.
1260 :     my ($locString) = @_;
1261 :     # Declare the return variables.
1262 :     my ($contig, $begin, $end, $strand);
1263 :     # Parse the location string.
1264 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
1265 :     # Pull out the contig ID, strand, and begin location.
1266 :     $contig = $1;
1267 :     $begin = $2;
1268 :     $strand = $3;
1269 :     # Compute the ending location from the direction and length.
1270 :     if ($3 eq '+') {
1271 :     $end = $begin + $4 - 1;
1272 :     } else {
1273 :     $end = $begin - $4 + 1;
1274 : parrello 1.7 }
1275 : parrello 1.1 }
1276 : olson 1.43 elsif ($locString =~ /^(.*)_(\d+)_(\d+)$/)
1277 :     {
1278 :     $contig = $1;
1279 :     $begin = $2;
1280 :     $end = $3;
1281 :     $strand = $begin < $end ? "+" : "-";
1282 :     }
1283 :    
1284 : parrello 1.21 # Return the results.
1285 :     return ($contig, $begin, $end, $strand);
1286 : parrello 1.1 }
1287 :    
1288 : parrello 1.2 =head3 rev_comp
1289 :    
1290 :     my $revcmp = rev_comp($dna);
1291 :    
1292 :     or
1293 :    
1294 :     rev_comp(\$dna);
1295 :    
1296 :     Return the reverse complement of a DNA string.
1297 :    
1298 :     =over 4
1299 :    
1300 :     =item dna
1301 :    
1302 :     Either a DNA string, or a reference to a DNA string.
1303 :    
1304 :     =item RETURN
1305 :    
1306 :     If the input is a DNA string, returns the reverse complement. If the
1307 :     input is a reference to a DNA string, the string itself is reverse
1308 :     complemented.
1309 :    
1310 :     =back
1311 :    
1312 :     =cut
1313 :    
1314 :     sub rev_comp {
1315 :     # Get the parameters.
1316 :     my ($dna) = @_;
1317 :     # Determine how we were called.
1318 :     my ($retVal, $refMode);
1319 : parrello 1.24 if (! ref $dna) {
1320 : parrello 1.21 $retVal = reverse $dna;
1321 : parrello 1.2 $refMode = 0;
1322 :     } else {
1323 : parrello 1.21 $retVal = reverse $$dna;
1324 : parrello 1.2 $refMode = 1;
1325 :     }
1326 : parrello 1.21 # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
1327 :     # user passed in a reference. The following translation step complements the
1328 :     # string.
1329 :     $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
1330 : parrello 1.2 # Return the result in the method corresponding to the way it came in.
1331 :     if ($refMode) {
1332 :     $$dna = $retVal;
1333 :     return;
1334 :     } else {
1335 :     return $retVal;
1336 :     }
1337 :     }
1338 :    
1339 : parrello 1.21 # Synonym of rev_comp, for backward compatibility.
1340 :     sub reverse_comp {
1341 :     return rev_comp($_[0]);
1342 : parrello 1.15 }
1343 :    
1344 : parrello 1.21 =head3 roles_of_function
1345 : parrello 1.15
1346 : parrello 1.21 my @roles = roles_of_function($assignment);
1347 : parrello 1.3
1348 : parrello 1.21 Return a list of the functional roles in the specified assignment string.
1349 :     A single assignment may contain multiple roles as well as comments; this
1350 :     method separates them out.
1351 : parrello 1.3
1352 :     =over 4
1353 :    
1354 : parrello 1.21 =item assignment
1355 : parrello 1.3
1356 : parrello 1.21 Functional assignment to parse for roles.
1357 : parrello 1.3
1358 :     =item RETURN
1359 :    
1360 : parrello 1.21 Returns a list of the individual roles in the assignment.
1361 : parrello 1.3
1362 :     =back
1363 :    
1364 :     =cut
1365 :    
1366 : parrello 1.21 sub roles_of_function {
1367 : parrello 1.3 # Get the parameters.
1368 : parrello 1.21 my ($assignment) = @_;
1369 :     # Remove any comment.
1370 : parrello 1.30 my $commentFree = ($assignment =~ /(.+?)\s*[#!]/ ? $1 : $assignment);
1371 : parrello 1.21 # Split out the roles.
1372 : parrello 1.30 my @retVal = split /\s+[\/@]\s+|\s*;\s+/, $commentFree;
1373 : parrello 1.3 # Return the result.
1374 : parrello 1.21 return @retVal;
1375 : parrello 1.3 }
1376 :    
1377 : parrello 1.6 =head3 sims
1378 :    
1379 : parrello 1.31 my @sims = sims($id, $maxN, $maxP, 'fig');
1380 :    
1381 :     or
1382 :    
1383 :     my @sims = sims($id, $maxN, $maxP, 'all);
1384 :    
1385 : parrello 1.6
1386 :     Retrieve similarities from the network similarity server. The similarity retrieval
1387 :     is performed using an HTTP user agent that returns similarity data in multiple
1388 :     chunks. An anonymous subroutine is passed to the user agent that parses and
1389 :     reformats the chunks as they come in. The similarites themselves are returned
1390 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
1391 :     The Sim object methods allow access to the elements by name.
1392 :    
1393 :     Similarities can be either raw or expanded. The raw similarities are basic
1394 :     hits between features with similar DNA. Expanding a raw similarity drags in any
1395 :     features considered substantially identical. So, for example, if features B<A1>,
1396 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
1397 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
1398 :    
1399 :     =over 4
1400 :    
1401 :     =item id
1402 :    
1403 :     ID of the feature whose similarities are desired, or reference to a list
1404 :     of the IDs of the features whose similarities are desired.
1405 :    
1406 : parrello 1.31 =item maxN (optional)
1407 : parrello 1.6
1408 : parrello 1.27 Maximum number of similarities to return for each incoming feature.
1409 : parrello 1.6
1410 : parrello 1.31 =item maxP (optional)
1411 : parrello 1.6
1412 :     The maximum allowable similarity score.
1413 :    
1414 : parrello 1.31 =item select (optional)
1415 : parrello 1.6
1416 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
1417 :     means only similarities to FIG features are returned; C<all> means all expanded
1418 :     similarities are returned; and C<figx> means similarities are expanded until the
1419 :     number of FIG features equals the maximum.
1420 :    
1421 : parrello 1.31 =item max_expand (optional)
1422 : parrello 1.6
1423 :     The maximum number of features to expand.
1424 :    
1425 : parrello 1.31 =item filters (optional)
1426 : parrello 1.6
1427 :     Reference to a hash containing filter information, or a subroutine that can be
1428 :     used to filter the sims.
1429 :    
1430 :     =item RETURN
1431 :    
1432 :     Returns a list of L<Sim> objects.
1433 :    
1434 :     =back
1435 :    
1436 :     =cut
1437 :    
1438 :     sub sims {
1439 :     # Get the parameters.
1440 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
1441 :     # Get the URL for submitting to the sims server.
1442 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
1443 :     # Get a list of the IDs to process.
1444 :     my @ids;
1445 :     if (ref($id) eq "ARRAY") {
1446 :     @ids = @$id;
1447 :     } else {
1448 :     @ids = ($id);
1449 :     }
1450 :     # Form a list of the parameters to pass to the server.
1451 :     my %args = ();
1452 :     $args{id} = \@ids;
1453 :     $args{maxN} = $maxN if defined($maxN);
1454 :     $args{maxP} = $maxP if defined($maxP);
1455 :     $args{select} = $select if defined($select);
1456 :     $args{max_expand} = $max_expand if defined($max_expand);
1457 :     # If the filter is a hash, put the filters in the argument list.
1458 :     if (ref($filters) eq 'HASH') {
1459 :     for my $k (keys(%$filters))
1460 :     {
1461 :     $args{"filter_$k"}= $filters->{$k};
1462 :     }
1463 :     }
1464 :     # Get the user agent.
1465 :     require LWP::UserAgent;
1466 :     my $ua = LWP::UserAgent->new();
1467 :     # Insure we have the Sim module.
1468 :     require Sim;
1469 :     #
1470 :     # Our next task is to create the anonymous subroutine that will process the
1471 :     # chunks that come back from the server. We require three global variables:
1472 :     # @sims to hold the similarities found, $tail to remember the unprocessed
1473 :     # data from the previous chunk, and $chunks to count the chunks.
1474 :     #
1475 :     my @retVal;
1476 :     my $tail;
1477 :     my $chunks = 0;
1478 :     #
1479 :     # ANONYMOUS SUBROUTINE
1480 :     #
1481 :     my $cb = sub {
1482 :     eval {
1483 :     # Get the parameters.
1484 :     my ($data, $command) = @_;
1485 :     # Check for a reset command. If we get one, we discard any data
1486 :     # in progress.
1487 :     if ($command && $command eq 'reset') {
1488 :     $tail = '';
1489 :     } else {
1490 :     $chunks++;
1491 :     # Get the data to process. Note we concatenate it to the incoming
1492 :     # tail from last time.
1493 :     my $c = $tail . $data;
1494 :     # Make sure the caller hasn't messed up the new-line character.
1495 :     # FASTA readers in particular are notorious for doing things
1496 :     # like that.
1497 :     local $/ = "\n";
1498 :     # Split the input into lines.
1499 :     my @lines = split(/\n/, $c);
1500 :     # If the input does not end with a new-line, we have a partial
1501 :     # chunk and need to put it in the tail for next time. If not,
1502 :     # there is no tail for next time.
1503 :     if (substr($c, -1, 1) ne "\n") {
1504 :     $tail = pop @lines;
1505 :     } else {
1506 :     $tail = '';
1507 :     }
1508 :     # Loop through the lines. Note there's no need to chomp because
1509 :     # the SPLIT took out the new-line characters.
1510 :     for my $l (@lines) {
1511 :     # Split the line into fields.
1512 :     my @s = split(/\t/, $l);
1513 :     # Insure we have all the fields we need.
1514 : parrello 1.12 if (@s >= 9) {
1515 : parrello 1.6 # Check to see if we've seen this SIM before.
1516 :     my $id1 = $s[0];
1517 :     my $id2 = $s[1];
1518 :     # Add it to the result list.
1519 :     push(@retVal, bless \@s, 'Sim');
1520 :     }
1521 :     }
1522 :     }
1523 :     };
1524 :     };
1525 :     #
1526 :     # END OF ANONYMOUS SUBROUTINE
1527 :     #
1528 :     # Now we're ready to start. Because networking is an iffy thing, we set up
1529 :     # to try our request multiple times.
1530 :     my $n_retries = 10;
1531 :     my $attempts = 0;
1532 :     # Set the timeout value, in seconds.
1533 :     $ua->timeout(180);
1534 :     # Loop until we succeed or run out of retries.
1535 :     my $done = 0;
1536 :     while (! $done && $attempts++ < $n_retries) {
1537 :     # Reset the content processor. This clears the tail.
1538 :     &$cb(undef, 'reset');
1539 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
1540 :     if ($resp->is_success) {
1541 :     # If the response was successful, get the content. This triggers
1542 :     # the anonymous subroutine.
1543 :     my $x = $resp->content;
1544 :     # Denote we've been successful.
1545 :     $done = 1;
1546 :     }
1547 :     }
1548 :     return @retVal;
1549 :     }
1550 :    
1551 : gdpusch 1.55
1552 :     =head3 genetic_code
1553 :    
1554 :     my $code = genetic_code();
1555 :    
1556 :     Return a hash containing the translation of nucleotide triples to proteins.
1557 :     Methods such as L</translate> can take a translation scheme as a parameter.
1558 :     This method returns the translation scheme for genetic code 11 or 4,
1559 :     and an error for all other cocdes. The scheme is implemented as a reference to a
1560 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1561 :    
1562 :     =cut
1563 :    
1564 :     sub genetic_code {
1565 :     my ($ncbi_genetic_code_num) = @_;
1566 :     my $code = &standard_genetic_code();
1567 :    
1568 :     if ($ncbi_genetic_code_num == 11) {
1569 :     #...Do nothing
1570 :     }
1571 :     elsif ($ncbi_genetic_code_num == 4) {
1572 :     $code->{TGA} = 'W';
1573 :     }
1574 :     else {
1575 :     die "Sorry, only genetic codes 11 and 4 are currently supported";
1576 :     }
1577 :    
1578 :     return $code;
1579 :     }
1580 :    
1581 :    
1582 : parrello 1.21 =head3 standard_genetic_code
1583 :    
1584 :     my $code = standard_genetic_code();
1585 :    
1586 :     Return a hash containing the standard translation of nucleotide triples to proteins.
1587 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
1588 :     returns the default translation scheme. The scheme is implemented as a reference to a
1589 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1590 :    
1591 :     =cut
1592 :    
1593 :     sub standard_genetic_code {
1594 :    
1595 :     my $code = {};
1596 : parrello 1.8
1597 : parrello 1.21 $code->{"AAA"} = "K";
1598 :     $code->{"AAC"} = "N";
1599 :     $code->{"AAG"} = "K";
1600 :     $code->{"AAT"} = "N";
1601 :     $code->{"ACA"} = "T";
1602 :     $code->{"ACC"} = "T";
1603 :     $code->{"ACG"} = "T";
1604 :     $code->{"ACT"} = "T";
1605 :     $code->{"AGA"} = "R";
1606 :     $code->{"AGC"} = "S";
1607 :     $code->{"AGG"} = "R";
1608 :     $code->{"AGT"} = "S";
1609 :     $code->{"ATA"} = "I";
1610 :     $code->{"ATC"} = "I";
1611 :     $code->{"ATG"} = "M";
1612 :     $code->{"ATT"} = "I";
1613 :     $code->{"CAA"} = "Q";
1614 :     $code->{"CAC"} = "H";
1615 :     $code->{"CAG"} = "Q";
1616 :     $code->{"CAT"} = "H";
1617 :     $code->{"CCA"} = "P";
1618 :     $code->{"CCC"} = "P";
1619 :     $code->{"CCG"} = "P";
1620 :     $code->{"CCT"} = "P";
1621 :     $code->{"CGA"} = "R";
1622 :     $code->{"CGC"} = "R";
1623 :     $code->{"CGG"} = "R";
1624 :     $code->{"CGT"} = "R";
1625 :     $code->{"CTA"} = "L";
1626 :     $code->{"CTC"} = "L";
1627 :     $code->{"CTG"} = "L";
1628 :     $code->{"CTT"} = "L";
1629 :     $code->{"GAA"} = "E";
1630 :     $code->{"GAC"} = "D";
1631 :     $code->{"GAG"} = "E";
1632 :     $code->{"GAT"} = "D";
1633 :     $code->{"GCA"} = "A";
1634 :     $code->{"GCC"} = "A";
1635 :     $code->{"GCG"} = "A";
1636 :     $code->{"GCT"} = "A";
1637 :     $code->{"GGA"} = "G";
1638 :     $code->{"GGC"} = "G";
1639 :     $code->{"GGG"} = "G";
1640 :     $code->{"GGT"} = "G";
1641 :     $code->{"GTA"} = "V";
1642 :     $code->{"GTC"} = "V";
1643 :     $code->{"GTG"} = "V";
1644 :     $code->{"GTT"} = "V";
1645 :     $code->{"TAA"} = "*";
1646 :     $code->{"TAC"} = "Y";
1647 :     $code->{"TAG"} = "*";
1648 :     $code->{"TAT"} = "Y";
1649 :     $code->{"TCA"} = "S";
1650 :     $code->{"TCC"} = "S";
1651 :     $code->{"TCG"} = "S";
1652 :     $code->{"TCT"} = "S";
1653 :     $code->{"TGA"} = "*";
1654 :     $code->{"TGC"} = "C";
1655 :     $code->{"TGG"} = "W";
1656 :     $code->{"TGT"} = "C";
1657 :     $code->{"TTA"} = "L";
1658 :     $code->{"TTC"} = "F";
1659 :     $code->{"TTG"} = "L";
1660 :     $code->{"TTT"} = "F";
1661 : parrello 1.8
1662 : parrello 1.21 return $code;
1663 :     }
1664 : parrello 1.8
1665 : parrello 1.21 =head3 strand_of
1666 : parrello 1.8
1667 : parrello 1.21 my $plusOrMinus = strand_of($loc);
1668 : parrello 1.8
1669 : parrello 1.21 Return the strand (C<+> or C<->) from the specified location string.
1670 : parrello 1.8
1671 : parrello 1.21 =over 4
1672 : parrello 1.8
1673 : parrello 1.21 =item loc
1674 : parrello 1.8
1675 : parrello 1.21 Location string to parse (see L<SAP/Location Strings>).
1676 : parrello 1.9
1677 :     =item RETURN
1678 :    
1679 : parrello 1.21 Returns C<+> if the location is on the forward strand, else C<->.
1680 : parrello 1.9
1681 :     =back
1682 :    
1683 :     =cut
1684 :    
1685 : parrello 1.21 sub strand_of {
1686 : parrello 1.9 # Get the parameters.
1687 : parrello 1.21 my ($loc) = @_;
1688 :     # Declare the return variable.
1689 :     my $retVal;
1690 :     # Parse the strand indicator from the location.
1691 :     if ($loc =~ /\d+([+-])\d+/) {
1692 :     $retVal = $1;
1693 : parrello 1.9 }
1694 : parrello 1.21 # Return the result.
1695 :     return $retVal;
1696 : parrello 1.9 }
1697 :    
1698 : parrello 1.21 =head3 strip_ec
1699 : parrello 1.9
1700 : parrello 1.21 my $role = strip_ec($rawRole);
1701 : parrello 1.11
1702 : parrello 1.21 Strip the EC number (if any) from the specified role or functional
1703 :     assignment.
1704 : parrello 1.11
1705 :     =over 4
1706 :    
1707 : parrello 1.21 =item rawRole
1708 : parrello 1.11
1709 : parrello 1.21 Role or functional assignment from which the EC numbers are to be stripped.
1710 : parrello 1.11
1711 :     =item RETURN
1712 :    
1713 : parrello 1.21 Returns the incoming string with any EC numbers removed. The EC numbers must
1714 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1715 :     and surrounding parentheses).
1716 : parrello 1.11
1717 :     =back
1718 :    
1719 :     =cut
1720 :    
1721 : parrello 1.21 sub strip_ec {
1722 :     # Get the parameters.
1723 :     my ($rawRole) = @_;
1724 :     # Declare the return variable.
1725 :     my $retVal = $rawRole;
1726 :     # Remove the EC numbers.
1727 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1728 :     # Return the result.
1729 :     return $retVal;
1730 : parrello 1.15 }
1731 :    
1732 :     =head3 translate
1733 :    
1734 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1735 :    
1736 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1737 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1738 :     C<M>. (In the standard genetic code, these two combinations normally translate
1739 :     to C<V> and C<L>, respectively.)
1740 :    
1741 :     =over 4
1742 :    
1743 :     =item dna_seq
1744 :    
1745 :     DNA sequence to translate. Note that the DNA sequence can only contain
1746 :     known nucleotides.
1747 :    
1748 :     =item code
1749 :    
1750 :     Reference to a hash specifying the translation code. The hash is keyed by
1751 :     nucleotide triples, and the value for each key is the corresponding protein
1752 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1753 :     used.
1754 :    
1755 :     =item fix_start
1756 :    
1757 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1758 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1759 :     C<M> instead of the value specified in the translation code.
1760 :    
1761 :     =item RETURN
1762 :    
1763 :     Returns a string resulting from translating each nucleotide triple into a
1764 :     protein letter.
1765 :    
1766 :     =back
1767 :    
1768 :     =cut
1769 : gdpusch 1.47
1770 : parrello 1.15 #: Return Type $;
1771 :     sub translate {
1772 :    
1773 :     my( $dna,$code,$start ) = @_;
1774 :     my( $i,$j,$ln );
1775 :     my( $x,$y );
1776 :     my( $prot );
1777 :    
1778 :     if (! defined($code)) {
1779 : overbeek 1.45 $code = &standard_genetic_code;
1780 : parrello 1.15 }
1781 :     $ln = length($dna);
1782 :     $prot = "X" x ($ln/3);
1783 :     $dna =~ tr/a-z/A-Z/;
1784 :    
1785 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1786 :     $x = substr($dna,$i,3);
1787 :     if ($y = $code->{$x}) {
1788 :     substr($prot,$j,1) = $y;
1789 :     }
1790 :     }
1791 :    
1792 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1793 :     substr($prot,0,1) = 'M';
1794 :     }
1795 :     return $prot;
1796 :     }
1797 :    
1798 : parrello 1.36 =head3 type_of
1799 :    
1800 :     my $type = SeedUtils::type_of($fid);
1801 :    
1802 :     Return the type of a feature, given a FIG feature ID (e.g. C<fig|100226.1.peg.3361>).
1803 :    
1804 :     =over 4
1805 :    
1806 :     =item fid
1807 :    
1808 :     ID of a feature whose type is desired.
1809 :    
1810 :     =item RETURN
1811 :    
1812 :     Returns the type of the feature (e.g. C<peg>, C<rna>, ...).
1813 :    
1814 :     =back
1815 :    
1816 :     =cut
1817 :    
1818 :     sub type_of {
1819 :     # Get the parameter.
1820 :     my ($fid) = @_;
1821 :     # Declare the return variable. We return undefined if the ID is unparseable.
1822 :     my $retVal;
1823 :     # Parse the FIG ID.
1824 :     if ($fid =~ /fig\|\d+\.\d+\.(\w+)\./) {
1825 :     # Save the type segment.
1826 :     $retVal = $1;
1827 :     }
1828 :     # Return the result.
1829 :     return $retVal;
1830 :     }
1831 :    
1832 :    
1833 : parrello 1.21 =head3 verify_dir
1834 : parrello 1.18
1835 : parrello 1.21 verify_dir($dirName);
1836 : parrello 1.18
1837 : parrello 1.21 Insure that the specified directory exists. If the directory does not
1838 :     exist, it will be created.
1839 : parrello 1.18
1840 :     =over 4
1841 :    
1842 : parrello 1.21 =item dirName
1843 : parrello 1.18
1844 : parrello 1.21 Name of the relevant directory.
1845 : parrello 1.18
1846 :     =back
1847 :    
1848 :     =cut
1849 :    
1850 : parrello 1.21 sub verify_dir {
1851 : parrello 1.18 # Get the parameters.
1852 : parrello 1.21 my ($dirName) = @_;
1853 :     # Strip off the final slash, if any.
1854 :     $dirName =~ s#/$##;
1855 :     # Only proceed if the directory does NOT already exist.
1856 :     if (! -d $dirName) {
1857 :     # If there is a parent directory, recursively insure it is there.
1858 :     if ($dirName =~ m#(.+)/[^/]+$#) {
1859 :     verify_dir($1);
1860 : overbeek 1.20 }
1861 : parrello 1.21 # Create this particular directory with full permissions.
1862 :     mkdir $dirName, 0777;
1863 : overbeek 1.20 }
1864 :     }
1865 :    
1866 : olson 1.39 =head3 validate_fasta_file
1867 :    
1868 :     $sequence_type = validate_fasta_file($in_file, $out_file)
1869 :    
1870 :     Ensure the given file is in valid fasta format. If $out_file
1871 :     is given, write the data to $out_file as a normalized fasta file
1872 :     (with cleaned up line endings, upper case data).
1873 :    
1874 :     If successful, returns the string "dna" or "protein".
1875 :    
1876 :     Will invoke die() on failure; call inside eval{} to ensure full error catching.
1877 :    
1878 :     =cut
1879 :    
1880 :     sub validate_fasta_file
1881 :     {
1882 :     my($file, $norm) = @_;
1883 :    
1884 :     my($input_fh, $clean_fh);
1885 : olson 1.56
1886 :     if ($file =~ /\.gz$/)
1887 :     {
1888 :     open($input_fh, "-|", "gunzip", "-c", $file) or die "cannot unzip $file: $!";
1889 :     }
1890 :     else
1891 :     {
1892 :     open($input_fh, "<", $file) or die "cannot open $file: $!";
1893 :     }
1894 : olson 1.39
1895 :     if ($norm)
1896 :     {
1897 :     open($clean_fh, ">", $norm) or die "cannot write normalized file $norm: $!";
1898 :     }
1899 :    
1900 :     my $state = 'expect_header';
1901 :     my $cur_id;
1902 :     my $dna_chars;
1903 :     my $prot_chars;
1904 :    
1905 :     while (<$input_fh>)
1906 :     {
1907 :     chomp;
1908 :    
1909 :     if ($state eq 'expect_header')
1910 :     {
1911 :     if (/^>(\S+)/)
1912 :     {
1913 :     $cur_id = $1;
1914 :     $state = 'expect_data';
1915 :     print $clean_fh ">$cur_id\n" if $clean_fh;
1916 :     next;
1917 :     }
1918 :     else
1919 :     {
1920 :     die "Invalid fasta: Expected header at line $.\n";
1921 :     }
1922 :     }
1923 :     elsif ($state eq 'expect_data')
1924 :     {
1925 :     if (/^>(\S+)/)
1926 :     {
1927 :     $cur_id = $1;
1928 :     $state = 'expect_data';
1929 :     print $clean_fh ">$cur_id\n" if $clean_fh;
1930 :     next;
1931 :     }
1932 :     elsif (/^([acgtumrwsykbdhvn]*)\s*$/i)
1933 :     {
1934 :     print $clean_fh uc($1) . "\n" if $clean_fh;
1935 :     $dna_chars += length($1);
1936 :     next;
1937 :     }
1938 :     elsif (/^([*abcdefghijklmnopqrstuvwxyz]*)\s*$/i)
1939 :     {
1940 :     print $clean_fh uc($1) . "\n" if $clean_fh;
1941 :     $prot_chars += length($1);
1942 :     next;
1943 :     }
1944 :     else
1945 :     {
1946 :     my $str = $_;
1947 :     if (length($_) > 100)
1948 :     {
1949 :     $str = substr($_, 0, 50) . " [...] " . substr($_, -50);
1950 :     }
1951 :     die "Invalid fasta: Bad data at line $.\n$str\n";
1952 :     }
1953 :     }
1954 :     else
1955 :     {
1956 :     die "Internal error: invalid state $state\n";
1957 :     }
1958 :     }
1959 :     close($input_fh);
1960 :     close($clean_fh) if $clean_fh;
1961 :    
1962 :     my $what = ($prot_chars > 0) ? "protein" : "dna";
1963 :    
1964 :     return $what;
1965 :     }
1966 :    
1967 : disz 1.28 sub strip_func {
1968 :     my($func) = @_;
1969 : overbeek 1.20
1970 : disz 1.28 $func =~ s/^FIG\d{6}[^:]*:\s*//;
1971 :     $func =~ s/\s*\#.*$//;
1972 :     return($func);
1973 :     }
1974 :    
1975 : olson 1.49 sub strip_func_comment {
1976 :     my($func) = @_;
1977 :    
1978 :     $func =~ s/\s*\#.*$//;
1979 :     return($func);
1980 :     }
1981 :    
1982 : olson 1.32 sub verify_db {
1983 :     my($db,$type) = @_;
1984 :    
1985 :     #
1986 :     # Find formatdb; if we're operating in a SEED environment
1987 :     # use it from there.
1988 :     #
1989 :    
1990 :     my $path = '';
1991 :     if ($FIG_Config::blastbin ne '' && -d $FIG_Config::blastbin)
1992 :     {
1993 :     $path = "$FIG_Config::blastbin/";
1994 :     }
1995 :     elsif ($FIG_Config::ext_bin ne '' && -d $FIG_Config::ext_bin)
1996 :     {
1997 :     $path = "$FIG_Config::ext_bin/";
1998 :     }
1999 :    
2000 :    
2001 :     my @cmd;
2002 :     if ($type =~ /^p/i)
2003 :     {
2004 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db))
2005 :     {
2006 :     @cmd = ("${path}formatdb", "-p", "T", "-i", $db);
2007 :     }
2008 :     }
2009 :     else
2010 :     {
2011 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db))
2012 :     {
2013 :     @cmd = ("${path}formatdb", "-p", "F", "-i", $db);
2014 :     }
2015 :     }
2016 :     if (@cmd)
2017 :     {
2018 :     my $rc = system(@cmd);
2019 :     if ($rc != 0)
2020 :     {
2021 :     warn "SeedUtils::verify_db: formatdb failed with rc=$rc: @cmd\n";
2022 :     }
2023 :     }
2024 :     }
2025 :    
2026 : olson 1.38 #
2027 :     # Some berkeley-db building utilities.
2028 :     #
2029 :    
2030 :     sub create_berk_table
2031 :     {
2032 :     my($input_file, $key_columns, $value_columns, $db_file, %opts) = @_;
2033 :    
2034 :     local $DB_BTREE->{flags};
2035 :     if ($opts{-multiple_values})
2036 :     {
2037 :     $DB_BTREE->{flags} = R_DUP;
2038 :     }
2039 :    
2040 :     my $ifh;
2041 :    
2042 :     if ($opts{-sort})
2043 :     {
2044 :     my $sk = join(" ", map { "-k " . ($_ + 1) } @$key_columns);
2045 :     my $cmd = "sort $sk $input_file";
2046 :     print "Run $cmd\n";
2047 :    
2048 :     open($ifh, "$cmd |") or die "Cannot open sort $sk $input_file for reading: $!";
2049 :     }
2050 :     else
2051 :     {
2052 :     open($ifh, "<", $input_file) or die "Cannot open $input_file for reading: $!";
2053 :     }
2054 :    
2055 :     my $hash = {};
2056 :     my $tie = tie %$hash, "DB_File", $db_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;
2057 :     $tie or die "Cannot create $db_file: $!";
2058 :    
2059 :     while (<$ifh>)
2060 :     {
2061 :     chomp;
2062 :     my @a = split(/\t/);
2063 :     my $k = join($;, @a[@$key_columns]);
2064 :     my $v = join($;, @a[@$value_columns]);
2065 :    
2066 :     $hash->{$k} = $v;
2067 :     }
2068 :     close($ifh);
2069 :     undef $hash;
2070 :     untie $tie;
2071 :     }
2072 :    
2073 :     sub open_berk_table
2074 :     {
2075 :     my($table, %opts) = @_;
2076 :    
2077 :     if (! -f $table)
2078 :     {
2079 :     warn "Cannot read table file $table\n";
2080 :     return undef;
2081 :     }
2082 :     my $h = {};
2083 :     tie %$h, 'BerkTable', $table, %opts;
2084 :     return $h;
2085 :     }
2086 : olson 1.32
2087 : redwards 1.54 our $AllColors;
2088 :    
2089 : olson 1.43 sub compare_region_color
2090 :     {
2091 :     my($n) = @_;
2092 :     my $nc = @$AllColors;
2093 :     my $c = $AllColors->[$n % $nc];
2094 :     return split(/-/, $c);
2095 :     }
2096 :    
2097 : olson 1.52 $AllColors =
2098 : olson 1.43 [
2099 :     '255-0-0', # red
2100 :     '0-255-0', # green
2101 :     '0-0-255', # blue
2102 :     '255-64-192',
2103 :     '255-128-64',
2104 :     '255-0-128',
2105 :     '255-192-64',
2106 :     '64-192-255',
2107 :     '64-255-192',
2108 :     '192-128-128',
2109 :     '192-255-0',
2110 :     '0-255-128',
2111 :     '0-192-64',
2112 :     '128-0-0',
2113 :     '255-0-192',
2114 :     '64-0-128',
2115 :     '128-64-64',
2116 :     '64-255-0',
2117 :     '128-0-64',
2118 :     '128-192-255',
2119 :     '128-192-0',
2120 :     '64-0-0',
2121 :     '128-128-0',
2122 :     '255-192-255',
2123 :     '128-64-255',
2124 :     '64-0-192',
2125 :     '0-64-64',
2126 :     '64-0-255',
2127 :     '192-64-255',
2128 :     '128-0-128',
2129 :     '192-255-64',
2130 :     '64-128-255',
2131 :     '255-128-192',
2132 :     '64-192-64',
2133 :     '0-128-128',
2134 :     '255-0-64',
2135 :     '128-64-0',
2136 :     '128-255-128',
2137 :     '255-64-128',
2138 :     '128-192-64',
2139 :     '128-128-64',
2140 :     '255-255-192',
2141 :     '192-192-128',
2142 :     '192-64-128',
2143 :     '64-128-192',
2144 :     '192-192-64',
2145 :     '192-0-128',
2146 :     '64-64-192',
2147 :     '0-128-192',
2148 :     '0-128-64',
2149 :     '255-192-128',
2150 :     '192-128-0',
2151 :     '64-255-255',
2152 :     '255-0-255',
2153 :     '128-255-255',
2154 :     '255-255-64',
2155 :     '0-128-0',
2156 :     '192-255-192',
2157 :     '0-192-0',
2158 :     '0-64-192',
2159 :     '0-64-128',
2160 :     '192-0-255',
2161 :     '192-192-255',
2162 :     '64-255-128',
2163 :     '0-0-128',
2164 :     '255-64-64',
2165 :     '192-192-0',
2166 :     '192-128-192',
2167 :     '128-64-192',
2168 :     '0-192-255',
2169 :     '128-192-192',
2170 :     '192-0-64',
2171 :     '192-255-255',
2172 :     '255-192-0',
2173 :     '255-255-128',
2174 :     '192-0-0',
2175 :     '64-64-0',
2176 :     '192-64-192',
2177 :     '192-128-255',
2178 :     '128-255-192',
2179 :     '64-64-255',
2180 :     '0-64-255',
2181 :     '128-64-128',
2182 :     '255-64-255',
2183 :     '192-128-64',
2184 :     '64-64-128',
2185 :     '0-128-255',
2186 :     '64-0-64',
2187 :     '128-0-192',
2188 :     '255-128-255',
2189 :     '64-128-0',
2190 :     '255-64-0',
2191 :     '64-192-192',
2192 :     '255-128-0',
2193 :     '0-0-64',
2194 :     '128-128-192',
2195 :     '128-128-255',
2196 :     '0-192-192',
2197 :     '0-255-192',
2198 :     '128-192-128',
2199 :     '192-0-192',
2200 :     '0-255-64',
2201 :     '64-192-0',
2202 :     '0-192-128',
2203 :     '128-255-64',
2204 :     '255-255-0',
2205 :     '64-255-64',
2206 :     '192-64-64',
2207 :     '192-64-0',
2208 :     '255-192-192',
2209 :     '192-255-128',
2210 :     '0-64-0',
2211 :     '0-0-192',
2212 :     '128-0-255',
2213 :     '64-128-64',
2214 :     '64-192-128',
2215 :     '0-255-255',
2216 :     '255-128-128',
2217 :     '64-128-128',
2218 :     '128-255-0'
2219 :     ];
2220 :    
2221 : overbeek 1.40 sub run {
2222 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2223 :     my($cmd) = @_;
2224 :    
2225 :     if ($ENV{FIG_VERBOSE}) {
2226 :     my @tmp = `date`;
2227 :     chomp @tmp;
2228 :     print STDERR "$tmp[0]: running $cmd\n";
2229 :     }
2230 :     (system($cmd) == 0) || die("FAILED: $cmd");
2231 :     }
2232 :    
2233 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3