[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.59 - (view) (download) (as text)

1 : parrello 1.1
2 : parrello 1.11 # This is a SAS component.
3 :    
4 : gdpusch 1.55 ########################################################################
5 : parrello 1.1 # Copyright (c) 2003-2006 University of Chicago and Fellowship
6 :     # for Interpretations of Genomes. All Rights Reserved.
7 :     #
8 :     # This file is part of the SEED Toolkit.
9 :     #
10 :     # The SEED Toolkit is free software. You can redistribute
11 :     # it and/or modify it under the terms of the SEED Toolkit
12 :     # Public License.
13 :     #
14 :     # You should have received a copy of the SEED Toolkit Public License
15 :     # along with this program; if not write to the University of Chicago
16 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
17 :     # Genomes at veronika@thefig.info or download a copy from
18 :     # http://www.theseed.org/LICENSE.TXT.
19 : gdpusch 1.55 ########################################################################
20 : parrello 1.1
21 :     package SeedUtils;
22 : olson 1.38 use BerkTable;
23 :     use DB_File;
24 : olson 1.42 use Carp;
25 : parrello 1.1
26 : olson 1.32 #
27 :     # In case we are running in a SEED, pull in the FIG_Config
28 :     #
29 :     BEGIN
30 :     {
31 :     eval {
32 :     require FIG_Config;
33 :     };
34 :     }
35 :    
36 : parrello 1.1 use strict;
37 : parrello 1.12 no warnings qw(once);
38 : parrello 1.1 use base qw(Exporter);
39 : parrello 1.15 our @EXPORT = qw(hypo boundaries_of parse_fasta_record create_fasta_record
40 :     rev_comp genome_of min max sims verify_dir between translate
41 : parrello 1.21 standard_genetic_code parse_location roles_of_function
42 : olson 1.33 strip_ec location_string location_cmp strand_of by_fig_id
43 : olson 1.39 verify_db bbh_data id_url validate_fasta_file);
44 : parrello 1.1
45 :     =head1 SEED Utility Methods
46 :    
47 :     =head2 Introduction
48 :    
49 :     This is a simple utility package that performs functions useful for
50 :     bioinformatics, but that do not require access to the databases.
51 :    
52 :     =head2 Public Methods
53 :    
54 : parrello 1.21 =head3 abbrev
55 :    
56 :     my $abbrev = SeedUtils::abbrev($genome_name);
57 :    
58 :     Return an abbreviation of the specified genome name. This method is used to create
59 :     a reasonably indicative genome name that fits in 10 characters.
60 :    
61 :     =over 4
62 :    
63 :     =item genome_name
64 :    
65 :     Genome name to abbreviate.
66 :    
67 :     =item RETURN
68 :    
69 :     Returns a shortened version of the genome name that is 10 characters or less in
70 :     length.
71 :    
72 :     =back
73 :    
74 :     =cut
75 :    
76 :     sub abbrev {
77 :     my($genome_name) = @_;
78 : overbeek 1.44 my %exclude = map { $_ => 1 }
79 : overbeek 1.50 qw( candidatus subspecies subsp strain str bv pv sp );
80 :     my ($p1,$p2,@rest) = grep { ! $exclude{lc $_} }
81 :     map { $_ =~ s/\W//g; $_ }
82 :     split(/\s/,$genome_name);
83 :     my $p3 = join("",@rest);
84 : parrello 1.21
85 : overbeek 1.44 my $lbl_ln = 10;
86 :    
87 :     if (! $p2)
88 :     {
89 :     if (length($p1) > $lbl_ln) { $p1 = substr($p1,0,$lbl_ln) }
90 :     return $p1;
91 :     }
92 :     elsif (! $p3)
93 :     {
94 :     my $l1 = length($p1);
95 :     my $l2 = length($p2);
96 :     my $ln1 = $l1;
97 :     my $ln2 = $l2;
98 :    
99 :     if (($l1 + $l2 + 1) > $lbl_ln)
100 :     {
101 :     $ln1 = $lbl_ln - ($l2+1);
102 :     $ln1 = &min($l1,&max($ln1,3));
103 :     $ln2 = $lbl_ln - ($ln1+1);
104 :     $p1 = substr($p1,0,$ln1);
105 :     $p2 = substr($p2,0,$ln2);
106 :     }
107 :     my $sep = ($l1 == $ln1) ? '_' : '.';
108 :     return $p1 . $sep . $p2;
109 :     }
110 :     else
111 :     {
112 :     my $l1 = length($p1);
113 :     my $l2 = length($p2);
114 :     my $l3 = length($p3);
115 :     my $l23 = $l2+$l3+1;
116 :     my $ln1 = $l1;
117 :     my $ln2 = $l2;
118 :     my $ln3 = $l3;
119 :    
120 :     if (($l1 + $l2 + $l3 + 2) > $lbl_ln)
121 :     {
122 :     $ln1 = $lbl_ln - ($l23 + 1);
123 :     $ln1 = &min($l1,&max($ln1,3));
124 :     my $rest = $lbl_ln - ($ln1+1);
125 :     $ln2 = $rest - ($ln3+1);
126 :     $ln2 = &min($l2,&max($ln2,3));
127 : overbeek 1.51 $ln3 = $lbl_ln - ($ln1+$ln2+2);
128 : overbeek 1.44
129 :     $p1 = substr($p1,0,$ln1);
130 :     $p2 = substr($p2,0,$ln2);
131 :     $p3 = substr($p3,0,$ln3);
132 :     }
133 :     my $sep1 = ($l1 == $ln1) ? '_' : '.';
134 :     my $sep2 = ($l2 == $ln2) ? '_' : '.';
135 :     return $p1 . $sep1 . $p2 . $sep2 . $p3;
136 :     }
137 :     }
138 :    
139 :     =head3 abbrev_set
140 :    
141 :     my $abbrevH = SeedUtils::abbrev_set($genome_names);
142 :    
143 :     Takes a pointer to a list of genome names and returns a hash mapping names to unique
144 :     abbreviations. The names will be less than or equal to 10 characters in length.
145 :    
146 :     =over 4
147 :    
148 :     =item genome_names
149 :    
150 :     Pointer to a list of genome names
151 :    
152 :     =item RETURN
153 :    
154 :     Returns a hash mapping full names to unique abbreviations.
155 :    
156 :     =back
157 :    
158 :     =cut
159 :    
160 :     sub abbrev_set {
161 :     my($genome_names) = @_;
162 :    
163 :     my %seen;
164 :     my $hash = {};
165 :     foreach my $name (@$genome_names)
166 :     {
167 :     next if ($hash->{$name});
168 :     my $abbrev = &abbrev($name);
169 :     while ($seen{$abbrev})
170 :     {
171 :     $abbrev = &next_try($abbrev);
172 :     }
173 :     $hash->{$name} = $abbrev;
174 :     $seen{$abbrev} = 1;
175 :     }
176 :     return $hash;
177 :     }
178 :    
179 :     sub next_try {
180 :     my($abbrev) = @_;
181 :    
182 :     my($ext) = ($abbrev =~ s/\.(\d+)$//);
183 :     $ext ||= 0;
184 :     $ext++;
185 :     my $ln = length($abbrev) + length($ext) + 1;
186 :     if ($ln > 10)
187 :     {
188 :     $abbrev = substr($abbrev,0,(10 - (length($ext)+1)));
189 : parrello 1.21 }
190 : overbeek 1.44 return "$abbrev.$ext";
191 : parrello 1.21 }
192 :    
193 : parrello 1.34 =head3 bbh_data
194 :    
195 :     my $bbhList = FIGRules::bbh_data($peg, $cutoff);
196 :    
197 :     Return a list of the bi-directional best hits relevant to the specified PEG.
198 :    
199 :     =over 4
200 :    
201 :     =item peg
202 :    
203 :     ID of the feature whose bidirectional best hits are desired.
204 :    
205 :     =item cutoff
206 :    
207 :     Similarity cutoff. If omitted, 1e-10 is used.
208 :    
209 :     =item RETURN
210 :    
211 :     Returns a reference to a list of 3-tuples. The first element of the list is the best-hit
212 :     PEG; the second element is the score. A lower score indicates a better match. The third
213 :     element is the normalized bit score for the pair; it is normalized to the length
214 :     of the protein.
215 :    
216 :     =back
217 :    
218 :     =cut
219 : gdpusch 1.47
220 : parrello 1.34 #: Return Type @@;
221 :     sub bbh_data {
222 :     my ($peg, $cutoff) = @_;
223 :     my @retVal = ();
224 :     require LWP::UserAgent;
225 :     my $ua = LWP::UserAgent->new();
226 :     my $url = "http://bioseed.mcs.anl.gov/simserver/perl/bbhs.pl";
227 :     my $retries = 5;
228 :     my $done = 0;
229 :     my $resp;
230 :     while ($retries > 0 && ! $done) {
231 :     $resp = $ua->post($url, { id => $peg, cutoff => $cutoff });
232 :     if ($resp->is_success) {
233 :     my $dat = $resp->content;
234 :     while ($dat =~ /([^\n]+)\n/g) {
235 :     my @l = split(/\t/, $1);
236 :     push @retVal, \@l;
237 :     }
238 :     $done = 1;
239 :     } else {
240 :     $retries--;
241 :     }
242 :     }
243 :     if (! $done) {
244 :     die("Failure retrieving bbh data for $peg: " . $resp->status_line);
245 :     }
246 :     return \@retVal;
247 :     }
248 :    
249 :    
250 : parrello 1.21 =head3 between
251 :    
252 :     my $flag = between($x, $y, $z);
253 :    
254 :     Determine whether or not $y is between $x and $z.
255 :    
256 :     =over 4
257 :    
258 :     =item x
259 :    
260 :     First edge number.
261 :    
262 :     =item y
263 :    
264 :     Number to examine.
265 :    
266 :     =item z
267 :    
268 :     Second edge number.
269 :    
270 :     =item RETURN
271 :    
272 :     Return TRUE if the number I<$y> is between the numbers I<$x> and I<$z>. The check
273 :     is inclusive (that is, if I<$y> is equal to I<$x> or I<$z> the function returns
274 :     TRUE), and the order of I<$x> and I<$z> does not matter. If I<$x> is lower than
275 :     I<$z>, then the return is TRUE if I<$x> <= I<$y> <= I<$z>. If I<$z> is lower,
276 :     then the return is TRUE if I<$x> >= I$<$y> >= I<$z>.
277 :    
278 :     =back
279 :    
280 :     =cut
281 : gdpusch 1.47
282 : parrello 1.21 #: Return Type $;
283 :     sub between {
284 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
285 :     my($x,$y,$z) = @_;
286 :    
287 :     if ($x < $z) {
288 :     return (($x <= $y) && ($y <= $z));
289 :     } else {
290 :     return (($x >= $y) && ($y >= $z));
291 :     }
292 :     }
293 :    
294 :    
295 : parrello 1.13 =head3 boundaries_of
296 :    
297 : parrello 1.21 my ($contig, $min, $max, $dir) = boundaries_of($locs);
298 : parrello 1.13
299 :     Return the boundaries of a set of locations. The contig, the leftmost
300 :     location, and the rightmost location will be returned to the caller. If
301 :     more than one contig is represented, the method will return an undefined
302 :     value for the contig (indicating failure).
303 :    
304 :     =over 4
305 :    
306 :     =item locs
307 :    
308 :     Reference to a list of location strings. A location string contains a contig ID,
309 :     and underscore (C<_>), a starting offset, a strand identifier (C<+> or C<->), and
310 :     a length (e.g. C<360108.3:NC_10023P_1000+2000> begins at offset 1000 of contig
311 :     B<360108.3:NC_10023P> and covers 2000 base pairs on the C<+> strand).
312 :    
313 :     =item RETURN
314 :    
315 : parrello 1.21 Returns a 4-element list. The first element is the contig ID from all the locations,
316 :     the second is the offset of leftmost base pair represented in the locations, the
317 :     third is the offset of the rightmost base pair represented in the locations, and
318 :     the fourth is the dominant strand.
319 : parrello 1.13
320 :     =back
321 :    
322 :     =cut
323 :    
324 :     sub boundaries_of {
325 :     # Get the parameters.
326 :     my ($locs) = @_;
327 :     # Declare the return variables.
328 :     my ($contig, $min, $max);
329 :     # We'll put all the starting and ending offsets found in here.
330 :     my @offsets;
331 : parrello 1.21 # This will be used to count the orientations.
332 :     my %dirs = ('+' => 0, '-' => 0);
333 : parrello 1.13 # This will count the number of errors found.
334 :     my $error = 0;
335 : parrello 1.35 # Insure the location is an array. If it's not, we assume it's a string of
336 :     # comma-separated values.
337 :     if (! ref $locs) {
338 :     $locs = [ split /\s*,\s*/, $locs ];
339 :     }
340 : parrello 1.13 # Loop through the locations.
341 :     for my $loc (@$locs) {
342 :     # Parse this location.
343 :     if ($loc =~ /^(.+)_(\d+)(\+|\-)(\d+)$/) {
344 : parrello 1.23 # This is a valid location string.
345 : parrello 1.13 my ($newContig, $begin, $dir, $len) = ($1, $2, $3, $4);
346 :     # Is this contig valid?
347 :     if ($contig && $newContig ne $contig) {
348 :     # No, skip this location.
349 :     $error++;
350 :     } else {
351 :     # Save the contig.
352 :     $contig = $newContig;
353 : parrello 1.21 # Count the orientation.
354 :     $dirs{$dir}++;
355 : parrello 1.13 # Compute the ending offset.
356 :     my $end = ($dir eq '+' ? $begin + $len - 1 : $begin - $len + 1);
357 :     # Save both offsets.
358 :     push @offsets, $begin, $end;
359 :     }
360 : parrello 1.23 } elsif ($loc =~ /^(.+)_(\d+)_(\d+)/) {
361 :     # Here we have an old-style location string.
362 :     my($newContig, $start, $stop) = ($1, $2, $3);
363 :     # Is this contig valid?
364 : olson 1.22 if ($contig && $newContig ne $contig) {
365 :     # No, skip this location.
366 :     $error++;
367 :     } else {
368 :     # Save the contig.
369 :     $contig = $newContig;
370 : parrello 1.23 # Compute the orientation.
371 :     my $dir;
372 :     if ($start > $stop) {
373 :     $dir = '-';
374 :     } else {
375 :     $dir = '+';
376 :     }
377 :     # Count it.
378 : olson 1.22 $dirs{$dir}++;
379 : parrello 1.23 # Save both offsets.
380 : olson 1.22 push @offsets, $start, $stop;
381 :     }
382 : parrello 1.13 } else {
383 :     # The location is invalid, so it's an error,
384 :     $error++;
385 :     }
386 :     }
387 :     # If there's an error, clear the contig ID.
388 :     if ($error) {
389 :     $contig = undef;
390 :     }
391 :     # Compute the min and max from the offsets collected.
392 :     $min = min(@offsets);
393 :     $max = max(@offsets);
394 : parrello 1.21 # Save the dominant orientation.
395 :     my $dir = ($dirs{'-'} > $dirs{'+'} ? '-' : '+');
396 : parrello 1.13 # Return the results.
397 : parrello 1.21 return ($contig, $min, $max, $dir);
398 : parrello 1.13 }
399 :    
400 : parrello 1.21 =head3 boundary_loc
401 : parrello 1.17
402 : parrello 1.21 my $singleLoc = SeedUtils::boundary_loc($locations);
403 : parrello 1.17
404 : parrello 1.21 Return a single location string (see L<SAP/Location Strings>) that covers
405 :     the incoming list of locations. NOTE that if the locations listed span
406 :     more than one contig, this method may return an unexpected result.
407 : parrello 1.17
408 : parrello 1.21 This method is useful for converting the output of L<SAP/fid_locations> to
409 :     location strings.
410 : parrello 1.17
411 :     =over 4
412 :    
413 : parrello 1.21 =item locations
414 : parrello 1.17
415 : parrello 1.21 A set of location strings formatted as a comma-separated list or as a reference
416 :     to a list of location strings.
417 : parrello 1.17
418 :     =item RETURN
419 :    
420 : parrello 1.21 Returns a single location string that covers as best as possible the list of
421 :     incoming locations.
422 : parrello 1.17
423 :     =back
424 :    
425 :     =cut
426 :    
427 : parrello 1.21 sub boundary_loc {
428 : parrello 1.17 # Get the parameters.
429 : parrello 1.21 my ($locations) = @_;
430 :     # Convert the incoming locations to a list.
431 :     my @locs;
432 :     if (ref $locations eq 'ARRAY') {
433 :     @locs = @$locations;
434 :     } else {
435 :     @locs = split /\s*,\s*/, $locations;
436 :     }
437 :     # Get the boundary information for the listed locations.
438 :     my ($contig, $min, $max, $dir) = boundaries_of(\@locs);
439 :     # Compute the indicated location string.
440 :     my $retVal = $contig . "_" . ($dir eq '+' ? $min : $max) . $dir .
441 :     ($max + 1 - $min);
442 : parrello 1.17 # Return the result.
443 : parrello 1.21 return $retVal;
444 : parrello 1.17 }
445 :    
446 : parrello 1.21 =head3 by_fig_id
447 : parrello 1.17
448 : parrello 1.21 my @sorted_by_fig_id = sort { by_fig_id($a,$b) } @fig_ids;
449 : parrello 1.16
450 : parrello 1.21 Compare two feature IDs.
451 : parrello 1.16
452 : parrello 1.21 This function is designed to assist in sorting features by ID. The sort is by
453 :     genome ID followed by feature type and then feature number.
454 : parrello 1.16
455 :     =over 4
456 :    
457 : parrello 1.21 =item a
458 :    
459 :     First feature ID.
460 :    
461 :     =item b
462 : parrello 1.16
463 : parrello 1.21 Second feature ID.
464 : parrello 1.16
465 :     =item RETURN
466 :    
467 : parrello 1.21 Returns a negative number if the first parameter is smaller, zero if both parameters
468 :     are equal, and a positive number if the first parameter is greater.
469 : parrello 1.16
470 :     =back
471 :    
472 :     =cut
473 :    
474 : parrello 1.21 sub by_fig_id {
475 :     my($a,$b) = @_;
476 :     my($g1,$g2,$t1,$t2,$n1,$n2);
477 :     if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) &&
478 :     ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) {
479 :     ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2);
480 :     } else {
481 :     $a cmp $b;
482 : parrello 1.16 }
483 : parrello 1.19 }
484 :    
485 : parrello 1.21 =head3 create_fasta_record
486 : parrello 1.19
487 : parrello 1.21 my $fastaString = create_fasta_record($id, $comment, $sequence, $stripped);
488 : parrello 1.19
489 : parrello 1.21 Create a FASTA record from the specified DNA or protein sequence. The
490 :     sequence will be split into 60-character lines, and the record will
491 :     include an identifier line.
492 : parrello 1.19
493 :     =over 4
494 :    
495 : parrello 1.21 =item id
496 :    
497 :     ID for the sequence, to be placed at the beginning of the identifier
498 :     line.
499 :    
500 :     =item comment (optional)
501 : parrello 1.19
502 : parrello 1.21 Comment text to place after the ID on the identifier line. If this parameter
503 :     is empty, undefined, or 0, no comment will be placed.
504 : parrello 1.19
505 : parrello 1.21 =item sequence
506 : parrello 1.19
507 : parrello 1.21 Sequence of letters to form into FASTA. For purposes of convenience, whitespace
508 :     characters in the sequence will be removed automatically.
509 : parrello 1.19
510 : parrello 1.21 =item stripped (optional)
511 : parrello 1.19
512 : parrello 1.21 If TRUE, then the sequence will be returned unmodified instead of converted
513 :     to FASTA format. The default is FALSE.
514 : parrello 1.19
515 :     =item RETURN
516 :    
517 : parrello 1.21 Returns the desired sequence in FASTA format.
518 : parrello 1.19
519 :     =back
520 :    
521 :     =cut
522 :    
523 : parrello 1.21 sub create_fasta_record {
524 : parrello 1.19 # Get the parameters.
525 : parrello 1.21 my ($id, $comment, $sequence, $stripped) = @_;
526 :     # Declare the return variable.
527 :     my $retVal;
528 :     # If we're in stripped mode, we just return the sequence.
529 :     if ($stripped) {
530 :     $retVal = $sequence;
531 : parrello 1.19 } else {
532 : parrello 1.21 # Here we have to do the FASTA conversion. Start with the ID.
533 :     my $header = ">$id";
534 :     # Add a comment, if any.
535 :     if ($comment) {
536 :     $header .= " $comment";
537 :     }
538 :     # Clean up the sequence.
539 :     $sequence =~ s/\s+//g;
540 :     # We need to format the sequence into 60-byte chunks. We use the infamous
541 :     # grep-split trick. The split, because of the presence of the parentheses,
542 :     # includes the matched delimiters in the output list. The grep strips out
543 :     # the empty list items that appear between the so-called delimiters, since
544 :     # the delimiters are what we want.
545 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
546 :     # Add the chunks and the trailer.
547 :     $retVal = join("\n", $header, @chunks) . "\n";
548 : parrello 1.19 }
549 :     # Return the result.
550 : parrello 1.21 return $retVal;
551 : parrello 1.19 }
552 :    
553 : parrello 1.21 =head3 display_id_and_seq
554 : parrello 1.19
555 : parrello 1.21 SeedUtils::display_id_and_seq($id_and_comment, $seqP, $fh);
556 : parrello 1.19
557 : parrello 1.21 Display a fasta ID and sequence to the specified open file. This method is designed
558 :     to work well with L</read_fasta_sequence> and L</rev_comp>, because it takes as
559 :     input a string pointer rather than a string. If the file handle is omitted it
560 :     defaults to STDOUT.
561 : parrello 1.19
562 : parrello 1.21 The output is formatted into a FASTA record. The first line of the output is
563 :     preceded by a C<< > >> symbol, and the sequence is split into 60-character
564 :     chunks displayed one per line. Thus, this method can be used to produce
565 :     FASTA files from data gathered by the rest of the system.
566 : parrello 1.19
567 :     =over 4
568 :    
569 : parrello 1.21 =item id_and_comment
570 : parrello 1.19
571 : parrello 1.21 The sequence ID and (optionally) the comment from the sequence's FASTA record.
572 :     The ID
573 : parrello 1.19
574 : parrello 1.21 =item seqP
575 : parrello 1.19
576 : parrello 1.21 Reference to a string containing the sequence. The sequence is automatically
577 :     formatted into 60-character chunks displayed one per line.
578 : parrello 1.19
579 : parrello 1.21 =item fh
580 :    
581 :     Open file handle to which the ID and sequence should be output. If omitted,
582 :     C<\*STDOUT> is assumed.
583 :    
584 :     =back
585 :    
586 :     =cut
587 :    
588 :     sub display_id_and_seq {
589 :    
590 :     my( $id, $seqP, $fh ) = @_;
591 :    
592 :     if (! defined($fh) ) { $fh = \*STDOUT; }
593 :    
594 :     print $fh ">$id\n";
595 :     &display_seq($seqP, $fh);
596 :     }
597 :    
598 :     =head3 display_seq
599 :    
600 :     SeedUtils::display_seq(\$seqP, $fh);
601 :    
602 :     Display a fasta sequence to the specified open file. If the file handle is
603 :     omitted it defaults to STDOUT.
604 :    
605 :     The sequence is split into 60-character chunks displayed one per line for
606 :     readability.
607 :    
608 :     =over 4
609 :    
610 :     =item seqP
611 :    
612 :     Reference to a string containing the sequence.
613 :    
614 :     =item fh
615 :    
616 :     Open file handle to which the sequence should be output. If omitted,
617 :     C<STDOUT> is assumed.
618 :    
619 :     =back
620 :    
621 :     =cut
622 :    
623 :     sub display_seq {
624 :    
625 :     my ( $seqP, $fh ) = @_;
626 :     my ( $i, $n, $ln );
627 :    
628 :     if (! defined($fh) ) { $fh = \*STDOUT; }
629 :    
630 :     $n = length($$seqP);
631 :     # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) );
632 :     for ($i=0; ($i < $n); $i += 60) {
633 :     if (($i + 60) <= $n) {
634 :     $ln = substr($$seqP,$i,60);
635 :     } else {
636 :     $ln = substr($$seqP,$i,($n-$i));
637 :     }
638 :     print $fh "$ln\n";
639 :     }
640 :     }
641 :    
642 : olson 1.42 =head3 extract_seq
643 :    
644 :     $seq = &SeedUtils::extract_seq($contigs,$loc)
645 :    
646 :     This is just a little utility routine that I have found convenient. It assumes that
647 :     $contigs is a hash that contains IDs as keys and sequences as values. $loc must be of the
648 :     form
649 :    
650 :     Contig_Beg_End
651 :    
652 :     where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought
653 :     subsequence. If Beg > End, it is assumed that you want the reverse complement of the subsequence.
654 :     This routine plucks out the subsequence for you.
655 :    
656 :     =cut
657 :    
658 :     sub extract_seq {
659 :     my($contigs,$loc) = @_;
660 :     my($contig,$beg,$end,$contig_seq);
661 :     my($plus,$minus);
662 :    
663 :     $plus = $minus = 0;
664 :     my $strand = "";
665 :     my @loc = split(/,/,$loc);
666 :     my @seq = ();
667 :     foreach $loc (@loc)
668 :     {
669 :     if ($loc =~ /^\S+_(\d+)_(\d+)$/)
670 :     {
671 :     if ($1 < $2)
672 :     {
673 :     $plus++;
674 :     }
675 :     elsif ($2 < $1)
676 :     {
677 :     $minus++;
678 :     }
679 :     }
680 :     }
681 :     if ($plus > $minus)
682 :     {
683 :     $strand = "+";
684 :     }
685 :     elsif ($plus < $minus)
686 :     {
687 :     $strand = "-";
688 :     }
689 :    
690 :     foreach $loc (@loc)
691 :     {
692 :     if ($loc =~ /^(\S+)_(\d+)_(\d+)$/)
693 :     {
694 :     ($contig,$beg,$end) = ($1,$2,$3);
695 :    
696 :     my $len = length($contigs->{$contig});
697 :     if (!$len)
698 :     {
699 :     carp "Undefined or zero-length contig $contig";
700 :     return "";
701 :     }
702 :    
703 :     if (($beg > $len) || ($end > $len))
704 :     {
705 :     carp "Region $loc out of bounds (contig len=$len)";
706 :     }
707 :     else
708 :     {
709 :     if (($beg < $end) || (($beg == $end) && ($strand eq "+")))
710 :     {
711 :     push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg)));
712 :     }
713 :     else
714 :     {
715 :     $strand = "-";
716 :     push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end))));
717 :     }
718 :     }
719 :     }
720 :     }
721 :     return join("",@seq);
722 :     }
723 :    
724 : parrello 1.21
725 : gdpusch 1.46
726 :     =head3 file_read
727 :    
728 :     my $text = $fig->file_read($fileName);
729 :    
730 :     or
731 :    
732 :     my @lines = $fig->file_read($fileName);
733 :    
734 :     or
735 :    
736 :     my $text = FIG::file_read($fileName);
737 :    
738 :     or
739 :    
740 :     my @lines = FIG::file_read($fileName);
741 :    
742 :     Read an entire file into memory. In a scalar context, the file is returned
743 :     as a single text string with line delimiters included. In a list context, the
744 :     file is returned as a list of lines, each line terminated by a line
745 :     delimiter. (For a method that automatically strips the line delimiters,
746 :     use C<Tracer::GetFile>.)
747 :    
748 :     =over 4
749 :    
750 :     =item fileName
751 :    
752 :     Fully-qualified name of the file to read.
753 :    
754 :     =item RETURN
755 :    
756 :     In a list context, returns a list of the file lines. In a scalar context, returns
757 :     a string containing all the lines of the file with delimiters included.
758 :    
759 :     =back
760 :    
761 :     =cut
762 :    
763 :     #: Return Type $;
764 :     #: Return Type @;
765 :     sub file_read {
766 :    
767 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
768 :     my($fileName) = @_;
769 :     return file_head($fileName, '*');
770 :    
771 :     }
772 :    
773 :    
774 :     =head3 file_head
775 :    
776 :     my $text = $fig->file_head($fileName, $count);
777 :    
778 :     or
779 :    
780 :     my @lines = $fig->file_head($fileName, $count);
781 :    
782 :     or
783 :    
784 :     my $text = FIG::file_head($fileName, $count);
785 :    
786 :     or
787 :    
788 :     my @lines = FIG::file_head($fileName, $count);
789 :    
790 :     Read a portion of a file into memory. In a scalar context, the file portion is
791 :     returned as a single text string with line delimiters included. In a list
792 :     context, the file portion is returned as a list of lines, each line terminated
793 :     by a line delimiter.
794 :    
795 :     =over 4
796 :    
797 :     =item fileName
798 :    
799 :     Fully-qualified name of the file to read.
800 :    
801 :     =item count (optional)
802 :    
803 :     Number of lines to read from the file. If omitted, C<1> is assumed. If the
804 :     non-numeric string C<*> is specified, the entire file will be read.
805 :    
806 :     =item RETURN
807 :    
808 :     In a list context, returns a list of the desired file lines. In a scalar context, returns
809 :     a string containing the desired lines of the file with delimiters included.
810 :    
811 :     =back
812 :    
813 :     =cut
814 :    
815 :     #: Return Type $;
816 :     #: Return Type @;
817 :     sub file_head {
818 :    
819 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
820 :     my($file, $count) = @_;
821 :    
822 :     my ($n, $allFlag);
823 :     if ($count eq '*') {
824 :     $allFlag = 1;
825 :     $n = 0;
826 :     } else {
827 :     $allFlag = 0;
828 :     $n = (!$count ? 1 : $count);
829 :     }
830 :    
831 :     if (open(my $fh, "<$file")) {
832 :     my(@ret, $i);
833 :     $i = 0;
834 :     while (<$fh>) {
835 :     push(@ret, $_);
836 :     $i++;
837 :     last if !$allFlag && $i >= $n;
838 :     }
839 :     close($fh);
840 :     if (wantarray) {
841 :     return @ret;
842 :     } else {
843 :     return join("", @ret);
844 :     }
845 :     }
846 :     }
847 :    
848 :    
849 :    
850 : parrello 1.21 =head3 genome_of
851 :    
852 :     my $genomeID = genome_of($fid);
853 :    
854 :     Return the Genome ID embedded in the specified FIG feature ID.
855 :    
856 :     =over 4
857 :    
858 :     =item fid
859 :    
860 :     Feature ID of interest.
861 :    
862 :     =item RETURN
863 :    
864 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
865 :     feature ID is invalid, this method returns an undefined value.
866 :    
867 :     =back
868 :    
869 :     =cut
870 :    
871 :     sub genome_of {
872 :     # Get the parameters.
873 :     my ($fid) = @_;
874 :     # Declare the return variable.
875 :     my $retVal;
876 :     # Parse the feature ID.
877 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
878 :     $retVal = $1;
879 :     }
880 :     # Return the result.
881 :     return $retVal;
882 :     }
883 :    
884 :     =head3 hypo
885 :    
886 :     my $flag = hypo($func);
887 :    
888 :     Return TRUE if the specified functional role is hypothetical, else FALSE.
889 :     Hypothetical functional roles are identified by key words in the text,
890 :     such as I<hypothesis>, I<predicted>, or I<glimmer> (among others).
891 :    
892 :     =over 4
893 :    
894 :     =item func
895 :    
896 :     Text of the functional role whose nature is to be determined.
897 :    
898 :     =item RETURN
899 :    
900 :     Returns TRUE if the role is hypothetical, else FALSE.
901 :    
902 :     =back
903 :    
904 :     =cut
905 :    
906 :     sub hypo {
907 :     my ($func) = @_;
908 :     if (! $func) { return 1 }
909 :     if ($func =~ /lmo\d+ protein/i) { return 1 }
910 :     if ($func =~ /hypoth/i) { return 1 }
911 :     if ($func =~ /conserved protein/i) { return 1 }
912 :     if ($func =~ /gene product/i) { return 1 }
913 :     if ($func =~ /interpro/i) { return 1 }
914 :     if ($func =~ /B[sl][lr]\d/i) { return 1 }
915 :     if ($func =~ /^U\d/) { return 1 }
916 :     if ($func =~ /^orf[^_]/i) { return 1 }
917 :     if ($func =~ /uncharacterized/i) { return 1 }
918 :     if ($func =~ /pseudogene/i) { return 1 }
919 :     if ($func =~ /^predicted/i) { return 1 }
920 :     if ($func =~ /AGR_/) { return 1 }
921 :     if ($func =~ /similar to/i) { return 1 }
922 :     if ($func =~ /similarity/i) { return 1 }
923 :     if ($func =~ /glimmer/i) { return 1 }
924 :     if ($func =~ /unknown/i) { return 1 }
925 :     if (($func =~ /domain/i) ||
926 :     ($func =~ /^y[a-z]{2,4}\b/i) ||
927 :     ($func =~ /complete/i) ||
928 :     ($func =~ /ensang/i) ||
929 :     ($func =~ /unnamed/i) ||
930 :     ($func =~ /EG:/) ||
931 :     ($func =~ /orf\d+/i) ||
932 :     ($func =~ /RIKEN/) ||
933 :     ($func =~ /Expressed/i) ||
934 :     ($func =~ /[a-zA-Z]{2,3}\|/) ||
935 :     ($func =~ /predicted by Psort/) ||
936 :     ($func =~ /^bh\d+/i) ||
937 :     ($func =~ /cds_/i) ||
938 :     ($func =~ /^[a-z]{2,3}\d+[^:\+\-0-9]/i) ||
939 :     ($func =~ /similar to/i) ||
940 :     ($func =~ / identi/i) ||
941 :     ($func =~ /ortholog of/i) ||
942 :     ($func =~ /structural feature/i)) { return 1 }
943 :     return 0;
944 :    
945 :     }
946 :    
947 : parrello 1.37 =head3 id_url
948 :    
949 :     my $url = id_url($id);
950 :    
951 :     Return the URL for a specified external gene ID.
952 :    
953 :     =over 4
954 :    
955 :     =item id
956 :    
957 :     ID of the gene whose URL is desired.
958 :    
959 :     =item RETURN
960 :    
961 :     Returns a URL for displaying information about the specified gene. The structure
962 :     of the ID is used to determine the web site to which the gene belongs.
963 :    
964 :     =back
965 :    
966 :     =cut
967 :    
968 :     sub id_url {
969 :     # Get the parameters.
970 :     my ($id) = @_;
971 :     # Declare the return variable.
972 :     my $retVal;
973 :     # Parse the ID to determine the URL.
974 :     if ($id =~ /^(?:ref\|)?([NXYZA]P_[0-9\.]+)$/) {
975 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=protein;cmd=search;term=$1";
976 :     } elsif ($id =~ /^gi\|(\d+)$/) {
977 :     $retVal = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve;db=Protein&list_uids=$1;dopt=GenPept";
978 :     } elsif ($id =~ /^cmr\|(.+)$/) {
979 :     $retVal = "http://cmr.jcvi.org/tigr-scripts/CMR/shared/GenePage.cgi?locus=$1";
980 :     } elsif ($id =~ /^sp\|([A-Z0-9]{6})$/) {
981 :     $retVal = "http://us.expasy.org/cgi-bin/get-sprot-entry?$1";
982 :     } elsif ($id =~ /^uni\|([A-Z0-9_]+?)$/) {
983 :     $retVal = "http://www.ebi.uniprot.org/uniprot-srv/uniProtView.do?proteinAc=$1";
984 :     } elsif ($id =~ /^kegg\|(([a-z]{2,4}):([a-zA-Z_0-9]+))$/) {
985 :     $retVal = "http://www.genome.ad.jp/dbget-bin/www_bget?$2+$3";
986 :     } elsif ($id =~ /^tr\|([a-zA-Z0-9]+)$/) {
987 :     $retVal = "http://ca.expasy.org/uniprot/$1";
988 :     } elsif ($id =~ /^(fig\|\d+\.\d+\.\w+\.\d+)$/) {
989 :     $retVal = "http://seed-viewer.theseed.org/?pattern=$1&page=SearchResult&action=check_search";
990 :     }
991 :     # Return the result.
992 :     return $retVal;
993 :     }
994 :    
995 :    
996 : parrello 1.21 =head3 location_cmp
997 :    
998 :     my $cmp = location_cmp($loc1, $loc2);
999 :    
1000 :     Compare two location strings (see L<SAP/Location Strings>).
1001 :    
1002 :     The ordering principle for locations is that they are sorted first by contig ID, then by
1003 :     leftmost position, in reverse order by length, and then by direction. The effect is that
1004 :     within a contig, the locations are ordered first and foremost in the way they would
1005 :     appear when displayed in a picture of the contig and second in such a way that embedded
1006 :     locations come after the locations in which they are embedded. In the case of two
1007 :     locations that represent the exact same base pairs, the forward (C<+>) location is
1008 :     arbitrarily placed first.
1009 :    
1010 :     =over 4
1011 :    
1012 :     =item loc1
1013 :    
1014 :     First location string to compare.
1015 :    
1016 :     =item loc2
1017 :    
1018 :     Second location string to compare.
1019 :    
1020 :     =item RETURN
1021 :    
1022 :     Returns a negative number if the B<loc1> location sorts first, a positive number if the
1023 :     B<loc2> location sorts first, and zero if the two locations are the same.
1024 : parrello 1.19
1025 :    
1026 :     =back
1027 :    
1028 :     =cut
1029 :    
1030 :     sub location_cmp {
1031 :     # Get the parameters.
1032 :     my ($loc1, $loc2) = @_;
1033 :     # Parse the locations.
1034 :     my ($contig1, $beg1, $strand1, $len1) = $loc1 =~ /^(.+)_(\d+)([+-])(\d+)$/;
1035 :     my $left1 = ($strand1 eq '+' ? $beg1 : $beg1 - $len1 + 1);
1036 :     my ($contig2, $beg2, $strand2, $len2) = $loc2 =~ /^(.+)_(\d+)([+-])(\d+)$/;
1037 :     my $left2 = ($strand2 eq '+' ? $beg2 : $beg2 - $len2 + 1);
1038 :     # Declare the return variable. We compare the indicative parts of the location
1039 :     # in order. Note that we sort in reverse order by length, so the last comparison
1040 :     # puts 2 before 1.
1041 :     my $retVal = ($contig1 cmp $contig2) || ($left1 <=> $left2) ||
1042 :     ($len2 <=> $len1);
1043 :     # If everything matches to this point, check the strands.
1044 :     if (! $retVal) {
1045 :     if ($strand1 eq '+') {
1046 :     # First location is positive, so if the locations are unequal, it
1047 :     # sorts first.
1048 :     $retVal = ($strand2 eq '+' ? 0 : -1);
1049 :     } else {
1050 :     # First location is negative, so if the locations are unequal, it
1051 :     # sorts second.
1052 :     $retVal = ($strand1 eq '-' ? 0 : 1);
1053 :     }
1054 :     }
1055 :     # Return the result.
1056 :     return $retVal;
1057 :     }
1058 :    
1059 : parrello 1.21 =head3 location_string
1060 : parrello 1.19
1061 : parrello 1.21 my $locString = location_string($contig, $beg, $end);
1062 : parrello 1.19
1063 : parrello 1.21 Form a location string for the specified contig that starts at the
1064 :     indicated begin location and stops at the indicated end location. A
1065 :     single-base location will automatically be put on the forward strand.
1066 : parrello 1.19
1067 :     =over 4
1068 :    
1069 : parrello 1.21 =item contig
1070 :    
1071 :     ID of the contig to contain this location.
1072 :    
1073 :     =item beg
1074 :    
1075 :     Beginning offset of the location.
1076 :    
1077 :     =item end
1078 : parrello 1.19
1079 : parrello 1.21 Ending offset of the location.
1080 : parrello 1.19
1081 :     =item RETURN
1082 :    
1083 : parrello 1.21 Returns a location string (see L<SAP/Location String>) for the specified
1084 :     location.
1085 : parrello 1.19
1086 :     =back
1087 :    
1088 :     =cut
1089 :    
1090 : parrello 1.21 sub location_string {
1091 : parrello 1.19 # Get the parameters.
1092 : parrello 1.21 my ($contig, $beg, $end) = @_;
1093 :     # Compute the strand and length.
1094 :     my ($strand, $len);
1095 :     if ($beg <= $end) {
1096 :     $strand = '+';
1097 :     $len = $end + 1 - $beg;
1098 :     } else {
1099 :     $strand = '-';
1100 :     $len = $beg + 1 - $end;
1101 : parrello 1.19 }
1102 :     # Return the result.
1103 : parrello 1.21 return $contig . "_$beg$strand$len";
1104 : parrello 1.16 }
1105 :    
1106 : parrello 1.5 =head3 max
1107 :    
1108 :     my $max = max(@nums);
1109 :    
1110 :     Return the maximum number from all the values in the specified list.
1111 :    
1112 :     =over 4
1113 :    
1114 :     =item nums
1115 :    
1116 :     List of numbers to examine.
1117 :    
1118 :     =item RETURN
1119 :    
1120 :     Returns the maximum numeric value from the specified parameters, or
1121 :     an undefined value if an empty list is passed in.
1122 :    
1123 :     =back
1124 :    
1125 :     =cut
1126 :    
1127 :     sub max {
1128 :     my ($retVal, @nums) = @_;
1129 :     for my $num (@nums) {
1130 :     if ($num > $retVal) {
1131 :     $retVal = $num;
1132 :     }
1133 :     }
1134 :     return $retVal;
1135 :     }
1136 :    
1137 :     =head3 min
1138 :    
1139 :     my $min = min(@nums);
1140 :    
1141 :     Return the minimum number from all the values in the specified list.
1142 :    
1143 :     =over 4
1144 :    
1145 :     =item nums
1146 :    
1147 :     List of numbers to examine.
1148 :    
1149 :     =item RETURN
1150 :    
1151 :     Returns the minimum numeric value from the specified parameters, or
1152 :     an undefined value if an empty list is passed in.
1153 :    
1154 :     =back
1155 :    
1156 :     =cut
1157 :    
1158 :     sub min {
1159 :     my ($retVal, @nums) = @_;
1160 :     for my $num (@nums) {
1161 :     if ($num < $retVal) {
1162 :     $retVal = $num;
1163 :     }
1164 :     }
1165 :     return $retVal;
1166 :     }
1167 :    
1168 : parrello 1.21 =head3 parse_fasta_record
1169 : parrello 1.1
1170 : parrello 1.21 my ($id, $comment, $seq) = parse_fasta_record($string);
1171 : parrello 1.1
1172 : parrello 1.21 Extract the ID, comment, and sequence from a single FASTA record. For
1173 :     backward compatability, instead of a FASTA record the ID and sequence can
1174 :     be specified separated by a comma. In this case, the returned comment
1175 :     will be empty.
1176 : parrello 1.1
1177 :     =over 4
1178 :    
1179 : parrello 1.21 =item string
1180 : parrello 1.1
1181 : parrello 1.21 A single FASTA record, or an ID and sequence separated by a single comma,
1182 :     an unadorned sequence, a 2-element list consisting of an ID and a sequence,
1183 :     or a 3-element list consisting of an ID, a comment, and a sequence.
1184 : parrello 1.7
1185 : parrello 1.1 =item RETURN
1186 :    
1187 : parrello 1.21 Returns a three-element list consisting of the incoming ID, the associated
1188 :     comment, and the specified DNA or protein sequence. If the incoming string is
1189 :     invalid, all three list elements will come back undefined. If no ID is
1190 :     specified, an MD5 will be provided.
1191 : parrello 1.1
1192 :     =back
1193 :    
1194 :     =cut
1195 :    
1196 : parrello 1.21 sub parse_fasta_record {
1197 : parrello 1.1 # Get the parameters.
1198 : parrello 1.21 my ($string) = @_;
1199 :     # Declare the return variables.
1200 :     my ($id, $comment, $seq);
1201 :     # Check the type of input string.
1202 :     if (! defined $string) {
1203 :     # Do nothing if no string was passed in. This extra check prevents a
1204 :     # warning at runtime.
1205 :     } elsif ($string =~ /^>(\S+)([\t ]+[^\r\n]*)?[\r\n]+(.+)/s) {
1206 :     # Here we have a standard FASTA string.
1207 :     ($id, $comment, $seq) = ($1, $2, $3);
1208 :     # Remove white space from the sequence string.
1209 :     $seq =~ s/\s+//sg;
1210 :     # Trim front of comment.
1211 :     $comment =~ s/^s+//;
1212 :     } elsif ($string =~ /(.+?)\s*,\s*(.+)/) {
1213 :     ($id, $comment, $seq) = ($1, '', $2);
1214 :     } elsif (ref $string eq 'ARRAY') {
1215 :     # Here the data came in pre-formatted as a list reference.
1216 :     ($id, $comment, $seq) = @$string;
1217 :     # If there's no comment, we need to adjust.
1218 :     if (! defined $seq) {
1219 :     $seq = $comment;
1220 :     $comment = '';
1221 :     }
1222 : parrello 1.7 } else {
1223 : parrello 1.21 # Here we have only a sequence. We need to construct the ID.
1224 :     $seq = $string;
1225 :     require Digest::MD5;
1226 :     $id = "md5|" . Digest::MD5::md5_base64($seq);
1227 :     $comment = "";
1228 :     }
1229 :     # Return the results.
1230 :     return ($id, $comment, $seq);
1231 :     }
1232 :    
1233 :     =head3 parse_location
1234 :    
1235 :     my ($contig, $begin, $end, $strand) = parse_location($locString);
1236 :    
1237 :     Return the contigID, start offset, and end offset for a specified
1238 :     location string (see L<SAP/Location Strings>).
1239 :    
1240 :     =over 4
1241 :    
1242 :     =item locString
1243 :    
1244 :     Location string to parse.
1245 :    
1246 :     =item RETURN
1247 :    
1248 :     Returns a four-element list containing the contig ID from the location
1249 :     string, the starting offset of the location, the ending offset, and the
1250 :     strand. If the location string is not valid, the values returned will be
1251 :     C<undef>.
1252 :    
1253 :     =back
1254 :    
1255 :     =cut
1256 :    
1257 :     sub parse_location {
1258 :     # Get the parameters.
1259 :     my ($locString) = @_;
1260 :     # Declare the return variables.
1261 :     my ($contig, $begin, $end, $strand);
1262 :     # Parse the location string.
1263 :     if ($locString =~ /^(.+)_(\d+)([+-])(\d+)$/) {
1264 :     # Pull out the contig ID, strand, and begin location.
1265 :     $contig = $1;
1266 :     $begin = $2;
1267 :     $strand = $3;
1268 :     # Compute the ending location from the direction and length.
1269 :     if ($3 eq '+') {
1270 :     $end = $begin + $4 - 1;
1271 :     } else {
1272 :     $end = $begin - $4 + 1;
1273 : parrello 1.7 }
1274 : parrello 1.1 }
1275 : olson 1.43 elsif ($locString =~ /^(.*)_(\d+)_(\d+)$/)
1276 :     {
1277 :     $contig = $1;
1278 :     $begin = $2;
1279 :     $end = $3;
1280 :     $strand = $begin < $end ? "+" : "-";
1281 :     }
1282 :    
1283 : parrello 1.21 # Return the results.
1284 :     return ($contig, $begin, $end, $strand);
1285 : parrello 1.1 }
1286 :    
1287 : parrello 1.2 =head3 rev_comp
1288 :    
1289 :     my $revcmp = rev_comp($dna);
1290 :    
1291 :     or
1292 :    
1293 :     rev_comp(\$dna);
1294 :    
1295 :     Return the reverse complement of a DNA string.
1296 :    
1297 :     =over 4
1298 :    
1299 :     =item dna
1300 :    
1301 :     Either a DNA string, or a reference to a DNA string.
1302 :    
1303 :     =item RETURN
1304 :    
1305 :     If the input is a DNA string, returns the reverse complement. If the
1306 :     input is a reference to a DNA string, the string itself is reverse
1307 :     complemented.
1308 :    
1309 :     =back
1310 :    
1311 :     =cut
1312 :    
1313 :     sub rev_comp {
1314 :     # Get the parameters.
1315 :     my ($dna) = @_;
1316 :     # Determine how we were called.
1317 :     my ($retVal, $refMode);
1318 : parrello 1.24 if (! ref $dna) {
1319 : parrello 1.21 $retVal = reverse $dna;
1320 : parrello 1.2 $refMode = 0;
1321 :     } else {
1322 : parrello 1.21 $retVal = reverse $$dna;
1323 : parrello 1.2 $refMode = 1;
1324 :     }
1325 : parrello 1.21 # Now $retVal contains the reversed DNA string, and $refMode is TRUE iff the
1326 :     # user passed in a reference. The following translation step complements the
1327 :     # string.
1328 :     $retVal =~ tr/acgtumrwsykbdhvACGTUMRWSYKBDHV/tgcaakywsrmvhdbTGCAAKYWSRMVHDB/;
1329 : parrello 1.2 # Return the result in the method corresponding to the way it came in.
1330 :     if ($refMode) {
1331 :     $$dna = $retVal;
1332 :     return;
1333 :     } else {
1334 :     return $retVal;
1335 :     }
1336 :     }
1337 :    
1338 : parrello 1.21 # Synonym of rev_comp, for backward compatibility.
1339 :     sub reverse_comp {
1340 :     return rev_comp($_[0]);
1341 : parrello 1.15 }
1342 :    
1343 : parrello 1.21 =head3 roles_of_function
1344 : parrello 1.15
1345 : parrello 1.21 my @roles = roles_of_function($assignment);
1346 : parrello 1.3
1347 : parrello 1.21 Return a list of the functional roles in the specified assignment string.
1348 :     A single assignment may contain multiple roles as well as comments; this
1349 :     method separates them out.
1350 : parrello 1.3
1351 :     =over 4
1352 :    
1353 : parrello 1.21 =item assignment
1354 : parrello 1.3
1355 : parrello 1.21 Functional assignment to parse for roles.
1356 : parrello 1.3
1357 :     =item RETURN
1358 :    
1359 : parrello 1.21 Returns a list of the individual roles in the assignment.
1360 : parrello 1.3
1361 :     =back
1362 :    
1363 :     =cut
1364 :    
1365 : parrello 1.21 sub roles_of_function {
1366 : parrello 1.3 # Get the parameters.
1367 : parrello 1.21 my ($assignment) = @_;
1368 :     # Remove any comment.
1369 : parrello 1.30 my $commentFree = ($assignment =~ /(.+?)\s*[#!]/ ? $1 : $assignment);
1370 : parrello 1.21 # Split out the roles.
1371 : parrello 1.30 my @retVal = split /\s+[\/@]\s+|\s*;\s+/, $commentFree;
1372 : parrello 1.3 # Return the result.
1373 : parrello 1.21 return @retVal;
1374 : parrello 1.3 }
1375 :    
1376 : parrello 1.6 =head3 sims
1377 :    
1378 : parrello 1.31 my @sims = sims($id, $maxN, $maxP, 'fig');
1379 :    
1380 :     or
1381 :    
1382 :     my @sims = sims($id, $maxN, $maxP, 'all);
1383 :    
1384 : parrello 1.6
1385 :     Retrieve similarities from the network similarity server. The similarity retrieval
1386 :     is performed using an HTTP user agent that returns similarity data in multiple
1387 :     chunks. An anonymous subroutine is passed to the user agent that parses and
1388 :     reformats the chunks as they come in. The similarites themselves are returned
1389 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
1390 :     The Sim object methods allow access to the elements by name.
1391 :    
1392 :     Similarities can be either raw or expanded. The raw similarities are basic
1393 :     hits between features with similar DNA. Expanding a raw similarity drags in any
1394 :     features considered substantially identical. So, for example, if features B<A1>,
1395 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
1396 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
1397 :    
1398 :     =over 4
1399 :    
1400 :     =item id
1401 :    
1402 :     ID of the feature whose similarities are desired, or reference to a list
1403 :     of the IDs of the features whose similarities are desired.
1404 :    
1405 : parrello 1.31 =item maxN (optional)
1406 : parrello 1.6
1407 : parrello 1.27 Maximum number of similarities to return for each incoming feature.
1408 : parrello 1.6
1409 : parrello 1.31 =item maxP (optional)
1410 : parrello 1.6
1411 :     The maximum allowable similarity score.
1412 :    
1413 : parrello 1.31 =item select (optional)
1414 : parrello 1.6
1415 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
1416 :     means only similarities to FIG features are returned; C<all> means all expanded
1417 :     similarities are returned; and C<figx> means similarities are expanded until the
1418 :     number of FIG features equals the maximum.
1419 :    
1420 : parrello 1.31 =item max_expand (optional)
1421 : parrello 1.6
1422 :     The maximum number of features to expand.
1423 :    
1424 : parrello 1.31 =item filters (optional)
1425 : parrello 1.6
1426 :     Reference to a hash containing filter information, or a subroutine that can be
1427 :     used to filter the sims.
1428 :    
1429 :     =item RETURN
1430 :    
1431 :     Returns a list of L<Sim> objects.
1432 :    
1433 :     =back
1434 :    
1435 :     =cut
1436 :    
1437 :     sub sims {
1438 :     # Get the parameters.
1439 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
1440 :     # Get the URL for submitting to the sims server.
1441 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
1442 :     # Get a list of the IDs to process.
1443 :     my @ids;
1444 :     if (ref($id) eq "ARRAY") {
1445 :     @ids = @$id;
1446 :     } else {
1447 :     @ids = ($id);
1448 :     }
1449 :     # Form a list of the parameters to pass to the server.
1450 :     my %args = ();
1451 :     $args{id} = \@ids;
1452 :     $args{maxN} = $maxN if defined($maxN);
1453 :     $args{maxP} = $maxP if defined($maxP);
1454 :     $args{select} = $select if defined($select);
1455 :     $args{max_expand} = $max_expand if defined($max_expand);
1456 :     # If the filter is a hash, put the filters in the argument list.
1457 :     if (ref($filters) eq 'HASH') {
1458 :     for my $k (keys(%$filters))
1459 :     {
1460 :     $args{"filter_$k"}= $filters->{$k};
1461 :     }
1462 :     }
1463 :     # Get the user agent.
1464 :     require LWP::UserAgent;
1465 :     my $ua = LWP::UserAgent->new();
1466 :     # Insure we have the Sim module.
1467 :     require Sim;
1468 :     #
1469 :     # Our next task is to create the anonymous subroutine that will process the
1470 :     # chunks that come back from the server. We require three global variables:
1471 :     # @sims to hold the similarities found, $tail to remember the unprocessed
1472 :     # data from the previous chunk, and $chunks to count the chunks.
1473 :     #
1474 :     my @retVal;
1475 :     my $tail;
1476 :     my $chunks = 0;
1477 :     #
1478 :     # ANONYMOUS SUBROUTINE
1479 :     #
1480 :     my $cb = sub {
1481 :     eval {
1482 :     # Get the parameters.
1483 :     my ($data, $command) = @_;
1484 :     # Check for a reset command. If we get one, we discard any data
1485 :     # in progress.
1486 :     if ($command && $command eq 'reset') {
1487 :     $tail = '';
1488 :     } else {
1489 :     $chunks++;
1490 :     # Get the data to process. Note we concatenate it to the incoming
1491 :     # tail from last time.
1492 :     my $c = $tail . $data;
1493 :     # Make sure the caller hasn't messed up the new-line character.
1494 :     # FASTA readers in particular are notorious for doing things
1495 :     # like that.
1496 :     local $/ = "\n";
1497 :     # Split the input into lines.
1498 :     my @lines = split(/\n/, $c);
1499 :     # If the input does not end with a new-line, we have a partial
1500 :     # chunk and need to put it in the tail for next time. If not,
1501 :     # there is no tail for next time.
1502 :     if (substr($c, -1, 1) ne "\n") {
1503 :     $tail = pop @lines;
1504 :     } else {
1505 :     $tail = '';
1506 :     }
1507 :     # Loop through the lines. Note there's no need to chomp because
1508 :     # the SPLIT took out the new-line characters.
1509 :     for my $l (@lines) {
1510 :     # Split the line into fields.
1511 :     my @s = split(/\t/, $l);
1512 :     # Insure we have all the fields we need.
1513 : parrello 1.12 if (@s >= 9) {
1514 : parrello 1.6 # Check to see if we've seen this SIM before.
1515 :     my $id1 = $s[0];
1516 :     my $id2 = $s[1];
1517 :     # Add it to the result list.
1518 :     push(@retVal, bless \@s, 'Sim');
1519 :     }
1520 :     }
1521 :     }
1522 :     };
1523 :     };
1524 :     #
1525 :     # END OF ANONYMOUS SUBROUTINE
1526 :     #
1527 :     # Now we're ready to start. Because networking is an iffy thing, we set up
1528 :     # to try our request multiple times.
1529 :     my $n_retries = 10;
1530 :     my $attempts = 0;
1531 :     # Set the timeout value, in seconds.
1532 :     $ua->timeout(180);
1533 :     # Loop until we succeed or run out of retries.
1534 :     my $done = 0;
1535 :     while (! $done && $attempts++ < $n_retries) {
1536 :     # Reset the content processor. This clears the tail.
1537 :     &$cb(undef, 'reset');
1538 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
1539 :     if ($resp->is_success) {
1540 :     # If the response was successful, get the content. This triggers
1541 :     # the anonymous subroutine.
1542 :     my $x = $resp->content;
1543 :     # Denote we've been successful.
1544 :     $done = 1;
1545 :     }
1546 :     }
1547 :     return @retVal;
1548 :     }
1549 :    
1550 : gdpusch 1.55
1551 :     =head3 genetic_code
1552 :    
1553 :     my $code = genetic_code();
1554 :    
1555 :     Return a hash containing the translation of nucleotide triples to proteins.
1556 :     Methods such as L</translate> can take a translation scheme as a parameter.
1557 :     This method returns the translation scheme for genetic code 11 or 4,
1558 :     and an error for all other cocdes. The scheme is implemented as a reference to a
1559 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1560 :    
1561 :     =cut
1562 :    
1563 :     sub genetic_code {
1564 :     my ($ncbi_genetic_code_num) = @_;
1565 :     my $code = &standard_genetic_code();
1566 :    
1567 : gdpusch 1.58 if (($ncbi_genetic_code_num == 1) ||
1568 :     ($ncbi_genetic_code_num == 11)
1569 :     ) {
1570 : gdpusch 1.55 #...Do nothing
1571 :     }
1572 :     elsif ($ncbi_genetic_code_num == 4) {
1573 :     $code->{TGA} = 'W';
1574 :     }
1575 :     else {
1576 : gdpusch 1.58 die "Sorry, only genetic codes 1, 4, and 11 are currently supported";
1577 : gdpusch 1.55 }
1578 :    
1579 :     return $code;
1580 :     }
1581 :    
1582 :    
1583 : parrello 1.21 =head3 standard_genetic_code
1584 :    
1585 :     my $code = standard_genetic_code();
1586 :    
1587 :     Return a hash containing the standard translation of nucleotide triples to proteins.
1588 :     Methods such as L</translate> can take a translation scheme as a parameter. This method
1589 :     returns the default translation scheme. The scheme is implemented as a reference to a
1590 :     hash that contains nucleotide triplets as keys and has protein letters as values.
1591 :    
1592 :     =cut
1593 :    
1594 :     sub standard_genetic_code {
1595 :    
1596 :     my $code = {};
1597 : parrello 1.8
1598 : parrello 1.21 $code->{"AAA"} = "K";
1599 :     $code->{"AAC"} = "N";
1600 :     $code->{"AAG"} = "K";
1601 :     $code->{"AAT"} = "N";
1602 :     $code->{"ACA"} = "T";
1603 :     $code->{"ACC"} = "T";
1604 :     $code->{"ACG"} = "T";
1605 :     $code->{"ACT"} = "T";
1606 :     $code->{"AGA"} = "R";
1607 :     $code->{"AGC"} = "S";
1608 :     $code->{"AGG"} = "R";
1609 :     $code->{"AGT"} = "S";
1610 :     $code->{"ATA"} = "I";
1611 :     $code->{"ATC"} = "I";
1612 :     $code->{"ATG"} = "M";
1613 :     $code->{"ATT"} = "I";
1614 :     $code->{"CAA"} = "Q";
1615 :     $code->{"CAC"} = "H";
1616 :     $code->{"CAG"} = "Q";
1617 :     $code->{"CAT"} = "H";
1618 :     $code->{"CCA"} = "P";
1619 :     $code->{"CCC"} = "P";
1620 :     $code->{"CCG"} = "P";
1621 :     $code->{"CCT"} = "P";
1622 :     $code->{"CGA"} = "R";
1623 :     $code->{"CGC"} = "R";
1624 :     $code->{"CGG"} = "R";
1625 :     $code->{"CGT"} = "R";
1626 :     $code->{"CTA"} = "L";
1627 :     $code->{"CTC"} = "L";
1628 :     $code->{"CTG"} = "L";
1629 :     $code->{"CTT"} = "L";
1630 :     $code->{"GAA"} = "E";
1631 :     $code->{"GAC"} = "D";
1632 :     $code->{"GAG"} = "E";
1633 :     $code->{"GAT"} = "D";
1634 :     $code->{"GCA"} = "A";
1635 :     $code->{"GCC"} = "A";
1636 :     $code->{"GCG"} = "A";
1637 :     $code->{"GCT"} = "A";
1638 :     $code->{"GGA"} = "G";
1639 :     $code->{"GGC"} = "G";
1640 :     $code->{"GGG"} = "G";
1641 :     $code->{"GGT"} = "G";
1642 :     $code->{"GTA"} = "V";
1643 :     $code->{"GTC"} = "V";
1644 :     $code->{"GTG"} = "V";
1645 :     $code->{"GTT"} = "V";
1646 :     $code->{"TAA"} = "*";
1647 :     $code->{"TAC"} = "Y";
1648 :     $code->{"TAG"} = "*";
1649 :     $code->{"TAT"} = "Y";
1650 :     $code->{"TCA"} = "S";
1651 :     $code->{"TCC"} = "S";
1652 :     $code->{"TCG"} = "S";
1653 :     $code->{"TCT"} = "S";
1654 :     $code->{"TGA"} = "*";
1655 :     $code->{"TGC"} = "C";
1656 :     $code->{"TGG"} = "W";
1657 :     $code->{"TGT"} = "C";
1658 :     $code->{"TTA"} = "L";
1659 :     $code->{"TTC"} = "F";
1660 :     $code->{"TTG"} = "L";
1661 :     $code->{"TTT"} = "F";
1662 : parrello 1.8
1663 : parrello 1.21 return $code;
1664 :     }
1665 : parrello 1.8
1666 : parrello 1.21 =head3 strand_of
1667 : parrello 1.8
1668 : parrello 1.21 my $plusOrMinus = strand_of($loc);
1669 : parrello 1.8
1670 : parrello 1.21 Return the strand (C<+> or C<->) from the specified location string.
1671 : parrello 1.8
1672 : parrello 1.21 =over 4
1673 : parrello 1.8
1674 : parrello 1.21 =item loc
1675 : parrello 1.8
1676 : parrello 1.21 Location string to parse (see L<SAP/Location Strings>).
1677 : parrello 1.9
1678 :     =item RETURN
1679 :    
1680 : parrello 1.21 Returns C<+> if the location is on the forward strand, else C<->.
1681 : parrello 1.9
1682 :     =back
1683 :    
1684 :     =cut
1685 :    
1686 : parrello 1.21 sub strand_of {
1687 : parrello 1.9 # Get the parameters.
1688 : parrello 1.21 my ($loc) = @_;
1689 :     # Declare the return variable.
1690 :     my $retVal;
1691 :     # Parse the strand indicator from the location.
1692 :     if ($loc =~ /\d+([+-])\d+/) {
1693 :     $retVal = $1;
1694 : parrello 1.9 }
1695 : parrello 1.21 # Return the result.
1696 :     return $retVal;
1697 : parrello 1.9 }
1698 :    
1699 : parrello 1.21 =head3 strip_ec
1700 : parrello 1.9
1701 : parrello 1.21 my $role = strip_ec($rawRole);
1702 : parrello 1.11
1703 : parrello 1.21 Strip the EC number (if any) from the specified role or functional
1704 :     assignment.
1705 : parrello 1.11
1706 :     =over 4
1707 :    
1708 : parrello 1.21 =item rawRole
1709 : parrello 1.11
1710 : parrello 1.21 Role or functional assignment from which the EC numbers are to be stripped.
1711 : parrello 1.11
1712 :     =item RETURN
1713 :    
1714 : parrello 1.21 Returns the incoming string with any EC numbers removed. The EC numbers must
1715 :     be formatted in the standard format used by the SEED (with the C<EC> prefix
1716 :     and surrounding parentheses).
1717 : parrello 1.11
1718 :     =back
1719 :    
1720 :     =cut
1721 :    
1722 : parrello 1.21 sub strip_ec {
1723 :     # Get the parameters.
1724 :     my ($rawRole) = @_;
1725 :     # Declare the return variable.
1726 :     my $retVal = $rawRole;
1727 :     # Remove the EC numbers.
1728 :     $retVal =~ s/\s*\(EC\s+[0-9.\-]+\)//g;
1729 :     # Return the result.
1730 :     return $retVal;
1731 : parrello 1.15 }
1732 :    
1733 :     =head3 translate
1734 :    
1735 :     my $aa_seq = translate($dna_seq, $code, $fix_start);
1736 :    
1737 :     Translate a DNA sequence to a protein sequence using the specified genetic code.
1738 :     If I<$fix_start> is TRUE, will translate an initial C<TTG> or C<GTG> code to
1739 :     C<M>. (In the standard genetic code, these two combinations normally translate
1740 :     to C<V> and C<L>, respectively.)
1741 :    
1742 :     =over 4
1743 :    
1744 :     =item dna_seq
1745 :    
1746 :     DNA sequence to translate. Note that the DNA sequence can only contain
1747 :     known nucleotides.
1748 :    
1749 :     =item code
1750 :    
1751 :     Reference to a hash specifying the translation code. The hash is keyed by
1752 :     nucleotide triples, and the value for each key is the corresponding protein
1753 :     letter. If this parameter is omitted, the L</standard_genetic_code> will be
1754 :     used.
1755 :    
1756 :     =item fix_start
1757 :    
1758 :     TRUE if the first triple is to get special treatment, else FALSE. If TRUE,
1759 :     then a value of C<TTG> or C<GTG> in the first position will be translated to
1760 :     C<M> instead of the value specified in the translation code.
1761 :    
1762 :     =item RETURN
1763 :    
1764 :     Returns a string resulting from translating each nucleotide triple into a
1765 :     protein letter.
1766 :    
1767 :     =back
1768 :    
1769 :     =cut
1770 : gdpusch 1.47
1771 : parrello 1.15 #: Return Type $;
1772 :     sub translate {
1773 :    
1774 :     my( $dna,$code,$start ) = @_;
1775 :     my( $i,$j,$ln );
1776 :     my( $x,$y );
1777 :     my( $prot );
1778 :    
1779 :     if (! defined($code)) {
1780 : overbeek 1.45 $code = &standard_genetic_code;
1781 : parrello 1.15 }
1782 :     $ln = length($dna);
1783 :     $prot = "X" x ($ln/3);
1784 :     $dna =~ tr/a-z/A-Z/;
1785 :    
1786 :     for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) {
1787 :     $x = substr($dna,$i,3);
1788 :     if ($y = $code->{$x}) {
1789 :     substr($prot,$j,1) = $y;
1790 :     }
1791 :     }
1792 :    
1793 :     if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) {
1794 :     substr($prot,0,1) = 'M';
1795 :     }
1796 :     return $prot;
1797 :     }
1798 :    
1799 : parrello 1.36 =head3 type_of
1800 :    
1801 :     my $type = SeedUtils::type_of($fid);
1802 :    
1803 :     Return the type of a feature, given a FIG feature ID (e.g. C<fig|100226.1.peg.3361>).
1804 :    
1805 :     =over 4
1806 :    
1807 :     =item fid
1808 :    
1809 :     ID of a feature whose type is desired.
1810 :    
1811 :     =item RETURN
1812 :    
1813 :     Returns the type of the feature (e.g. C<peg>, C<rna>, ...).
1814 :    
1815 :     =back
1816 :    
1817 :     =cut
1818 :    
1819 :     sub type_of {
1820 :     # Get the parameter.
1821 :     my ($fid) = @_;
1822 :     # Declare the return variable. We return undefined if the ID is unparseable.
1823 :     my $retVal;
1824 :     # Parse the FIG ID.
1825 :     if ($fid =~ /fig\|\d+\.\d+\.(\w+)\./) {
1826 :     # Save the type segment.
1827 :     $retVal = $1;
1828 :     }
1829 :     # Return the result.
1830 :     return $retVal;
1831 :     }
1832 :    
1833 :    
1834 : parrello 1.21 =head3 verify_dir
1835 : parrello 1.18
1836 : parrello 1.21 verify_dir($dirName);
1837 : parrello 1.18
1838 : parrello 1.21 Insure that the specified directory exists. If the directory does not
1839 :     exist, it will be created.
1840 : parrello 1.18
1841 :     =over 4
1842 :    
1843 : parrello 1.21 =item dirName
1844 : parrello 1.18
1845 : parrello 1.21 Name of the relevant directory.
1846 : parrello 1.18
1847 :     =back
1848 :    
1849 :     =cut
1850 :    
1851 : parrello 1.21 sub verify_dir {
1852 : parrello 1.18 # Get the parameters.
1853 : parrello 1.21 my ($dirName) = @_;
1854 :     # Strip off the final slash, if any.
1855 :     $dirName =~ s#/$##;
1856 :     # Only proceed if the directory does NOT already exist.
1857 :     if (! -d $dirName) {
1858 :     # If there is a parent directory, recursively insure it is there.
1859 :     if ($dirName =~ m#(.+)/[^/]+$#) {
1860 :     verify_dir($1);
1861 : overbeek 1.20 }
1862 : parrello 1.21 # Create this particular directory with full permissions.
1863 :     mkdir $dirName, 0777;
1864 : overbeek 1.20 }
1865 :     }
1866 :    
1867 : olson 1.39 =head3 validate_fasta_file
1868 :    
1869 :     $sequence_type = validate_fasta_file($in_file, $out_file)
1870 :    
1871 :     Ensure the given file is in valid fasta format. If $out_file
1872 :     is given, write the data to $out_file as a normalized fasta file
1873 :     (with cleaned up line endings, upper case data).
1874 :    
1875 :     If successful, returns the string "dna" or "protein".
1876 :    
1877 :     Will invoke die() on failure; call inside eval{} to ensure full error catching.
1878 :    
1879 :     =cut
1880 :    
1881 :     sub validate_fasta_file
1882 :     {
1883 :     my($file, $norm) = @_;
1884 :    
1885 :     my($input_fh, $clean_fh);
1886 : olson 1.56
1887 :     if ($file =~ /\.gz$/)
1888 :     {
1889 :     open($input_fh, "-|", "gunzip", "-c", $file) or die "cannot unzip $file: $!";
1890 :     }
1891 :     else
1892 :     {
1893 :     open($input_fh, "<", $file) or die "cannot open $file: $!";
1894 :     }
1895 : olson 1.39
1896 :     if ($norm)
1897 :     {
1898 :     open($clean_fh, ">", $norm) or die "cannot write normalized file $norm: $!";
1899 :     }
1900 :    
1901 :     my $state = 'expect_header';
1902 :     my $cur_id;
1903 :     my $dna_chars;
1904 :     my $prot_chars;
1905 :    
1906 :     while (<$input_fh>)
1907 :     {
1908 :     chomp;
1909 :    
1910 :     if ($state eq 'expect_header')
1911 :     {
1912 :     if (/^>(\S+)/)
1913 :     {
1914 :     $cur_id = $1;
1915 :     $state = 'expect_data';
1916 :     print $clean_fh ">$cur_id\n" if $clean_fh;
1917 :     next;
1918 :     }
1919 :     else
1920 :     {
1921 :     die "Invalid fasta: Expected header at line $.\n";
1922 :     }
1923 :     }
1924 :     elsif ($state eq 'expect_data')
1925 :     {
1926 :     if (/^>(\S+)/)
1927 :     {
1928 :     $cur_id = $1;
1929 :     $state = 'expect_data';
1930 :     print $clean_fh ">$cur_id\n" if $clean_fh;
1931 :     next;
1932 :     }
1933 :     elsif (/^([acgtumrwsykbdhvn]*)\s*$/i)
1934 :     {
1935 :     print $clean_fh uc($1) . "\n" if $clean_fh;
1936 :     $dna_chars += length($1);
1937 :     next;
1938 :     }
1939 :     elsif (/^([*abcdefghijklmnopqrstuvwxyz]*)\s*$/i)
1940 :     {
1941 :     print $clean_fh uc($1) . "\n" if $clean_fh;
1942 :     $prot_chars += length($1);
1943 :     next;
1944 :     }
1945 :     else
1946 :     {
1947 :     my $str = $_;
1948 :     if (length($_) > 100)
1949 :     {
1950 :     $str = substr($_, 0, 50) . " [...] " . substr($_, -50);
1951 :     }
1952 :     die "Invalid fasta: Bad data at line $.\n$str\n";
1953 :     }
1954 :     }
1955 :     else
1956 :     {
1957 :     die "Internal error: invalid state $state\n";
1958 :     }
1959 :     }
1960 :     close($input_fh);
1961 :     close($clean_fh) if $clean_fh;
1962 :    
1963 :     my $what = ($prot_chars > 0) ? "protein" : "dna";
1964 :    
1965 :     return $what;
1966 :     }
1967 :    
1968 : disz 1.28 sub strip_func {
1969 :     my($func) = @_;
1970 : overbeek 1.20
1971 : disz 1.28 $func =~ s/^FIG\d{6}[^:]*:\s*//;
1972 :     $func =~ s/\s*\#.*$//;
1973 :     return($func);
1974 :     }
1975 :    
1976 : olson 1.49 sub strip_func_comment {
1977 :     my($func) = @_;
1978 :    
1979 :     $func =~ s/\s*\#.*$//;
1980 :     return($func);
1981 :     }
1982 :    
1983 : olson 1.32 sub verify_db {
1984 :     my($db,$type) = @_;
1985 :    
1986 :     #
1987 :     # Find formatdb; if we're operating in a SEED environment
1988 :     # use it from there.
1989 :     #
1990 :    
1991 :     my $path = '';
1992 :     if ($FIG_Config::blastbin ne '' && -d $FIG_Config::blastbin)
1993 :     {
1994 :     $path = "$FIG_Config::blastbin/";
1995 :     }
1996 :     elsif ($FIG_Config::ext_bin ne '' && -d $FIG_Config::ext_bin)
1997 :     {
1998 :     $path = "$FIG_Config::ext_bin/";
1999 :     }
2000 :    
2001 :    
2002 :     my @cmd;
2003 :     if ($type =~ /^p/i)
2004 :     {
2005 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db))
2006 :     {
2007 :     @cmd = ("${path}formatdb", "-p", "T", "-i", $db);
2008 :     }
2009 :     }
2010 :     else
2011 :     {
2012 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db))
2013 :     {
2014 :     @cmd = ("${path}formatdb", "-p", "F", "-i", $db);
2015 :     }
2016 :     }
2017 :     if (@cmd)
2018 :     {
2019 :     my $rc = system(@cmd);
2020 :     if ($rc != 0)
2021 :     {
2022 :     warn "SeedUtils::verify_db: formatdb failed with rc=$rc: @cmd\n";
2023 :     }
2024 :     }
2025 :     }
2026 :    
2027 : olson 1.38 #
2028 :     # Some berkeley-db building utilities.
2029 :     #
2030 :    
2031 :     sub create_berk_table
2032 :     {
2033 :     my($input_file, $key_columns, $value_columns, $db_file, %opts) = @_;
2034 :    
2035 :     local $DB_BTREE->{flags};
2036 :     if ($opts{-multiple_values})
2037 :     {
2038 :     $DB_BTREE->{flags} = R_DUP;
2039 :     }
2040 :    
2041 :     my $ifh;
2042 :    
2043 :     if ($opts{-sort})
2044 :     {
2045 :     my $sk = join(" ", map { "-k " . ($_ + 1) } @$key_columns);
2046 :     my $cmd = "sort $sk $input_file";
2047 :     print "Run $cmd\n";
2048 :    
2049 :     open($ifh, "$cmd |") or die "Cannot open sort $sk $input_file for reading: $!";
2050 :     }
2051 :     else
2052 :     {
2053 :     open($ifh, "<", $input_file) or die "Cannot open $input_file for reading: $!";
2054 :     }
2055 :    
2056 :     my $hash = {};
2057 :     my $tie = tie %$hash, "DB_File", $db_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;
2058 :     $tie or die "Cannot create $db_file: $!";
2059 :    
2060 :     while (<$ifh>)
2061 :     {
2062 :     chomp;
2063 :     my @a = split(/\t/);
2064 :     my $k = join($;, @a[@$key_columns]);
2065 :     my $v = join($;, @a[@$value_columns]);
2066 :    
2067 :     $hash->{$k} = $v;
2068 :     }
2069 :     close($ifh);
2070 :     undef $hash;
2071 :     untie $tie;
2072 :     }
2073 :    
2074 :     sub open_berk_table
2075 :     {
2076 :     my($table, %opts) = @_;
2077 :    
2078 :     if (! -f $table)
2079 :     {
2080 :     warn "Cannot read table file $table\n";
2081 :     return undef;
2082 :     }
2083 :     my $h = {};
2084 :     tie %$h, 'BerkTable', $table, %opts;
2085 :     return $h;
2086 :     }
2087 : olson 1.32
2088 : redwards 1.54 our $AllColors;
2089 :    
2090 : olson 1.43 sub compare_region_color
2091 :     {
2092 :     my($n) = @_;
2093 :     my $nc = @$AllColors;
2094 :     my $c = $AllColors->[$n % $nc];
2095 :     return split(/-/, $c);
2096 :     }
2097 :    
2098 : olson 1.52 $AllColors =
2099 : olson 1.43 [
2100 :     '255-0-0', # red
2101 :     '0-255-0', # green
2102 :     '0-0-255', # blue
2103 :     '255-64-192',
2104 :     '255-128-64',
2105 :     '255-0-128',
2106 :     '255-192-64',
2107 :     '64-192-255',
2108 :     '64-255-192',
2109 :     '192-128-128',
2110 :     '192-255-0',
2111 :     '0-255-128',
2112 :     '0-192-64',
2113 :     '128-0-0',
2114 :     '255-0-192',
2115 :     '64-0-128',
2116 :     '128-64-64',
2117 :     '64-255-0',
2118 :     '128-0-64',
2119 :     '128-192-255',
2120 :     '128-192-0',
2121 :     '64-0-0',
2122 :     '128-128-0',
2123 :     '255-192-255',
2124 :     '128-64-255',
2125 :     '64-0-192',
2126 :     '0-64-64',
2127 :     '64-0-255',
2128 :     '192-64-255',
2129 :     '128-0-128',
2130 :     '192-255-64',
2131 :     '64-128-255',
2132 :     '255-128-192',
2133 :     '64-192-64',
2134 :     '0-128-128',
2135 :     '255-0-64',
2136 :     '128-64-0',
2137 :     '128-255-128',
2138 :     '255-64-128',
2139 :     '128-192-64',
2140 :     '128-128-64',
2141 :     '255-255-192',
2142 :     '192-192-128',
2143 :     '192-64-128',
2144 :     '64-128-192',
2145 :     '192-192-64',
2146 :     '192-0-128',
2147 :     '64-64-192',
2148 :     '0-128-192',
2149 :     '0-128-64',
2150 :     '255-192-128',
2151 :     '192-128-0',
2152 :     '64-255-255',
2153 :     '255-0-255',
2154 :     '128-255-255',
2155 :     '255-255-64',
2156 :     '0-128-0',
2157 :     '192-255-192',
2158 :     '0-192-0',
2159 :     '0-64-192',
2160 :     '0-64-128',
2161 :     '192-0-255',
2162 :     '192-192-255',
2163 :     '64-255-128',
2164 :     '0-0-128',
2165 :     '255-64-64',
2166 :     '192-192-0',
2167 :     '192-128-192',
2168 :     '128-64-192',
2169 :     '0-192-255',
2170 :     '128-192-192',
2171 :     '192-0-64',
2172 :     '192-255-255',
2173 :     '255-192-0',
2174 :     '255-255-128',
2175 :     '192-0-0',
2176 :     '64-64-0',
2177 :     '192-64-192',
2178 :     '192-128-255',
2179 :     '128-255-192',
2180 :     '64-64-255',
2181 :     '0-64-255',
2182 :     '128-64-128',
2183 :     '255-64-255',
2184 :     '192-128-64',
2185 :     '64-64-128',
2186 :     '0-128-255',
2187 :     '64-0-64',
2188 :     '128-0-192',
2189 :     '255-128-255',
2190 :     '64-128-0',
2191 :     '255-64-0',
2192 :     '64-192-192',
2193 :     '255-128-0',
2194 :     '0-0-64',
2195 :     '128-128-192',
2196 :     '128-128-255',
2197 :     '0-192-192',
2198 :     '0-255-192',
2199 :     '128-192-128',
2200 :     '192-0-192',
2201 :     '0-255-64',
2202 :     '64-192-0',
2203 :     '0-192-128',
2204 :     '128-255-64',
2205 :     '255-255-0',
2206 :     '64-255-64',
2207 :     '192-64-64',
2208 :     '192-64-0',
2209 :     '255-192-192',
2210 :     '192-255-128',
2211 :     '0-64-0',
2212 :     '0-0-192',
2213 :     '128-0-255',
2214 :     '64-128-64',
2215 :     '64-192-128',
2216 :     '0-255-255',
2217 :     '255-128-128',
2218 :     '64-128-128',
2219 :     '128-255-0'
2220 :     ];
2221 :    
2222 : overbeek 1.40 sub run {
2223 :     shift if UNIVERSAL::isa($_[0],__PACKAGE__);
2224 :     my($cmd) = @_;
2225 :    
2226 :     if ($ENV{FIG_VERBOSE}) {
2227 :     my @tmp = `date`;
2228 :     chomp @tmp;
2229 :     print STDERR "$tmp[0]: running $cmd\n";
2230 :     }
2231 :     (system($cmd) == 0) || die("FAILED: $cmd");
2232 :     }
2233 :    
2234 : olson 1.57 sub map_to_families
2235 :     {
2236 :     my($fam2c, $func) = @_;
2237 :    
2238 :     my $fh;
2239 :    
2240 :     if (ref($fam2c))
2241 :     {
2242 :     $fh = $fam2c;
2243 :     }
2244 :     else
2245 :     {
2246 :     if (!open($fh, "<", $fam2c))
2247 :     {
2248 :     die "Cannot open $fam2c: $!";
2249 :     }
2250 :     }
2251 :     $_ = <$fh>;
2252 :     chomp;
2253 :     my($fam, $peg) = split(/\t/);
2254 :     while (defined($fam))
2255 :     {
2256 :     my $cur = $fam;
2257 :     my @set;
2258 :     while (defined($fam) && $fam eq $cur)
2259 :     {
2260 :     push(@set, $peg);
2261 :     $_ = <$fh>;
2262 :     chomp;
2263 :     ($fam, $peg) = split(/\t/);
2264 :     }
2265 :     $func->($cur, \@set);
2266 :     }
2267 :    
2268 :     }
2269 :    
2270 :    
2271 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3