[Bio] / FigKernelPackages / SeedUtils.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/SeedUtils.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SeedUtils;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use base qw(Exporter);
25 :    
26 : parrello 1.6 our @EXPORT = qw(create_fasta_record rev_comp genome_of min max sims);
27 : parrello 1.1
28 :     =head1 SEED Utility Methods
29 :    
30 :     =head2 Introduction
31 :    
32 :     This is a simple utility package that performs functions useful for
33 :     bioinformatics, but that do not require access to the databases.
34 :    
35 :     =head2 Public Methods
36 :    
37 : parrello 1.5 =head3 max
38 :    
39 :     my $max = max(@nums);
40 :    
41 :     Return the maximum number from all the values in the specified list.
42 :    
43 :     =over 4
44 :    
45 :     =item nums
46 :    
47 :     List of numbers to examine.
48 :    
49 :     =item RETURN
50 :    
51 :     Returns the maximum numeric value from the specified parameters, or
52 :     an undefined value if an empty list is passed in.
53 :    
54 :     =back
55 :    
56 :     =cut
57 :    
58 :     sub max {
59 :     my ($retVal, @nums) = @_;
60 :     for my $num (@nums) {
61 :     if ($num > $retVal) {
62 :     $retVal = $num;
63 :     }
64 :     }
65 :     return $retVal;
66 :     }
67 :    
68 :     =head3 min
69 :    
70 :     my $min = min(@nums);
71 :    
72 :     Return the minimum number from all the values in the specified list.
73 :    
74 :     =over 4
75 :    
76 :     =item nums
77 :    
78 :     List of numbers to examine.
79 :    
80 :     =item RETURN
81 :    
82 :     Returns the minimum numeric value from the specified parameters, or
83 :     an undefined value if an empty list is passed in.
84 :    
85 :     =back
86 :    
87 :     =cut
88 :    
89 :     sub min {
90 :     my ($retVal, @nums) = @_;
91 :     for my $num (@nums) {
92 :     if ($num < $retVal) {
93 :     $retVal = $num;
94 :     }
95 :     }
96 :     return $retVal;
97 :     }
98 :    
99 : parrello 1.1 =head3 create_fasta_record
100 :    
101 :     my $fastaString = create_fasta_record($id, $comment, $sequence);
102 :    
103 :     Create a FASTA record from the specified DNA or protein sequence. The
104 :     sequence will be split into 60-character lines, and the record will
105 : parrello 1.4 include an identifier line.
106 : parrello 1.1
107 :     =over 4
108 :    
109 :     =item id
110 :    
111 :     ID for the sequence, to be placed at the beginning of the identifier
112 :     line.
113 :    
114 :     =item comment (optional)
115 :    
116 :     Comment text to place after the ID on the identifier line. If this parameter
117 :     is empty, undefined, or 0, no comment will be placed.
118 :    
119 :     =item sequence
120 :    
121 :     Sequence of letters to form into FASTA. For purposes of convenience, whitespace
122 :     characters in the sequence will be removed automatically.
123 :    
124 :     =item RETURN
125 :    
126 :     Returns the desired sequence in FASTA format.
127 :    
128 :     =back
129 :    
130 :     =cut
131 :    
132 :     sub create_fasta_record {
133 :     # Get the parameters.
134 :     my ($id, $comment, $sequence) = @_;
135 :     # Start with the ID.
136 :     my $header = ">$id";
137 :     # Add a comment, if any.
138 :     if ($comment) {
139 :     $header .= " $comment";
140 :     }
141 :     # Clean up the sequence.
142 :     $sequence =~ s/\s+//g;
143 :     # We need to format the sequence into 60-byte chunks. We use the infamous
144 :     # grep-split trick. The split, because of the presence of the parentheses,
145 :     # includes the matched delimiters in the output list. The grep strips out
146 :     # the empty list items that appear between the so-called delimiters, since
147 :     # the delimiters are what we want.
148 :     my @chunks = grep { $_ } split /(.{1,60})/, $sequence;
149 :     Trace(scalar(@chunks) . " chunks found in sequence of length " .
150 :     length($sequence) . ".") if T(3);
151 :     # Add the chunks and the trailer.
152 : parrello 1.4 my $retVal = join("\n", $header, @chunks);
153 : parrello 1.1 # Return the result.
154 :     return $retVal;
155 :     }
156 :    
157 : parrello 1.2 =head3 rev_comp
158 :    
159 :     my $revcmp = rev_comp($dna);
160 :    
161 :     or
162 :    
163 :     rev_comp(\$dna);
164 :    
165 :     Return the reverse complement of a DNA string.
166 :    
167 :     =over 4
168 :    
169 :     =item dna
170 :    
171 :     Either a DNA string, or a reference to a DNA string.
172 :    
173 :     =item RETURN
174 :    
175 :     If the input is a DNA string, returns the reverse complement. If the
176 :     input is a reference to a DNA string, the string itself is reverse
177 :     complemented.
178 :    
179 :     =back
180 :    
181 :     =cut
182 :    
183 :     sub rev_comp {
184 :     # Get the parameters.
185 :     my ($dna) = @_;
186 :     # Determine how we were called.
187 :     my ($retVal, $refMode);
188 :     if (ref $dna eq 'SCALAR') {
189 :     $retVal = lc reverse $dna;
190 :     $refMode = 0;
191 :     } else {
192 :     $retVal = lc reverse $$dna;
193 :     $refMode = 1;
194 :     }
195 :     # Now $retVal contains the reversed DNA string in all lower case, and
196 :     # $refMode is TRUE iff the user passed in a reference. The following
197 :     # translation step complements the string.
198 :     $retVal =~ tr/acgtumrwsykbdhv/tgcaakywsrmvhdb/;
199 :     # Return the result in the method corresponding to the way it came in.
200 :     if ($refMode) {
201 :     $$dna = $retVal;
202 :     return;
203 :     } else {
204 :     return $retVal;
205 :     }
206 :     }
207 :    
208 : parrello 1.3 =head3 genome_of
209 :    
210 :     my $genomeID = genome_of($fid);
211 :    
212 :     Return the Genome ID embedded in the specified FIG feature ID.
213 :    
214 :     =over 4
215 :    
216 :     =item fid
217 :    
218 :     Feature ID of interest.
219 :    
220 :     =item RETURN
221 :    
222 :     Returns the genome ID in the middle portion of the FIG feature ID. If the
223 :     feature ID is invalid, this method returns an undefined value.
224 :    
225 :     =back
226 :    
227 :     =cut
228 :    
229 :     sub genome_of {
230 :     # Get the parameters.
231 :     my ($fid) = @_;
232 :     # Declare the return variable.
233 :     my $retVal;
234 :     # Parse the feature ID.
235 :     if ($fid =~ /^fig\|(\d+\.\d+)\./) {
236 :     $retVal = $1;
237 :     }
238 :     # Return the result.
239 :     return $retVal;
240 :     }
241 :    
242 : parrello 1.6 =head3 sims
243 :    
244 :     my $sims = sims($id, \%seen, $maxN, $maxP, $select, $max_expand, $filters);
245 :    
246 :     Retrieve similarities from the network similarity server. The similarity retrieval
247 :     is performed using an HTTP user agent that returns similarity data in multiple
248 :     chunks. An anonymous subroutine is passed to the user agent that parses and
249 :     reformats the chunks as they come in. The similarites themselves are returned
250 :     as B<Sim> objects. Sim objects are actually list references with 15 elements.
251 :     The Sim object methods allow access to the elements by name.
252 :    
253 :     Similarities can be either raw or expanded. The raw similarities are basic
254 :     hits between features with similar DNA. Expanding a raw similarity drags in any
255 :     features considered substantially identical. So, for example, if features B<A1>,
256 :     B<A2>, and B<A3> are all substatially identical to B<A>, then a raw similarity
257 :     B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
258 :    
259 :     Specify the trace type C<nsims> to trace this method.
260 :    
261 :     =over 4
262 :    
263 :     =item id
264 :    
265 :     ID of the feature whose similarities are desired, or reference to a list
266 :     of the IDs of the features whose similarities are desired.
267 :    
268 :     =item maxN
269 :    
270 :     Maximum number of similarities to return.
271 :    
272 :     =item maxP
273 :    
274 :     The maximum allowable similarity score.
275 :    
276 :     =item select
277 :    
278 :     Selection criterion: C<raw> means only raw similarities are returned; C<fig>
279 :     means only similarities to FIG features are returned; C<all> means all expanded
280 :     similarities are returned; and C<figx> means similarities are expanded until the
281 :     number of FIG features equals the maximum.
282 :    
283 :     =item max_expand
284 :    
285 :     The maximum number of features to expand.
286 :    
287 :     =item filters
288 :    
289 :     Reference to a hash containing filter information, or a subroutine that can be
290 :     used to filter the sims.
291 :    
292 :     =item RETURN
293 :    
294 :     Returns a list of L<Sim> objects.
295 :    
296 :     =back
297 :    
298 :     =cut
299 :    
300 :     sub sims {
301 :     # Get the parameters.
302 :     my($id, $maxN, $maxP, $select, $max_expand, $filters) = @_;
303 :     # Get the URL for submitting to the sims server.
304 :     my $url = $FIG_Config::sim_server_url || "http://bioseed.mcs.anl.gov/simserver/perl/sims.pl";
305 :     # Get a list of the IDs to process.
306 :     my @ids;
307 :     if (ref($id) eq "ARRAY") {
308 :     @ids = @$id;
309 :     } else {
310 :     @ids = ($id);
311 :     }
312 :     # Form a list of the parameters to pass to the server.
313 :     my %args = ();
314 :     $args{id} = \@ids;
315 :     $args{maxN} = $maxN if defined($maxN);
316 :     $args{maxP} = $maxP if defined($maxP);
317 :     $args{select} = $select if defined($select);
318 :     $args{max_expand} = $max_expand if defined($max_expand);
319 :     # If the filter is a hash, put the filters in the argument list.
320 :     if (ref($filters) eq 'HASH') {
321 :     for my $k (keys(%$filters))
322 :     {
323 :     $args{"filter_$k"}= $filters->{$k};
324 :     }
325 :     }
326 :     # Dump the request data.
327 :     if (T(nsims => 3)) {
328 :     for my $arg (keys %args) {
329 :     if (ref $args{$arg} eq 'ARRAY') {
330 :     Trace("Request argument $arg has " . scalar(@{$args{$arg}}) . " values.");
331 :     } else {
332 :     Trace("Request argument $arg = $args{$arg}");
333 :     }
334 :     }
335 :     }
336 :     # Get the user agent.
337 :     require LWP::UserAgent;
338 :     my $ua = LWP::UserAgent->new();
339 :     # Insure we have the Sim module.
340 :     require Sim;
341 :     #
342 :     # Our next task is to create the anonymous subroutine that will process the
343 :     # chunks that come back from the server. We require three global variables:
344 :     # @sims to hold the similarities found, $tail to remember the unprocessed
345 :     # data from the previous chunk, and $chunks to count the chunks.
346 :     #
347 :     my @retVal;
348 :     my $tail;
349 :     my $chunks = 0;
350 :     #
351 :     # ANONYMOUS SUBROUTINE
352 :     #
353 :     my $cb = sub {
354 :     eval {
355 :     # Get the parameters.
356 :     my ($data, $command) = @_;
357 :     # Check for a reset command. If we get one, we discard any data
358 :     # in progress.
359 :     if ($command && $command eq 'reset') {
360 :     $tail = '';
361 :     } else {
362 :     $chunks++;
363 :     # Get the data to process. Note we concatenate it to the incoming
364 :     # tail from last time.
365 :     my $c = $tail . $data;
366 :     # Make sure the caller hasn't messed up the new-line character.
367 :     # FASTA readers in particular are notorious for doing things
368 :     # like that.
369 :     local $/ = "\n";
370 :     # Split the input into lines.
371 :     my @lines = split(/\n/, $c);
372 :     # If the input does not end with a new-line, we have a partial
373 :     # chunk and need to put it in the tail for next time. If not,
374 :     # there is no tail for next time.
375 :     if (substr($c, -1, 1) ne "\n") {
376 :     $tail = pop @lines;
377 :     } else {
378 :     $tail = '';
379 :     }
380 :     # Loop through the lines. Note there's no need to chomp because
381 :     # the SPLIT took out the new-line characters.
382 :     for my $l (@lines) {
383 :     # Split the line into fields.
384 :     my @s = split(/\t/, $l);
385 :     # Insure we have all the fields we need.
386 :     if (@s < 9) {
387 :     Trace("Insufficient fields in line $l.") if T(nsims => 1);
388 :     } else {
389 :     # Check to see if we've seen this SIM before.
390 :     my $id1 = $s[0];
391 :     my $id2 = $s[1];
392 :     # Add it to the result list.
393 :     push(@retVal, bless \@s, 'Sim');
394 :     }
395 :     }
396 :     }
397 :     };
398 :     if ($@) {
399 :     Trace("Network sims request failed with error: $@") if T(nsims => 0);
400 :     }
401 :     };
402 :     #
403 :     # END OF ANONYMOUS SUBROUTINE
404 :     #
405 :     # Now we're ready to start. Because networking is an iffy thing, we set up
406 :     # to try our request multiple times.
407 :     my $n_retries = 10;
408 :     my $attempts = 0;
409 :     # Set the timeout value, in seconds.
410 :     $ua->timeout(180);
411 :     # Loop until we succeed or run out of retries.
412 :     my $done = 0;
413 :     while (! $done && $attempts++ < $n_retries) {
414 :     # Reset the content processor. This clears the tail.
415 :     &$cb(undef, 'reset');
416 :     Trace("Sending request to $url.") if T(nsims => 3);
417 :     my $resp = $ua->post($url, \%args, ':content_cb' => $cb);
418 :     if ($resp->is_success) {
419 :     # If the response was successful, get the content. This triggers
420 :     # the anonymous subroutine.
421 :     Trace("Processing similarity results.") if T(nsims => 3);
422 :     my $x = $resp->content;
423 :     Trace("Response content is:\n$x") if T(nsims => 4);
424 :     # Denote we've been successful.
425 :     $done = 1;
426 :     } else {
427 :     Trace("Error getting sims (attempt $attempts of $n_retries): " . $resp->status_line) if T(nsims => 1);
428 :     }
429 :     }
430 :     if (! $done) {
431 :     # Here we failed.
432 :     Trace("Could not get network sims after $attempts retries; request url is $url.") if T(nsims => 0);
433 :     } else {
434 :     # Here everything worked.
435 :     Trace(scalar(@retVal) . " sims and $chunks chunks received from network.") if T(nsims => 3);
436 :     }
437 :     return @retVal;
438 :     }
439 :    
440 : parrello 1.1
441 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3