[Bio] / FigKernelPackages / ANNOserver.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/ANNOserver.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.20 - (view) (download) (as text)

1 : olson 1.1 package ANNOserver;
2 :    
3 :     #
4 :     # This is a SAS Component
5 :     #
6 :    
7 : olson 1.9 =head1 Annotation Support Server Object
8 : olson 1.1
9 :    
10 : olson 1.9 This file contains the functions and utilities used by the Annotation Support Server
11 :     (B<anno_server.cgi>). The various methods listed in the sections below represent
12 :     function calls direct to the server. These all have a signature similar to the
13 :     following.
14 :    
15 :     my $results = $annoObject->function_name($args);
16 :    
17 :     where C<$annoObject> is an object created by this module,
18 :     C<$args> is a parameter structure, and C<function_name> is the Annotation Support
19 :     Server function name. The output $results is a scalar, generally a hash
20 :     reference, but sometimes a string or a list reference.
21 :    
22 :     =head2 Constructor
23 :    
24 :     Use
25 :    
26 :     my $annoObject = ANNOserver->new();
27 :    
28 :     to create a new annotation support server function object. The function object
29 :     is used to invoke the L</Primary Methods> listed below. See L<SAPserver> for
30 :     more information on how to create this object and the options available.
31 : olson 1.1
32 :     =cut
33 :    
34 :     use LWP::UserAgent;
35 :     use Data::Dumper;
36 :     use YAML;
37 :    
38 : olson 1.4 use base qw(ClientThing);
39 :    
40 : olson 1.1 use strict;
41 :    
42 :     sub new
43 :     {
44 : parrello 1.18 my($class, @options) = @_;
45 :     my %options = ClientThing::FixOptions(@options);
46 : parrello 1.17 $options{url} = ClientThing::ComputeURL($options{url}, 'anno_server.cgi', 'anno');
47 : olson 1.1
48 : olson 1.4 # my $self = {
49 :     # server_url => $server_url,
50 :     # ua => LWP::UserAgent->new(),
51 :     # };
52 :     # $self->{ua}->timeout(20 * 60);
53 : olson 1.1
54 : olson 1.4 # return bless $self, $class;
55 : olson 1.1
56 : olson 1.4 my $self = $class->SUPER::new(ANNO => %options);
57 :     return $self;
58 : olson 1.1 }
59 :    
60 : olson 1.4 #
61 :     # Doc for stuff in ANNO.pm is actually here.
62 :     #
63 :    
64 : olson 1.9 =head1 Primary Methods
65 :    
66 : olson 1.1 =head2 Functions
67 :    
68 : olson 1.4 =head3 metabolic_reconstruction
69 :    
70 : parrello 1.13 my $results = $annoObject->metabolic_reconstruction({
71 : parrello 1.14 -roles => { [$role1, $id1],
72 :     [$role2, $id2],
73 :     ... });
74 : olson 1.4
75 :     This method will find for each subsystem, the subsystem variant that contains a
76 :     maximal subset of the roles in an incoming list, and output the ID of the
77 :     variant and a list of the roles in it.
78 :    
79 :     =over 4
80 :    
81 : parrello 1.16 =item parameters
82 :    
83 :     The single parameter is a reference to a hash containing the following
84 : parrello 1.14 key fields.
85 : olson 1.4
86 :     =over 8
87 :    
88 :     =item -roles
89 :    
90 : parrello 1.12 Reference to a list of 2-tuples, each consisting of the functional role followed
91 :     by an arbitrary ID of the caller's choosing (e.g., a gene name, a
92 :     sequence-project gene ID, a protein ID or whatever).
93 : olson 1.4
94 :     =back
95 :    
96 : parrello 1.12 For backward compatibility, instead of a hash reference you may specify a
97 :     simple reference to a list of 2-tuples.
98 :    
99 : olson 1.4 =item RETURN
100 :    
101 : parrello 1.8 Returns a list of tuples, each containing a variant name (subsystem name,
102 :     colon, variant code), a role ID, and optionally a caller-provided ID associated
103 :     with the role. The ability to specify arbitrary IDs to be associated with the
104 :     roles is normally used to associate arbitrary gene IDs with the roles they are
105 :     believed to implement.
106 : olson 1.4
107 :     =back
108 :    
109 : parrello 1.15 =head3 find_special_proteins
110 :    
111 :     my $proteinList = $annoObject->find_special_proteins({
112 :     -contigs => [[$contigID1, $contigNote1, $contigDNA1],
113 :     [$contigID2, $contigNote2, $contigDNA2],
114 :     ...],
115 :     -is_init => [$codon1a, $codon1b, ...],
116 :     -is_alt => [$codon2a, $codon2b, ...],
117 :     -is_term => [$codon3a, $codon3b, ...],
118 :     -comment => $commentString,
119 :     -templates => [[$protID1, $protNote1, $protSeq1],
120 :     [$protID2, $protNote2, $protSeq1],
121 :     ...]
122 :     });
123 :    
124 :     This method searches for special proteins in a list of contigs. The method is
125 :     specifically designed to find selenoproteins and pyrrolysoproteins, but custom
126 :     protein templates can be specified to allow searching for any type of protein
127 :     family.
128 :    
129 :     =over 4
130 :    
131 :     =item parameter
132 :    
133 :     The parameter is a reference to a hash with the following permissible keys.
134 :    
135 :     =over 8
136 :    
137 :     =item -contigs
138 :    
139 :     Reference to a list of contigs. Each contig is represented by a 3-tuple
140 :     consisting of a contig ID, a comment, and a DNA string.
141 :    
142 :     =item -is_init (optional)
143 :    
144 :     Reference to a list of DNA codons to be used as start codons. The default is
145 :     C<ATG> and C<GTG>.
146 :    
147 :     =item -is_alt (optional)
148 :    
149 :     Reference to a list of DNA codons to be used as alternative start codons. These are
150 :     employed if there are no results from the main start codons. The default is
151 :     C<TTG>.
152 :    
153 :     =item -is_term (optional)
154 :    
155 :     Reference to a list of DNA codons to be used as stop codons. The default is
156 :     C<TAA>, C<TAG>, and C<TGA>.
157 :    
158 :     =item -templates (optional)
159 :    
160 : parrello 1.19 Description of the type of special protein being sought. If C<pyrrolysoprotein>, then
161 : parrello 1.15 the method will search for pyrrolysines. If C<selenoprotein>, then the method will
162 :     search for selenoproteins. Otherwise, should be a reference to a list of 3-tuples
163 :     containing templates for the proteins in the desired family. Each 3-tuple must
164 :     consist of an ID, a functional role description, and a protein sequence. The default
165 :     is C<selenoprotein>.
166 :    
167 :     =item -comment (optional)
168 :    
169 :     A string that will be inserted as a comment in each element of the output list. The
170 : parrello 1.19 default is either C<pyrrolysoprotein> or C<selenoprotein>, depending on the template
171 : parrello 1.15 specification.
172 :    
173 :     =back
174 :    
175 :     =item RETURN
176 :    
177 :     Returns a reference to a list of hashes. Each hash contains the following keys.
178 :    
179 :     =over 8
180 :    
181 :     =item location
182 :    
183 :     A location string describing the contig, start, and end location of the protein
184 :     found.
185 :    
186 :     =item sequence
187 :    
188 :     The protein sequence found.
189 :    
190 :     =item reference_id
191 :    
192 :     ID of the relevant template protein sequence.
193 :    
194 :     =item reference_def
195 :    
196 :     Functional role of the relevant template protein sequence.
197 :    
198 :     =item comment
199 :    
200 :     Comment from the input parameters.
201 :    
202 :     =back
203 :    
204 :     =back
205 : olson 1.4
206 : olson 1.1
207 :     =head3 assign_function_to_prot
208 :    
209 : olson 1.9 my $resultHandle = $annoObject->assign_function_to_prot($args)
210 : olson 1.1
211 : overbeek 1.7 For each incoming protein sequence, attempt to assign a function.
212 :     There are two ways functions can get assigned. The first is based on
213 :     kmers, and these are normally viewed as the most reliable (at least
214 :     they give a consistent vocabulary!). If no kmer match is made,
215 :     you can optionally try to make an assignment based on similarity
216 :     computations.
217 :    
218 :     The attempt is made using kmer-technology. A pass through the sequence
219 :     will locate "signature kmers", and scores will be computed.
220 :     The scores are based on the number of nonoverlapping hits, the number of
221 :     overlapping hits, and the difference in counts between hits against the
222 :     most probable function's kmer-set and the next most probable function's
223 :     kmer set. Basically, we compute all matching kmers. Then, we split them
224 :     into sets based on the predictions each would make (each kmer, in effect,
225 :     predicts a single function). One threshhold (the B<scoreThreshold>) is
226 :     the difference between total number of overlapping hits for the "best function"
227 :     versus the total number for the "next best". B<hitTheshold> is the number
228 :     of overlapping hits required for the "best function". Similarly,
229 :     B<seqHitThreshold> is the minimum number of non-overlapping hits.
230 :    
231 :     Now, to add complexity, these thresholds are based on counting "1" for
232 :     each matched Kmer. That is, the B<scoreThreshold> is normally thought of
233 :     as a difference in the number of occurrences. However, you may wish to
234 :     "normalize" this threshold by dividing the count by the length of the sequence.
235 :     This then gives scores between 0 and 1, rather than between 0 and the length
236 :     of the sequence (-K if you wish to be pedantic).
237 : olson 1.1
238 :     =over 4
239 :    
240 :     =item args
241 :    
242 :     Reference to a hash containing the parameters. The allowable parameter fields
243 :     are as follows.
244 :    
245 :     =over 8
246 :    
247 :     =item -input
248 :    
249 :     Either (1) an open input handle to a file containing the proteins in FASTA format,
250 : parrello 1.19 or (2) A reference to a list of sequence data entries. Each entry is a triple of strings
251 :     [sequence-id, comment, protein-sequence-data].
252 : disz 1.11
253 : olson 1.1 =item -kmer
254 :    
255 : olson 1.3 Specify the kmer size to use for analysis (valid sizes are 7 - 12).
256 : olson 1.1
257 :     =item -assignToAll
258 :    
259 :     If TRUE, then if the standard matching algorithm fails to assign a protein,
260 : olson 1.3 a similarity-based assignment algorithm will be used instead.
261 : olson 1.1
262 :     =item -scoreThreshold N
263 :    
264 :     Require a Kmer score of at least N for a Kmer match to succeed.
265 :    
266 :     =item -hitThreshold N
267 :    
268 :     Require at least N (possibly overlapping) Kmer hits for a Kmer match to succeed.
269 :    
270 :     =item -seqHitThreshold N
271 :    
272 : overbeek 1.7 Require at least N (non-overlapping) Kmer hits for a Kmer match to succeed.
273 : olson 1.1
274 :     =item -normalizeScores 0|1
275 :    
276 :     Normalize the scores to the size of the protein.
277 :    
278 :     =item -detailed 0|1
279 :    
280 :     If true, return a detailed accounting of the kmers hit for each protein.
281 :    
282 : olson 1.2 =back
283 : olson 1.1
284 :     =item RETURN
285 :    
286 : olson 1.10 Returns a Result Handle. Call C<get_next> on the result handle to get
287 :     back a data item. Each item sent back by C<get_next> is a 7-tuple
288 :     containing the results. Each tuple is of the form
289 :    
290 :     [ sequence-id, assigned-function, genome-set-name, score, non-overlapping hit-count, overlapping hit-count, detailed-hits]
291 :    
292 :     where detailed-hits is undef unless the -detailed option was used.
293 :    
294 :     If details were requested, the detailed-hit list is a list of tuples,
295 :     one for each kmer hit. These tuples have the form
296 :    
297 :     [ offset, oligo, functional-role, genome-set-name]
298 : olson 1.1
299 :     =back
300 :    
301 :     =cut
302 :    
303 :     sub assign_function_to_prot
304 :     {
305 :     my($self, $args) = _handle_args(@_);
306 :    
307 :     my $wq;
308 :    
309 : olson 1.3 # my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
310 :    
311 :     my $input = delete $args->{-input};
312 : olson 1.1
313 : olson 1.3 my $params = [ map { $_ => $args->{$_} } keys %$args ];
314 :    
315 : olson 1.1 if (ref($input) eq 'ARRAY')
316 :     {
317 :     $wq = SequenceListWorkQueue->new($input);
318 :     }
319 :     else
320 :     {
321 :     $wq = FastaWorkQueue->new($input);
322 :     }
323 :    
324 : olson 1.3 my $req_bytes = 1_000_000;
325 :     #my $req_bytes = $blast ? 1000 : 1_000_000;
326 : olson 1.1
327 :     return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
328 :     #\&tab_delimited_output_parser,
329 :     \&YAML::Load,
330 :     $params, $req_bytes);
331 :     }
332 :    
333 :     =head3 call_genes
334 :    
335 : overbeek 1.7 my $result = $annoObject->call_genes($args);
336 : olson 1.1
337 : overbeek 1.7 Call the protein-encoding genes for the specified DNA sequences.
338 : olson 1.1
339 :     =over 4
340 :    
341 :     =item args
342 :    
343 :     Reference to a hash containing the parameters. The allowable parameter fields
344 :     are as follows.
345 :    
346 :     =over 8
347 :    
348 :     =item -input
349 :    
350 : parrello 1.19 The DNA sequences to be analyzed. This may take one of two forms: (1) a file handle
351 :     that is open for reading from a file of DNA sequences in FASTA format, or
352 :     (2) a reference to a list of DNA data entries. Each entry is a triple of strings
353 :     [sequence-id, comment, dna-sequence-data].
354 : parrello 1.15
355 :     =item -trainingLocations (optional)
356 :    
357 :     Reference to a hash mapping gene IDs to location strings. The location strings in
358 :     this case are of the form I<contig>C<_>I<start>C<_>I<end>. The locations indicated
359 :     should be coding regions in the incoming sequences to be analyzed (or in the
360 :     training contigs if a I<-trainingContigs> parameter is specified). If this
361 : parrello 1.19 parameter is omitted, then the default GLIMMER training algorithm will be used.
362 : parrello 1.15
363 :     =item -trainingContigs (optional)
364 :    
365 :     The contigs in which the I<-trainingLocations> can be found. This may take one of
366 :     two forms: (1) a file handle that is open for reading from a file of DNA
367 :     sequences in FASTA format, or (2) a reference to a list of contig data entries.
368 :     Each entry is a triple of strings [contig-id, comment, contig-sequence-data].
369 : olson 1.6
370 : parrello 1.15 =item -minContigLen (optional)
371 : olson 1.6
372 : parrello 1.15 Shortest-length contig considered to be valid. This is used to prevent attempting
373 :     to call genes in contigs too short to have any complete ones. The default is C<2000>.
374 : olson 1.1
375 : parrello 1.15 =item -geneticCode (optional)
376 : olson 1.1
377 :     The numeric code for the mapping from DNA to amino acids. The default is C<11>,
378 :     which is the standard mapping and should be used in almost all cases. A complete
379 :     list of mapping codes can be found at
380 :     L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
381 :    
382 : olson 1.2 =back
383 :    
384 : olson 1.1 =item RETURN
385 :    
386 : overbeek 1.7 Returns a 2-tuple consisting of 1) a string containing what would normally
387 :     be the contents of an entire FASTA file for all the proteins found
388 :     followed by 2) a reference to a list of genes found. Each gene found will be
389 : olson 1.1 represented by a 4-tuple containing an ID for the gene, the ID of the contig
390 : overbeek 1.7 containing it, the starting offset, and the ending offset.
391 : olson 1.1
392 :     =back
393 :    
394 :     =cut
395 :    
396 : parrello 1.15 use constant CALL_GENES_FILE_PARMS => { -input => 'id_seq', -trainingContigs => 'train_seq' };
397 :    
398 : olson 1.1 sub call_genes
399 :     {
400 : olson 1.3 my($self, $args) = _handle_args(@_);
401 :    
402 : parrello 1.15 my $params = {
403 :     -geneticCode => ($args->{-geneticCode} || 11),
404 :     -minContigLen => ($args->{-minContigLen} || 2000),
405 :     -verbose => ($args->{-verbose} || 0),
406 :     };
407 : olson 1.3
408 : parrello 1.15 for my $fileKey (keys %{CALL_GENES_FILE_PARMS()}) {
409 :     my $fileData = $args->{$fileKey};
410 :     if (defined $fileData) {
411 :     my $input = [];
412 :     if (ref($fileData) eq 'ARRAY') {
413 :     $input = $fileData;
414 :     } else {
415 :     my $fh;
416 :     if (ref($fileData)) {
417 :     $fh = $fileData;
418 :     } else {
419 :     my $fasta_file = $fileData;
420 :     open($fh, "<", $fasta_file);
421 :     }
422 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh)) {
423 :     push(@$input, "$id,$$seqp");
424 :     }
425 :     close($fh);
426 :     }
427 :     $params->{CALL_GENES_FILE_PARMS->{$fileKey}} = $input;
428 :     }
429 :     }
430 :     my $trainingData = $args->{-trainingLocations};
431 :     if (ref $trainingData eq 'HASH') {
432 :     $params->{training_set} = [ map { $_, $trainingData->{$_} } keys %$trainingData ];
433 : olson 1.1 }
434 : parrello 1.15 my $parmList = [function => "call_genes", %$params];
435 :     return $self->run_query_form($parmList);
436 : olson 1.1 }
437 :    
438 :     =head3 find_rnas
439 :    
440 : olson 1.4 my $document = $annoObject->find_rnas($args)
441 : olson 1.1
442 : overbeek 1.7 Call the RNAs for the specified DNA sequences.
443 : olson 1.1
444 : olson 1.2 =over 4
445 :    
446 : olson 1.1 =item args
447 :    
448 :     Reference to a hash containing the parameters. The allowable parameter fields
449 :     are as follows.
450 :    
451 :     =over 8
452 :    
453 :     =item -input
454 :    
455 :     Open input handle to a file containing the DNA sequences in FASTA format.
456 :    
457 :     =item -genus
458 :    
459 :     Common name of the genus for this DNA.
460 :    
461 :     =item -species
462 :    
463 :     Common name of the species for this DNA.
464 :    
465 :     =item -domain
466 :    
467 :     Domain of this DNA. The default is C<Bacteria>.
468 :    
469 : olson 1.2 =back
470 :    
471 : olson 1.1 =item RETURN
472 :    
473 : overbeek 1.7 Returns a 2-tuple consisting of 1) a string containing what would normally
474 : parrello 1.19 be the contents of an entire FASTA file for all the RNA genes found
475 :     followed by 2) a reference to a list of RNA genes found. Each gene found will be
476 : overbeek 1.7 represented by a 5-tuple containing an ID for the gene, the ID of the contig
477 :     containing it, the starting offset, the ending offset, and the
478 :     name of the RNA found.
479 : olson 1.1
480 :     =back
481 :    
482 :     =cut
483 :    
484 :     sub find_rnas
485 :     {
486 : olson 1.3 my($self, $args) = _handle_args(@_);
487 :    
488 :     my $input = delete $args->{-input};
489 :    
490 :     my $params = [ map { $_ => $args->{$_} } keys %$args ];
491 : olson 1.1
492 :     if (ref($input) ne 'ARRAY')
493 :     {
494 :     my $fh;
495 :     if (ref($input))
496 :     {
497 :     $fh = $input;
498 :     }
499 :     else
500 :     {
501 :     my $fasta_file = $input;
502 :     open($fh, "<", $fasta_file);
503 :     }
504 :     $input = [];
505 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
506 :     {
507 :     push(@$input, "$id,$$seqp");
508 :     }
509 :     close($fh);
510 :     }
511 :    
512 :     return $self->run_query_form([function => "find_rnas",
513 : olson 1.3 @$params,
514 : olson 1.1 id_seq => $input]);
515 :     }
516 :    
517 : parrello 1.20 =head3 assign_functions_to_dna
518 : olson 1.1
519 : parrello 1.20 my $result = $annoObject->assign_functions_to_dna($args)
520 : olson 1.1
521 :     Analyze DNA sequences and output regions that probably belong to FIGfams.
522 :     The selected regions will be high-probability candidates for protein
523 : overbeek 1.7 encoding sequences.
524 : olson 1.1
525 : olson 1.2 =over 4
526 :    
527 : olson 1.1 =item args
528 :    
529 :     Reference to a hash containing the parameters. The allowable parameter fields
530 :     are as follows.
531 :    
532 :     =over 8
533 :    
534 :     =item -input
535 :    
536 : olson 1.6 The sequences to be analyzed. This may take one of two forms:
537 :    
538 :     1. An file handle that is open for reading from a file of DNA sequences in FASTA format, or
539 :    
540 :     2. A reference to a list of sequence data entries. Each entry is a triple of strings
541 :     [sequence-id, comment, dna-sequence-data].
542 : olson 1.1
543 :     =item -kmer
544 :    
545 : olson 1.5 Specify the kmer size to use for analysis (valid sizes are 7 - 12).
546 : olson 1.1
547 :     =item -minHits
548 :    
549 :     A number from 1 to 10, indicating the minimum number of matches required to
550 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
551 :     indicates a more reliable matching algorithm; the default is C<3>.
552 :    
553 :     =item -maxGap
554 :    
555 :     When looking for a match, if two sequence elements match and are closer than
556 :     this distance, then they will be considered part of a single match. Otherwise,
557 :     the match will be split. The default is C<600>.
558 :    
559 : olson 1.2 =back
560 :    
561 : olson 1.1 =item RETURN
562 :    
563 : olson 1.9 Returns a Result Handle. Call C<get_next> on the result handle to get back a data
564 :     item. Each item sent back by the result handle is a 2-tuple containing the
565 : olson 1.1 incoming protein sequence and a reference to a list of hit regions. Each hit
566 : overbeek 1.7 region is a 5-tuple consisting of the number of matches to the function, the start
567 :     location, the stop location, the proposed function, and the name of the
568 : olson 1.2 Genome Set from which the gene is likely to have originated.
569 : olson 1.1
570 :     =back
571 :    
572 :     =cut
573 :    
574 :     sub assign_functions_to_dna
575 :     {
576 : olson 1.3 my($self, $args) = _handle_args(@_);
577 : olson 1.1 my $wq;
578 :    
579 : olson 1.3 my $input = delete $args->{-input};
580 :    
581 :     my $params = [ map { $_ => $args->{$_} } keys %$args ];
582 :    
583 : olson 1.1 if (ref($input) eq 'ARRAY')
584 :     {
585 :     $wq = SequenceListWorkQueue->new($input);
586 :     }
587 :     else
588 :     {
589 :     $wq = FastaWorkQueue->new($input);
590 :     }
591 :    
592 : olson 1.3 my $req_bytes = 500_000;
593 :    
594 : olson 1.1 return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',
595 :     \&id_seq_pair_bundler,
596 :     \&tab_delimited_output_parser, $params, $req_bytes);
597 :     }
598 :    
599 :     ###### Utility Methods ######
600 :    
601 :     sub run_query
602 :     {
603 :     my($self, $function, @args ) = @_;
604 :     my $form = [function => $function,
605 :     args => YAML::Dump(\@args),
606 :     ];
607 :     return $self->run_query_form($form);
608 :     }
609 :    
610 :     sub run_query_form
611 :     {
612 :     my($self, $form, $raw) = @_;
613 :    
614 :     my $res = $self->{ua}->post($self->{server_url}, $form);
615 :    
616 :     if ($res->is_success)
617 :     {
618 :     my $content = $res->content;
619 :     if ($raw)
620 :     {
621 :     return $content;
622 :     }
623 :    
624 :     # print "Got $content\n";
625 :     my $ret;
626 :     eval {
627 :     $ret = Load($content);
628 :     };
629 :     if ($@)
630 :     {
631 :     die "Query returned unparsable content ($@): " . $content;
632 :     }
633 :     return $ret;
634 :     }
635 :     else
636 :     {
637 :     die "error on post " . $res->status_line . " " . $res->content;
638 :     }
639 :     }
640 :    
641 :     sub id_seq_pair_bundler
642 :     {
643 :     my($item) = @_;
644 :     my($id, $seq) = @$item[0,2];
645 :     return "id_seq", join(",", $id, (ref($seq) eq 'SCALAR' ? $$seq : $seq));
646 :     }
647 :    
648 :     sub tab_delimited_output_parser
649 :     {
650 :     my($line) = @_;
651 :     chomp $line;
652 :     my @cols = split(/\t/, $line);
653 :     return \@cols;
654 :     }
655 :    
656 :    
657 :     sub tab_delimited_dna_data_output_parser
658 :     {
659 :     my($line) = @_;
660 :     chomp $line;
661 :     my ($id, $idbe, $fam) = split(/\t/, $line);
662 :     my ($beg, $end) = $idbe =~ /_(\d+)_(\d+)$/;
663 :     return [$id, $beg, $end, $fam];
664 :     }
665 :    
666 :    
667 :     #
668 :     # Turn an argument list into a $self ref and an argument hash.
669 :     # Code lifted from ClientThing.
670 :     #
671 :     sub _handle_args
672 :     {
673 :     my $self = shift;
674 :     my $args = $_[0];
675 :     if (defined $args)
676 :     {
677 :     if (scalar @_ gt 1)
678 :     {
679 :     # Here we have multiple arguments. We check the first one for a
680 :     # leading hyphen.
681 :     if ($args =~ /^-/) {
682 :     # This means we have hash-form parameters.
683 :     my %args = @_;
684 :     $args = \%args;
685 :     } else {
686 :     # This means we have list-form parameters.
687 :     my @args = @_;
688 :     $args = \@args;
689 :     }
690 :     } else {
691 :     # Here we have a single argument. If it's a scalar, we convert it
692 :     # to a singleton list.
693 :     if (! ref $args) {
694 :     $args = [$args];
695 :     }
696 :     }
697 :     }
698 :     return($self, $args);
699 :     }
700 :    
701 :    
702 :    
703 :    
704 :     package ResultHandler;
705 :     use strict;
706 :     use Data::Dumper;
707 :    
708 :     sub new
709 :     {
710 :     my($class, $work_queue, $server_url, $function, $input_bundler, $output_parser, $form_vars, $req_bytes) = @_;
711 :    
712 :     my $self = {
713 :     work_queue => $work_queue,
714 :     server_url => $server_url,
715 :     function => $function,
716 :     input_bundler => $input_bundler,
717 :     output_parser => $output_parser,
718 :     ua => LWP::UserAgent->new(),
719 :     cur_result => undef,
720 :     form_vars => $form_vars ? $form_vars : [],
721 :     req_bytes => ($req_bytes ? $req_bytes : 16000),
722 :     };
723 :     $self->{ua}->timeout(20 * 60);
724 :     return bless $self, $class;
725 :     }
726 :    
727 :     sub get_next
728 :     {
729 :     my($self) = @_;
730 :    
731 :     my $res = $self->get_next_from_result();
732 :     # print "gnfr returns: " , Dumper($res);
733 :    
734 :     if ($res)
735 :     {
736 :     return $res;
737 :     }
738 :     else
739 :     {
740 :    
741 :     while (my @inp = $self->{work_queue}->get_next_n_bytes($self->{req_bytes}))
742 :     {
743 :     my $form = [@{$self->{form_vars}}];
744 :     push(@$form, function => $self->{function},
745 :     map { &{$self->{input_bundler}}($_) } @inp);
746 :     # print "Invoke " .Dumper($form);
747 :    
748 :     my $res = $self->{ua}->post($self->{server_url}, $form);
749 :     if ($res->is_success)
750 :     {
751 :     eval {
752 :     $self->{cur_result} = [YAML::Load($res->content)];
753 :     };
754 :     if ($@)
755 :     {
756 :     die "Query returned unparsable content ($@): " . $res->content;
757 :     }
758 :     # print "res: " . Dumper($self->{cur_result});
759 :     my $oneres = $self->get_next_from_result();
760 :     if ($oneres)
761 :     {
762 :     return $oneres;
763 :     }
764 :     }
765 :     else
766 :     {
767 :     die "error " . $res->status_line . " on post " . $res->content;
768 :     }
769 :     }
770 :     return;
771 :     }
772 :     }
773 :    
774 :     sub get_next_from_result
775 :     {
776 :     my($self) = @_;
777 :     my $l = $self->{cur_result};
778 :     if ($l and @$l)
779 :     {
780 :     return shift(@$l);
781 :     }
782 :     else
783 :     {
784 :     delete $self->{cur_result};
785 :     return undef;
786 :     }
787 :     }
788 :    
789 :     package SequenceWorkQueue;
790 :     use strict;
791 :    
792 :     sub new
793 :     {
794 :     my($class) = @_;
795 :    
796 :     my $self = {};
797 :    
798 :     return bless $self, $class;
799 :     }
800 :    
801 :     sub get_next_n
802 :     {
803 :     my($self, $n) = @_;
804 :     my @out;
805 :    
806 :     for (my $i = 0;$i < $n; $i++)
807 :     {
808 :     my($id, $com, $seqp) = $self->get_next();
809 :     if (defined($id))
810 :     {
811 :     push(@out, [$id, $com, $seqp]);
812 :     }
813 :     else
814 :     {
815 :     last;
816 :     }
817 :     }
818 :     return @out;
819 :     }
820 :    
821 :     sub get_next_n_bytes
822 :     {
823 :     my($self, $n) = @_;
824 :     my @out;
825 :    
826 :     my $size = 0;
827 :     while ($size < $n)
828 :     {
829 :     my($id, $com, $seqp) = $self->get_next();
830 :     if (defined($id))
831 :     {
832 :     push(@out, [$id, $com, $seqp]);
833 :     $size += (ref($seqp) eq 'SCALAR') ? length($$seqp) : length($seqp);
834 :     }
835 :     else
836 :     {
837 :     last;
838 :     }
839 :     }
840 :     return @out;
841 :     }
842 :    
843 :     package FastaWorkQueue;
844 :     use strict;
845 :     use base 'SequenceWorkQueue';
846 :     use FileHandle;
847 :    
848 :     sub new
849 :     {
850 :     my($class, $input) = @_;
851 :    
852 :     my $fh;
853 :     if (ref($input))
854 :     {
855 :     $fh = $input;
856 :     }
857 :     else
858 :     {
859 :     $fh = new FileHandle("<$input");
860 :     }
861 :    
862 :     my $self = $class->SUPER::new();
863 :    
864 :     $self->{fh} = $fh;
865 :    
866 :     return bless $self, $class;
867 :     }
868 :    
869 :     sub get_next
870 :     {
871 :     my($self) = @_;
872 :    
873 :     my($id, $seqp, $com) = read_fasta_record($self->{fh});
874 :     return defined($id) ? ($id, $com, $seqp) : ();
875 :     }
876 :    
877 :     sub read_fasta_record {
878 :     my ($file_handle) = @_;
879 :     my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
880 :    
881 :     if (not defined($file_handle)) { $file_handle = \*STDIN; }
882 :    
883 :     $old_end_of_record = $/;
884 :     $/ = "\n>";
885 :    
886 :     if (defined($fasta_record = <$file_handle>)) {
887 :     chomp $fasta_record;
888 :     @lines = split( /\n/, $fasta_record );
889 :     $head = shift @lines;
890 :     $head =~ s/^>?//;
891 :     $head =~ m/^(\S+)/;
892 :     $seq_id = $1;
893 :     if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; }
894 :     $sequence = join( "", @lines );
895 :     @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
896 :     } else {
897 :     @parsed_fasta_record = ();
898 :     }
899 :    
900 :     $/ = $old_end_of_record;
901 :    
902 :     return @parsed_fasta_record;
903 :     }
904 :    
905 :     package SequenceListWorkQueue;
906 :     use strict;
907 :     use base 'SequenceWorkQueue';
908 :    
909 :     sub new
910 :     {
911 :     my($class, $input) = @_;
912 :    
913 :     my $fh;
914 :     if (ref($input) ne 'ARRAY')
915 :     {
916 :     die "SequenceWorkQueue requires a list as input";
917 :     }
918 :    
919 :     my $self = $class->SUPER::new();
920 :    
921 :     $self->{list} = $input;
922 :    
923 :     return bless $self, $class;
924 :     }
925 :    
926 :     sub get_next
927 :     {
928 :     my($self) = @_;
929 :    
930 :     my $top = shift @{$self->{list}};
931 :    
932 :     return defined($top) ? @$top : ();
933 :     }
934 :    
935 :    
936 :     1;
937 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3