[Bio] / FigKernelPackages / FFserver.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/FFserver.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.21 - (view) (download) (as text)

1 : olson 1.1 package FFserver;
2 :    
3 : olson 1.14 #
4 :     # This is a SAS Component
5 :     #
6 :    
7 : parrello 1.20 =head1 FIGfam Server Helper Object
8 :    
9 :     This module is used to call the FIGfam server, which is a general-purpose
10 :     server for extracting data from the FIGfams database. Each FIGfam server
11 :     function correspond to a method of this object.
12 :    
13 :     This package deliberately uses no internal SEED packages or scripts, only common
14 :     PERL modules.
15 :    
16 :     =cut
17 :    
18 : olson 1.1 use LWP::UserAgent;
19 :     use Data::Dumper;
20 : olson 1.3 use YAML;
21 : olson 1.1
22 :     use strict;
23 :    
24 :     sub new
25 :     {
26 :     my($class, $server_url) = @_;
27 :    
28 : olson 1.3 $server_url = "http://servers.nmpdr.org/figfam/server.cgi" unless $server_url;
29 : olson 1.1
30 :    
31 :     my $self = {
32 :     server_url => $server_url,
33 :     ua => LWP::UserAgent->new(),
34 :     };
35 : olson 1.14 $self->{ua}->timeout(20 * 60);
36 : olson 1.1
37 :     return bless $self, $class;
38 :     }
39 :    
40 : parrello 1.20 =head2 Functions
41 :    
42 :     =head3 members_of_families
43 :    
44 :     my $document = $ffObject->members_of_families(@ids);
45 :    
46 :     Return the function and a list of the members for each specified family.
47 :    
48 :     =over 4
49 :    
50 :     =item ids
51 :    
52 :     A list of FIGfam IDs.
53 :    
54 :     =item RETURN
55 :    
56 :     Returns a reference to a list of 3-tuples. Each 3-tuple will consist of a FIGfam
57 :     family ID followed by the family's function and a sub-list of all the FIG feature
58 :     IDs for the features in the family.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 : olson 1.3 sub members_of_families
65 :     {
66 :     my($self, @ids) = @_;
67 : olson 1.4 return $self->run_query('members_of_families', @ids);
68 :     }
69 :    
70 : parrello 1.20 =head3 families_containing_peg
71 :    
72 :     my $document = $ffObject->families_containing_peg(@ids);
73 :    
74 :     Return the FIGfams containing the specified features.
75 :    
76 :     =over 4
77 :    
78 :     =item ids
79 :    
80 :     A list of FIG feature IDs.
81 :    
82 :     =item RETURN
83 :    
84 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
85 :     followed by a list of FIGfam IDs for the families containing the incoming
86 :     feature.
87 :    
88 :     =back
89 :    
90 :     =cut
91 :    
92 : olson 1.6 sub families_containing_peg
93 :     {
94 :     my($self, @ids) = @_;
95 :     return $self->run_query('families_containing_peg', @ids);
96 :     }
97 :    
98 : parrello 1.20 =head3 function_of
99 :    
100 :     my $document = $ffObject->function_of(@ids);
101 :    
102 :     Return the current functional assignment for each specified feature.
103 :    
104 :     =over 4
105 :    
106 :     =item ids
107 :    
108 :     A list of FIG feature IDs.
109 :    
110 :     =item RETURN
111 :    
112 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
113 :     followed by the feature's functional assignment, or an undefined value
114 :     if the feature is not found in the FIGfam database.
115 :    
116 :     =back
117 :    
118 :     =cut
119 :    
120 : arodri7 1.9 sub function_of
121 :     {
122 :     my($self, @ids) = @_;
123 :     return $self->run_query('function_of', @ids);
124 :     }
125 :    
126 : parrello 1.20 =head3 org_of
127 :    
128 :     my $document = $ffObject->org_of(@ids);
129 :    
130 :     Return the name of the genome containing each specified feature.
131 :    
132 :     =over 4
133 :    
134 :     =item ids
135 :    
136 :     A list of FIG feature IDs.
137 :    
138 :     =item RETURN
139 :    
140 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
141 :     followed by the name of the genome containing the feature, or an undefined value
142 :     if the feature is not found in the FIGfam database.
143 :    
144 :     =back
145 :    
146 :     =cut
147 :    
148 : arodri7 1.9 sub org_of
149 :     {
150 :     my($self, @ids) = @_;
151 :     return $self->run_query('org_of', @ids);
152 :     }
153 :    
154 : parrello 1.20 =head3 seq_of
155 :    
156 :     my $document = $ffObject->seq_of(@ids);
157 :    
158 :     Return the protein sequence for each specified feature.
159 :    
160 :     =over 4
161 :    
162 :     =item ids
163 :    
164 :     A list of FIG feature IDs.
165 :    
166 :     =item RETURN
167 :    
168 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
169 :     followed by the protein sequence for the feature, or an undefined value
170 :     if the feature is not found in the FIGfam database.
171 :    
172 :     =back
173 :    
174 :     =cut
175 :    
176 : arodri7 1.9 sub seq_of
177 :     {
178 :     my($self, @ids) = @_;
179 :     return $self->run_query('seq_of', @ids);
180 :     }
181 :    
182 : parrello 1.20 =head3 aliases_of
183 :    
184 :     my $document = $ffObject->aliases_of(@ids);
185 :    
186 :     Return the aliases for each specified feature.
187 :    
188 :     =over 4
189 :    
190 :     =item ids
191 :    
192 :     A list of FIG feature IDs.
193 :    
194 :     =item RETURN
195 :    
196 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
197 :     followed by a comma-delimited list of the aliases for the feature, or an
198 :     undefined value if the feature is not found in the FIGfam database.
199 :    
200 :     =back
201 :    
202 :     =cut
203 :    
204 : arodri7 1.9 sub aliases_of
205 :     {
206 :     my($self, @ids) = @_;
207 :     return $self->run_query('aliases_of', @ids);
208 :     }
209 :    
210 : parrello 1.20 =head3 families_implementing_role
211 :    
212 :     my $document = $ffObject->families_implementing_role(@roles);
213 :    
214 :     Return the FIGfams that implement the specified roles. Each FIGfam has
215 :     a single function associated with it, but the function may involve
216 :     multiple roles, or may include comments. The role is therefore a more
217 :     compact string than the function.
218 :    
219 :     =over 4
220 :    
221 :     =item roles
222 :    
223 :     A list of role names.
224 :    
225 :     =item RETURN
226 :    
227 :     Returns a list of 2-tuples, each consisting of an incoming role name
228 :     followed by a list of FIGfam IDs for the families that implement the
229 :     incoming role.
230 :    
231 :     =back
232 :    
233 :     =cut
234 :    
235 : arodri7 1.9 sub families_implementing_role
236 :     {
237 :     my($self,@roles) = @_;
238 :     return $self->run_query('families_implementing_role', @roles);
239 :     }
240 :    
241 : parrello 1.20 =head3 families_with_function
242 :    
243 :     my $document = $ffObject->families_with_function(@functions);
244 :    
245 :     Return the FIGfams that belong to the specified functions. Each FIGfam has
246 :     a single function associated with it, but the function may involve
247 :     multiple roles, or may include comments. The function is therefore a
248 :     more specific string than the role.
249 :    
250 :     =over 4
251 :    
252 :     =item functions
253 :    
254 :     A list of functional roles.
255 :    
256 :     =item RETURN
257 :    
258 :     Returns a list of 2-tuples, each consisting of an incoming role name
259 :     followed by a list of FIGfam IDs for the families associated with the
260 :     incoming function.
261 :    
262 :     =back
263 :    
264 :     =cut
265 :    
266 : arodri7 1.9 sub families_with_function
267 :     {
268 :     my($self,@functions) = @_;
269 :     return $self->run_query('families_with_function', @functions);
270 :     }
271 :    
272 : parrello 1.20 =head3 families_in_genome
273 :    
274 :     my $document = $ffObject->families_in_genome(@genomes);
275 :    
276 :     Return the FIGfams that have members in the specified genomes.
277 :    
278 :     =over 4
279 :    
280 :     =item genomes
281 :    
282 :     A list of genome IDs.
283 :    
284 :     =item RETURN
285 :    
286 :     Returns a list of 2-tuples, each consisting of an incoming genome ID
287 :     followed by a list of FIGfam IDs for the families that have members in
288 :     that genome.
289 :    
290 :     =back
291 :    
292 :     =cut
293 :    
294 : arodri7 1.9 sub families_in_genome
295 :     {
296 :     my($self,@genomes) = @_;
297 :     return $self->run_query('families_in_genome', @genomes);
298 :     }
299 :    
300 : parrello 1.20 =head3 get_subsystem_based_figfams
301 :    
302 :     my $document = $ffObject->get_subsystem_based_figfams();
303 :    
304 :     Return a list of the FIGfams derived from subsystems.
305 :    
306 :     =over 4
307 :    
308 :     =item RETURN
309 :    
310 :     Returns a reference to a list of the IDs for the FIGfams derived from subsystems.
311 :    
312 :     =back
313 :    
314 :     =cut
315 :    
316 : arodri7 1.9 sub get_subsystem_based_figfams
317 :     {
318 :     my ($self) = @_;
319 :     return $self->run_query('get_subsystem_based_figfams');
320 :     }
321 :    
322 : disz 1.21 ##=head3 should_be_member
323 :     ##
324 :     ## my $document = $ffObject->should_be_member(@id_seq_pairs);
325 :     ##
326 :     ##Determine whether a particular protein sequence belongs in a particular
327 :     ##FIGfam. This method takes as input multiple FIGfam/sequence pairs and
328 :     ##performs a determination for each.
329 :     ##
330 :     ##=over 4
331 :     ##
332 :     ##=item id_seq_pairs
333 :     ##
334 :     ##A list of 2-tuples, each consisting of a FIGfam ID followed
335 :     ##by a protein sequence string.
336 :     ##
337 :     ##=item RETURN
338 :     ##
339 :     ##Returns a reference to a list of boolean flags, one per input pair. For each
340 :     ##input pair, the flag will be C<1> if the sequence should be in the FIGfam and
341 :     ##C<0> otherwise.
342 :     ##
343 :     ##=back
344 :     ##
345 :     ##=cut
346 :     ##
347 :     ##sub should_be_member
348 :     ##{
349 :     ## my($self, @id_seq_pairs) = @_;
350 :     ## return $self->run_query('should_be_member', @id_seq_pairs);
351 :     ##}
352 : olson 1.4
353 : parrello 1.20 =head3 all_families
354 :    
355 :     my $document = $ffObject->all_families();
356 :    
357 :     Return a list of the IDs for all the FIGfams in the system.
358 :    
359 :     =over 4
360 :    
361 :     =item RETURN
362 :    
363 :     Returns a reference to a list of the IDs for all the FIGfams in the system.
364 :    
365 :     =back
366 :    
367 :     =cut
368 :    
369 : olson 1.4 sub all_families
370 :     {
371 :     my($self) = @_;
372 :     return $self->run_query('all_families');
373 :     }
374 :    
375 : parrello 1.20 =head3 assign_function_to_prot
376 :    
377 :     my $document = $ffObject->assign_function_to_prot($input, $blast, $min_hits, $assignToAll);
378 :    
379 :     For each incoming protein sequence, attempt to place it in a FIGfam. If a
380 :     suitable FIGfam can be found for a particular sequence, the FIGfam ID and
381 :     its functional assignment will be returned.
382 :    
383 :     =over 4
384 :    
385 :     =item input
386 :    
387 :     Either (1) an open input handle to a file containing the proteins in FASTA format,
388 :     or (2) a reference to a list of FASTA strings for the proteins.
389 :    
390 :     =item blast
391 :    
392 :     If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
393 :     afterward, and the top I<N> hits (where I<N> is the value of this parameter)
394 :     will be returned as part of the protein's output tuple.
395 :    
396 :     =item min_hits
397 :    
398 :     A number from 1 to 10, indicating the minimum number of matches required to
399 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
400 :     indicates a more reliable matching algorithm; the default is C<3>.
401 :    
402 :     =item assign_to_all
403 :    
404 :     If TRUE, then if the standard matching algorithm fails to assign a protein,
405 :     a BLAST will be used. The BLAST is slower, but is capable of placing more
406 :     proteins than the normal algorithm.
407 :    
408 :     =item RETURN
409 :    
410 :     Returns a Result Handler. Call C<get_next> on the result handler to get back a data
411 :     item. Each item sent back by the result handler is a 2-tuple containing the
412 :     incoming protein sequence and a reference to a list consisting of the proposed
413 :     functional assignment for the protein, the name of the Genome Set from which the
414 :     protein is likely to have originated (if known), a list of BLAST hits (if
415 :     requested), and the number of matches for the protein found in the FIGfam. If no
416 :     assignment could be made for a particular protein, it will not appear in the
417 :     output stream.
418 : olson 1.3
419 : parrello 1.20 =back
420 : olson 1.3
421 : parrello 1.20 =cut
422 : olson 1.3
423 : olson 1.1 sub assign_function_to_prot
424 :     {
425 : olson 1.15 my($self, $input, $blast, $min_hits, $assignToAll) = @_;
426 : olson 1.1
427 : olson 1.2 my $wq;
428 : olson 1.10
429 : olson 1.15 my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
430 : olson 1.2
431 : olson 1.1 if (ref($input) eq 'ARRAY')
432 :     {
433 : olson 1.2 $wq = SequenceListWorkQueue->new($input);
434 : olson 1.1 }
435 :     else
436 :     {
437 : olson 1.2 $wq = FastaWorkQueue->new($input);
438 : olson 1.1 }
439 : olson 1.2
440 : olson 1.19 my $req_bytes = $blast ? 1000 : 1_000_000;
441 : olson 1.11
442 : olson 1.10 return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
443 :     #\&tab_delimited_output_parser,
444 :     \&YAML::Load,
445 : olson 1.11 $params, $req_bytes);
446 : olson 1.1 }
447 :    
448 : parrello 1.20 =head3 call_genes
449 :    
450 :     my $document = $ffObject->call_genes($input, $genetic_code);
451 :    
452 :     Call the protein-encoding genes for the specified DNA sequences. The result will
453 :     be a multi-sequence FASTA string listing all the proteins found and a hash mapping
454 :     each gene found to its location string.
455 :    
456 :     =over 4
457 :    
458 :     =item input
459 :    
460 :     Open input handle to a file containing the DNA sequences in FASTA format.
461 :    
462 :     =item genetic_code
463 :    
464 :     The numeric code for the mapping from DNA to amino acids. The default is C<11>,
465 :     which is the standard mapping and should be used in almost all cases. A complete
466 :     list of mapping codes can be found at
467 :     L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
468 :    
469 :     =item RETURN
470 :    
471 :     Returns a 2-tuple consisting of a FASTA string for all the proteins found
472 :     followed by a reference to a list of genes found. Each gene found will be
473 :     represented by a 4-tuple containing an ID for the gene, the ID of the contig
474 :     containing it, the starting offset, and the ending offset.
475 :    
476 :     =back
477 :    
478 :     =cut
479 :    
480 : olson 1.16 sub call_genes
481 :     {
482 :     my($self, $input, $genetic_code) = @_;
483 :    
484 :     if (ref($input) ne 'ARRAY')
485 :     {
486 :     my $fh;
487 :     if (ref($input))
488 :     {
489 :     $fh = $input;
490 :     }
491 :     else
492 :     {
493 :     my $fasta_file = $input;
494 :     open($fh, "<", $fasta_file);
495 :     }
496 :     $input = [];
497 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
498 :     {
499 :     push(@$input, "$id,$$seqp");
500 :     }
501 :     close($fh);
502 :     }
503 :    
504 :     return $self->run_query_form([function => "call_genes",
505 :     genetic_code => $genetic_code,
506 : olson 1.17 id_seq => $input]);
507 : olson 1.16 }
508 :    
509 : parrello 1.20 =head3 find_rnas
510 :    
511 :     my $document = $ffObject->find_rnas($input, $genus, $species, $domain);
512 :    
513 :     Call the RNAs for the specified DNA sequences. The result will be a
514 :     multi-sequence FASTA string listing all the RNAs found and a hash mapping
515 :     each RNA to its location string.
516 :    
517 :     =over 4
518 :    
519 :     =item input
520 :    
521 :     Open input handle to a file containing the DNA sequences in FASTA format.
522 :    
523 :     =item genus
524 :    
525 :     Common name of the genus for this DNA.
526 :    
527 :     =item species
528 :    
529 :     Common name of the species for this DNA.
530 :    
531 :     =item domain
532 :    
533 :     Domain of this DNA. The default is C<Bacteria>.
534 :    
535 :     =item RETURN
536 :    
537 :     Returns a 2-tuple consisting of a FASTA string for all the RNAs found
538 :     followed by reference to a list of RNAs found. Each RNA will be represented by
539 :     a 4-tuple consisting of an ID for the RNA, the ID of the contig containing it, its
540 :     starting offset, and its ending offset.
541 :    
542 :     =back
543 :    
544 :     =cut
545 :    
546 : olson 1.18 sub find_rnas
547 :     {
548 :     my($self, $input, $genus, $species, $domain) = @_;
549 :    
550 :     if (ref($input) ne 'ARRAY')
551 :     {
552 :     my $fh;
553 :     if (ref($input))
554 :     {
555 :     $fh = $input;
556 :     }
557 :     else
558 :     {
559 :     my $fasta_file = $input;
560 :     open($fh, "<", $fasta_file);
561 :     }
562 :     $input = [];
563 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
564 :     {
565 :     push(@$input, "$id,$$seqp");
566 :     }
567 :     close($fh);
568 :     }
569 :    
570 :     return $self->run_query_form([function => "find_rnas",
571 :     genus => $genus,
572 :     species => $species,
573 :     domain => $domain,
574 :     id_seq => $input]);
575 :     }
576 :    
577 : parrello 1.20 =head3 assign_functions_to_DNA
578 :    
579 :     my $document = $ffObject->assign_functions_to_DNA($input, $blast, $min_hits, $max_gap);
580 :    
581 :     Analyze DNA sequences and output regions that probably belong to FIGfams.
582 :     The selected regions will be high-probability candidates for protein
583 :     production.
584 :    
585 :     =over 4
586 :    
587 :     =item input
588 :    
589 :     Either (1) an open input handle to a file containing the DNA sequences in FASTA format,
590 :     or (2) a reference to a list of FASTA strings for the DNA sequences.
591 :    
592 :     =item blast
593 :    
594 :     If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
595 :     afterward, and the top I<N> hits (where I<N> is the value of this parameter)
596 :     will be returned as part of each protein's output tuple.
597 :    
598 :     =item min_hits
599 :    
600 :     A number from 1 to 10, indicating the minimum number of matches required to
601 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
602 :     indicates a more reliable matching algorithm; the default is C<3>.
603 :    
604 :     =item max_gap
605 :    
606 :     When looking for a match, if two sequence elements match and are closer than
607 :     this distance, then they will be considered part of a single match. Otherwise,
608 :     the match will be split. The default is C<600>.
609 :    
610 :     =item RETURN
611 :    
612 :     Returns a Result Handler. Call C<get_next> on the result handler to get back a data
613 :     item. Each item sent back by the result handler is a 2-tuple containing the
614 :     incoming protein sequence and a reference to a list of hit regions. Each hit
615 :     region is a 6-tuple consisting of the number of matches to the FIGfam, the start
616 :     location, the stop location, the proposed functional assignment, the name of the
617 :     Genome Set from which the gene is likely to have originated, and a list of BLAST
618 :     hits. If the I<blast> option is not specified, the list of BLAST hits will be
619 :     empty.
620 :    
621 :     =back
622 :    
623 :     =cut
624 :    
625 : olson 1.2 sub assign_functions_to_dna
626 : olson 1.1 {
627 : olson 1.10 my($self, $input, $min_hits, $max_gap, $blast) = @_;
628 : olson 1.1
629 : olson 1.12 $min_hits = 3 unless defined($min_hits);
630 :     $max_gap = 600 unless defined($max_gap);
631 : olson 1.13 $blast = 0 unless defined($blast);
632 : olson 1.12
633 : olson 1.2 my $wq;
634 : olson 1.1
635 : olson 1.2 if (ref($input) eq 'ARRAY')
636 : olson 1.1 {
637 : olson 1.2 $wq = SequenceListWorkQueue->new($input);
638 : olson 1.1 }
639 :     else
640 :     {
641 : olson 1.2 $wq = FastaWorkQueue->new($input);
642 : olson 1.1 }
643 : olson 1.2
644 : olson 1.13 my $req_bytes = $blast ? 1000 : 500000;
645 : olson 1.10 my $params = [min_hits => $min_hits, max_gap => $max_gap, blast => $blast];
646 :     return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',
647 :     \&id_seq_pair_bundler,
648 : olson 1.11 \&tab_delimited_output_parser, $params, $req_bytes);
649 : olson 1.2 }
650 :    
651 : parrello 1.20 ###### Utility Methods ######
652 :    
653 :     sub run_query
654 :     {
655 :     my($self, $function, @args ) = @_;
656 :     my $form = [function => $function,
657 :     args => YAML::Dump(\@args),
658 :     ];
659 :     return $self->run_query_form($form);
660 :     }
661 :    
662 :     sub run_query_form
663 :     {
664 :     my($self, $form, $raw) = @_;
665 :    
666 :     my $res = $self->{ua}->post($self->{server_url}, $form);
667 :    
668 :     if ($res->is_success)
669 :     {
670 :     my $content = $res->content;
671 :     if ($raw)
672 :     {
673 :     return $content;
674 :     }
675 :    
676 :     # print "Got $content\n";
677 :     my $ret;
678 :     eval {
679 :     $ret = Load($content);
680 :     };
681 :     if ($@)
682 :     {
683 :     die "Query returned unparsable content ($@): " . $content;
684 :     }
685 :     return $ret;
686 :     }
687 :     else
688 :     {
689 :     die "error on post " . $res->status_line . " " . $res->content;
690 :     }
691 :     }
692 :    
693 : olson 1.2 sub id_seq_pair_bundler
694 :     {
695 :     my($item) = @_;
696 :     my($id, $seq) = @$item[0,2];
697 :     return "id_seq", join(",", $id, (ref($seq) eq 'SCALAR' ? $$seq : $seq));
698 :     }
699 :    
700 :     sub tab_delimited_output_parser
701 :     {
702 :     my($line) = @_;
703 :     chomp $line;
704 :     my @cols = split(/\t/, $line);
705 :     return \@cols;
706 :     }
707 :    
708 :    
709 :     sub tab_delimited_dna_data_output_parser
710 :     {
711 :     my($line) = @_;
712 :     chomp $line;
713 :     my ($id, $idbe, $fam) = split(/\t/, $line);
714 :     my ($beg, $end) = $idbe =~ /_(\d+)_(\d+)$/;
715 :     return [$id, $beg, $end, $fam];
716 : olson 1.1 }
717 :    
718 : olson 1.2 package ResultHandler;
719 : olson 1.1 use strict;
720 : olson 1.2 use Data::Dumper;
721 : olson 1.1
722 :     sub new
723 :     {
724 : olson 1.11 my($class, $work_queue, $server_url, $function, $input_bundler, $output_parser, $form_vars, $req_bytes) = @_;
725 : olson 1.2
726 : olson 1.1 my $self = {
727 : olson 1.2 work_queue => $work_queue,
728 :     server_url => $server_url,
729 :     function => $function,
730 :     input_bundler => $input_bundler,
731 :     output_parser => $output_parser,
732 :     ua => LWP::UserAgent->new(),
733 :     cur_result => undef,
734 : olson 1.5 form_vars => $form_vars ? $form_vars : [],
735 : olson 1.13 req_bytes => ($req_bytes ? $req_bytes : 16000),
736 : olson 1.1 };
737 : olson 1.14 $self->{ua}->timeout(20 * 60);
738 : olson 1.1 return bless $self, $class;
739 :     }
740 :    
741 :     sub get_next
742 :     {
743 :     my($self) = @_;
744 :    
745 : olson 1.10 my $res = $self->get_next_from_result();
746 : olson 1.13 # print "gnfr returns: " , Dumper($res);
747 : olson 1.10
748 :     if ($res)
749 : olson 1.2 {
750 : olson 1.10 return $res;
751 : olson 1.2 }
752 :     else
753 :     {
754 : olson 1.13
755 :     while (my @inp = $self->{work_queue}->get_next_n_bytes($self->{req_bytes}))
756 : olson 1.2 {
757 : olson 1.5 my $form = [@{$self->{form_vars}}];
758 :     push(@$form, function => $self->{function},
759 :     map { &{$self->{input_bundler}}($_) } @inp);
760 : olson 1.8 # print "Invoke " .Dumper($form);
761 : olson 1.2
762 :     my $res = $self->{ua}->post($self->{server_url}, $form);
763 :     if ($res->is_success)
764 :     {
765 : olson 1.13 eval {
766 :     $self->{cur_result} = [YAML::Load($res->content)];
767 :     };
768 :     if ($@)
769 :     {
770 :     die "Query returned unparsable content ($@): " . $res->content;
771 :     }
772 :     # print "res: " . Dumper($self->{cur_result});
773 :     my $oneres = $self->get_next_from_result();
774 :     if ($oneres)
775 :     {
776 :     return $oneres;
777 :     }
778 : olson 1.2 }
779 :     else
780 :     {
781 : olson 1.3 die "error " . $res->status_line . " on post " . $res->content;
782 : olson 1.2 }
783 :     }
784 : olson 1.13 return;
785 : olson 1.2 }
786 :     }
787 :    
788 :     sub get_next_from_result
789 :     {
790 :     my($self) = @_;
791 : olson 1.10 my $l = $self->{cur_result};
792 :     if ($l and @$l)
793 :     {
794 :     return shift(@$l);
795 :     }
796 :     else
797 : olson 1.2 {
798 : olson 1.10 delete $self->{cur_result};
799 :     return undef;
800 : olson 1.2 }
801 :     }
802 :    
803 :     package SequenceWorkQueue;
804 :     use strict;
805 :    
806 :     sub new
807 :     {
808 :     my($class) = @_;
809 :    
810 :     my $self = {};
811 :    
812 :     return bless $self, $class;
813 :     }
814 : olson 1.1
815 : olson 1.2 sub get_next_n
816 :     {
817 :     my($self, $n) = @_;
818 :     my @out;
819 :    
820 :     for (my $i = 0;$i < $n; $i++)
821 : olson 1.1 {
822 : olson 1.2 my($id, $com, $seqp) = $self->get_next();
823 :     if (defined($id))
824 :     {
825 :     push(@out, [$id, $com, $seqp]);
826 :     }
827 :     else
828 :     {
829 :     last;
830 :     }
831 : olson 1.1 }
832 : olson 1.2 return @out;
833 :     }
834 :    
835 :     sub get_next_n_bytes
836 :     {
837 :     my($self, $n) = @_;
838 :     my @out;
839 :    
840 :     my $size = 0;
841 :     while ($size < $n)
842 : olson 1.1 {
843 : olson 1.2 my($id, $com, $seqp) = $self->get_next();
844 :     if (defined($id))
845 :     {
846 :     push(@out, [$id, $com, $seqp]);
847 :     $size += (ref($seqp) eq 'SCALAR') ? length($$seqp) : length($seqp);
848 :     }
849 :     else
850 :     {
851 :     last;
852 :     }
853 : olson 1.1 }
854 : olson 1.2 return @out;
855 : olson 1.1 }
856 :    
857 : olson 1.2 package FastaWorkQueue;
858 : olson 1.1 use strict;
859 : olson 1.2 use base 'SequenceWorkQueue';
860 : olson 1.1 use FileHandle;
861 :    
862 :     sub new
863 :     {
864 : olson 1.2 my($class, $input) = @_;
865 : olson 1.1
866 :     my $fh;
867 :     if (ref($input))
868 :     {
869 :     $fh = $input;
870 :     }
871 :     else
872 :     {
873 :     $fh = new FileHandle("<$input");
874 :     }
875 : olson 1.2
876 :     my $self = $class->SUPER::new();
877 :    
878 :     $self->{fh} = $fh;
879 :    
880 : olson 1.1 return bless $self, $class;
881 :     }
882 :    
883 :     sub get_next
884 :     {
885 :     my($self) = @_;
886 :    
887 : olson 1.14 my($id, $seqp, $com) = read_fasta_record($self->{fh});
888 : olson 1.2 return defined($id) ? ($id, $com, $seqp) : ();
889 :     }
890 :    
891 : olson 1.14 sub read_fasta_record {
892 :     my ($file_handle) = @_;
893 :     my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
894 :    
895 :     if (not defined($file_handle)) { $file_handle = \*STDIN; }
896 :    
897 :     $old_end_of_record = $/;
898 :     $/ = "\n>";
899 :    
900 :     if (defined($fasta_record = <$file_handle>)) {
901 :     chomp $fasta_record;
902 :     @lines = split( /\n/, $fasta_record );
903 :     $head = shift @lines;
904 :     $head =~ s/^>?//;
905 :     $head =~ m/^(\S+)/;
906 :     $seq_id = $1;
907 :     if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; }
908 :     $sequence = join( "", @lines );
909 :     @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
910 :     } else {
911 :     @parsed_fasta_record = ();
912 :     }
913 :    
914 :     $/ = $old_end_of_record;
915 :    
916 :     return @parsed_fasta_record;
917 :     }
918 :    
919 : olson 1.2 package SequenceListWorkQueue;
920 :     use strict;
921 :     use base 'SequenceWorkQueue';
922 : olson 1.1
923 : olson 1.2 sub new
924 :     {
925 :     my($class, $input) = @_;
926 :    
927 :     my $fh;
928 :     if (ref($input) ne 'ARRAY')
929 : olson 1.1 {
930 : olson 1.2 die "SequenceWorkQueue requires a list as input";
931 : olson 1.1 }
932 : olson 1.2
933 :     my $self = $class->SUPER::new();
934 :    
935 :     $self->{list} = $input;
936 :    
937 :     return bless $self, $class;
938 :     }
939 :    
940 :     sub get_next
941 :     {
942 :     my($self) = @_;
943 :    
944 :     my $top = shift @{$self->{list}};
945 :    
946 :     return defined($top) ? @$top : ();
947 : olson 1.1 }
948 : olson 1.2
949 :    
950 : olson 1.1 1;
951 : olson 1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3