[Bio] / FigKernelPackages / FFserver.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/FFserver.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (view) (download) (as text)

1 : olson 1.1 package FFserver;
2 :    
3 : olson 1.14 #
4 :     # This is a SAS Component
5 :     #
6 :    
7 : parrello 1.20 =head1 FIGfam Server Helper Object
8 :    
9 :     This module is used to call the FIGfam server, which is a general-purpose
10 :     server for extracting data from the FIGfams database. Each FIGfam server
11 :     function correspond to a method of this object.
12 :    
13 :     This package deliberately uses no internal SEED packages or scripts, only common
14 :     PERL modules.
15 :    
16 :     =cut
17 :    
18 : olson 1.1 use LWP::UserAgent;
19 :     use Data::Dumper;
20 : olson 1.3 use YAML;
21 : olson 1.1
22 :     use strict;
23 :    
24 :     sub new
25 :     {
26 :     my($class, $server_url) = @_;
27 :    
28 : olson 1.3 $server_url = "http://servers.nmpdr.org/figfam/server.cgi" unless $server_url;
29 : olson 1.1
30 :    
31 :     my $self = {
32 :     server_url => $server_url,
33 :     ua => LWP::UserAgent->new(),
34 :     };
35 : olson 1.14 $self->{ua}->timeout(20 * 60);
36 : olson 1.1
37 :     return bless $self, $class;
38 :     }
39 :    
40 : parrello 1.20 =head2 Functions
41 :    
42 :     =head3 members_of_families
43 :    
44 :     my $document = $ffObject->members_of_families(@ids);
45 :    
46 :     Return the function and a list of the members for each specified family.
47 :    
48 :     =over 4
49 :    
50 :     =item ids
51 :    
52 :     A list of FIGfam IDs.
53 :    
54 :     =item RETURN
55 :    
56 :     Returns a reference to a list of 3-tuples. Each 3-tuple will consist of a FIGfam
57 :     family ID followed by the family's function and a sub-list of all the FIG feature
58 :     IDs for the features in the family.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 : olson 1.3 sub members_of_families
65 :     {
66 :     my($self, @ids) = @_;
67 : olson 1.4 return $self->run_query('members_of_families', @ids);
68 :     }
69 :    
70 : parrello 1.20 =head3 families_containing_peg
71 :    
72 :     my $document = $ffObject->families_containing_peg(@ids);
73 :    
74 :     Return the FIGfams containing the specified features.
75 :    
76 :     =over 4
77 :    
78 :     =item ids
79 :    
80 :     A list of FIG feature IDs.
81 :    
82 :     =item RETURN
83 :    
84 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
85 :     followed by a list of FIGfam IDs for the families containing the incoming
86 :     feature.
87 :    
88 :     =back
89 :    
90 :     =cut
91 :    
92 : olson 1.6 sub families_containing_peg
93 :     {
94 :     my($self, @ids) = @_;
95 :     return $self->run_query('families_containing_peg', @ids);
96 :     }
97 :    
98 : parrello 1.20 =head3 families_implementing_role
99 :    
100 :     my $document = $ffObject->families_implementing_role(@roles);
101 :    
102 :     Return the FIGfams that implement the specified roles. Each FIGfam has
103 :     a single function associated with it, but the function may involve
104 :     multiple roles, or may include comments. The role is therefore a more
105 :     compact string than the function.
106 :    
107 :     =over 4
108 :    
109 :     =item roles
110 :    
111 :     A list of role names.
112 :    
113 :     =item RETURN
114 :    
115 :     Returns a list of 2-tuples, each consisting of an incoming role name
116 :     followed by a list of FIGfam IDs for the families that implement the
117 :     incoming role.
118 :    
119 :     =back
120 :    
121 :     =cut
122 :    
123 : arodri7 1.9 sub families_implementing_role
124 :     {
125 :     my($self,@roles) = @_;
126 :     return $self->run_query('families_implementing_role', @roles);
127 :     }
128 :    
129 : parrello 1.20 =head3 families_with_function
130 :    
131 :     my $document = $ffObject->families_with_function(@functions);
132 :    
133 :     Return the FIGfams that belong to the specified functions. Each FIGfam has
134 :     a single function associated with it, but the function may involve
135 :     multiple roles, or may include comments. The function is therefore a
136 :     more specific string than the role.
137 :    
138 :     =over 4
139 :    
140 :     =item functions
141 :    
142 :     A list of functional roles.
143 :    
144 :     =item RETURN
145 :    
146 :     Returns a list of 2-tuples, each consisting of an incoming role name
147 :     followed by a list of FIGfam IDs for the families associated with the
148 :     incoming function.
149 :    
150 :     =back
151 :    
152 :     =cut
153 :    
154 : arodri7 1.9 sub families_with_function
155 :     {
156 :     my($self,@functions) = @_;
157 :     return $self->run_query('families_with_function', @functions);
158 :     }
159 :    
160 : parrello 1.20 =head3 families_in_genome
161 :    
162 :     my $document = $ffObject->families_in_genome(@genomes);
163 :    
164 :     Return the FIGfams that have members in the specified genomes.
165 :    
166 :     =over 4
167 :    
168 :     =item genomes
169 :    
170 :     A list of genome IDs.
171 :    
172 :     =item RETURN
173 :    
174 :     Returns a list of 2-tuples, each consisting of an incoming genome ID
175 :     followed by a list of FIGfam IDs for the families that have members in
176 :     that genome.
177 :    
178 :     =back
179 :    
180 :     =cut
181 :    
182 : arodri7 1.9 sub families_in_genome
183 :     {
184 :     my($self,@genomes) = @_;
185 :     return $self->run_query('families_in_genome', @genomes);
186 :     }
187 :    
188 : parrello 1.20 =head3 get_subsystem_based_figfams
189 :    
190 :     my $document = $ffObject->get_subsystem_based_figfams();
191 :    
192 :     Return a list of the FIGfams derived from subsystems.
193 :    
194 :     =over 4
195 :    
196 :     =item RETURN
197 :    
198 :     Returns a reference to a list of the IDs for the FIGfams derived from subsystems.
199 :    
200 :     =back
201 :    
202 :     =cut
203 :    
204 : arodri7 1.9 sub get_subsystem_based_figfams
205 :     {
206 :     my ($self) = @_;
207 :     return $self->run_query('get_subsystem_based_figfams');
208 :     }
209 :    
210 : disz 1.21 ##=head3 should_be_member
211 :     ##
212 :     ## my $document = $ffObject->should_be_member(@id_seq_pairs);
213 :     ##
214 :     ##Determine whether a particular protein sequence belongs in a particular
215 :     ##FIGfam. This method takes as input multiple FIGfam/sequence pairs and
216 :     ##performs a determination for each.
217 :     ##
218 :     ##=over 4
219 :     ##
220 :     ##=item id_seq_pairs
221 :     ##
222 :     ##A list of 2-tuples, each consisting of a FIGfam ID followed
223 :     ##by a protein sequence string.
224 :     ##
225 :     ##=item RETURN
226 :     ##
227 :     ##Returns a reference to a list of boolean flags, one per input pair. For each
228 :     ##input pair, the flag will be C<1> if the sequence should be in the FIGfam and
229 :     ##C<0> otherwise.
230 :     ##
231 :     ##=back
232 :     ##
233 :     ##=cut
234 :     ##
235 :     ##sub should_be_member
236 :     ##{
237 :     ## my($self, @id_seq_pairs) = @_;
238 :     ## return $self->run_query('should_be_member', @id_seq_pairs);
239 :     ##}
240 : olson 1.4
241 : parrello 1.20 =head3 all_families
242 :    
243 :     my $document = $ffObject->all_families();
244 :    
245 :     Return a list of the IDs for all the FIGfams in the system.
246 :    
247 :     =over 4
248 :    
249 :     =item RETURN
250 :    
251 :     Returns a reference to a list of the IDs for all the FIGfams in the system.
252 :    
253 :     =back
254 :    
255 :     =cut
256 :    
257 : olson 1.4 sub all_families
258 :     {
259 :     my($self) = @_;
260 :     return $self->run_query('all_families');
261 :     }
262 :    
263 : parrello 1.20 =head3 assign_function_to_prot
264 :    
265 :     my $document = $ffObject->assign_function_to_prot($input, $blast, $min_hits, $assignToAll);
266 :    
267 :     For each incoming protein sequence, attempt to place it in a FIGfam. If a
268 :     suitable FIGfam can be found for a particular sequence, the FIGfam ID and
269 :     its functional assignment will be returned.
270 :    
271 :     =over 4
272 :    
273 :     =item input
274 :    
275 :     Either (1) an open input handle to a file containing the proteins in FASTA format,
276 :     or (2) a reference to a list of FASTA strings for the proteins.
277 :    
278 :     =item blast
279 :    
280 :     If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
281 :     afterward, and the top I<N> hits (where I<N> is the value of this parameter)
282 :     will be returned as part of the protein's output tuple.
283 :    
284 :     =item min_hits
285 :    
286 :     A number from 1 to 10, indicating the minimum number of matches required to
287 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
288 :     indicates a more reliable matching algorithm; the default is C<3>.
289 :    
290 :     =item assign_to_all
291 :    
292 :     If TRUE, then if the standard matching algorithm fails to assign a protein,
293 :     a BLAST will be used. The BLAST is slower, but is capable of placing more
294 :     proteins than the normal algorithm.
295 :    
296 :     =item RETURN
297 :    
298 :     Returns a Result Handler. Call C<get_next> on the result handler to get back a data
299 :     item. Each item sent back by the result handler is a 2-tuple containing the
300 :     incoming protein sequence and a reference to a list consisting of the proposed
301 :     functional assignment for the protein, the name of the Genome Set from which the
302 :     protein is likely to have originated (if known), a list of BLAST hits (if
303 :     requested), and the number of matches for the protein found in the FIGfam. If no
304 :     assignment could be made for a particular protein, it will not appear in the
305 :     output stream.
306 : olson 1.3
307 : parrello 1.20 =back
308 : olson 1.3
309 : parrello 1.20 =cut
310 : olson 1.3
311 : olson 1.1 sub assign_function_to_prot
312 :     {
313 : olson 1.15 my($self, $input, $blast, $min_hits, $assignToAll) = @_;
314 : olson 1.1
315 : olson 1.2 my $wq;
316 : olson 1.10
317 : olson 1.15 my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
318 : olson 1.2
319 : olson 1.1 if (ref($input) eq 'ARRAY')
320 :     {
321 : olson 1.2 $wq = SequenceListWorkQueue->new($input);
322 : olson 1.1 }
323 :     else
324 :     {
325 : olson 1.2 $wq = FastaWorkQueue->new($input);
326 : olson 1.1 }
327 : olson 1.2
328 : olson 1.19 my $req_bytes = $blast ? 1000 : 1_000_000;
329 : olson 1.11
330 : olson 1.10 return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
331 :     #\&tab_delimited_output_parser,
332 :     \&YAML::Load,
333 : olson 1.11 $params, $req_bytes);
334 : olson 1.1 }
335 :    
336 : parrello 1.20 =head3 call_genes
337 :    
338 :     my $document = $ffObject->call_genes($input, $genetic_code);
339 :    
340 :     Call the protein-encoding genes for the specified DNA sequences. The result will
341 :     be a multi-sequence FASTA string listing all the proteins found and a hash mapping
342 :     each gene found to its location string.
343 :    
344 :     =over 4
345 :    
346 :     =item input
347 :    
348 :     Open input handle to a file containing the DNA sequences in FASTA format.
349 :    
350 :     =item genetic_code
351 :    
352 :     The numeric code for the mapping from DNA to amino acids. The default is C<11>,
353 :     which is the standard mapping and should be used in almost all cases. A complete
354 :     list of mapping codes can be found at
355 :     L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
356 :    
357 :     =item RETURN
358 :    
359 :     Returns a 2-tuple consisting of a FASTA string for all the proteins found
360 :     followed by a reference to a list of genes found. Each gene found will be
361 :     represented by a 4-tuple containing an ID for the gene, the ID of the contig
362 :     containing it, the starting offset, and the ending offset.
363 :    
364 :     =back
365 :    
366 :     =cut
367 :    
368 : olson 1.16 sub call_genes
369 :     {
370 :     my($self, $input, $genetic_code) = @_;
371 :    
372 :     if (ref($input) ne 'ARRAY')
373 :     {
374 :     my $fh;
375 :     if (ref($input))
376 :     {
377 :     $fh = $input;
378 :     }
379 :     else
380 :     {
381 :     my $fasta_file = $input;
382 :     open($fh, "<", $fasta_file);
383 :     }
384 :     $input = [];
385 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
386 :     {
387 :     push(@$input, "$id,$$seqp");
388 :     }
389 :     close($fh);
390 :     }
391 :    
392 :     return $self->run_query_form([function => "call_genes",
393 :     genetic_code => $genetic_code,
394 : olson 1.17 id_seq => $input]);
395 : olson 1.16 }
396 :    
397 : parrello 1.20 =head3 find_rnas
398 :    
399 :     my $document = $ffObject->find_rnas($input, $genus, $species, $domain);
400 :    
401 :     Call the RNAs for the specified DNA sequences. The result will be a
402 :     multi-sequence FASTA string listing all the RNAs found and a hash mapping
403 :     each RNA to its location string.
404 :    
405 :     =over 4
406 :    
407 :     =item input
408 :    
409 :     Open input handle to a file containing the DNA sequences in FASTA format.
410 :    
411 :     =item genus
412 :    
413 :     Common name of the genus for this DNA.
414 :    
415 :     =item species
416 :    
417 :     Common name of the species for this DNA.
418 :    
419 :     =item domain
420 :    
421 :     Domain of this DNA. The default is C<Bacteria>.
422 :    
423 :     =item RETURN
424 :    
425 :     Returns a 2-tuple consisting of a FASTA string for all the RNAs found
426 :     followed by reference to a list of RNAs found. Each RNA will be represented by
427 :     a 4-tuple consisting of an ID for the RNA, the ID of the contig containing it, its
428 :     starting offset, and its ending offset.
429 :    
430 :     =back
431 :    
432 :     =cut
433 :    
434 : olson 1.18 sub find_rnas
435 :     {
436 :     my($self, $input, $genus, $species, $domain) = @_;
437 :    
438 :     if (ref($input) ne 'ARRAY')
439 :     {
440 :     my $fh;
441 :     if (ref($input))
442 :     {
443 :     $fh = $input;
444 :     }
445 :     else
446 :     {
447 :     my $fasta_file = $input;
448 :     open($fh, "<", $fasta_file);
449 :     }
450 :     $input = [];
451 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
452 :     {
453 :     push(@$input, "$id,$$seqp");
454 :     }
455 :     close($fh);
456 :     }
457 :    
458 :     return $self->run_query_form([function => "find_rnas",
459 :     genus => $genus,
460 :     species => $species,
461 :     domain => $domain,
462 :     id_seq => $input]);
463 :     }
464 :    
465 : parrello 1.20 =head3 assign_functions_to_DNA
466 :    
467 :     my $document = $ffObject->assign_functions_to_DNA($input, $blast, $min_hits, $max_gap);
468 :    
469 :     Analyze DNA sequences and output regions that probably belong to FIGfams.
470 :     The selected regions will be high-probability candidates for protein
471 :     production.
472 :    
473 :     =over 4
474 :    
475 :     =item input
476 :    
477 :     Either (1) an open input handle to a file containing the DNA sequences in FASTA format,
478 :     or (2) a reference to a list of FASTA strings for the DNA sequences.
479 :    
480 :     =item blast
481 :    
482 :     If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
483 :     afterward, and the top I<N> hits (where I<N> is the value of this parameter)
484 :     will be returned as part of each protein's output tuple.
485 :    
486 :     =item min_hits
487 :    
488 :     A number from 1 to 10, indicating the minimum number of matches required to
489 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
490 :     indicates a more reliable matching algorithm; the default is C<3>.
491 :    
492 :     =item max_gap
493 :    
494 :     When looking for a match, if two sequence elements match and are closer than
495 :     this distance, then they will be considered part of a single match. Otherwise,
496 :     the match will be split. The default is C<600>.
497 :    
498 :     =item RETURN
499 :    
500 :     Returns a Result Handler. Call C<get_next> on the result handler to get back a data
501 :     item. Each item sent back by the result handler is a 2-tuple containing the
502 :     incoming protein sequence and a reference to a list of hit regions. Each hit
503 :     region is a 6-tuple consisting of the number of matches to the FIGfam, the start
504 :     location, the stop location, the proposed functional assignment, the name of the
505 :     Genome Set from which the gene is likely to have originated, and a list of BLAST
506 :     hits. If the I<blast> option is not specified, the list of BLAST hits will be
507 :     empty.
508 :    
509 :     =back
510 :    
511 :     =cut
512 :    
513 : olson 1.2 sub assign_functions_to_dna
514 : olson 1.1 {
515 : olson 1.10 my($self, $input, $min_hits, $max_gap, $blast) = @_;
516 : olson 1.1
517 : olson 1.12 $min_hits = 3 unless defined($min_hits);
518 :     $max_gap = 600 unless defined($max_gap);
519 : olson 1.13 $blast = 0 unless defined($blast);
520 : olson 1.12
521 : olson 1.2 my $wq;
522 : olson 1.1
523 : olson 1.2 if (ref($input) eq 'ARRAY')
524 : olson 1.1 {
525 : olson 1.2 $wq = SequenceListWorkQueue->new($input);
526 : olson 1.1 }
527 :     else
528 :     {
529 : olson 1.2 $wq = FastaWorkQueue->new($input);
530 : olson 1.1 }
531 : olson 1.2
532 : olson 1.13 my $req_bytes = $blast ? 1000 : 500000;
533 : olson 1.10 my $params = [min_hits => $min_hits, max_gap => $max_gap, blast => $blast];
534 :     return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',
535 :     \&id_seq_pair_bundler,
536 : olson 1.11 \&tab_delimited_output_parser, $params, $req_bytes);
537 : olson 1.2 }
538 :    
539 : parrello 1.20 ###### Utility Methods ######
540 :    
541 :     sub run_query
542 :     {
543 :     my($self, $function, @args ) = @_;
544 :     my $form = [function => $function,
545 :     args => YAML::Dump(\@args),
546 :     ];
547 :     return $self->run_query_form($form);
548 :     }
549 :    
550 :     sub run_query_form
551 :     {
552 :     my($self, $form, $raw) = @_;
553 :    
554 :     my $res = $self->{ua}->post($self->{server_url}, $form);
555 :    
556 :     if ($res->is_success)
557 :     {
558 :     my $content = $res->content;
559 :     if ($raw)
560 :     {
561 :     return $content;
562 :     }
563 :    
564 :     # print "Got $content\n";
565 :     my $ret;
566 :     eval {
567 :     $ret = Load($content);
568 :     };
569 :     if ($@)
570 :     {
571 :     die "Query returned unparsable content ($@): " . $content;
572 :     }
573 :     return $ret;
574 :     }
575 :     else
576 :     {
577 :     die "error on post " . $res->status_line . " " . $res->content;
578 :     }
579 :     }
580 :    
581 : olson 1.2 sub id_seq_pair_bundler
582 :     {
583 :     my($item) = @_;
584 :     my($id, $seq) = @$item[0,2];
585 :     return "id_seq", join(",", $id, (ref($seq) eq 'SCALAR' ? $$seq : $seq));
586 :     }
587 :    
588 :     sub tab_delimited_output_parser
589 :     {
590 :     my($line) = @_;
591 :     chomp $line;
592 :     my @cols = split(/\t/, $line);
593 :     return \@cols;
594 :     }
595 :    
596 :    
597 :     sub tab_delimited_dna_data_output_parser
598 :     {
599 :     my($line) = @_;
600 :     chomp $line;
601 :     my ($id, $idbe, $fam) = split(/\t/, $line);
602 :     my ($beg, $end) = $idbe =~ /_(\d+)_(\d+)$/;
603 :     return [$id, $beg, $end, $fam];
604 : olson 1.1 }
605 :    
606 : olson 1.2 package ResultHandler;
607 : olson 1.1 use strict;
608 : olson 1.2 use Data::Dumper;
609 : olson 1.1
610 :     sub new
611 :     {
612 : olson 1.11 my($class, $work_queue, $server_url, $function, $input_bundler, $output_parser, $form_vars, $req_bytes) = @_;
613 : olson 1.2
614 : olson 1.1 my $self = {
615 : olson 1.2 work_queue => $work_queue,
616 :     server_url => $server_url,
617 :     function => $function,
618 :     input_bundler => $input_bundler,
619 :     output_parser => $output_parser,
620 :     ua => LWP::UserAgent->new(),
621 :     cur_result => undef,
622 : olson 1.5 form_vars => $form_vars ? $form_vars : [],
623 : olson 1.13 req_bytes => ($req_bytes ? $req_bytes : 16000),
624 : olson 1.1 };
625 : olson 1.14 $self->{ua}->timeout(20 * 60);
626 : olson 1.1 return bless $self, $class;
627 :     }
628 :    
629 :     sub get_next
630 :     {
631 :     my($self) = @_;
632 :    
633 : olson 1.10 my $res = $self->get_next_from_result();
634 : olson 1.13 # print "gnfr returns: " , Dumper($res);
635 : olson 1.10
636 :     if ($res)
637 : olson 1.2 {
638 : olson 1.10 return $res;
639 : olson 1.2 }
640 :     else
641 :     {
642 : olson 1.13
643 :     while (my @inp = $self->{work_queue}->get_next_n_bytes($self->{req_bytes}))
644 : olson 1.2 {
645 : olson 1.5 my $form = [@{$self->{form_vars}}];
646 :     push(@$form, function => $self->{function},
647 :     map { &{$self->{input_bundler}}($_) } @inp);
648 : olson 1.8 # print "Invoke " .Dumper($form);
649 : olson 1.2
650 :     my $res = $self->{ua}->post($self->{server_url}, $form);
651 :     if ($res->is_success)
652 :     {
653 : olson 1.13 eval {
654 :     $self->{cur_result} = [YAML::Load($res->content)];
655 :     };
656 :     if ($@)
657 :     {
658 :     die "Query returned unparsable content ($@): " . $res->content;
659 :     }
660 :     # print "res: " . Dumper($self->{cur_result});
661 :     my $oneres = $self->get_next_from_result();
662 :     if ($oneres)
663 :     {
664 :     return $oneres;
665 :     }
666 : olson 1.2 }
667 :     else
668 :     {
669 : olson 1.3 die "error " . $res->status_line . " on post " . $res->content;
670 : olson 1.2 }
671 :     }
672 : olson 1.13 return;
673 : olson 1.2 }
674 :     }
675 :    
676 :     sub get_next_from_result
677 :     {
678 :     my($self) = @_;
679 : olson 1.10 my $l = $self->{cur_result};
680 :     if ($l and @$l)
681 :     {
682 :     return shift(@$l);
683 :     }
684 :     else
685 : olson 1.2 {
686 : olson 1.10 delete $self->{cur_result};
687 :     return undef;
688 : olson 1.2 }
689 :     }
690 :    
691 :     package SequenceWorkQueue;
692 :     use strict;
693 :    
694 :     sub new
695 :     {
696 :     my($class) = @_;
697 :    
698 :     my $self = {};
699 :    
700 :     return bless $self, $class;
701 :     }
702 : olson 1.1
703 : olson 1.2 sub get_next_n
704 :     {
705 :     my($self, $n) = @_;
706 :     my @out;
707 :    
708 :     for (my $i = 0;$i < $n; $i++)
709 : olson 1.1 {
710 : olson 1.2 my($id, $com, $seqp) = $self->get_next();
711 :     if (defined($id))
712 :     {
713 :     push(@out, [$id, $com, $seqp]);
714 :     }
715 :     else
716 :     {
717 :     last;
718 :     }
719 : olson 1.1 }
720 : olson 1.2 return @out;
721 :     }
722 :    
723 :     sub get_next_n_bytes
724 :     {
725 :     my($self, $n) = @_;
726 :     my @out;
727 :    
728 :     my $size = 0;
729 :     while ($size < $n)
730 : olson 1.1 {
731 : olson 1.2 my($id, $com, $seqp) = $self->get_next();
732 :     if (defined($id))
733 :     {
734 :     push(@out, [$id, $com, $seqp]);
735 :     $size += (ref($seqp) eq 'SCALAR') ? length($$seqp) : length($seqp);
736 :     }
737 :     else
738 :     {
739 :     last;
740 :     }
741 : olson 1.1 }
742 : olson 1.2 return @out;
743 : olson 1.1 }
744 :    
745 : olson 1.2 package FastaWorkQueue;
746 : olson 1.1 use strict;
747 : olson 1.2 use base 'SequenceWorkQueue';
748 : olson 1.1 use FileHandle;
749 :    
750 :     sub new
751 :     {
752 : olson 1.2 my($class, $input) = @_;
753 : olson 1.1
754 :     my $fh;
755 :     if (ref($input))
756 :     {
757 :     $fh = $input;
758 :     }
759 :     else
760 :     {
761 :     $fh = new FileHandle("<$input");
762 :     }
763 : olson 1.2
764 :     my $self = $class->SUPER::new();
765 :    
766 :     $self->{fh} = $fh;
767 :    
768 : olson 1.1 return bless $self, $class;
769 :     }
770 :    
771 :     sub get_next
772 :     {
773 :     my($self) = @_;
774 :    
775 : olson 1.14 my($id, $seqp, $com) = read_fasta_record($self->{fh});
776 : olson 1.2 return defined($id) ? ($id, $com, $seqp) : ();
777 :     }
778 :    
779 : olson 1.14 sub read_fasta_record {
780 :     my ($file_handle) = @_;
781 :     my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
782 :    
783 :     if (not defined($file_handle)) { $file_handle = \*STDIN; }
784 :    
785 :     $old_end_of_record = $/;
786 :     $/ = "\n>";
787 :    
788 :     if (defined($fasta_record = <$file_handle>)) {
789 :     chomp $fasta_record;
790 :     @lines = split( /\n/, $fasta_record );
791 :     $head = shift @lines;
792 :     $head =~ s/^>?//;
793 :     $head =~ m/^(\S+)/;
794 :     $seq_id = $1;
795 :     if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; }
796 :     $sequence = join( "", @lines );
797 :     @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
798 :     } else {
799 :     @parsed_fasta_record = ();
800 :     }
801 :    
802 :     $/ = $old_end_of_record;
803 :    
804 :     return @parsed_fasta_record;
805 :     }
806 :    
807 : olson 1.2 package SequenceListWorkQueue;
808 :     use strict;
809 :     use base 'SequenceWorkQueue';
810 : olson 1.1
811 : olson 1.2 sub new
812 :     {
813 :     my($class, $input) = @_;
814 :    
815 :     my $fh;
816 :     if (ref($input) ne 'ARRAY')
817 : olson 1.1 {
818 : olson 1.2 die "SequenceWorkQueue requires a list as input";
819 : olson 1.1 }
820 : olson 1.2
821 :     my $self = $class->SUPER::new();
822 :    
823 :     $self->{list} = $input;
824 :    
825 :     return bless $self, $class;
826 :     }
827 :    
828 :     sub get_next
829 :     {
830 :     my($self) = @_;
831 :    
832 :     my $top = shift @{$self->{list}};
833 :    
834 :     return defined($top) ? @$top : ();
835 : olson 1.1 }
836 : olson 1.2
837 :    
838 : olson 1.1 1;
839 : olson 1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3