[Bio] / FigKernelPackages / FFserver.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/FFserver.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.23 - (view) (download) (as text)

1 : olson 1.1 package FFserver;
2 :    
3 : parrello 1.20 =head1 FIGfam Server Helper Object
4 :    
5 :     This module is used to call the FIGfam server, which is a general-purpose
6 :     server for extracting data from the FIGfams database. Each FIGfam server
7 :     function correspond to a method of this object.
8 :    
9 :     This package deliberately uses no internal SEED packages or scripts, only common
10 :     PERL modules.
11 :    
12 :     =cut
13 :    
14 : olson 1.1 use LWP::UserAgent;
15 :     use Data::Dumper;
16 : olson 1.3 use YAML;
17 : olson 1.1
18 :     use strict;
19 :    
20 :     sub new
21 :     {
22 :     my($class, $server_url) = @_;
23 :    
24 : olson 1.3 $server_url = "http://servers.nmpdr.org/figfam/server.cgi" unless $server_url;
25 : olson 1.1
26 :    
27 :     my $self = {
28 :     server_url => $server_url,
29 :     ua => LWP::UserAgent->new(),
30 :     };
31 : olson 1.14 $self->{ua}->timeout(20 * 60);
32 : olson 1.1
33 :     return bless $self, $class;
34 :     }
35 :    
36 : parrello 1.20 =head2 Functions
37 :    
38 :     =head3 members_of_families
39 :    
40 :     my $document = $ffObject->members_of_families(@ids);
41 :    
42 :     Return the function and a list of the members for each specified family.
43 :    
44 :     =over 4
45 :    
46 :     =item ids
47 :    
48 :     A list of FIGfam IDs.
49 :    
50 :     =item RETURN
51 :    
52 :     Returns a reference to a list of 3-tuples. Each 3-tuple will consist of a FIGfam
53 :     family ID followed by the family's function and a sub-list of all the FIG feature
54 :     IDs for the features in the family.
55 :    
56 :     =back
57 :    
58 :     =cut
59 :    
60 : olson 1.3 sub members_of_families
61 :     {
62 :     my($self, @ids) = @_;
63 : olson 1.4 return $self->run_query('members_of_families', @ids);
64 :     }
65 :    
66 : parrello 1.20 =head3 families_containing_peg
67 :    
68 :     my $document = $ffObject->families_containing_peg(@ids);
69 :    
70 :     Return the FIGfams containing the specified features.
71 :    
72 :     =over 4
73 :    
74 :     =item ids
75 :    
76 :     A list of FIG feature IDs.
77 :    
78 :     =item RETURN
79 :    
80 :     Returns a list of 2-tuples, each consisting of an incoming feature ID
81 :     followed by a list of FIGfam IDs for the families containing the incoming
82 :     feature.
83 :    
84 :     =back
85 :    
86 :     =cut
87 :    
88 : olson 1.6 sub families_containing_peg
89 :     {
90 :     my($self, @ids) = @_;
91 :     return $self->run_query('families_containing_peg', @ids);
92 :     }
93 :    
94 : parrello 1.20 =head3 families_implementing_role
95 :    
96 :     my $document = $ffObject->families_implementing_role(@roles);
97 :    
98 :     Return the FIGfams that implement the specified roles. Each FIGfam has
99 :     a single function associated with it, but the function may involve
100 :     multiple roles, or may include comments. The role is therefore a more
101 :     compact string than the function.
102 :    
103 :     =over 4
104 :    
105 :     =item roles
106 :    
107 :     A list of role names.
108 :    
109 :     =item RETURN
110 :    
111 :     Returns a list of 2-tuples, each consisting of an incoming role name
112 :     followed by a list of FIGfam IDs for the families that implement the
113 :     incoming role.
114 :    
115 :     =back
116 :    
117 :     =cut
118 :    
119 : arodri7 1.9 sub families_implementing_role
120 :     {
121 :     my($self,@roles) = @_;
122 :     return $self->run_query('families_implementing_role', @roles);
123 :     }
124 :    
125 : parrello 1.20 =head3 families_with_function
126 :    
127 :     my $document = $ffObject->families_with_function(@functions);
128 :    
129 :     Return the FIGfams that belong to the specified functions. Each FIGfam has
130 :     a single function associated with it, but the function may involve
131 :     multiple roles, or may include comments. The function is therefore a
132 :     more specific string than the role.
133 :    
134 :     =over 4
135 :    
136 :     =item functions
137 :    
138 :     A list of functional roles.
139 :    
140 :     =item RETURN
141 :    
142 :     Returns a list of 2-tuples, each consisting of an incoming role name
143 :     followed by a list of FIGfam IDs for the families associated with the
144 :     incoming function.
145 :    
146 :     =back
147 :    
148 :     =cut
149 :    
150 : arodri7 1.9 sub families_with_function
151 :     {
152 :     my($self,@functions) = @_;
153 :     return $self->run_query('families_with_function', @functions);
154 :     }
155 :    
156 : parrello 1.20 =head3 families_in_genome
157 :    
158 :     my $document = $ffObject->families_in_genome(@genomes);
159 :    
160 :     Return the FIGfams that have members in the specified genomes.
161 :    
162 :     =over 4
163 :    
164 :     =item genomes
165 :    
166 :     A list of genome IDs.
167 :    
168 :     =item RETURN
169 :    
170 :     Returns a list of 2-tuples, each consisting of an incoming genome ID
171 :     followed by a list of FIGfam IDs for the families that have members in
172 :     that genome.
173 :    
174 :     =back
175 :    
176 :     =cut
177 :    
178 : arodri7 1.9 sub families_in_genome
179 :     {
180 :     my($self,@genomes) = @_;
181 :     return $self->run_query('families_in_genome', @genomes);
182 :     }
183 :    
184 : parrello 1.20 =head3 get_subsystem_based_figfams
185 :    
186 :     my $document = $ffObject->get_subsystem_based_figfams();
187 :    
188 :     Return a list of the FIGfams derived from subsystems.
189 :    
190 :     =over 4
191 :    
192 :     =item RETURN
193 :    
194 :     Returns a reference to a list of the IDs for the FIGfams derived from subsystems.
195 :    
196 :     =back
197 :    
198 :     =cut
199 :    
200 : arodri7 1.9 sub get_subsystem_based_figfams
201 :     {
202 :     my ($self) = @_;
203 :     return $self->run_query('get_subsystem_based_figfams');
204 :     }
205 :    
206 : disz 1.21 ##=head3 should_be_member
207 :     ##
208 :     ## my $document = $ffObject->should_be_member(@id_seq_pairs);
209 :     ##
210 :     ##Determine whether a particular protein sequence belongs in a particular
211 :     ##FIGfam. This method takes as input multiple FIGfam/sequence pairs and
212 :     ##performs a determination for each.
213 :     ##
214 :     ##=over 4
215 :     ##
216 :     ##=item id_seq_pairs
217 :     ##
218 :     ##A list of 2-tuples, each consisting of a FIGfam ID followed
219 :     ##by a protein sequence string.
220 :     ##
221 :     ##=item RETURN
222 :     ##
223 :     ##Returns a reference to a list of boolean flags, one per input pair. For each
224 :     ##input pair, the flag will be C<1> if the sequence should be in the FIGfam and
225 :     ##C<0> otherwise.
226 :     ##
227 :     ##=back
228 :     ##
229 :     ##=cut
230 :     ##
231 :     ##sub should_be_member
232 :     ##{
233 :     ## my($self, @id_seq_pairs) = @_;
234 :     ## return $self->run_query('should_be_member', @id_seq_pairs);
235 :     ##}
236 : olson 1.4
237 : parrello 1.20 =head3 all_families
238 :    
239 :     my $document = $ffObject->all_families();
240 :    
241 :     Return a list of the IDs for all the FIGfams in the system.
242 :    
243 :     =over 4
244 :    
245 :     =item RETURN
246 :    
247 :     Returns a reference to a list of the IDs for all the FIGfams in the system.
248 :    
249 :     =back
250 :    
251 :     =cut
252 :    
253 : olson 1.4 sub all_families
254 :     {
255 :     my($self) = @_;
256 :     return $self->run_query('all_families');
257 :     }
258 :    
259 : parrello 1.20 =head3 assign_function_to_prot
260 :    
261 :     my $document = $ffObject->assign_function_to_prot($input, $blast, $min_hits, $assignToAll);
262 :    
263 :     For each incoming protein sequence, attempt to place it in a FIGfam. If a
264 :     suitable FIGfam can be found for a particular sequence, the FIGfam ID and
265 :     its functional assignment will be returned.
266 :    
267 :     =over 4
268 :    
269 :     =item input
270 :    
271 :     Either (1) an open input handle to a file containing the proteins in FASTA format,
272 :     or (2) a reference to a list of FASTA strings for the proteins.
273 :    
274 :     =item blast
275 :    
276 :     If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
277 :     afterward, and the top I<N> hits (where I<N> is the value of this parameter)
278 :     will be returned as part of the protein's output tuple.
279 :    
280 :     =item min_hits
281 :    
282 :     A number from 1 to 10, indicating the minimum number of matches required to
283 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
284 :     indicates a more reliable matching algorithm; the default is C<3>.
285 :    
286 :     =item assign_to_all
287 :    
288 :     If TRUE, then if the standard matching algorithm fails to assign a protein,
289 :     a BLAST will be used. The BLAST is slower, but is capable of placing more
290 :     proteins than the normal algorithm.
291 :    
292 :     =item RETURN
293 :    
294 :     Returns a Result Handler. Call C<get_next> on the result handler to get back a data
295 :     item. Each item sent back by the result handler is a 2-tuple containing the
296 :     incoming protein sequence and a reference to a list consisting of the proposed
297 :     functional assignment for the protein, the name of the Genome Set from which the
298 :     protein is likely to have originated (if known), a list of BLAST hits (if
299 :     requested), and the number of matches for the protein found in the FIGfam. If no
300 :     assignment could be made for a particular protein, it will not appear in the
301 :     output stream.
302 : olson 1.3
303 : parrello 1.20 =back
304 : olson 1.3
305 : parrello 1.20 =cut
306 : olson 1.3
307 : olson 1.1 sub assign_function_to_prot
308 :     {
309 : olson 1.15 my($self, $input, $blast, $min_hits, $assignToAll) = @_;
310 : olson 1.1
311 : olson 1.2 my $wq;
312 : olson 1.10
313 : olson 1.15 my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
314 : olson 1.2
315 : olson 1.1 if (ref($input) eq 'ARRAY')
316 :     {
317 : olson 1.2 $wq = SequenceListWorkQueue->new($input);
318 : olson 1.1 }
319 :     else
320 :     {
321 : olson 1.2 $wq = FastaWorkQueue->new($input);
322 : olson 1.1 }
323 : olson 1.2
324 : olson 1.19 my $req_bytes = $blast ? 1000 : 1_000_000;
325 : olson 1.11
326 : olson 1.10 return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
327 :     #\&tab_delimited_output_parser,
328 :     \&YAML::Load,
329 : olson 1.11 $params, $req_bytes);
330 : olson 1.1 }
331 :    
332 : parrello 1.20 =head3 call_genes
333 :    
334 :     my $document = $ffObject->call_genes($input, $genetic_code);
335 :    
336 :     Call the protein-encoding genes for the specified DNA sequences. The result will
337 :     be a multi-sequence FASTA string listing all the proteins found and a hash mapping
338 :     each gene found to its location string.
339 :    
340 :     =over 4
341 :    
342 :     =item input
343 :    
344 :     Open input handle to a file containing the DNA sequences in FASTA format.
345 :    
346 :     =item genetic_code
347 :    
348 :     The numeric code for the mapping from DNA to amino acids. The default is C<11>,
349 :     which is the standard mapping and should be used in almost all cases. A complete
350 :     list of mapping codes can be found at
351 :     L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
352 :    
353 :     =item RETURN
354 :    
355 :     Returns a 2-tuple consisting of a FASTA string for all the proteins found
356 :     followed by a reference to a list of genes found. Each gene found will be
357 :     represented by a 4-tuple containing an ID for the gene, the ID of the contig
358 :     containing it, the starting offset, and the ending offset.
359 :    
360 :     =back
361 :    
362 :     =cut
363 :    
364 : olson 1.16 sub call_genes
365 :     {
366 :     my($self, $input, $genetic_code) = @_;
367 :    
368 :     if (ref($input) ne 'ARRAY')
369 :     {
370 :     my $fh;
371 :     if (ref($input))
372 :     {
373 :     $fh = $input;
374 :     }
375 :     else
376 :     {
377 :     my $fasta_file = $input;
378 :     open($fh, "<", $fasta_file);
379 :     }
380 :     $input = [];
381 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
382 :     {
383 :     push(@$input, "$id,$$seqp");
384 :     }
385 :     close($fh);
386 :     }
387 :    
388 :     return $self->run_query_form([function => "call_genes",
389 :     genetic_code => $genetic_code,
390 : olson 1.17 id_seq => $input]);
391 : olson 1.16 }
392 :    
393 : parrello 1.20 =head3 find_rnas
394 :    
395 :     my $document = $ffObject->find_rnas($input, $genus, $species, $domain);
396 :    
397 :     Call the RNAs for the specified DNA sequences. The result will be a
398 :     multi-sequence FASTA string listing all the RNAs found and a hash mapping
399 :     each RNA to its location string.
400 :    
401 :     =over 4
402 :    
403 :     =item input
404 :    
405 :     Open input handle to a file containing the DNA sequences in FASTA format.
406 :    
407 :     =item genus
408 :    
409 :     Common name of the genus for this DNA.
410 :    
411 :     =item species
412 :    
413 :     Common name of the species for this DNA.
414 :    
415 :     =item domain
416 :    
417 :     Domain of this DNA. The default is C<Bacteria>.
418 :    
419 :     =item RETURN
420 :    
421 :     Returns a 2-tuple consisting of a FASTA string for all the RNAs found
422 :     followed by reference to a list of RNAs found. Each RNA will be represented by
423 :     a 4-tuple consisting of an ID for the RNA, the ID of the contig containing it, its
424 :     starting offset, and its ending offset.
425 :    
426 :     =back
427 :    
428 :     =cut
429 :    
430 : olson 1.18 sub find_rnas
431 :     {
432 :     my($self, $input, $genus, $species, $domain) = @_;
433 :    
434 :     if (ref($input) ne 'ARRAY')
435 :     {
436 :     my $fh;
437 :     if (ref($input))
438 :     {
439 :     $fh = $input;
440 :     }
441 :     else
442 :     {
443 :     my $fasta_file = $input;
444 :     open($fh, "<", $fasta_file);
445 :     }
446 :     $input = [];
447 :     while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
448 :     {
449 :     push(@$input, "$id,$$seqp");
450 :     }
451 :     close($fh);
452 :     }
453 :    
454 :     return $self->run_query_form([function => "find_rnas",
455 :     genus => $genus,
456 :     species => $species,
457 :     domain => $domain,
458 :     id_seq => $input]);
459 :     }
460 :    
461 : parrello 1.20 =head3 assign_functions_to_DNA
462 :    
463 :     my $document = $ffObject->assign_functions_to_DNA($input, $blast, $min_hits, $max_gap);
464 :    
465 :     Analyze DNA sequences and output regions that probably belong to FIGfams.
466 :     The selected regions will be high-probability candidates for protein
467 :     production.
468 :    
469 :     =over 4
470 :    
471 :     =item input
472 :    
473 :     Either (1) an open input handle to a file containing the DNA sequences in FASTA format,
474 :     or (2) a reference to a list of FASTA strings for the DNA sequences.
475 :    
476 :     =item blast
477 :    
478 :     If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
479 :     afterward, and the top I<N> hits (where I<N> is the value of this parameter)
480 :     will be returned as part of each protein's output tuple.
481 :    
482 :     =item min_hits
483 :    
484 :     A number from 1 to 10, indicating the minimum number of matches required to
485 :     consider a protein as a candidate for assignment to a FIGfam. A higher value
486 :     indicates a more reliable matching algorithm; the default is C<3>.
487 :    
488 :     =item max_gap
489 :    
490 :     When looking for a match, if two sequence elements match and are closer than
491 :     this distance, then they will be considered part of a single match. Otherwise,
492 :     the match will be split. The default is C<600>.
493 :    
494 :     =item RETURN
495 :    
496 :     Returns a Result Handler. Call C<get_next> on the result handler to get back a data
497 :     item. Each item sent back by the result handler is a 2-tuple containing the
498 :     incoming protein sequence and a reference to a list of hit regions. Each hit
499 :     region is a 6-tuple consisting of the number of matches to the FIGfam, the start
500 :     location, the stop location, the proposed functional assignment, the name of the
501 :     Genome Set from which the gene is likely to have originated, and a list of BLAST
502 :     hits. If the I<blast> option is not specified, the list of BLAST hits will be
503 :     empty.
504 :    
505 :     =back
506 :    
507 :     =cut
508 :    
509 : olson 1.2 sub assign_functions_to_dna
510 : olson 1.1 {
511 : olson 1.10 my($self, $input, $min_hits, $max_gap, $blast) = @_;
512 : olson 1.1
513 : olson 1.12 $min_hits = 3 unless defined($min_hits);
514 :     $max_gap = 600 unless defined($max_gap);
515 : olson 1.13 $blast = 0 unless defined($blast);
516 : olson 1.12
517 : olson 1.2 my $wq;
518 : olson 1.1
519 : olson 1.2 if (ref($input) eq 'ARRAY')
520 : olson 1.1 {
521 : olson 1.2 $wq = SequenceListWorkQueue->new($input);
522 : olson 1.1 }
523 :     else
524 :     {
525 : olson 1.2 $wq = FastaWorkQueue->new($input);
526 : olson 1.1 }
527 : olson 1.2
528 : olson 1.13 my $req_bytes = $blast ? 1000 : 500000;
529 : olson 1.10 my $params = [min_hits => $min_hits, max_gap => $max_gap, blast => $blast];
530 :     return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',
531 :     \&id_seq_pair_bundler,
532 : olson 1.11 \&tab_delimited_output_parser, $params, $req_bytes);
533 : olson 1.2 }
534 :    
535 : parrello 1.20 ###### Utility Methods ######
536 :    
537 :     sub run_query
538 :     {
539 :     my($self, $function, @args ) = @_;
540 :     my $form = [function => $function,
541 :     args => YAML::Dump(\@args),
542 :     ];
543 :     return $self->run_query_form($form);
544 :     }
545 :    
546 :     sub run_query_form
547 :     {
548 :     my($self, $form, $raw) = @_;
549 :    
550 :     my $res = $self->{ua}->post($self->{server_url}, $form);
551 :    
552 :     if ($res->is_success)
553 :     {
554 :     my $content = $res->content;
555 :     if ($raw)
556 :     {
557 :     return $content;
558 :     }
559 :    
560 :     # print "Got $content\n";
561 :     my $ret;
562 :     eval {
563 :     $ret = Load($content);
564 :     };
565 :     if ($@)
566 :     {
567 :     die "Query returned unparsable content ($@): " . $content;
568 :     }
569 :     return $ret;
570 :     }
571 :     else
572 :     {
573 :     die "error on post " . $res->status_line . " " . $res->content;
574 :     }
575 :     }
576 :    
577 : olson 1.2 sub id_seq_pair_bundler
578 :     {
579 :     my($item) = @_;
580 :     my($id, $seq) = @$item[0,2];
581 :     return "id_seq", join(",", $id, (ref($seq) eq 'SCALAR' ? $$seq : $seq));
582 :     }
583 :    
584 :     sub tab_delimited_output_parser
585 :     {
586 :     my($line) = @_;
587 :     chomp $line;
588 :     my @cols = split(/\t/, $line);
589 :     return \@cols;
590 :     }
591 :    
592 :    
593 :     sub tab_delimited_dna_data_output_parser
594 :     {
595 :     my($line) = @_;
596 :     chomp $line;
597 :     my ($id, $idbe, $fam) = split(/\t/, $line);
598 :     my ($beg, $end) = $idbe =~ /_(\d+)_(\d+)$/;
599 :     return [$id, $beg, $end, $fam];
600 : olson 1.1 }
601 :    
602 : olson 1.2 package ResultHandler;
603 : olson 1.1 use strict;
604 : olson 1.2 use Data::Dumper;
605 : olson 1.1
606 :     sub new
607 :     {
608 : olson 1.11 my($class, $work_queue, $server_url, $function, $input_bundler, $output_parser, $form_vars, $req_bytes) = @_;
609 : olson 1.2
610 : olson 1.1 my $self = {
611 : olson 1.2 work_queue => $work_queue,
612 :     server_url => $server_url,
613 :     function => $function,
614 :     input_bundler => $input_bundler,
615 :     output_parser => $output_parser,
616 :     ua => LWP::UserAgent->new(),
617 :     cur_result => undef,
618 : olson 1.5 form_vars => $form_vars ? $form_vars : [],
619 : olson 1.13 req_bytes => ($req_bytes ? $req_bytes : 16000),
620 : olson 1.1 };
621 : olson 1.14 $self->{ua}->timeout(20 * 60);
622 : olson 1.1 return bless $self, $class;
623 :     }
624 :    
625 :     sub get_next
626 :     {
627 :     my($self) = @_;
628 :    
629 : olson 1.10 my $res = $self->get_next_from_result();
630 : olson 1.13 # print "gnfr returns: " , Dumper($res);
631 : olson 1.10
632 :     if ($res)
633 : olson 1.2 {
634 : olson 1.10 return $res;
635 : olson 1.2 }
636 :     else
637 :     {
638 : olson 1.13
639 :     while (my @inp = $self->{work_queue}->get_next_n_bytes($self->{req_bytes}))
640 : olson 1.2 {
641 : olson 1.5 my $form = [@{$self->{form_vars}}];
642 :     push(@$form, function => $self->{function},
643 :     map { &{$self->{input_bundler}}($_) } @inp);
644 : olson 1.8 # print "Invoke " .Dumper($form);
645 : olson 1.2
646 :     my $res = $self->{ua}->post($self->{server_url}, $form);
647 :     if ($res->is_success)
648 :     {
649 : olson 1.13 eval {
650 :     $self->{cur_result} = [YAML::Load($res->content)];
651 :     };
652 :     if ($@)
653 :     {
654 :     die "Query returned unparsable content ($@): " . $res->content;
655 :     }
656 :     # print "res: " . Dumper($self->{cur_result});
657 :     my $oneres = $self->get_next_from_result();
658 :     if ($oneres)
659 :     {
660 :     return $oneres;
661 :     }
662 : olson 1.2 }
663 :     else
664 :     {
665 : olson 1.3 die "error " . $res->status_line . " on post " . $res->content;
666 : olson 1.2 }
667 :     }
668 : olson 1.13 return;
669 : olson 1.2 }
670 :     }
671 :    
672 :     sub get_next_from_result
673 :     {
674 :     my($self) = @_;
675 : olson 1.10 my $l = $self->{cur_result};
676 :     if ($l and @$l)
677 :     {
678 :     return shift(@$l);
679 :     }
680 :     else
681 : olson 1.2 {
682 : olson 1.10 delete $self->{cur_result};
683 :     return undef;
684 : olson 1.2 }
685 :     }
686 :    
687 :     package SequenceWorkQueue;
688 :     use strict;
689 :    
690 :     sub new
691 :     {
692 :     my($class) = @_;
693 :    
694 :     my $self = {};
695 :    
696 :     return bless $self, $class;
697 :     }
698 : olson 1.1
699 : olson 1.2 sub get_next_n
700 :     {
701 :     my($self, $n) = @_;
702 :     my @out;
703 :    
704 :     for (my $i = 0;$i < $n; $i++)
705 : olson 1.1 {
706 : olson 1.2 my($id, $com, $seqp) = $self->get_next();
707 :     if (defined($id))
708 :     {
709 :     push(@out, [$id, $com, $seqp]);
710 :     }
711 :     else
712 :     {
713 :     last;
714 :     }
715 : olson 1.1 }
716 : olson 1.2 return @out;
717 :     }
718 :    
719 :     sub get_next_n_bytes
720 :     {
721 :     my($self, $n) = @_;
722 :     my @out;
723 :    
724 :     my $size = 0;
725 :     while ($size < $n)
726 : olson 1.1 {
727 : olson 1.2 my($id, $com, $seqp) = $self->get_next();
728 :     if (defined($id))
729 :     {
730 :     push(@out, [$id, $com, $seqp]);
731 :     $size += (ref($seqp) eq 'SCALAR') ? length($$seqp) : length($seqp);
732 :     }
733 :     else
734 :     {
735 :     last;
736 :     }
737 : olson 1.1 }
738 : olson 1.2 return @out;
739 : olson 1.1 }
740 :    
741 : olson 1.2 package FastaWorkQueue;
742 : olson 1.1 use strict;
743 : olson 1.2 use base 'SequenceWorkQueue';
744 : olson 1.1 use FileHandle;
745 :    
746 :     sub new
747 :     {
748 : olson 1.2 my($class, $input) = @_;
749 : olson 1.1
750 :     my $fh;
751 :     if (ref($input))
752 :     {
753 :     $fh = $input;
754 :     }
755 :     else
756 :     {
757 :     $fh = new FileHandle("<$input");
758 :     }
759 : olson 1.2
760 :     my $self = $class->SUPER::new();
761 :    
762 :     $self->{fh} = $fh;
763 :    
764 : olson 1.1 return bless $self, $class;
765 :     }
766 :    
767 :     sub get_next
768 :     {
769 :     my($self) = @_;
770 :    
771 : olson 1.14 my($id, $seqp, $com) = read_fasta_record($self->{fh});
772 : olson 1.2 return defined($id) ? ($id, $com, $seqp) : ();
773 :     }
774 :    
775 : olson 1.14 sub read_fasta_record {
776 :     my ($file_handle) = @_;
777 :     my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
778 :    
779 :     if (not defined($file_handle)) { $file_handle = \*STDIN; }
780 :    
781 :     $old_end_of_record = $/;
782 :     $/ = "\n>";
783 :    
784 :     if (defined($fasta_record = <$file_handle>)) {
785 :     chomp $fasta_record;
786 :     @lines = split( /\n/, $fasta_record );
787 :     $head = shift @lines;
788 :     $head =~ s/^>?//;
789 :     $head =~ m/^(\S+)/;
790 :     $seq_id = $1;
791 :     if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; }
792 :     $sequence = join( "", @lines );
793 :     @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
794 :     } else {
795 :     @parsed_fasta_record = ();
796 :     }
797 :    
798 :     $/ = $old_end_of_record;
799 :    
800 :     return @parsed_fasta_record;
801 :     }
802 :    
803 : olson 1.2 package SequenceListWorkQueue;
804 :     use strict;
805 :     use base 'SequenceWorkQueue';
806 : olson 1.1
807 : olson 1.2 sub new
808 :     {
809 :     my($class, $input) = @_;
810 :    
811 :     my $fh;
812 :     if (ref($input) ne 'ARRAY')
813 : olson 1.1 {
814 : olson 1.2 die "SequenceWorkQueue requires a list as input";
815 : olson 1.1 }
816 : olson 1.2
817 :     my $self = $class->SUPER::new();
818 :    
819 :     $self->{list} = $input;
820 :    
821 :     return bless $self, $class;
822 :     }
823 :    
824 :     sub get_next
825 :     {
826 :     my($self) = @_;
827 :    
828 :     my $top = shift @{$self->{list}};
829 :    
830 :     return defined($top) ? @$top : ();
831 : olson 1.1 }
832 : olson 1.2
833 :    
834 : olson 1.1 1;
835 : olson 1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3