[Bio] / FigKernelPackages / FFserver.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FFserver.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Mon May 11 20:55:05 2009 UTC revision 1.23, Thu Dec 17 20:48:21 2009 UTC
# Line 1  Line 1 
   
1  package FFserver;  package FFserver;
2    
3    =head1 FIGfam Server Helper Object
4    
5    This module is used to call the FIGfam server, which is a general-purpose
6    server for extracting data from the FIGfams database. Each FIGfam server
7    function correspond to a method of this object.
8    
9    This package deliberately uses no internal SEED packages or scripts, only common
10    PERL modules.
11    
12    =cut
13    
14  use LWP::UserAgent;  use LWP::UserAgent;
15  use Data::Dumper;  use Data::Dumper;
16  use YAML;  use YAML;
# Line 18  Line 28 
28          server_url => $server_url,          server_url => $server_url,
29          ua => LWP::UserAgent->new(),          ua => LWP::UserAgent->new(),
30      };      };
31        $self->{ua}->timeout(20 * 60);
32    
33      return bless $self, $class;      return bless $self, $class;
34  }  }
35    
36    =head2 Functions
37    
38    =head3 members_of_families
39    
40        my $document = $ffObject->members_of_families(@ids);
41    
42    Return the function and a list of the members for each specified family.
43    
44    =over 4
45    
46    =item ids
47    
48    A list of FIGfam IDs.
49    
50    =item RETURN
51    
52    Returns a reference to a list of 3-tuples. Each 3-tuple will consist of a FIGfam
53    family ID followed by the family's function and a sub-list of all the FIG feature
54    IDs for the features in the family.
55    
56    =back
57    
58    =cut
59    
60  sub members_of_families  sub members_of_families
61  {  {
62      my($self, @ids) = @_;      my($self, @ids) = @_;
63      return $self->run_query('members_of_families', @ids);      return $self->run_query('members_of_families', @ids);
64  }  }
65    
66  sub should_be_member  =head3 families_containing_peg
67    
68        my $document = $ffObject->families_containing_peg(@ids);
69    
70    Return the FIGfams containing the specified features.
71    
72    =over 4
73    
74    =item ids
75    
76    A list of FIG feature IDs.
77    
78    =item RETURN
79    
80    Returns a list of 2-tuples, each consisting of an incoming feature ID
81    followed by a list of FIGfam IDs for the families containing the incoming
82    feature.
83    
84    =back
85    
86    =cut
87    
88    sub families_containing_peg
89  {  {
90      my($self, @id_seq_pairs) = @_;      my($self, @ids) = @_;
91      return $self->run_query('should_be_member', @id_seq_pairs);      return $self->run_query('families_containing_peg', @ids);
92  }  }
93    
94  sub all_families  =head3 families_implementing_role
95    
96        my $document = $ffObject->families_implementing_role(@roles);
97    
98    Return the FIGfams that implement the specified roles. Each FIGfam has
99    a single function associated with it, but the function may involve
100    multiple roles, or may include comments. The role is therefore a more
101    compact string than the function.
102    
103    =over 4
104    
105    =item roles
106    
107    A list of role names.
108    
109    =item RETURN
110    
111    Returns a list of 2-tuples, each consisting of an incoming role name
112    followed by a list of FIGfam IDs for the families that implement the
113    incoming role.
114    
115    =back
116    
117    =cut
118    
119    sub families_implementing_role
120  {  {
121      my($self) = @_;      my($self,@roles) = @_;
122      return $self->run_query('all_families');      return $self->run_query('families_implementing_role', @roles);
123  }  }
124    
125  sub run_query  =head3 families_with_function
126    
127        my $document = $ffObject->families_with_function(@functions);
128    
129    Return the FIGfams that belong to the specified functions. Each FIGfam has
130    a single function associated with it, but the function may involve
131    multiple roles, or may include comments. The function is therefore a
132    more specific string than the role.
133    
134    =over 4
135    
136    =item functions
137    
138    A list of functional roles.
139    
140    =item RETURN
141    
142    Returns a list of 2-tuples, each consisting of an incoming role name
143    followed by a list of FIGfam IDs for the families associated with the
144    incoming function.
145    
146    =back
147    
148    =cut
149    
150    sub families_with_function
151  {  {
152      my($self, $function, @args ) = @_;      my($self,@functions) = @_;
153      my $form = [function  => $function,      return $self->run_query('families_with_function', @functions);
                 args => YAML::Dump(@args),  
                 ];  
     return $self->run_query_form($form);  
154  }  }
155    
156  sub run_query_form  =head3 families_in_genome
 {  
     my($self, $form) = @_;  
157    
158      my $res = $self->{ua}->post($self->{server_url}, $form);      my $document = $ffObject->families_in_genome(@genomes);
159    
160      if ($res->is_success)  Return the FIGfams that have members in the specified genomes.
161    
162    =over 4
163    
164    =item genomes
165    
166    A list of genome IDs.
167    
168    =item RETURN
169    
170    Returns a list of 2-tuples, each consisting of an incoming genome ID
171    followed by a list of FIGfam IDs for the families that have members in
172    that genome.
173    
174    =back
175    
176    =cut
177    
178    sub families_in_genome
179      {      {
180          return Load($res->content);      my($self,@genomes) = @_;
181        return $self->run_query('families_in_genome', @genomes);
182      }      }
183      else  
184    =head3 get_subsystem_based_figfams
185    
186        my $document = $ffObject->get_subsystem_based_figfams();
187    
188    Return a list of the FIGfams derived from subsystems.
189    
190    =over 4
191    
192    =item RETURN
193    
194    Returns a reference to a list of the IDs for the FIGfams derived from subsystems.
195    
196    =back
197    
198    =cut
199    
200    sub get_subsystem_based_figfams
201      {      {
202          die "error on post " . $res->content;      my ($self) = @_;
203        return $self->run_query('get_subsystem_based_figfams');
204      }      }
205    
206    ##=head3 should_be_member
207    ##
208    ##    my $document = $ffObject->should_be_member(@id_seq_pairs);
209    ##
210    ##Determine whether a particular protein sequence belongs in a particular
211    ##FIGfam. This method takes as input multiple FIGfam/sequence pairs and
212    ##performs a determination for each.
213    ##
214    ##=over 4
215    ##
216    ##=item id_seq_pairs
217    ##
218    ##A list of 2-tuples, each consisting of a FIGfam ID followed
219    ##by a protein sequence string.
220    ##
221    ##=item RETURN
222    ##
223    ##Returns a reference to a list of boolean flags, one per input pair. For each
224    ##input pair, the flag will be C<1> if the sequence should be in the FIGfam and
225    ##C<0> otherwise.
226    ##
227    ##=back
228    ##
229    ##=cut
230    ##
231    ##sub should_be_member
232    ##{
233    ##    my($self, @id_seq_pairs) = @_;
234    ##    return $self->run_query('should_be_member', @id_seq_pairs);
235    ##}
236    
237    =head3 all_families
238    
239        my $document = $ffObject->all_families();
240    
241    Return a list of the IDs for all the FIGfams in the system.
242    
243    =over 4
244    
245    =item RETURN
246    
247    Returns a reference to a list of the IDs for all the FIGfams in the system.
248    
249    =back
250    
251    =cut
252    
253    sub all_families
254    {
255        my($self) = @_;
256        return $self->run_query('all_families');
257  }  }
258    
259    =head3 assign_function_to_prot
260    
261        my $document = $ffObject->assign_function_to_prot($input, $blast, $min_hits, $assignToAll);
262    
263    For each incoming protein sequence, attempt to place it in a FIGfam. If a
264    suitable FIGfam can be found for a particular sequence, the FIGfam ID and
265    its functional assignment will be returned.
266    
267    =over 4
268    
269    =item input
270    
271    Either (1) an open input handle to a file containing the proteins in FASTA format,
272    or (2) a reference to a list of FASTA strings for the proteins.
273    
274    =item blast
275    
276    If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
277    afterward, and the top I<N> hits (where I<N> is the value of this parameter)
278    will be returned as part of the protein's output tuple.
279    
280    =item min_hits
281    
282    A number from 1 to 10, indicating the minimum number of matches required to
283    consider a protein as a candidate for assignment to a FIGfam. A higher value
284    indicates a more reliable matching algorithm; the default is C<3>.
285    
286    =item assign_to_all
287    
288    If TRUE, then if the standard matching algorithm fails to assign a protein,
289    a BLAST will be used. The BLAST is slower, but is capable of placing more
290    proteins than the normal algorithm.
291    
292    =item RETURN
293    
294    Returns a Result Handler. Call C<get_next> on the result handler to get back a data
295    item. Each item sent back by the result handler is a 2-tuple containing the
296    incoming protein sequence and a reference to a list consisting of the proposed
297    functional assignment for the protein, the name of the Genome Set from which the
298    protein is likely to have originated (if known), a list of BLAST hits (if
299    requested), and the number of matches for the protein found in the FIGfam. If no
300    assignment could be made for a particular protein, it will not appear in the
301    output stream.
302    
303    =back
304    
305    =cut
306    
307  sub assign_function_to_prot  sub assign_function_to_prot
308  {  {
309      my($self, $input) = @_;      my($self, $input, $blast, $min_hits, $assignToAll) = @_;
310    
311      my $wq;      my $wq;
312    
313        my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
314    
315      if (ref($input) eq 'ARRAY')      if (ref($input) eq 'ARRAY')
316      {      {
317          $wq = SequenceListWorkQueue->new($input);          $wq = SequenceListWorkQueue->new($input);
# Line 80  Line 321 
321          $wq = FastaWorkQueue->new($input);          $wq = FastaWorkQueue->new($input);
322      }      }
323    
324      return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler, \&tab_delimited_output_parser);      my $req_bytes = $blast ? 1000 : 1_000_000;
325    
326        return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
327                                  #\&tab_delimited_output_parser,
328                                  \&YAML::Load,
329                                  $params, $req_bytes);
330  }  }
331    
332    =head3 call_genes
333    
334        my $document = $ffObject->call_genes($input, $genetic_code);
335    
336    Call the protein-encoding genes for the specified DNA sequences. The result will
337    be a multi-sequence FASTA string listing all the proteins found and a hash mapping
338    each gene found to its location string.
339    
340    =over 4
341    
342    =item input
343    
344    Open input handle to a file containing the DNA sequences in FASTA format.
345    
346    =item genetic_code
347    
348    The numeric code for the mapping from DNA to amino acids. The default is C<11>,
349    which is the standard mapping and should be used in almost all cases. A complete
350    list of mapping codes can be found at
351    L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
352    
353    =item RETURN
354    
355    Returns a 2-tuple consisting of a FASTA string for all the proteins found
356    followed by a reference to a list of genes found. Each gene found will be
357    represented by a 4-tuple containing an ID for the gene, the ID of the contig
358    containing it, the starting offset, and the ending offset.
359    
360    =back
361    
362    =cut
363    
364    sub call_genes
365    {
366        my($self, $input, $genetic_code) = @_;
367    
368        if (ref($input) ne 'ARRAY')
369        {
370            my $fh;
371            if (ref($input))
372            {
373                $fh = $input;
374            }
375            else
376            {
377                my $fasta_file = $input;
378                open($fh, "<", $fasta_file);
379            }
380            $input = [];
381            while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
382            {
383                push(@$input, "$id,$$seqp");
384            }
385            close($fh);
386        }
387    
388        return $self->run_query_form([function => "call_genes",
389                                      genetic_code => $genetic_code,
390                                      id_seq => $input]);
391    }
392    
393    =head3 find_rnas
394    
395        my $document = $ffObject->find_rnas($input, $genus, $species, $domain);
396    
397    Call the RNAs for the specified DNA sequences. The result will be a
398    multi-sequence FASTA string listing all the RNAs found and a hash mapping
399    each RNA to its location string.
400    
401    =over 4
402    
403    =item input
404    
405    Open input handle to a file containing the DNA sequences in FASTA format.
406    
407    =item genus
408    
409    Common name of the genus for this DNA.
410    
411    =item species
412    
413    Common name of the species for this DNA.
414    
415    =item domain
416    
417    Domain of this DNA. The default is C<Bacteria>.
418    
419    =item RETURN
420    
421    Returns a 2-tuple consisting of a FASTA string for all the RNAs found
422    followed by reference to a list of RNAs found. Each RNA will be represented by
423    a 4-tuple consisting of an ID for the RNA, the ID of the contig containing it, its
424    starting offset, and its ending offset.
425    
426    =back
427    
428    =cut
429    
430    sub find_rnas
431    {
432        my($self, $input, $genus, $species, $domain) = @_;
433    
434        if (ref($input) ne 'ARRAY')
435        {
436            my $fh;
437            if (ref($input))
438            {
439                $fh = $input;
440            }
441            else
442            {
443                my $fasta_file = $input;
444                open($fh, "<", $fasta_file);
445            }
446            $input = [];
447            while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
448            {
449                push(@$input, "$id,$$seqp");
450            }
451            close($fh);
452        }
453    
454        return $self->run_query_form([function => "find_rnas",
455                                      genus => $genus,
456                                      species => $species,
457                                      domain => $domain,
458                                      id_seq => $input]);
459    }
460    
461    =head3 assign_functions_to_DNA
462    
463        my $document = $ffObject->assign_functions_to_DNA($input, $blast, $min_hits, $max_gap);
464    
465    Analyze DNA sequences and output regions that probably belong to FIGfams.
466    The selected regions will be high-probability candidates for protein
467    production.
468    
469    =over 4
470    
471    =item input
472    
473    Either (1) an open input handle to a file containing the DNA sequences in FASTA format,
474    or (2) a reference to a list of FASTA strings for the DNA sequences.
475    
476    =item blast
477    
478    If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
479    afterward, and the top I<N> hits (where I<N> is the value of this parameter)
480    will be returned as part of each protein's output tuple.
481    
482    =item min_hits
483    
484    A number from 1 to 10, indicating the minimum number of matches required to
485    consider a protein as a candidate for assignment to a FIGfam. A higher value
486    indicates a more reliable matching algorithm; the default is C<3>.
487    
488    =item max_gap
489    
490    When looking for a match, if two sequence elements match and are closer than
491    this distance, then they will be considered part of a single match. Otherwise,
492    the match will be split. The default is C<600>.
493    
494    =item RETURN
495    
496    Returns a Result Handler. Call C<get_next> on the result handler to get back a data
497    item. Each item sent back by the result handler is a 2-tuple containing the
498    incoming protein sequence and a reference to a list of hit regions. Each hit
499    region is a 6-tuple consisting of the number of matches to the FIGfam, the start
500    location, the stop location, the proposed functional assignment, the name of the
501    Genome Set from which the gene is likely to have originated, and a list of BLAST
502    hits. If the I<blast> option is not specified, the list of BLAST hits will be
503    empty.
504    
505    =back
506    
507    =cut
508    
509  sub assign_functions_to_dna  sub assign_functions_to_dna
510  {  {
511      my($self, $input, $min_hits, $max_gap) = @_;      my($self, $input, $min_hits, $max_gap, $blast) = @_;
512    
513        $min_hits = 3 unless defined($min_hits);
514        $max_gap = 600 unless defined($max_gap);
515        $blast = 0 unless defined($blast);
516    
517      my $wq;      my $wq;
518    
# Line 98  Line 525 
525          $wq = FastaWorkQueue->new($input);          $wq = FastaWorkQueue->new($input);
526      }      }
527    
528      return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA', \&id_seq_pair_bundler, \&tab_delimited_output_parser, [min_hits => $min_hits, max_gap => $max_gap]);      my $req_bytes = $blast ? 1000 : 500000;
529        my $params = [min_hits => $min_hits, max_gap => $max_gap, blast => $blast];
530        return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',
531                                  \&id_seq_pair_bundler,
532                                  \&tab_delimited_output_parser, $params, $req_bytes);
533    }
534    
535    ###### Utility Methods ######
536    
537    sub run_query
538    {
539        my($self, $function, @args ) = @_;
540        my $form = [function  => $function,
541                    args => YAML::Dump(\@args),
542                    ];
543        return $self->run_query_form($form);
544    }
545    
546    sub run_query_form
547    {
548        my($self, $form, $raw) = @_;
549    
550        my $res = $self->{ua}->post($self->{server_url}, $form);
551    
552        if ($res->is_success)
553        {
554            my $content = $res->content;
555            if ($raw)
556            {
557                return $content;
558            }
559    
560    #       print "Got $content\n";
561            my $ret;
562            eval {
563                $ret = Load($content);
564            };
565            if ($@)
566            {
567                die "Query returned unparsable content ($@): " . $content;
568            }
569            return $ret;
570        }
571        else
572        {
573            die "error on post " . $res->status_line . " " . $res->content;
574        }
575  }  }
576    
577  sub id_seq_pair_bundler  sub id_seq_pair_bundler
# Line 132  Line 605 
605    
606  sub new  sub new
607  {  {
608      my($class, $work_queue, $server_url, $function, $input_bundler, $output_parser, $form_vars) = @_;      my($class, $work_queue, $server_url, $function, $input_bundler, $output_parser, $form_vars, $req_bytes) = @_;
609    
610      my $self = {      my $self = {
611          work_queue => $work_queue,          work_queue => $work_queue,
# Line 143  Line 616 
616          ua => LWP::UserAgent->new(),          ua => LWP::UserAgent->new(),
617          cur_result => undef,          cur_result => undef,
618          form_vars => $form_vars ? $form_vars : [],          form_vars => $form_vars ? $form_vars : [],
619            req_bytes => ($req_bytes ? $req_bytes : 16000),
620      };      };
621        $self->{ua}->timeout(20 * 60);
622      return bless $self, $class;      return bless $self, $class;
623  }  }
624    
# Line 151  Line 626 
626  {  {
627      my($self) = @_;      my($self) = @_;
628    
629      if ($self->{cur_result})      my $res =  $self->get_next_from_result();
630        # print "gnfr returns: " , Dumper($res);
631    
632        if ($res)
633      {      {
634          return $self->get_next_from_result();          return $res;
635      }      }
636      else      else
637      {      {
638          my @inp = $self->{work_queue}->get_next_n_bytes(16000);  
639          if (@inp)          while (my @inp = $self->{work_queue}->get_next_n_bytes($self->{req_bytes}))
640          {          {
641              my $form = [@{$self->{form_vars}}];              my $form = [@{$self->{form_vars}}];
642              push(@$form, function => $self->{function},              push(@$form, function => $self->{function},
643                           map { &{$self->{input_bundler}}($_) } @inp);                           map { &{$self->{input_bundler}}($_) } @inp);
644              print "Invoke " .Dumper($form);              # print "Invoke " .Dumper($form);
645    
646              my $res = $self->{ua}->post($self->{server_url}, $form);              my $res = $self->{ua}->post($self->{server_url}, $form);
647              if ($res->is_success)              if ($res->is_success)
648              {              {
649                  $self->{cur_result} = $res->content;                  eval {
650                  #print "res: $self->{cur_result}\n";                      $self->{cur_result} = [YAML::Load($res->content)];
651                  return $self->get_next_from_result();                  };
652                    if ($@)
653                    {
654                        die "Query returned unparsable content ($@): " . $res->content;
655              }              }
656              else                  # print "res: " . Dumper($self->{cur_result});
657                    my $oneres =  $self->get_next_from_result();
658                    if ($oneres)
659              {              {
660                  die "error " . $res->status_line . " on post " . $res->content;                      return $oneres;
661              }              }
662          }          }
663          else          else
664          {          {
665              return;                  die "error " . $res->status_line . " on post " . $res->content;
666          }          }
667      }      }
668            return;
669        }
670  }  }
671    
672  sub get_next_from_result  sub get_next_from_result
673  {  {
674      my($self) = @_;      my($self) = @_;
675      if ($self->{cur_result} =~ s/^([^\n]*)\n//)      my $l = $self->{cur_result};
676        if ($l and @$l)
677        {
678            return shift(@$l);
679        }
680        else
681      {      {
682          return &{$self->{output_parser}}($1);          delete $self->{cur_result};
683            return undef;
684      }      }
685  }  }
686    
# Line 251  Line 742 
742  use strict;  use strict;
743  use base 'SequenceWorkQueue';  use base 'SequenceWorkQueue';
744  use FileHandle;  use FileHandle;
 use FIG;  
745    
746  sub new  sub new
747  {  {
# Line 278  Line 768 
768  {  {
769      my($self) = @_;      my($self) = @_;
770    
771      my($id, $seqp, $com) = &FIG::read_fasta_record($self->{fh});      my($id, $seqp, $com) = read_fasta_record($self->{fh});
772      return defined($id) ? ($id, $com, $seqp) : ();      return defined($id) ? ($id, $com, $seqp) : ();
773  }  }
774    
775    sub read_fasta_record {
776        my ($file_handle) = @_;
777        my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
778    
779        if (not defined($file_handle))  { $file_handle = \*STDIN; }
780    
781        $old_end_of_record = $/;
782        $/ = "\n>";
783    
784        if (defined($fasta_record = <$file_handle>)) {
785            chomp $fasta_record;
786            @lines  =  split( /\n/, $fasta_record );
787            $head   =  shift @lines;
788            $head   =~ s/^>?//;
789            $head   =~ m/^(\S+)/;
790            $seq_id = $1;
791            if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
792            $sequence  =  join( "", @lines );
793            @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
794        } else {
795            @parsed_fasta_record = ();
796        }
797    
798        $/ = $old_end_of_record;
799    
800        return @parsed_fasta_record;
801    }
802    
803  package SequenceListWorkQueue;  package SequenceListWorkQueue;
804  use strict;  use strict;
805  use base 'SequenceWorkQueue';  use base 'SequenceWorkQueue';

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.23

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3