[Bio] / FigKernelPackages / FFserver.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FFserver.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.13, Thu Jun 4 21:23:01 2009 UTC revision 1.23, Thu Dec 17 20:48:21 2009 UTC
# Line 1  Line 1 
   
1  package FFserver;  package FFserver;
2    
3    =head1 FIGfam Server Helper Object
4    
5    This module is used to call the FIGfam server, which is a general-purpose
6    server for extracting data from the FIGfams database. Each FIGfam server
7    function correspond to a method of this object.
8    
9    This package deliberately uses no internal SEED packages or scripts, only common
10    PERL modules.
11    
12    =cut
13    
14  use LWP::UserAgent;  use LWP::UserAgent;
15  use Data::Dumper;  use Data::Dumper;
16  use YAML;  use YAML;
# Line 18  Line 28 
28          server_url => $server_url,          server_url => $server_url,
29          ua => LWP::UserAgent->new(),          ua => LWP::UserAgent->new(),
30      };      };
31      $self->{ua}->timeout(10 * 60);      $self->{ua}->timeout(20 * 60);
32    
33      return bless $self, $class;      return bless $self, $class;
34  }  }
35    
36    =head2 Functions
37    
38    =head3 members_of_families
39    
40        my $document = $ffObject->members_of_families(@ids);
41    
42    Return the function and a list of the members for each specified family.
43    
44    =over 4
45    
46    =item ids
47    
48    A list of FIGfam IDs.
49    
50    =item RETURN
51    
52    Returns a reference to a list of 3-tuples. Each 3-tuple will consist of a FIGfam
53    family ID followed by the family's function and a sub-list of all the FIG feature
54    IDs for the features in the family.
55    
56    =back
57    
58    =cut
59    
60  sub members_of_families  sub members_of_families
61  {  {
62      my($self, @ids) = @_;      my($self, @ids) = @_;
63      return $self->run_query('members_of_families', @ids);      return $self->run_query('members_of_families', @ids);
64  }  }
65    
66    =head3 families_containing_peg
67    
68        my $document = $ffObject->families_containing_peg(@ids);
69    
70    Return the FIGfams containing the specified features.
71    
72    =over 4
73    
74    =item ids
75    
76    A list of FIG feature IDs.
77    
78    =item RETURN
79    
80    Returns a list of 2-tuples, each consisting of an incoming feature ID
81    followed by a list of FIGfam IDs for the families containing the incoming
82    feature.
83    
84    =back
85    
86    =cut
87    
88  sub families_containing_peg  sub families_containing_peg
89  {  {
90      my($self, @ids) = @_;      my($self, @ids) = @_;
91      return $self->run_query('families_containing_peg', @ids);      return $self->run_query('families_containing_peg', @ids);
92  }  }
93    
94  sub function_of  =head3 families_implementing_role
 {  
     my($self, @ids) = @_;  
     return $self->run_query('function_of', @ids);  
 }  
95    
96  sub org_of      my $document = $ffObject->families_implementing_role(@roles);
 {  
     my($self, @ids) = @_;  
     return $self->run_query('org_of', @ids);  
 }  
97    
98  sub seq_of  Return the FIGfams that implement the specified roles. Each FIGfam has
99  {  a single function associated with it, but the function may involve
100      my($self, @ids) = @_;  multiple roles, or may include comments. The role is therefore a more
101      return $self->run_query('seq_of', @ids);  compact string than the function.
 }  
102    
103  sub aliases_of  =over 4
104  {  
105      my($self, @ids) = @_;  =item roles
106      return $self->run_query('aliases_of', @ids);  
107  }  A list of role names.
108    
109    =item RETURN
110    
111    Returns a list of 2-tuples, each consisting of an incoming role name
112    followed by a list of FIGfam IDs for the families that implement the
113    incoming role.
114    
115    =back
116    
117    =cut
118    
119  sub families_implementing_role  sub families_implementing_role
120  {  {
# Line 65  Line 122 
122      return $self->run_query('families_implementing_role', @roles);      return $self->run_query('families_implementing_role', @roles);
123  }  }
124    
125    =head3 families_with_function
126    
127        my $document = $ffObject->families_with_function(@functions);
128    
129    Return the FIGfams that belong to the specified functions. Each FIGfam has
130    a single function associated with it, but the function may involve
131    multiple roles, or may include comments. The function is therefore a
132    more specific string than the role.
133    
134    =over 4
135    
136    =item functions
137    
138    A list of functional roles.
139    
140    =item RETURN
141    
142    Returns a list of 2-tuples, each consisting of an incoming role name
143    followed by a list of FIGfam IDs for the families associated with the
144    incoming function.
145    
146    =back
147    
148    =cut
149    
150  sub families_with_function  sub families_with_function
151  {  {
152      my($self,@functions) = @_;      my($self,@functions) = @_;
153      return $self->run_query('families_with_function', @functions);      return $self->run_query('families_with_function', @functions);
154  }  }
155    
156    =head3 families_in_genome
157    
158        my $document = $ffObject->families_in_genome(@genomes);
159    
160    Return the FIGfams that have members in the specified genomes.
161    
162    =over 4
163    
164    =item genomes
165    
166    A list of genome IDs.
167    
168    =item RETURN
169    
170    Returns a list of 2-tuples, each consisting of an incoming genome ID
171    followed by a list of FIGfam IDs for the families that have members in
172    that genome.
173    
174    =back
175    
176    =cut
177    
178  sub families_in_genome  sub families_in_genome
179  {  {
180      my($self,@genomes) = @_;      my($self,@genomes) = @_;
181      return $self->run_query('families_in_genome', @genomes);      return $self->run_query('families_in_genome', @genomes);
182  }  }
183    
184    =head3 get_subsystem_based_figfams
185    
186        my $document = $ffObject->get_subsystem_based_figfams();
187    
188    Return a list of the FIGfams derived from subsystems.
189    
190    =over 4
191    
192    =item RETURN
193    
194    Returns a reference to a list of the IDs for the FIGfams derived from subsystems.
195    
196    =back
197    
198    =cut
199    
200  sub get_subsystem_based_figfams  sub get_subsystem_based_figfams
201  {  {
202      my ($self) = @_;      my ($self) = @_;
203      return $self->run_query('get_subsystem_based_figfams');      return $self->run_query('get_subsystem_based_figfams');
204  }  }
205    
206  sub should_be_member  ##=head3 should_be_member
207  {  ##
208      my($self, @id_seq_pairs) = @_;  ##    my $document = $ffObject->should_be_member(@id_seq_pairs);
209      return $self->run_query('should_be_member', @id_seq_pairs);  ##
210  }  ##Determine whether a particular protein sequence belongs in a particular
211    ##FIGfam. This method takes as input multiple FIGfam/sequence pairs and
212    ##performs a determination for each.
213    ##
214    ##=over 4
215    ##
216    ##=item id_seq_pairs
217    ##
218    ##A list of 2-tuples, each consisting of a FIGfam ID followed
219    ##by a protein sequence string.
220    ##
221    ##=item RETURN
222    ##
223    ##Returns a reference to a list of boolean flags, one per input pair. For each
224    ##input pair, the flag will be C<1> if the sequence should be in the FIGfam and
225    ##C<0> otherwise.
226    ##
227    ##=back
228    ##
229    ##=cut
230    ##
231    ##sub should_be_member
232    ##{
233    ##    my($self, @id_seq_pairs) = @_;
234    ##    return $self->run_query('should_be_member', @id_seq_pairs);
235    ##}
236    
237    =head3 all_families
238    
239        my $document = $ffObject->all_families();
240    
241    Return a list of the IDs for all the FIGfams in the system.
242    
243    =over 4
244    
245    =item RETURN
246    
247    Returns a reference to a list of the IDs for all the FIGfams in the system.
248    
249    =back
250    
251    =cut
252    
253  sub all_families  sub all_families
254  {  {
# Line 95  Line 256 
256      return $self->run_query('all_families');      return $self->run_query('all_families');
257  }  }
258    
259  sub run_query  =head3 assign_function_to_prot
 {  
     my($self, $function, @args ) = @_;  
     my $form = [function  => $function,  
                 args => YAML::Dump(@args),  
                 ];  
     return $self->run_query_form($form);  
 }  
260    
261  sub run_query_form      my $document = $ffObject->assign_function_to_prot($input, $blast, $min_hits, $assignToAll);
 {  
     my($self, $form) = @_;  
262    
263      my $res = $self->{ua}->post($self->{server_url}, $form);  For each incoming protein sequence, attempt to place it in a FIGfam. If a
264    suitable FIGfam can be found for a particular sequence, the FIGfam ID and
265    its functional assignment will be returned.
266    
267      if ($res->is_success)  =over 4
268      {  
269          my $content = $res->content;  =item input
270  #       print "Got $content\n";  
271          my $ret;  Either (1) an open input handle to a file containing the proteins in FASTA format,
272          eval {  or (2) a reference to a list of FASTA strings for the proteins.
273              $ret = Load($content);  
274          };  =item blast
275          if ($@)  
276          {  If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
277              die "Query returned unparsable content ($@): " . $content;  afterward, and the top I<N> hits (where I<N> is the value of this parameter)
278          }  will be returned as part of the protein's output tuple.
279          return $ret;  
280      }  =item min_hits
281      else  
282      {  A number from 1 to 10, indicating the minimum number of matches required to
283          die "error on post " . $res->status_line . " " . $res->content;  consider a protein as a candidate for assignment to a FIGfam. A higher value
284      }  indicates a more reliable matching algorithm; the default is C<3>.
285  }  
286    =item assign_to_all
287    
288    If TRUE, then if the standard matching algorithm fails to assign a protein,
289    a BLAST will be used. The BLAST is slower, but is capable of placing more
290    proteins than the normal algorithm.
291    
292    =item RETURN
293    
294    Returns a Result Handler. Call C<get_next> on the result handler to get back a data
295    item. Each item sent back by the result handler is a 2-tuple containing the
296    incoming protein sequence and a reference to a list consisting of the proposed
297    functional assignment for the protein, the name of the Genome Set from which the
298    protein is likely to have originated (if known), a list of BLAST hits (if
299    requested), and the number of matches for the protein found in the FIGfam. If no
300    assignment could be made for a particular protein, it will not appear in the
301    output stream.
302    
303    =back
304    
305    =cut
306    
307  sub assign_function_to_prot  sub assign_function_to_prot
308  {  {
309      my($self, $input, $blast, $min_hits) = @_;      my($self, $input, $blast, $min_hits, $assignToAll) = @_;
310    
311      my $wq;      my $wq;
312    
313      my $params = [blast => $blast, min_hits => $min_hits];      my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
314    
315      if (ref($input) eq 'ARRAY')      if (ref($input) eq 'ARRAY')
316      {      {
# Line 147  Line 321 
321          $wq = FastaWorkQueue->new($input);          $wq = FastaWorkQueue->new($input);
322      }      }
323    
324      my $req_bytes = $blast ? 1000 : 16000;      my $req_bytes = $blast ? 1000 : 1_000_000;
325    
326      return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,      return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
327                                #\&tab_delimited_output_parser,                                #\&tab_delimited_output_parser,
# Line 155  Line 329 
329                                $params, $req_bytes);                                $params, $req_bytes);
330  }  }
331    
332    =head3 call_genes
333    
334        my $document = $ffObject->call_genes($input, $genetic_code);
335    
336    Call the protein-encoding genes for the specified DNA sequences. The result will
337    be a multi-sequence FASTA string listing all the proteins found and a hash mapping
338    each gene found to its location string.
339    
340    =over 4
341    
342    =item input
343    
344    Open input handle to a file containing the DNA sequences in FASTA format.
345    
346    =item genetic_code
347    
348    The numeric code for the mapping from DNA to amino acids. The default is C<11>,
349    which is the standard mapping and should be used in almost all cases. A complete
350    list of mapping codes can be found at
351    L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
352    
353    =item RETURN
354    
355    Returns a 2-tuple consisting of a FASTA string for all the proteins found
356    followed by a reference to a list of genes found. Each gene found will be
357    represented by a 4-tuple containing an ID for the gene, the ID of the contig
358    containing it, the starting offset, and the ending offset.
359    
360    =back
361    
362    =cut
363    
364    sub call_genes
365    {
366        my($self, $input, $genetic_code) = @_;
367    
368        if (ref($input) ne 'ARRAY')
369        {
370            my $fh;
371            if (ref($input))
372            {
373                $fh = $input;
374            }
375            else
376            {
377                my $fasta_file = $input;
378                open($fh, "<", $fasta_file);
379            }
380            $input = [];
381            while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
382            {
383                push(@$input, "$id,$$seqp");
384            }
385            close($fh);
386        }
387    
388        return $self->run_query_form([function => "call_genes",
389                                      genetic_code => $genetic_code,
390                                      id_seq => $input]);
391    }
392    
393    =head3 find_rnas
394    
395        my $document = $ffObject->find_rnas($input, $genus, $species, $domain);
396    
397    Call the RNAs for the specified DNA sequences. The result will be a
398    multi-sequence FASTA string listing all the RNAs found and a hash mapping
399    each RNA to its location string.
400    
401    =over 4
402    
403    =item input
404    
405    Open input handle to a file containing the DNA sequences in FASTA format.
406    
407    =item genus
408    
409    Common name of the genus for this DNA.
410    
411    =item species
412    
413    Common name of the species for this DNA.
414    
415    =item domain
416    
417    Domain of this DNA. The default is C<Bacteria>.
418    
419    =item RETURN
420    
421    Returns a 2-tuple consisting of a FASTA string for all the RNAs found
422    followed by reference to a list of RNAs found. Each RNA will be represented by
423    a 4-tuple consisting of an ID for the RNA, the ID of the contig containing it, its
424    starting offset, and its ending offset.
425    
426    =back
427    
428    =cut
429    
430    sub find_rnas
431    {
432        my($self, $input, $genus, $species, $domain) = @_;
433    
434        if (ref($input) ne 'ARRAY')
435        {
436            my $fh;
437            if (ref($input))
438            {
439                $fh = $input;
440            }
441            else
442            {
443                my $fasta_file = $input;
444                open($fh, "<", $fasta_file);
445            }
446            $input = [];
447            while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
448            {
449                push(@$input, "$id,$$seqp");
450            }
451            close($fh);
452        }
453    
454        return $self->run_query_form([function => "find_rnas",
455                                      genus => $genus,
456                                      species => $species,
457                                      domain => $domain,
458                                      id_seq => $input]);
459    }
460    
461    =head3 assign_functions_to_DNA
462    
463        my $document = $ffObject->assign_functions_to_DNA($input, $blast, $min_hits, $max_gap);
464    
465    Analyze DNA sequences and output regions that probably belong to FIGfams.
466    The selected regions will be high-probability candidates for protein
467    production.
468    
469    =over 4
470    
471    =item input
472    
473    Either (1) an open input handle to a file containing the DNA sequences in FASTA format,
474    or (2) a reference to a list of FASTA strings for the DNA sequences.
475    
476    =item blast
477    
478    If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
479    afterward, and the top I<N> hits (where I<N> is the value of this parameter)
480    will be returned as part of each protein's output tuple.
481    
482    =item min_hits
483    
484    A number from 1 to 10, indicating the minimum number of matches required to
485    consider a protein as a candidate for assignment to a FIGfam. A higher value
486    indicates a more reliable matching algorithm; the default is C<3>.
487    
488    =item max_gap
489    
490    When looking for a match, if two sequence elements match and are closer than
491    this distance, then they will be considered part of a single match. Otherwise,
492    the match will be split. The default is C<600>.
493    
494    =item RETURN
495    
496    Returns a Result Handler. Call C<get_next> on the result handler to get back a data
497    item. Each item sent back by the result handler is a 2-tuple containing the
498    incoming protein sequence and a reference to a list of hit regions. Each hit
499    region is a 6-tuple consisting of the number of matches to the FIGfam, the start
500    location, the stop location, the proposed functional assignment, the name of the
501    Genome Set from which the gene is likely to have originated, and a list of BLAST
502    hits. If the I<blast> option is not specified, the list of BLAST hits will be
503    empty.
504    
505    =back
506    
507    =cut
508    
509  sub assign_functions_to_dna  sub assign_functions_to_dna
510  {  {
511      my($self, $input, $min_hits, $max_gap, $blast) = @_;      my($self, $input, $min_hits, $max_gap, $blast) = @_;
# Line 181  Line 532 
532                                \&tab_delimited_output_parser, $params, $req_bytes);                                \&tab_delimited_output_parser, $params, $req_bytes);
533  }  }
534    
535    ###### Utility Methods ######
536    
537    sub run_query
538    {
539        my($self, $function, @args ) = @_;
540        my $form = [function  => $function,
541                    args => YAML::Dump(\@args),
542                    ];
543        return $self->run_query_form($form);
544    }
545    
546    sub run_query_form
547    {
548        my($self, $form, $raw) = @_;
549    
550        my $res = $self->{ua}->post($self->{server_url}, $form);
551    
552        if ($res->is_success)
553        {
554            my $content = $res->content;
555            if ($raw)
556            {
557                return $content;
558            }
559    
560    #       print "Got $content\n";
561            my $ret;
562            eval {
563                $ret = Load($content);
564            };
565            if ($@)
566            {
567                die "Query returned unparsable content ($@): " . $content;
568            }
569            return $ret;
570        }
571        else
572        {
573            die "error on post " . $res->status_line . " " . $res->content;
574        }
575    }
576    
577  sub id_seq_pair_bundler  sub id_seq_pair_bundler
578  {  {
579      my($item) = @_;      my($item) = @_;
# Line 225  Line 618 
618          form_vars => $form_vars ? $form_vars : [],          form_vars => $form_vars ? $form_vars : [],
619          req_bytes => ($req_bytes ? $req_bytes : 16000),          req_bytes => ($req_bytes ? $req_bytes : 16000),
620      };      };
621        $self->{ua}->timeout(20 * 60);
622      return bless $self, $class;      return bless $self, $class;
623  }  }
624    
# Line 348  Line 742 
742  use strict;  use strict;
743  use base 'SequenceWorkQueue';  use base 'SequenceWorkQueue';
744  use FileHandle;  use FileHandle;
 use FIG;  
745    
746  sub new  sub new
747  {  {
# Line 375  Line 768 
768  {  {
769      my($self) = @_;      my($self) = @_;
770    
771      my($id, $seqp, $com) = &FIG::read_fasta_record($self->{fh});      my($id, $seqp, $com) = read_fasta_record($self->{fh});
772      return defined($id) ? ($id, $com, $seqp) : ();      return defined($id) ? ($id, $com, $seqp) : ();
773  }  }
774    
775    sub read_fasta_record {
776        my ($file_handle) = @_;
777        my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
778    
779        if (not defined($file_handle))  { $file_handle = \*STDIN; }
780    
781        $old_end_of_record = $/;
782        $/ = "\n>";
783    
784        if (defined($fasta_record = <$file_handle>)) {
785            chomp $fasta_record;
786            @lines  =  split( /\n/, $fasta_record );
787            $head   =  shift @lines;
788            $head   =~ s/^>?//;
789            $head   =~ m/^(\S+)/;
790            $seq_id = $1;
791            if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
792            $sequence  =  join( "", @lines );
793            @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
794        } else {
795            @parsed_fasta_record = ();
796        }
797    
798        $/ = $old_end_of_record;
799    
800        return @parsed_fasta_record;
801    }
802    
803  package SequenceListWorkQueue;  package SequenceListWorkQueue;
804  use strict;  use strict;
805  use base 'SequenceWorkQueue';  use base 'SequenceWorkQueue';

Legend:
Removed from v.1.13  
changed lines
  Added in v.1.23

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3