[Bio] / FigKernelPackages / FFserver.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FFserver.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Thu May 21 21:53:17 2009 UTC revision 1.22, Mon Dec 7 21:11:52 2009 UTC
# Line 1  Line 1 
   
1  package FFserver;  package FFserver;
2    
3    #
4    # This is a SAS Component
5    #
6    
7    =head1 FIGfam Server Helper Object
8    
9    This module is used to call the FIGfam server, which is a general-purpose
10    server for extracting data from the FIGfams database. Each FIGfam server
11    function correspond to a method of this object.
12    
13    This package deliberately uses no internal SEED packages or scripts, only common
14    PERL modules.
15    
16    =cut
17    
18  use LWP::UserAgent;  use LWP::UserAgent;
19  use Data::Dumper;  use Data::Dumper;
20  use YAML;  use YAML;
# Line 18  Line 32 
32          server_url => $server_url,          server_url => $server_url,
33          ua => LWP::UserAgent->new(),          ua => LWP::UserAgent->new(),
34      };      };
35      $self->{ua}->timeout(10 * 60);      $self->{ua}->timeout(20 * 60);
36    
37      return bless $self, $class;      return bless $self, $class;
38  }  }
39    
40    =head2 Functions
41    
42    =head3 members_of_families
43    
44        my $document = $ffObject->members_of_families(@ids);
45    
46    Return the function and a list of the members for each specified family.
47    
48    =over 4
49    
50    =item ids
51    
52    A list of FIGfam IDs.
53    
54    =item RETURN
55    
56    Returns a reference to a list of 3-tuples. Each 3-tuple will consist of a FIGfam
57    family ID followed by the family's function and a sub-list of all the FIG feature
58    IDs for the features in the family.
59    
60    =back
61    
62    =cut
63    
64  sub members_of_families  sub members_of_families
65  {  {
66      my($self, @ids) = @_;      my($self, @ids) = @_;
67      return $self->run_query('members_of_families', @ids);      return $self->run_query('members_of_families', @ids);
68  }  }
69    
70    =head3 families_containing_peg
71    
72        my $document = $ffObject->families_containing_peg(@ids);
73    
74    Return the FIGfams containing the specified features.
75    
76    =over 4
77    
78    =item ids
79    
80    A list of FIG feature IDs.
81    
82    =item RETURN
83    
84    Returns a list of 2-tuples, each consisting of an incoming feature ID
85    followed by a list of FIGfam IDs for the families containing the incoming
86    feature.
87    
88    =back
89    
90    =cut
91    
92  sub families_containing_peg  sub families_containing_peg
93  {  {
94      my($self, @ids) = @_;      my($self, @ids) = @_;
95      return $self->run_query('families_containing_peg', @ids);      return $self->run_query('families_containing_peg', @ids);
96  }  }
97    
98  sub function_of  =head3 families_implementing_role
 {  
     my($self, @ids) = @_;  
     return $self->run_query('function_of', @ids);  
 }  
99    
100  sub org_of      my $document = $ffObject->families_implementing_role(@roles);
 {  
     my($self, @ids) = @_;  
     return $self->run_query('org_of', @ids);  
 }  
101    
102  sub seq_of  Return the FIGfams that implement the specified roles. Each FIGfam has
103  {  a single function associated with it, but the function may involve
104      my($self, @ids) = @_;  multiple roles, or may include comments. The role is therefore a more
105      return $self->run_query('seq_of', @ids);  compact string than the function.
 }  
106    
107  sub aliases_of  =over 4
108  {  
109      my($self, @ids) = @_;  =item roles
110      return $self->run_query('aliases_of', @ids);  
111  }  A list of role names.
112    
113    =item RETURN
114    
115    Returns a list of 2-tuples, each consisting of an incoming role name
116    followed by a list of FIGfam IDs for the families that implement the
117    incoming role.
118    
119    =back
120    
121    =cut
122    
123  sub families_implementing_role  sub families_implementing_role
124  {  {
# Line 65  Line 126 
126      return $self->run_query('families_implementing_role', @roles);      return $self->run_query('families_implementing_role', @roles);
127  }  }
128    
129    =head3 families_with_function
130    
131        my $document = $ffObject->families_with_function(@functions);
132    
133    Return the FIGfams that belong to the specified functions. Each FIGfam has
134    a single function associated with it, but the function may involve
135    multiple roles, or may include comments. The function is therefore a
136    more specific string than the role.
137    
138    =over 4
139    
140    =item functions
141    
142    A list of functional roles.
143    
144    =item RETURN
145    
146    Returns a list of 2-tuples, each consisting of an incoming role name
147    followed by a list of FIGfam IDs for the families associated with the
148    incoming function.
149    
150    =back
151    
152    =cut
153    
154  sub families_with_function  sub families_with_function
155  {  {
156      my($self,@functions) = @_;      my($self,@functions) = @_;
157      return $self->run_query('families_with_function', @functions);      return $self->run_query('families_with_function', @functions);
158  }  }
159    
160    =head3 families_in_genome
161    
162        my $document = $ffObject->families_in_genome(@genomes);
163    
164    Return the FIGfams that have members in the specified genomes.
165    
166    =over 4
167    
168    =item genomes
169    
170    A list of genome IDs.
171    
172    =item RETURN
173    
174    Returns a list of 2-tuples, each consisting of an incoming genome ID
175    followed by a list of FIGfam IDs for the families that have members in
176    that genome.
177    
178    =back
179    
180    =cut
181    
182  sub families_in_genome  sub families_in_genome
183  {  {
184      my($self,@genomes) = @_;      my($self,@genomes) = @_;
185      return $self->run_query('families_in_genome', @genomes);      return $self->run_query('families_in_genome', @genomes);
186  }  }
187    
188    =head3 get_subsystem_based_figfams
189    
190        my $document = $ffObject->get_subsystem_based_figfams();
191    
192    Return a list of the FIGfams derived from subsystems.
193    
194    =over 4
195    
196    =item RETURN
197    
198    Returns a reference to a list of the IDs for the FIGfams derived from subsystems.
199    
200    =back
201    
202    =cut
203    
204  sub get_subsystem_based_figfams  sub get_subsystem_based_figfams
205  {  {
206      my ($self) = @_;      my ($self) = @_;
207      return $self->run_query('get_subsystem_based_figfams');      return $self->run_query('get_subsystem_based_figfams');
208  }  }
209    
210  sub should_be_member  ##=head3 should_be_member
211  {  ##
212      my($self, @id_seq_pairs) = @_;  ##    my $document = $ffObject->should_be_member(@id_seq_pairs);
213      return $self->run_query('should_be_member', @id_seq_pairs);  ##
214  }  ##Determine whether a particular protein sequence belongs in a particular
215    ##FIGfam. This method takes as input multiple FIGfam/sequence pairs and
216    ##performs a determination for each.
217    ##
218    ##=over 4
219    ##
220    ##=item id_seq_pairs
221    ##
222    ##A list of 2-tuples, each consisting of a FIGfam ID followed
223    ##by a protein sequence string.
224    ##
225    ##=item RETURN
226    ##
227    ##Returns a reference to a list of boolean flags, one per input pair. For each
228    ##input pair, the flag will be C<1> if the sequence should be in the FIGfam and
229    ##C<0> otherwise.
230    ##
231    ##=back
232    ##
233    ##=cut
234    ##
235    ##sub should_be_member
236    ##{
237    ##    my($self, @id_seq_pairs) = @_;
238    ##    return $self->run_query('should_be_member', @id_seq_pairs);
239    ##}
240    
241    =head3 all_families
242    
243        my $document = $ffObject->all_families();
244    
245    Return a list of the IDs for all the FIGfams in the system.
246    
247    =over 4
248    
249    =item RETURN
250    
251    Returns a reference to a list of the IDs for all the FIGfams in the system.
252    
253    =back
254    
255    =cut
256    
257  sub all_families  sub all_families
258  {  {
# Line 95  Line 260 
260      return $self->run_query('all_families');      return $self->run_query('all_families');
261  }  }
262    
263  sub run_query  =head3 assign_function_to_prot
 {  
     my($self, $function, @args ) = @_;  
     my $form = [function  => $function,  
                 args => YAML::Dump(@args),  
                 ];  
     return $self->run_query_form($form);  
 }  
264    
265  sub run_query_form      my $document = $ffObject->assign_function_to_prot($input, $blast, $min_hits, $assignToAll);
 {  
     my($self, $form) = @_;  
266    
267      my $res = $self->{ua}->post($self->{server_url}, $form);  For each incoming protein sequence, attempt to place it in a FIGfam. If a
268    suitable FIGfam can be found for a particular sequence, the FIGfam ID and
269    its functional assignment will be returned.
270    
271      if ($res->is_success)  =over 4
272      {  
273          return Load($res->content);  =item input
274      }  
275      else  Either (1) an open input handle to a file containing the proteins in FASTA format,
276      {  or (2) a reference to a list of FASTA strings for the proteins.
277          die "error on post " . $res->status_line . " " . $res->content;  
278      }  =item blast
279  }  
280    If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
281    afterward, and the top I<N> hits (where I<N> is the value of this parameter)
282    will be returned as part of the protein's output tuple.
283    
284    =item min_hits
285    
286    A number from 1 to 10, indicating the minimum number of matches required to
287    consider a protein as a candidate for assignment to a FIGfam. A higher value
288    indicates a more reliable matching algorithm; the default is C<3>.
289    
290    =item assign_to_all
291    
292    If TRUE, then if the standard matching algorithm fails to assign a protein,
293    a BLAST will be used. The BLAST is slower, but is capable of placing more
294    proteins than the normal algorithm.
295    
296    =item RETURN
297    
298    Returns a Result Handler. Call C<get_next> on the result handler to get back a data
299    item. Each item sent back by the result handler is a 2-tuple containing the
300    incoming protein sequence and a reference to a list consisting of the proposed
301    functional assignment for the protein, the name of the Genome Set from which the
302    protein is likely to have originated (if known), a list of BLAST hits (if
303    requested), and the number of matches for the protein found in the FIGfam. If no
304    assignment could be made for a particular protein, it will not appear in the
305    output stream.
306    
307    =back
308    
309    =cut
310    
311  sub assign_function_to_prot  sub assign_function_to_prot
312  {  {
313      my($self, $input, $blast, $min_hits) = @_;      my($self, $input, $blast, $min_hits, $assignToAll) = @_;
314    
315      my $wq;      my $wq;
316    
317      my $params = [blast => $blast, min_hits => $min_hits];      my $params = [blast => $blast, min_hits => $min_hits, assign_to_all => ($assignToAll ? 1 : 0)];
318    
319      if (ref($input) eq 'ARRAY')      if (ref($input) eq 'ARRAY')
320      {      {
# Line 137  Line 325 
325          $wq = FastaWorkQueue->new($input);          $wq = FastaWorkQueue->new($input);
326      }      }
327    
328      my $req_bytes = $blast ? 1000 : 16000;      my $req_bytes = $blast ? 1000 : 1_000_000;
329    
330      return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,      return ResultHandler->new($wq, $self->{server_url}, 'assign_function_to_prot', \&id_seq_pair_bundler,
331                                #\&tab_delimited_output_parser,                                #\&tab_delimited_output_parser,
# Line 145  Line 333 
333                                $params, $req_bytes);                                $params, $req_bytes);
334  }  }
335    
336    =head3 call_genes
337    
338        my $document = $ffObject->call_genes($input, $genetic_code);
339    
340    Call the protein-encoding genes for the specified DNA sequences. The result will
341    be a multi-sequence FASTA string listing all the proteins found and a hash mapping
342    each gene found to its location string.
343    
344    =over 4
345    
346    =item input
347    
348    Open input handle to a file containing the DNA sequences in FASTA format.
349    
350    =item genetic_code
351    
352    The numeric code for the mapping from DNA to amino acids. The default is C<11>,
353    which is the standard mapping and should be used in almost all cases. A complete
354    list of mapping codes can be found at
355    L<http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi>.
356    
357    =item RETURN
358    
359    Returns a 2-tuple consisting of a FASTA string for all the proteins found
360    followed by a reference to a list of genes found. Each gene found will be
361    represented by a 4-tuple containing an ID for the gene, the ID of the contig
362    containing it, the starting offset, and the ending offset.
363    
364    =back
365    
366    =cut
367    
368    sub call_genes
369    {
370        my($self, $input, $genetic_code) = @_;
371    
372        if (ref($input) ne 'ARRAY')
373        {
374            my $fh;
375            if (ref($input))
376            {
377                $fh = $input;
378            }
379            else
380            {
381                my $fasta_file = $input;
382                open($fh, "<", $fasta_file);
383            }
384            $input = [];
385            while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
386            {
387                push(@$input, "$id,$$seqp");
388            }
389            close($fh);
390        }
391    
392        return $self->run_query_form([function => "call_genes",
393                                      genetic_code => $genetic_code,
394                                      id_seq => $input]);
395    }
396    
397    =head3 find_rnas
398    
399        my $document = $ffObject->find_rnas($input, $genus, $species, $domain);
400    
401    Call the RNAs for the specified DNA sequences. The result will be a
402    multi-sequence FASTA string listing all the RNAs found and a hash mapping
403    each RNA to its location string.
404    
405    =over 4
406    
407    =item input
408    
409    Open input handle to a file containing the DNA sequences in FASTA format.
410    
411    =item genus
412    
413    Common name of the genus for this DNA.
414    
415    =item species
416    
417    Common name of the species for this DNA.
418    
419    =item domain
420    
421    Domain of this DNA. The default is C<Bacteria>.
422    
423    =item RETURN
424    
425    Returns a 2-tuple consisting of a FASTA string for all the RNAs found
426    followed by reference to a list of RNAs found. Each RNA will be represented by
427    a 4-tuple consisting of an ID for the RNA, the ID of the contig containing it, its
428    starting offset, and its ending offset.
429    
430    =back
431    
432    =cut
433    
434    sub find_rnas
435    {
436        my($self, $input, $genus, $species, $domain) = @_;
437    
438        if (ref($input) ne 'ARRAY')
439        {
440            my $fh;
441            if (ref($input))
442            {
443                $fh = $input;
444            }
445            else
446            {
447                my $fasta_file = $input;
448                open($fh, "<", $fasta_file);
449            }
450            $input = [];
451            while (my($id, $seqp, $com) = FastaWorkQueue::read_fasta_record($fh))
452            {
453                push(@$input, "$id,$$seqp");
454            }
455            close($fh);
456        }
457    
458        return $self->run_query_form([function => "find_rnas",
459                                      genus => $genus,
460                                      species => $species,
461                                      domain => $domain,
462                                      id_seq => $input]);
463    }
464    
465    =head3 assign_functions_to_DNA
466    
467        my $document = $ffObject->assign_functions_to_DNA($input, $blast, $min_hits, $max_gap);
468    
469    Analyze DNA sequences and output regions that probably belong to FIGfams.
470    The selected regions will be high-probability candidates for protein
471    production.
472    
473    =over 4
474    
475    =item input
476    
477    Either (1) an open input handle to a file containing the DNA sequences in FASTA format,
478    or (2) a reference to a list of FASTA strings for the DNA sequences.
479    
480    =item blast
481    
482    If nonzero, then when a protein is placed into a FIGfam, a BLAST will be performed
483    afterward, and the top I<N> hits (where I<N> is the value of this parameter)
484    will be returned as part of each protein's output tuple.
485    
486    =item min_hits
487    
488    A number from 1 to 10, indicating the minimum number of matches required to
489    consider a protein as a candidate for assignment to a FIGfam. A higher value
490    indicates a more reliable matching algorithm; the default is C<3>.
491    
492    =item max_gap
493    
494    When looking for a match, if two sequence elements match and are closer than
495    this distance, then they will be considered part of a single match. Otherwise,
496    the match will be split. The default is C<600>.
497    
498    =item RETURN
499    
500    Returns a Result Handler. Call C<get_next> on the result handler to get back a data
501    item. Each item sent back by the result handler is a 2-tuple containing the
502    incoming protein sequence and a reference to a list of hit regions. Each hit
503    region is a 6-tuple consisting of the number of matches to the FIGfam, the start
504    location, the stop location, the proposed functional assignment, the name of the
505    Genome Set from which the gene is likely to have originated, and a list of BLAST
506    hits. If the I<blast> option is not specified, the list of BLAST hits will be
507    empty.
508    
509    =back
510    
511    =cut
512    
513  sub assign_functions_to_dna  sub assign_functions_to_dna
514  {  {
515      my($self, $input, $min_hits, $max_gap, $blast) = @_;      my($self, $input, $min_hits, $max_gap, $blast) = @_;
516    
517        $min_hits = 3 unless defined($min_hits);
518        $max_gap = 600 unless defined($max_gap);
519        $blast = 0 unless defined($blast);
520    
521      my $wq;      my $wq;
522    
523      if (ref($input) eq 'ARRAY')      if (ref($input) eq 'ARRAY')
# Line 160  Line 529 
529          $wq = FastaWorkQueue->new($input);          $wq = FastaWorkQueue->new($input);
530      }      }
531    
532      my $req_bytes = $blast ? 1000 : 16000;      my $req_bytes = $blast ? 1000 : 500000;
533      my $params = [min_hits => $min_hits, max_gap => $max_gap, blast => $blast];      my $params = [min_hits => $min_hits, max_gap => $max_gap, blast => $blast];
534      return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',      return ResultHandler->new($wq, $self->{server_url}, 'assign_functions_to_DNA',
535                                \&id_seq_pair_bundler,                                \&id_seq_pair_bundler,
536                                \&tab_delimited_output_parser, $params, $req_bytes);                                \&tab_delimited_output_parser, $params, $req_bytes);
537  }  }
538    
539    ###### Utility Methods ######
540    
541    sub run_query
542    {
543        my($self, $function, @args ) = @_;
544        my $form = [function  => $function,
545                    args => YAML::Dump(\@args),
546                    ];
547        return $self->run_query_form($form);
548    }
549    
550    sub run_query_form
551    {
552        my($self, $form, $raw) = @_;
553    
554        my $res = $self->{ua}->post($self->{server_url}, $form);
555    
556        if ($res->is_success)
557        {
558            my $content = $res->content;
559            if ($raw)
560            {
561                return $content;
562            }
563    
564    #       print "Got $content\n";
565            my $ret;
566            eval {
567                $ret = Load($content);
568            };
569            if ($@)
570            {
571                die "Query returned unparsable content ($@): " . $content;
572            }
573            return $ret;
574        }
575        else
576        {
577            die "error on post " . $res->status_line . " " . $res->content;
578        }
579    }
580    
581  sub id_seq_pair_bundler  sub id_seq_pair_bundler
582  {  {
583      my($item) = @_;      my($item) = @_;
# Line 209  Line 620 
620          ua => LWP::UserAgent->new(),          ua => LWP::UserAgent->new(),
621          cur_result => undef,          cur_result => undef,
622          form_vars => $form_vars ? $form_vars : [],          form_vars => $form_vars ? $form_vars : [],
623          req_byts => ($req_bytes ? $req_bytes : 16000),          req_bytes => ($req_bytes ? $req_bytes : 16000),
624      };      };
625        $self->{ua}->timeout(20 * 60);
626      return bless $self, $class;      return bless $self, $class;
627  }  }
628    
# Line 219  Line 631 
631      my($self) = @_;      my($self) = @_;
632    
633      my $res =  $self->get_next_from_result();      my $res =  $self->get_next_from_result();
634        # print "gnfr returns: " , Dumper($res);
635    
636      if ($res)      if ($res)
637      {      {
# Line 226  Line 639 
639      }      }
640      else      else
641      {      {
642          my @inp = $self->{work_queue}->get_next_n_bytes(16000);  
643          if (@inp)          while (my @inp = $self->{work_queue}->get_next_n_bytes($self->{req_bytes}))
644          {          {
645              my $form = [@{$self->{form_vars}}];              my $form = [@{$self->{form_vars}}];
646              push(@$form, function => $self->{function},              push(@$form, function => $self->{function},
# Line 237  Line 650 
650              my $res = $self->{ua}->post($self->{server_url}, $form);              my $res = $self->{ua}->post($self->{server_url}, $form);
651              if ($res->is_success)              if ($res->is_success)
652              {              {
653                    eval {
654                  $self->{cur_result} = [YAML::Load($res->content)];                  $self->{cur_result} = [YAML::Load($res->content)];
655                  #print "res: $self->{cur_result}\n";                  };
656                  return $self->get_next_from_result();                  if ($@)
657                    {
658                        die "Query returned unparsable content ($@): " . $res->content;
659              }              }
660              else                  # print "res: " . Dumper($self->{cur_result});
661                    my $oneres =  $self->get_next_from_result();
662                    if ($oneres)
663              {              {
664                  die "error " . $res->status_line . " on post " . $res->content;                      return $oneres;
665              }              }
666          }          }
667          else          else
668          {          {
669              return;                  die "error " . $res->status_line . " on post " . $res->content;
670          }          }
671      }      }
672            return;
673        }
674  }  }
675    
676  sub get_next_from_result  sub get_next_from_result
# Line 326  Line 746 
746  use strict;  use strict;
747  use base 'SequenceWorkQueue';  use base 'SequenceWorkQueue';
748  use FileHandle;  use FileHandle;
 use FIG;  
749    
750  sub new  sub new
751  {  {
# Line 353  Line 772 
772  {  {
773      my($self) = @_;      my($self) = @_;
774    
775      my($id, $seqp, $com) = &FIG::read_fasta_record($self->{fh});      my($id, $seqp, $com) = read_fasta_record($self->{fh});
776      return defined($id) ? ($id, $com, $seqp) : ();      return defined($id) ? ($id, $com, $seqp) : ();
777  }  }
778    
779    sub read_fasta_record {
780        my ($file_handle) = @_;
781        my ($old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record);
782    
783        if (not defined($file_handle))  { $file_handle = \*STDIN; }
784    
785        $old_end_of_record = $/;
786        $/ = "\n>";
787    
788        if (defined($fasta_record = <$file_handle>)) {
789            chomp $fasta_record;
790            @lines  =  split( /\n/, $fasta_record );
791            $head   =  shift @lines;
792            $head   =~ s/^>?//;
793            $head   =~ m/^(\S+)/;
794            $seq_id = $1;
795            if ($head  =~ m/^\S+\s+(.*)$/)  { $comment = $1; } else { $comment = ""; }
796            $sequence  =  join( "", @lines );
797            @parsed_fasta_record = ( $seq_id, \$sequence, $comment );
798        } else {
799            @parsed_fasta_record = ();
800        }
801    
802        $/ = $old_end_of_record;
803    
804        return @parsed_fasta_record;
805    }
806    
807  package SequenceListWorkQueue;  package SequenceListWorkQueue;
808  use strict;  use strict;
809  use base 'SequenceWorkQueue';  use base 'SequenceWorkQueue';

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.22

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3