[Bio] / FigKernelPackages / SAPtutorial.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/SAPtutorial.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Wed Aug 5 19:33:24 2009 UTC revision 1.8, Thu Oct 1 23:43:30 2009 UTC
# Line 7  Line 7 
7    
8  The B<Sapling Server> is a web service that allows you to access data stored in  The B<Sapling Server> is a web service that allows you to access data stored in
9  the Sapling database, a large-scale MySQL database of genetic data. The Sapling  the Sapling database, a large-scale MySQL database of genetic data. The Sapling
10  database contains data on public genomes imported from L<RAST|http://rast.nmpdr.org>  database contains data on public genomes imported from I<RAST> (L<http://rast.nmpdr.org>)
11  and curated by the annotation team of the Fellowship for Interpretation of  and curated by the annotation team of the Fellowship for Interpretation of
12  Genomes.  Genomes.
13    
# Line 22  Line 22 
22  where C<$document> is usually a hash reference and C<$args> is B<always> a hash  where C<$document> is usually a hash reference and C<$args> is B<always> a hash
23  reference. The method description includes a section called I<Parameter Hash  reference. The method description includes a section called I<Parameter Hash
24  Fields> that describes the fields in C<$args>. For example, L<SAP/taxonomy_of>  Fields> that describes the fields in C<$args>. For example, L<SAP/taxonomy_of>
25  has a field called C<ids> that is to be a list of genome IDs and an optional  has a field called C<-ids> that is to be a list of genome IDs and an optional
26  field called C<format> that indicates whether you want taxonomy groups  field called C<-format> that indicates whether you want taxonomy groups
27  represented by numbers, names, or both. To call the I<taxonomy_of> service,  represented by numbers, names, or both. To call the I<taxonomy_of> service,
28  you create a B<SAPserver> object and call a method with the same name as the  you create a B<SAPserver> object and call a method with the same name as the
29  service.  service.
# Line 32  Line 32 
32      use SAPserver;      use SAPserver;
33    
34      my $sapServer = SAPserver->new();      my $sapServer = SAPserver->new();
35      my $resultHash = $sapServer->taxonomy_of({ ids => ['360108.3', '100226.1'],      my $resultHash = $sapServer->taxonomy_of({ -ids => ['360108.3', '100226.1'],
36                                                 format => 'names' });                                                 -format => 'names' });
37      for my $id (keys %$resultHash) {      for my $id (keys %$resultHash) {
38          my $taxonomy = $resultHash->{$id};          my $taxonomy = $resultHash->{$id};
39          print "$id: " . join(" ", @$taxonomy) . "\n";          print "$id: " . join(" ", @$taxonomy) . "\n";
# Line 53  Line 53 
53  than a hash reference. So, for example, the above I<taxonomy_of> call could  than a hash reference. So, for example, the above I<taxonomy_of> call could
54  also be written like this.  also be written like this.
55    
56      my $resultHash = $sapServer->taxonomy_of(ids => ['360108.3', '100226.1'],      my $resultHash = $sapServer->taxonomy_of(-ids => ['360108.3', '100226.1'],
57                                               format => 'names');                                               -format => 'names');
58    
59  =head2 A Simple Example: Genome Taxonomies  =head2 A Simple Example: Genome Taxonomies
60    
# Line 87  Line 87 
87    
88      my $sapServer = SAPserver->new();      my $sapServer = SAPserver->new();
89    
90  Now we use I<all_genomes> to get a list of the IDs for complete genomes.  Now we use I<all_genomes> to get a list of the complete genomes.
91  I<all_genomes> will normally return B<all> genome IDs, but we use the  I<all_genomes> will normally return B<all> genomes, but we use the
92  C<complete> option to restrict the output to complete genomes.  C<-complete> option to restrict the output to those that are complete.
93    
94      my $genomeIDs = $sapServer->all_genomes(complete => 1);      my $genomeIDs = $sapServer->all_genomes(-complete => 1);
95    
96  All we want are the genome IDs, so we use a PERL trick to convert the  All we want are the genome IDs, so we use a PERL trick to convert the
97  hash reference to a list reference.  hash reference in C<$genomeIDs> to a list reference.
98    
99      $genomeIDs = [ keys %$genomeIDs ];      $genomeIDs = [ keys %$genomeIDs ];
100    
101  Now we ask for the representatives and the taxonomies.  Now we ask for the representatives and the taxonomies.
102    
103      my $representativeHash = $sapServer->representative(ids => $genomeIDs);      my $representativeHash = $sapServer->representative(-ids => $genomeIDs);
104      my $taxonomyHash = $sapServer->taxonomy_of(ids => $genomeIDs,      my $taxonomyHash = $sapServer->taxonomy_of(-ids => $genomeIDs,
105                                                 format => 'names');                                                 -format => 'names');
106    
107  Our data is now contained in a pair of hash tables. The following loop  Our data is now contained in a pair of hash tables. The following loop
108  stiches them together to produce the output.  stiches them together to produce the output.
# Line 119  Line 119 
119  An excerpt from the output of this script is shown below. The first column contains  An excerpt from the output of this script is shown below. The first column contains
120  a genome ID, the second contains the representative genome's ID, and the third is  a genome ID, the second contains the representative genome's ID, and the third is
121  the full taxonomy. Note that the two genomes with very close taxonomies have the  the full taxonomy. Note that the two genomes with very close taxonomies have the
122  same representative genome: this is the expected vehavior.  same representative genome: this is the expected behavior.
123    
124      221109.1    221109.1    Bacteria Firmicutes Bacilli Bacillales Bacillaceae Oceanobacillus Oceanobacillus iheyensis Oceanobacillus iheyensis HTE831      221109.1    221109.1    Bacteria Firmicutes Bacilli Bacillales Bacillaceae Oceanobacillus Oceanobacillus iheyensis Oceanobacillus iheyensis HTE831
125      204722.1    204722.1    Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Brucellaceae Brucella Brucella suis Brucella suis 1330      204722.1    204722.1    Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Brucellaceae Brucella Brucella suis Brucella suis 1330
# Line 147  Line 147 
147      use SAPserver;      use SAPserver;
148    
149      my $sapServer = SAPserver->new(singleton => 1);      my $sapServer = SAPserver->new(singleton => 1);
150      my $genomeIDs = $sapServer->all_genomes(complete => 1);      my $genomeIDs = $sapServer->all_genomes(-complete => 1);
151      $genomeIDs = [ keys %$genomeIDs ];      $genomeIDs = [ keys %$genomeIDs ];
152      for my $genomeID (@$genomeIDs) {      for my $genomeID (@$genomeIDs) {
153          my $repID = $sapServer->representative(ids => $genomeID);          my $repID = $sapServer->representative(-ids => $genomeID);
154          my $taxonomy = $sapServer->taxonomy_of(ids => $genomeID,          my $taxonomy = $sapServer->taxonomy_of(-ids => $genomeID,
155                                                 format => 'names');                                                 -format => 'names');
156          # Format the taxonomy string.          # Format the taxonomy string.
157          my $taxonomyString = join(" ", @$taxonomy);          my $taxonomyString = join(" ", @$taxonomy);
158          # Print the result.          # Print the result.
# Line 168  Line 168 
168  =head2 Specifying Gene IDs  =head2 Specifying Gene IDs
169    
170  Many of the Sapling Server services return data on genes (a term we use rather  Many of the Sapling Server services return data on genes (a term we use rather
171  loosely to include any kind of genetic I<locus> or C<feature>). The standard  loosely to include any kind of genetic I<locus> or I<feature>). The standard
172  method for identifying a gene is the I<FIG ID>, an identifying string that  method for identifying a gene is the I<FIG ID>, an identifying string that
173  begins with the characters C<fig|> and includes the genome ID, the gene type,  begins with the characters C<fig|> and includes the genome ID, the gene type,
174  and an additional number for uniqueness. For example, the FIG ID  and an additional number for uniqueness. For example, the FIG ID
# Line 176  Line 176 
176  Bacillus halodurans C-125 (I<272558.1>).  Bacillus halodurans C-125 (I<272558.1>).
177    
178  Frequently, however, you will have a list of gene IDs from some other  Frequently, however, you will have a list of gene IDs from some other
179  database (e.g. L<NCBI|http://www.ncbi.nlm.nih.gov>, L<UniProt|http://www.uniprot.org>)  database (e.g. I<NCBI>, I<UniProt>) or in a community format such as Locus Tags
180  or in a community format such as Locus Tags or gene names. Most services that  or gene names. Most services that take gene IDs as input allow you to specify a
181  take gene IDs as input allow you to specify a C<source> option that indicates  C<-source> option that indicates the type of IDs being used. The acceptable
182  the type of IDs being used. The acceptable formats are as follows.  formats are as follows.
183    
184  =over 4  =over 4
185    
186  =item CMR  =item CMR
187    
188  L<Comprehensive Microbial Resource ID|http://cmr.jcvi.org>. The CMR IDs for  I<Comprehensive Microbial Resource> (L<http://cmr.jcvi.org>) ID. The CMR IDs for
189  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.
190    
191  =item GENE  =item GENE
# Line 201  Line 201 
201    
202  =item KEGG  =item KEGG
203    
204  L<Kyoto Encyclopedia of Genes and Genomes|http://www.genome.jp/kegg> identifier.  I<Kyoto Encyclopedia of Genes and Genomes> (L<http://www.genome.jp/kegg>) identifier.
205  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.
206    
207  =item LocusTag  =item LocusTag
# Line 211  Line 211 
211    
212  =item NCBI  =item NCBI
213    
214  L<NCBI|http://www.ncbi.nlm.nih.gov> accession number. The accession numbers for  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) accession number. The accession numbers for
215  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.
216    
217  =item RefSeq  =item RefSeq
218    
219  L<NCBI|http://www.ncbi.nlm.nih.gov> reference sequence identifier. The RefSeq  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) reference sequence identifier. The RefSeq
220  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.
221    
222  =item SEED  =item SEED
# Line 225  Line 225 
225    
226  =item SwissProt  =item SwissProt
227    
228  L<SwissProt|http://www.expasy.ch/sprot> identifier. For example, the SwissProt  I<SwissProt> (L<http://www.expasy.ch/sprot>) identifier. For example, the SwissProt
229  identifier for C<fig|243277.1.peg.3952> is C<O31153>.  identifier for C<fig|243277.1.peg.3952> is C<O31153>.
230    
231  =item UniProt  =item UniProt
232    
233  L<UniProt|http://www.uniprot.org> identifier. The UniProt identifiers for  I<UniProt> (L<http://www.uniprot.org>) identifier. The UniProt identifiers for
234  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.
235    
236  =back  =back
# Line 239  Line 239 
239  for the source type. In this case, however, care must be taken, because the same  for the source type. In this case, however, care must be taken, because the same
240  identifier can have different meanings in different databases.  identifier can have different meanings in different databases.
241    
242  =head2 Normal Mode Examples  =head2 Two Normal-Mode Examples
243    
244  The following examples use the Sapling Server in normal mode: that is, data  The following examples use the Sapling Server in normal mode: that is, data
245  is sent to the server in batches and the results are stitched together  is sent to the server in batches and the results are stitched together
# Line 247  Line 247 
247  also a risk that the server request might time out. If this happens, you may  also a risk that the server request might time out. If this happens, you may
248  want to consider breaking the input into smaller batches. At some point, the  want to consider breaking the input into smaller batches. At some point, the
249  server system will perform sophisticated flow control to reduce the risk of  server system will perform sophisticated flow control to reduce the risk of
250  timeout errors, but we are not yet at that point.  timeout errors, but we are not yet there.
251    
252  =head3 Retrieving Functional Roles  =head3 Retrieving Functional Roles
253    
# Line 282  Line 282 
282      # Read all the gene IDs.      # Read all the gene IDs.
283      my @genes = map { chomp; $_ } <STDIN>;      my @genes = map { chomp; $_ } <STDIN>;
284      # Compute the functional roles.      # Compute the functional roles.
285      my $results = $sapServer->ids_to_functions(ids => \@genes, source => 'UniProt');      my $results = $sapServer->ids_to_functions(-ids => \@genes, -source => 'UniProt');
286      # Loop through the genes.      # Loop through the genes.
287      for my $gene (@genes) {      for my $gene (@genes) {
288          # Did we find a result?          # Did we find a result?
# Line 296  Line 296 
296          }          }
297      }      }
298    
299  Sample output from this script is shown below. Note that one of the input IDs was  Sample output from this script is shown below. Note that one of the input IDs
300  not found.  was not found.
301    
302      HYPA_ECO57      [NiFe] hydrogenase nickel incorporation protein HypA      HYPA_ECO57      [NiFe] hydrogenase nickel incorporation protein HypA
303      17KD_RICBR      rickettsial 17 kDa surface antigen precursor      17KD_RICBR      rickettsial 17 kDa surface antigen precursor
# Line 311  Line 311 
311      Q8YY27_ANASP was not found.      Q8YY27_ANASP was not found.
312    
313  B<ids_to_subsystems> returns roles in subsystems. Roles in subsystems have  B<ids_to_subsystems> returns roles in subsystems. Roles in subsystems have
314  several differences from general functional roles. A single gene may be in  several differences from general functional roles. Only half of the genes in the
315  multiple subsystems and may have multiple roles in a subsystem. In addition,  database are currently associated with subsystems.A single gene may be in In
316  only half of the genes in the database are currently associated with subsystems.  addition, multiple subsystems and may have multiple roles in a subsystem.
317    
318  As a result, instead of a single string per incoming gene, B<ids_to_subsystems>  As a result, instead of a single string per incoming gene, B<ids_to_subsystems>
319  returns a list. Each element of the list consists of the role name followed by  returns a list. Each element of the list consists of the role name followed by
320  the subsystem name. This makes the processing of the results a little more  the subsystem name. This makes the processing of the results a little more
# Line 327  Line 328 
328      # Read all the gene IDs.      # Read all the gene IDs.
329      my @genes = map { chomp; $_ } <STDIN>;      my @genes = map { chomp; $_ } <STDIN>;
330      # Compute the functional roles.      # Compute the functional roles.
331      my $results = $sapServer->ids_to_subsystems(ids => \@genes, source => 'UniProt');      my $results = $sapServer->ids_to_subsystems(-ids => \@genes, -source => 'UniProt');
332      # Loop through the genes.      # Loop through the genes.
333      for my $gene (@genes) {      for my $gene (@genes) {
334          # Did we find a result?          # Did we find a result?
# Line 407  Line 408 
408      while (my $genomeID = <STDIN>) {      while (my $genomeID = <STDIN>) {
409          chomp $genomeID;          chomp $genomeID;
410          # Get the subsystems for this genome.          # Get the subsystems for this genome.
411          my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);          my $subHash = $sapServer->genomes_to_subsystems(-ids => $genomeID);
412          # The data returned for each genome (and in our case there's only one)          # The data returned for each genome (and in our case there's only one)
413          # includes the subsystem name and the variant code. The following          # includes the subsystem name and the variant code. The following
414          # statement strips away the variant codes, leaving only the subsystem          # statement strips away the variant codes, leaving only the subsystem
415          # names.          # names.
416          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
417          # Ask for the genes in each subsystem, using NCBI identifiers.          # Ask for the genes in each subsystem, using NCBI identifiers.
418          my $roleHashes = $sapServer->ids_in_subsystems(subsystems => $subList,          my $roleHashes = $sapServer->ids_in_subsystems(-subsystems => $subList,
419                                                       genome => $genomeID,                                                         -genome => $genomeID,
420                                                       source => 'NCBI',                                                         -source => 'NCBI',
421                                                       roleForm => 'full');                                                         -roleForm => 'full');
422          # The hash maps each subsystem ID to a hash of roles to lists of feature          # The hash maps each subsystem ID to a hash of roles to lists of feature
423          # IDs. We therefore use three levels of nested loops to produce our          # IDs. We therefore use three levels of nested loops to produce our
424          # output lines. At the top level we have a hash mapping subsystem IDs          # output lines. At the top level we have a hash mapping subsystem IDs
# Line 467  Line 468 
468      while (my $genomeID = <STDIN>) {      while (my $genomeID = <STDIN>) {
469          chomp $genomeID;          chomp $genomeID;
470          # Get the subsystems for this genome.          # Get the subsystems for this genome.
471          my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);          my $subHash = $sapServer->genomes_to_subsystems(-ids => $genomeID);
472          # The data returned for each genome (and in our case there's only one)          # The data returned for each genome (and in our case there's only one)
473          # includes the subsystem name and the variant code. The following          # includes the subsystem name and the variant code. The following
474          # statement strips away the variant codes, leaving only the subsystem          # statement strips away the variant codes, leaving only the subsystem
475          # names.          # names.
476          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
477          # Ask for the genes in each subsystem, using NCBI identifiers.          # Ask for the genes in each subsystem, using NCBI identifiers.
478          my $genesHash = $sapServer->ids_in_subsystems(subsystems => $subList,          my $genesHash = $sapServer->ids_in_subsystems(-subsystems => $subList,
479                                                       genome => $genomeID,                                                        -genome => $genomeID,
480                                                       source => 'NCBI',                                                        -source => 'NCBI',
481                                                       roleForm => 'none',                                                        -roleForm => 'none',
482                                                       grouped => 1);                                                        -grouped => 1);
483          # The hash maps each subsystem ID to a comma-delimited list of gene IDs.          # The hash maps each subsystem ID to a comma-delimited list of gene IDs.
484          for my $subsystem (sort keys %$genesHash) {          for my $subsystem (sort keys %$genesHash) {
485              my $genes = $genesHash->{$subsystem};              my $genes = $genesHash->{$subsystem};
# Line 495  Line 496 
496      360108.3    RNA processing and degradation, bacterial   85840820, 85841283      360108.3    RNA processing and degradation, bacterial   85840820, 85841283
497      360108.3    Recycling of Peptidoglycan Amino Acids      85842019, 85842272      360108.3    Recycling of Peptidoglycan Amino Acids      85842019, 85842272
498    
499    =head2 A More Complicated Example: Operon Upstream Regions
500    
501    In this example we'll string several services together to perform a more
502    complex task: locating the upstream regions of operons involved in a particular
503    metabolic pathway. The theory is that we can look for a common pattern in
504    these regions.
505    
506    A metabolic pathway is a subsystem, so we'll enter our database via the
507    subsystems. To keep the data manageable, we'll limit our results to
508    specific genomes. The program we write will take as input a subsystem name
509    and a file of genome IDs.
510    
511    The worst part of the task is finding the operon for a gene. This involves
512    finding all the genes in a neighborhood and isolating the ones that point in
513    the correct direction. Fortunately, there is a Sapling Server function--
514    L<SAP/make_runs>-- that specifcally performs this task.
515    
516    To start our program, we create a L<SAPserver> object and pull the subsystem
517    name from the command-line parameters. This program is going to be doing a
518    lot of complicated, long-running stuff, so we'll usually want to deal with one
519    result at a time. To facilitate that, we construct the server helper in
520    singleton mode.
521    
522        use strict;
523        use SAPserver;
524    
525        my $sapServer = SAPserver->new(singleton => 1);
526        # Get the subsystem name.
527        my $ssName = $ARGV[0];
528    
529    Our main loop will read through the list of genomes from the standard input
530    and call a method I<PrintUpstream> to process each one. We're going to be a bit
531    cheesy here and allow our genome loop to stop on the first blank line.
532    
533        while (my $genomeID = <STDIN>) {
534            chomp $genomeID;
535            PrintUpstream($sapServer, $ssName, $genomeID);
536        }
537    
538    Now we need to write I<PrintUpstream>. Its first task is to find all the
539    genes in the genome that belong to the subsystem. A single call to
540    L<SAP/ids_in_subsystems> will get this information. We then feed the
541    results into L<SAP/make_runs> to get operons and call L<SAP/upstream> for
542    each operon. The program is given below.
543    
544        sub PrintUpstream {
545            my ($sapServer, $ssName, $genomeID) = @_;
546            # Because we specify "roleForm => none", we get back one long
547            # gene list.
548            my $geneList = $sapServer->ids_in_subsystems(-subsystems => $ssName,
549                                                         -genome => $genomeID,
550                                                         -roleForm => 'none');
551            # Convert the gene list to a comma-delimited string.
552            my $geneString = join(", ", @$geneList);
553            # Get a list of operons.
554            my $opList = $sapServer->make_runs(-groups => $geneString);
555            # Loop through the operons.
556            for my $opString (@$opList) {
557                # Get the first feature's ID.
558                my ($front) = split /\s*,/, $opString, 2;
559                # Grab its upstream region. We'll include the operon string as the
560                # comment text.
561                my $fasta = $sapServer->upstream(-ids => $front,
562                                                 -comments => { $front => $opString },
563                                                 -skipGene => 1);
564                # Print the result.
565                print "$fasta\n";
566            }
567        }
568    
569    
570  =cut  =cut
571    
572  1;  1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3