[Bio] / FigKernelPackages / SAPtutorial.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/SAPtutorial.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Tue Aug 4 21:31:29 2009 UTC revision 1.10, Wed Feb 17 15:00:27 2010 UTC
# Line 7  Line 7 
7    
8  The B<Sapling Server> is a web service that allows you to access data stored in  The B<Sapling Server> is a web service that allows you to access data stored in
9  the Sapling database, a large-scale MySQL database of genetic data. The Sapling  the Sapling database, a large-scale MySQL database of genetic data. The Sapling
10  database contains data on public genomes imported from L<RAST|http://rast.nmpdr.org>  database contains data on public genomes imported from I<RAST> (L<http://rast.nmpdr.org>)
11  and curated by the annotation team of the Fellowship for Interpretation of  and curated by the annotation team of the Fellowship for Interpretation of
12  Genomes.  Genomes.
13    
# Line 22  Line 22 
22  where C<$document> is usually a hash reference and C<$args> is B<always> a hash  where C<$document> is usually a hash reference and C<$args> is B<always> a hash
23  reference. The method description includes a section called I<Parameter Hash  reference. The method description includes a section called I<Parameter Hash
24  Fields> that describes the fields in C<$args>. For example, L<SAP/taxonomy_of>  Fields> that describes the fields in C<$args>. For example, L<SAP/taxonomy_of>
25  has a field called C<ids> that is to be a list of genome IDs and an optional  has a field called C<-ids> that is to be a list of genome IDs and an optional
26  field called C<format> that indicates whether you want taxonomy groups  field called C<-format> that indicates whether you want taxonomy groups
27  represented by numbers, names, or both. To call the I<taxonomy_of> service,  represented by numbers, names, or both. To call the I<taxonomy_of> service,
28  you create a B<SAPserver> object and call a method with the same name as the  you create a B<SAPserver> object and call a method with the same name as the
29  service.  service.
30    
31        use strict;
32      use SAPserver;      use SAPserver;
33    
34      my $sapServer = SAPserver->new();      my $sapServer = SAPserver->new();
35      my $resultHash = $sapServer->taxonomy_of({ ids => ['360108.3', '100226.1'],      my $resultHash = $sapServer->taxonomy_of({ -ids => ['360108.3', '100226.1'],
36                                                 format => 'names' });                                                 -format => 'names' });
37      for my $id (keys %$resultHash) {      for my $id (keys %$resultHash) {
38          my $taxonomy = $resultHash->{$id};          my $taxonomy = $resultHash->{$id};
39          print $id, join(" ", @$taxonomy);          print "$id: " . join(" ", @$taxonomy) . "\n";
40      }      }
41    
42    The output from this program (reformatted slightly for readability) is shown below.
43    
44        360108.3: Bacteria, Proteobacteria, delta/epsilon subdivisions, Epsilonproteobacteria,
45                  Campylobacterales, Campylobacteraceae, Campylobacter, Campylobacter jejuni,
46                  Campylobacter jejuni subsp. jejuni, Campylobacter jejuni subsp. jejuni 260.94
47    
48        100226.1: Bacteria, Actinobacteria, Actinobacteria (class), Actinobacteridae,
49                  Actinomycetales, Streptomycineae, Streptomycetaceae, Streptomyces,
50                  Streptomyces coelicolor, Streptomyces coelicolor A3(2)
51    
52  For convenience, you can specify the parameters as a simple hash rather  For convenience, you can specify the parameters as a simple hash rather
53  than a hash reference. So, for example, the above I<taxonomy_of> call could  than a hash reference. So, for example, the above I<taxonomy_of> call could
54  also be written like this.  also be written like this.
55    
56      my $resultHash = $sapServer->taxonomy_of(ids => ['360108.3', '100226.1'],      my $resultHash = $sapServer->taxonomy_of(-ids => ['360108.3', '100226.1'],
57                                               format => 'names');                                               -format => 'names');
58    
59  =head2 A Simple Example: Genome Taxonomies  =head2 A Simple Example: Genome Taxonomies
60    
# Line 76  Line 87 
87    
88      my $sapServer = SAPserver->new();      my $sapServer = SAPserver->new();
89    
90  Now we use I<all_genomes> to get a list of the IDs for complete genomes.  Now we use I<all_genomes> to get a list of the complete genomes.
91  I<all_genomes> will normally return B<all> genome IDs, but we use the  I<all_genomes> will normally return B<all> genomes, but we use the
92  C<complete> option to restrict the output to complete genomes.  C<-complete> option to restrict the output to those that are complete.
93    
94      my $genomeIDs = $sapServer->all_genomes(complete => 1);      my $genomeIDs = $sapServer->all_genomes(-complete => 1);
95    
96  All we want are the genome IDs, so we use a PERL trick to convert the  All we want are the genome IDs, so we use a PERL trick to convert the
97  hash reference to a list reference.  hash reference in C<$genomeIDs> to a list reference.
98    
99      $genomeIDs = [ keys %$genomeIDs ];      $genomeIDs = [ keys %$genomeIDs ];
100    
101  Now we ask for the representatives and the taxonomies.  Now we ask for the representatives and the taxonomies.
102    
103      my $representativeHash = $sapServer->representative(ids => $genomeIDs);      my $representativeHash = $sapServer->representative(-ids => $genomeIDs);
104      my $taxonomyHash = $sapServer->taxonomy_of(ids => $genomeIDs,      my $taxonomyHash = $sapServer->taxonomy_of(-ids => $genomeIDs,
105                                                 format => 'names');                                                 -format => 'names');
106    
107  Our data is now contained in a pair of hash tables. The following loop  Our data is now contained in a pair of hash tables. The following loop
108  stiches them together to produce the output.  stiches them together to produce the output.
# Line 105  Line 116 
116          print join("\t", $genomeID, $repID, $taxonomyString) . "\n";          print join("\t", $genomeID, $repID, $taxonomyString) . "\n";
117      }      }
118    
119    An excerpt from the output of this script is shown below. The first column contains
120    a genome ID, the second contains the representative genome's ID, and the third is
121    the full taxonomy. Note that the two genomes with very close taxonomies have the
122    same representative genome: this is the expected behavior.
123    
124        221109.1    221109.1    Bacteria Firmicutes Bacilli Bacillales Bacillaceae Oceanobacillus Oceanobacillus iheyensis Oceanobacillus iheyensis HTE831
125        204722.1    204722.1    Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Brucellaceae Brucella Brucella suis Brucella suis 1330
126        391037.3    369723.3    Bacteria Actinobacteria Actinobacteria (class) Actinobacteridae Actinomycetales Micromonosporineae Micromonosporaceae Salinispora Salinispora arenicola Salinispora arenicola CNS205
127        339670.3    216591.1    Bacteria Proteobacteria Betaproteobacteria Burkholderiales Burkholderiaceae Burkholderia Burkholderia cepacia complex Burkholderia cepacia Burkholderia cepacia AMMD
128        272560.3    216591.1    Bacteria Proteobacteria Betaproteobacteria Burkholderiales Burkholderiaceae Burkholderia pseudomallei group Burkholderia pseudomallei Burkholderia pseudomallei K96243
129        262768.1    262768.1    Bacteria Firmicutes Mollicutes Acholeplasmatales Acholeplasmataceae Candidatus Phytoplasma Candidatus Phytoplasma asteris Onion yellows phytoplasma Onion yellows phytoplasma OY-M
130        272624.3    272624.3    Bacteria Proteobacteria Gammaproteobacteria Legionellales Legionellaceae Legionella Legionella pneumophila Legionella pneumophila subsp. pneumophila Legionella pneumophila subsp. pneumophila str. Philadelphia 1
131        150340.3    150340.3    Bacteria Proteobacteria Gammaproteobacteria Vibrionales Vibrionaceae Vibrio Vibrio sp. Ex25
132        205914.1    205914.1    Bacteria Proteobacteria Gammaproteobacteria Pasteurellales Pasteurellaceae Histophilus Histophilus somni Haemophilus somnus 129PT
133        393117.3    169963.1    Bacteria Firmicutes Bacilli Bacillales Listeriaceae Listeria Listeria monocytogenes Listeria monocytogenes FSL J1-194
134        203119.1    203119.1    Bacteria Firmicutes Clostridia Clostridiales Clostridiaceae Clostridium Clostridium thermocellum Clostridium thermocellum ATCC 27405
135        380703.5    380703.5    Bacteria Proteobacteria Gammaproteobacteria Aeromonadales Aeromonadaceae Aeromonas Aeromonas hydrophila Aeromonas hydrophila subsp. hydrophila Aeromonas hydrophila subsp. hydrophila ATCC 7966
136        259536.4    259536.4    Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Psychrobacter Psychrobacter arcticus Psychrobacter arcticus 273-4
137    
138  The Sapling Server processes lists of data (in this case a list of genome IDs)  The Sapling Server processes lists of data (in this case a list of genome IDs)
139  so that you can minimize the overhead of sending requests across the web. You  so that you can minimize the overhead of sending requests across the web. You
140  can, however, specify a single ID instead of a list to a method call, and  can, however, specify a single ID instead of a list to a method call, and
# Line 117  Line 147 
147      use SAPserver;      use SAPserver;
148    
149      my $sapServer = SAPserver->new(singleton => 1);      my $sapServer = SAPserver->new(singleton => 1);
150      my $genomeIDs = $sapServer->all_genomes(complete => 1);      my $genomeIDs = $sapServer->all_genomes(-complete => 1);
151      $genomeIDs = [ keys %$genomeIDs ];      $genomeIDs = [ keys %$genomeIDs ];
152      for my $genomeID (@$genomeIDs) {      for my $genomeID (@$genomeIDs) {
153          my $repID = $sapServer->representative(ids => $genomeID);          my $repID = $sapServer->representative(-ids => $genomeID);
154          my $taxonomy = $sapServer->taxonomy_of(ids => $genomeID,          my $taxonomy = $sapServer->taxonomy_of(-ids => $genomeID,
155                                                 format => 'names');                                                 -format => 'names');
156          # Format the taxonomy string.          # Format the taxonomy string.
157          my $taxonomyString = join(" ", @$taxonomy);          my $taxonomyString = join(" ", @$taxonomy);
158          # Print the result.          # Print the result.
# Line 138  Line 168 
168  =head2 Specifying Gene IDs  =head2 Specifying Gene IDs
169    
170  Many of the Sapling Server services return data on genes (a term we use rather  Many of the Sapling Server services return data on genes (a term we use rather
171  loosely to include any kind of genetic I<locus> or C<feature>). The standard  loosely to include any kind of genetic I<locus> or I<feature>). The standard
172  method for identifying a gene is the I<FIG ID>, an identifying string that  method for identifying a gene is the I<FIG ID>, an identifying string that
173  begins with the characters C<fig|> and includes the genome ID, the gene type,  begins with the characters C<fig|> and includes the genome ID, the gene type,
174  and an additional number for uniqueness. For example, the FIG ID  and an additional number for uniqueness. For example, the FIG ID
# Line 146  Line 176 
176  Bacillus halodurans C-125 (I<272558.1>).  Bacillus halodurans C-125 (I<272558.1>).
177    
178  Frequently, however, you will have a list of gene IDs from some other  Frequently, however, you will have a list of gene IDs from some other
179  database (e.g. L<NCBI|http://www.ncbi.nlm.nih.gov>, L<UniProt|http://www.uniprot.org>)  database (e.g. I<NCBI>, I<UniProt>) or in a community format such as Locus Tags
180  or in a community format such as Locus Tags or gene names. Most services that  or gene names. Most services that take gene IDs as input allow you to specify a
181  take gene IDs as input allow you to specify a C<source> option that indicates  C<-source> option that indicates the type of IDs being used. The acceptable
182  the type of IDs being used. The acceptable formats are as follows.  formats are as follows.
183    
184  =over 4  =over 4
185    
186  =item CMR  =item CMR
187    
188  L<Comprehensive Microbial Resource ID|http://cmr.jcvi.org>. The CMR IDs for  I<Comprehensive Microbial Resource> (L<http://cmr.jcvi.org>) ID. The CMR IDs for
189  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.
190    
191  =item GENE  =item GENE
# Line 171  Line 201 
201    
202  =item KEGG  =item KEGG
203    
204  L<Kyoto Encyclopedia of Genes and Genomes|http://www.genome.jp/kegg> identifier.  I<Kyoto Encyclopedia of Genes and Genomes> (L<http://www.genome.jp/kegg>) identifier.
205  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.
206    
207  =item LocusTag  =item LocusTag
208    
209  Common locus tag. For example, the locus tag of C<fig|380703.5.peg.250> is  Common locus tag. For example, the locus tag of C<fig|243160.4.peg.200> is
210  C<ABK38410.1>.  C<BMA0002>.
211    
212  =item NCBI  =item NCBI
213    
214  L<NCBI|http://www.ncbi.nlm.nih.gov> accession number. The accession numbers for  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) number. The NCBI ID numbers for
215  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.
216    
217  =item RefSeq  =item RefSeq
218    
219  L<NCBI|http://www.ncbi.nlm.nih.gov> reference sequence identifier. The RefSeq  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) reference sequence identifier. The RefSeq
220  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.
221    
222  =item SEED  =item SEED
# Line 195  Line 225 
225    
226  =item SwissProt  =item SwissProt
227    
228  L<SwissProt|http://www.expasy.ch/sprot> identifier. For example, the SwissProt  I<SwissProt> (L<http://www.expasy.ch/sprot>) identifier. For example, the SwissProt
229  identifier for C<fig|243277.1.peg.3952> is C<O31153>.  identifier for C<fig|243277.1.peg.3952> is C<O31153>.
230    
231  =item UniProt  =item UniProt
232    
233  L<UniProt|http://www.uniprot.org> identifier. The UniProt identifiers for  I<UniProt> (L<http://www.uniprot.org>) identifier. The UniProt identifiers for
234  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.
235    
236  =back  =back
237    
238  You can also mix identifiers of different types by specifying C<mixed>  You can also mix identifiers of different types by specifying C<mixed>
239  for the source type. In this case, however, care must be taken, because the same  for the source type. In this case, however, care must be taken, because the same
240  identifier can have different meanings in different databases.  identifier can have different meanings in different databases. You can also
241    specify C<prefixed> to use IDs with type prefixes. For RefSeq and SEED identifiers,
242    the prefixes are built-in; however, for other identifiers, the following
243    prefixes are used.
244    
245    =over 4
246    
247    =item CMR
248    
249    C<cmr|> -- for example, C<cmr|10172815> and C<cmr|NTL01BH0204>.
250    
251    =item GENE (Common gene name)
252    
253    C<GENE:> -- for example, C<GENE:accD>
254    
255    =item GeneID (Common gene number)
256    
257  =head2 Normal Mode Examples  C<GeneID:> -- for example, C<GeneID:891745>
258    
259    =item KEGG
260    
261    C<kegg|> -- for example, C<kegg|sav:SAV2628>
262    
263    =item LocusTag
264    
265    C<LocusTag:> -- for example, C<LocusTag:ABK38410.1>
266    
267    =item NBCI
268    
269    C<gi|> -- for example, C<gi|10172815>, C<gi|15612766>, C<gi|81788207>.
270    
271    =item UniProt
272    
273    C<uni|> -- for example, C<uni|Q9KGA9>, C<uni|Q9KGA9_BACHD>
274    
275    =back
276    
277    =head2 Two Normal-Mode Examples
278    
279  The following examples use the Sapling Server in normal mode: that is, data  The following examples use the Sapling Server in normal mode: that is, data
280  is sent to the server in batches and the results are stitched together  is sent to the server in batches and the results are stitched together
# Line 217  Line 282 
282  also a risk that the server request might time out. If this happens, you may  also a risk that the server request might time out. If this happens, you may
283  want to consider breaking the input into smaller batches. At some point, the  want to consider breaking the input into smaller batches. At some point, the
284  server system will perform sophisticated flow control to reduce the risk of  server system will perform sophisticated flow control to reduce the risk of
285  timeout errors, but we are not yet at that point.  timeout errors, but we are not yet there.
286    
287  =head3 Retrieving Functional Roles  =head3 Retrieving Functional Roles
288    
# Line 252  Line 317 
317      # Read all the gene IDs.      # Read all the gene IDs.
318      my @genes = map { chomp; $_ } <STDIN>;      my @genes = map { chomp; $_ } <STDIN>;
319      # Compute the functional roles.      # Compute the functional roles.
320      my $results = $sapServer->ids_to_functions(ids => \@genes, source => 'UniProt');      my $results = $sapServer->ids_to_functions(-ids => \@genes, -source => 'UniProt');
321      # Loop through the genes.      # Loop through the genes.
322      for my $gene (@genes) {      for my $gene (@genes) {
323          # Did we find a result?          # Did we find a result?
# Line 266  Line 331 
331          }          }
332      }      }
333    
334    Sample output from this script is shown below. Note that one of the input IDs
335    was not found.
336    
337        HYPA_ECO57      [NiFe] hydrogenase nickel incorporation protein HypA
338        17KD_RICBR      rickettsial 17 kDa surface antigen precursor
339        18KD_MYCLE      18 KDA antigen (HSP 16.7)
340        P72620_SYNY3    [NiFe] hydrogenase metallocenter assembly protein HypD
341        1A14_ARATH      1-aminocyclopropane-1-carboxylate synthase 4 / ACC synthase 4 (ACS4) / identical to gi:940370 [GB:U23481]; go_function: 1-aminocyclopropane-1-carboxylate synthase activity [goid 0016847]; go_process: ethylene biosynthesis [goid 0009693]; go_process: response to auxin stimulus [goid 0009733]
342        Q2RXN5_RHORT    [NiFe] hydrogenase metallocenter assembly protein HypE
343        O29118          Glutamate N-acetyltransferase (EC 2.3.1.35) / N-acetylglutamate synthase (EC 2.3.1.1)
344        Q8PZM3          tRNA nucleotidyltransferase (EC 2.7.7.25)
345    
346        Q8YY27_ANASP was not found.
347    
348  B<ids_to_subsystems> returns roles in subsystems. Roles in subsystems have  B<ids_to_subsystems> returns roles in subsystems. Roles in subsystems have
349  several differences from general functional roles. A single gene may be in  several differences from general functional roles. Only half of the genes in the
350  multiple subsystems and may have multiple roles in a subsystem. In addition,  database are currently associated with subsystems.A single gene may be in In
351  only half of the genes in the database are currently associated with subsystems.  addition, multiple subsystems and may have multiple roles in a subsystem.
352    
353  As a result, instead of a single string per incoming gene, B<ids_to_subsystems>  As a result, instead of a single string per incoming gene, B<ids_to_subsystems>
354  returns a list. Each element of the list consists of the role name followed by  returns a list. Each element of the list consists of the role name followed by
355  the subsystem name. This makes the processing of the results a little more  the subsystem name. This makes the processing of the results a little more
# Line 283  Line 363 
363      # Read all the gene IDs.      # Read all the gene IDs.
364      my @genes = map { chomp; $_ } <STDIN>;      my @genes = map { chomp; $_ } <STDIN>;
365      # Compute the functional roles.      # Compute the functional roles.
366      my $results = $sapServer->ids_to_subsystems(ids => \@genes, source => 'UniProt');      my $results = $sapServer->ids_to_subsystems(-ids => \@genes, -source => 'UniProt');
367      # Loop through the genes.      # Loop through the genes.
368      for my $gene (@genes) {      for my $gene (@genes) {
369          # Did we find a result?          # Did we find a result?
# Line 299  Line 379 
379          }          }
380      }      }
381    
382    Sample output from this script is shown below. In this case, four of the input IDs
383    failed to find a result; however, two of them (C<O29118> and C<Q8PZM3>) produced multiple
384    results.
385    
386        HYPA_ECO57      [NiFe] hydrogenase nickel incorporation protein HypA    (NiFe hydrogenase maturation)
387        P72620_SYNY3    [NiFe] hydrogenase metallocenter assembly protein HypD  (NiFe hydrogenase maturation)
388        Q2RXN5_RHORT    [NiFe] hydrogenase metallocenter assembly protein HypE  (NiFe hydrogenase maturation)
389        O29118  N-acetylglutamate synthase (EC 2.3.1.1) (Arginine Biosynthesis extended)
390        O29118  Glutamate N-acetyltransferase (EC 2.3.1.35)     (Arginine Biosynthesis extended)
391        O29118  N-acetylglutamate synthase (EC 2.3.1.1) (Arginine Biosynthesis)
392        O29118  Glutamate N-acetyltransferase (EC 2.3.1.35)     (Arginine Biosynthesis)
393        Q8PZM3  tRNA nucleotidyltransferase (EC 2.7.7.25)       (CBSS-316057.3.peg.1294)
394        Q8PZM3  tRNA nucleotidyltransferase (EC 2.7.7.25)       (tRNA nucleotidyltransferase)
395    
396        17KD_RICBR is not in a subsystem.
397        Q8YY27_ANASP is not in a subsystem.
398        18KD_MYCLE is not in a subsystem.
399        1A14_ARATH is not in a subsystem.
400    
401  =head3 Genes in Subsystems for Genomes  =head3 Genes in Subsystems for Genomes
402    
403  This next example finds all genes in subsystems for a specific genome. We will  This next example finds all genes in subsystems for a specific genome. We will
# Line 344  Line 443 
443      while (my $genomeID = <STDIN>) {      while (my $genomeID = <STDIN>) {
444          chomp $genomeID;          chomp $genomeID;
445          # Get the subsystems for this genome.          # Get the subsystems for this genome.
446          my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);          my $subHash = $sapServer->genomes_to_subsystems(-ids => $genomeID);
447          # The data returned for each genome (and in our case there's only one)          # The data returned for each genome (and in our case there's only one)
448          # includes the subsystem name and the variant code. The following          # includes the subsystem name and the variant code. The following
449          # statement strips away the variant codes, leaving only the subsystem          # statement strips away the variant codes, leaving only the subsystem
450          # names.          # names.
451          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
452          # Ask for the genes in each subsystem, using NCBI identifiers.          # Ask for the genes in each subsystem, using NCBI identifiers.
453          my $roleHashes = $sapServer->ids_in_subsystems(subsystems => $subList,          my $roleHashes = $sapServer->ids_in_subsystems(-subsystems => $subList,
454                                                       genome => $genomeID,                                                         -genome => $genomeID,
455                                                       source => 'NCBI',                                                         -source => 'NCBI',
456                                                       roleForm => 'full');                                                         -roleForm => 'full');
457          # The hash maps each subsystem ID to a hash of roles to lists of feature          # The hash maps each subsystem ID to a hash of roles to lists of feature
458          # IDs. We therefore use three levels of nested loops to produce our          # IDs. We therefore use three levels of nested loops to produce our
459          # output lines. At the top level we have a hash mapping subsystem IDs          # output lines. At the top level we have a hash mapping subsystem IDs
# Line 366  Line 465 
465                  my $geneList = $geneHash->{$role};                  my $geneList = $geneHash->{$role};
466                  # Finally, we loop through the gene IDs.                  # Finally, we loop through the gene IDs.
467                  for my $gene (@$geneList) {                  for my $gene (@$geneList) {
468                      print "$genomeID\t$subsystem\t$role\t$gene\n";                      print "$genomeID\t$gene\t$subsystem\t$role\n";
469                  }                  }
470              }              }
471          }          }
472      }      }
473    
474    An excerpt of the output is shown here.
475    
476        360108.3    85840651    Queuosine-Archaeosine Biosynthesis          Queuosine Biosynthesis QueC ATPase
477        360108.3    85841812    Queuosine-Archaeosine Biosynthesis          Queuosine Biosynthesis QueE Radical SAM
478        360108.3    85841520    Queuosine-Archaeosine Biosynthesis          Queuosine biosynthesis QueD, PTPS-I
479        360108.3    85841162    Queuosine-Archaeosine Biosynthesis          S-adenosylmethionine:tRNA ribosyltransferase-isomerase (EC 5.-.-.-)
480        360108.3    85842244    Queuosine-Archaeosine Biosynthesis          tRNA-guanine transglycosylase (EC 2.4.2.29)
481        360108.3    85841653    Quinate degradation                         3-dehydroquinate dehydratase II (EC 4.2.1.10)
482        360108.3    85840760    RNA polymerase bacterial                    DNA-directed RNA polymerase alpha subunit (EC 2.7.7.6)
483        360108.3    85841269    RNA polymerase bacterial                    DNA-directed RNA polymerase beta subunit (EC 2.7.7.6)
484        360108.3    85841348    RNA polymerase bacterial                    DNA-directed RNA polymerase beta' subunit (EC 2.7.7.6)
485        360108.3    85841887    RNA polymerase bacterial                    DNA-directed RNA polymerase omega subunit (EC 2.7.7.6)
486        360108.3    85841283    RNA processing and degradation, bacterial   3'-to-5' exoribonuclease RNase R
487        360108.3    85840820    RNA processing and degradation, bacterial   Ribonuclease III (EC 3.1.26.3)
488        360108.3    85842272    Recycling of Peptidoglycan Amino Acids      Aminoacyl-histidine dipeptidase (Peptidase D) (EC 3.4.13.3)
489    
490  The I<ids_in_subsystems> service has several useful options for changing the nature  The I<ids_in_subsystems> service has several useful options for changing the nature
491  of the output. For example, in the above program each role is represented by a  of the output. For example, in the above program each role is represented by a
492  full description (C<roleForm> set to C<full>). If you don't need the roles, you  full description (C<roleForm> set to C<full>). If you don't need the roles, you
# Line 388  Line 503 
503      while (my $genomeID = <STDIN>) {      while (my $genomeID = <STDIN>) {
504          chomp $genomeID;          chomp $genomeID;
505          # Get the subsystems for this genome.          # Get the subsystems for this genome.
506          my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);          my $subHash = $sapServer->genomes_to_subsystems(-ids => $genomeID);
507          # The data returned for each genome (and in our case there's only one)          # The data returned for each genome (and in our case there's only one)
508          # includes the subsystem name and the variant code. The following          # includes the subsystem name and the variant code. The following
509          # statement strips away the variant codes, leaving only the subsystem          # statement strips away the variant codes, leaving only the subsystem
510          # names.          # names.
511          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
512          # Ask for the genes in each subsystem, using NCBI identifiers.          # Ask for the genes in each subsystem, using NCBI identifiers.
513          my $genesHash = $sapServer->ids_in_subsystems(subsystems => $subList,          my $genesHash = $sapServer->ids_in_subsystems(-subsystems => $subList,
514                                                       genome => $genomeID,                                                        -genome => $genomeID,
515                                                       source => 'NCBI',                                                        -source => 'NCBI',
516                                                       roleForm => 'none',                                                        -roleForm => 'none',
517                                                       grouped => 1);                                                        -grouped => 1);
518          # The hash maps each subsystem ID to a comma-delimited list of gene IDs.          # The hash maps each subsystem ID to a comma-delimited list of gene IDs.
519          for my $subsystem (sort keys %$genesHash) {          for my $subsystem (sort keys %$genesHash) {
520              my $genes = $genesHash->{$subsystem};              my $genes = $genesHash->{$subsystem};
# Line 407  Line 522 
522          }          }
523      }      }
524    
525    The sample output in this case looks quite different. The role information is missing,
526    and all the data for a subsystem is in a single line.
527    
528        360108.3    Queuosine-Archaeosine Biosynthesis          85841622, 85841791, 85840661, 85841162, 85841520, 85842244, 85840651, 85841812
529        360108.3    Quinate degradation                         85841653
530        360108.3    RNA polymerase bacterial                    85840760, 85841269, 85841348, 85841887
531        360108.3    RNA processing and degradation, bacterial   85840820, 85841283
532        360108.3    Recycling of Peptidoglycan Amino Acids      85842019, 85842272
533    
534    =head2 A More Complicated Example: Operon Upstream Regions
535    
536    In this example we'll string several services together to perform a more
537    complex task: locating the upstream regions of operons involved in a particular
538    metabolic pathway. The theory is that we can look for a common pattern in
539    these regions.
540    
541    A metabolic pathway is a subsystem, so we'll enter our database via the
542    subsystems. To keep the data manageable, we'll limit our results to
543    specific genomes. The program we write will take as input a subsystem name
544    and a file of genome IDs.
545    
546    The worst part of the task is finding the operon for a gene. This involves
547    finding all the genes in a neighborhood and isolating the ones that point in
548    the correct direction. Fortunately, there is a Sapling Server function--
549    L<SAP/make_runs>-- that specifcally performs this task.
550    
551    To start our program, we create a L<SAPserver> object and pull the subsystem
552    name from the command-line parameters. This program is going to be doing a
553    lot of complicated, long-running stuff, so we'll usually want to deal with one
554    result at a time. To facilitate that, we construct the server helper in
555    singleton mode.
556    
557        use strict;
558        use SAPserver;
559    
560        my $sapServer = SAPserver->new(singleton => 1);
561        # Get the subsystem name.
562        my $ssName = $ARGV[0];
563    
564    Our main loop will read through the list of genomes from the standard input
565    and call a method I<PrintUpstream> to process each one. We're going to be a bit
566    cheesy here and allow our genome loop to stop on the first blank line.
567    
568        while (my $genomeID = <STDIN>) {
569            chomp $genomeID;
570            PrintUpstream($sapServer, $ssName, $genomeID);
571        }
572    
573    Now we need to write I<PrintUpstream>. Its first task is to find all the
574    genes in the genome that belong to the subsystem. A single call to
575    L<SAP/ids_in_subsystems> will get this information. We then feed the
576    results into L<SAP/make_runs> to get operons and call L<SAP/upstream> for
577    each operon. The program is given below.
578    
579        sub PrintUpstream {
580            my ($sapServer, $ssName, $genomeID) = @_;
581            # Because we specify "roleForm => none", we get back one long
582            # gene list.
583            my $geneList = $sapServer->ids_in_subsystems(-subsystems => $ssName,
584                                                         -genome => $genomeID,
585                                                         -roleForm => 'none');
586            # Convert the gene list to a comma-delimited string.
587            my $geneString = join(", ", @$geneList);
588            # Get a list of operons.
589            my $opList = $sapServer->make_runs(-groups => $geneString);
590            # Loop through the operons.
591            for my $opString (@$opList) {
592                # Get the first feature's ID.
593                my ($front) = split /\s*,/, $opString, 2;
594                # Grab its upstream region. We'll include the operon string as the
595                # comment text.
596                my $fasta = $sapServer->upstream(-ids => $front,
597                                                 -comments => { $front => $opString },
598                                                 -skipGene => 1);
599                # Print the result.
600                print "$fasta\n";
601            }
602        }
603    
604    
605  =cut  =cut
606    
607  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.10

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3