[Bio] / FigKernelPackages / SAPtutorial.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/SAPtutorial.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Wed Aug 5 19:33:24 2009 UTC revision 1.7, Wed Aug 19 17:04:38 2009 UTC
# Line 7  Line 7 
7    
8  The B<Sapling Server> is a web service that allows you to access data stored in  The B<Sapling Server> is a web service that allows you to access data stored in
9  the Sapling database, a large-scale MySQL database of genetic data. The Sapling  the Sapling database, a large-scale MySQL database of genetic data. The Sapling
10  database contains data on public genomes imported from L<RAST|http://rast.nmpdr.org>  database contains data on public genomes imported from I<RAST> (L<http://rast.nmpdr.org>)
11  and curated by the annotation team of the Fellowship for Interpretation of  and curated by the annotation team of the Fellowship for Interpretation of
12  Genomes.  Genomes.
13    
# Line 32  Line 32 
32      use SAPserver;      use SAPserver;
33    
34      my $sapServer = SAPserver->new();      my $sapServer = SAPserver->new();
35      my $resultHash = $sapServer->taxonomy_of({ ids => ['360108.3', '100226.1'],      my $resultHash = $sapServer->taxonomy_of({ -ids => ['360108.3', '100226.1'],
36                                                 format => 'names' });                                                 -format => 'names' });
37      for my $id (keys %$resultHash) {      for my $id (keys %$resultHash) {
38          my $taxonomy = $resultHash->{$id};          my $taxonomy = $resultHash->{$id};
39          print "$id: " . join(" ", @$taxonomy) . "\n";          print "$id: " . join(" ", @$taxonomy) . "\n";
# Line 53  Line 53 
53  than a hash reference. So, for example, the above I<taxonomy_of> call could  than a hash reference. So, for example, the above I<taxonomy_of> call could
54  also be written like this.  also be written like this.
55    
56      my $resultHash = $sapServer->taxonomy_of(ids => ['360108.3', '100226.1'],      my $resultHash = $sapServer->taxonomy_of(-ids => ['360108.3', '100226.1'],
57                                               format => 'names');                                               -format => 'names');
58    
59  =head2 A Simple Example: Genome Taxonomies  =head2 A Simple Example: Genome Taxonomies
60    
# Line 100  Line 100 
100    
101  Now we ask for the representatives and the taxonomies.  Now we ask for the representatives and the taxonomies.
102    
103      my $representativeHash = $sapServer->representative(ids => $genomeIDs);      my $representativeHash = $sapServer->representative(-ids => $genomeIDs);
104      my $taxonomyHash = $sapServer->taxonomy_of(ids => $genomeIDs,      my $taxonomyHash = $sapServer->taxonomy_of(-ids => $genomeIDs,
105                                                 format => 'names');                                                 -format => 'names');
106    
107  Our data is now contained in a pair of hash tables. The following loop  Our data is now contained in a pair of hash tables. The following loop
108  stiches them together to produce the output.  stiches them together to produce the output.
# Line 147  Line 147 
147      use SAPserver;      use SAPserver;
148    
149      my $sapServer = SAPserver->new(singleton => 1);      my $sapServer = SAPserver->new(singleton => 1);
150      my $genomeIDs = $sapServer->all_genomes(complete => 1);      my $genomeIDs = $sapServer->all_genomes(-complete => 1);
151      $genomeIDs = [ keys %$genomeIDs ];      $genomeIDs = [ keys %$genomeIDs ];
152      for my $genomeID (@$genomeIDs) {      for my $genomeID (@$genomeIDs) {
153          my $repID = $sapServer->representative(ids => $genomeID);          my $repID = $sapServer->representative(-ids => $genomeID);
154          my $taxonomy = $sapServer->taxonomy_of(ids => $genomeID,          my $taxonomy = $sapServer->taxonomy_of(-ids => $genomeID,
155                                                 format => 'names');                                                 -format => 'names');
156          # Format the taxonomy string.          # Format the taxonomy string.
157          my $taxonomyString = join(" ", @$taxonomy);          my $taxonomyString = join(" ", @$taxonomy);
158          # Print the result.          # Print the result.
# Line 176  Line 176 
176  Bacillus halodurans C-125 (I<272558.1>).  Bacillus halodurans C-125 (I<272558.1>).
177    
178  Frequently, however, you will have a list of gene IDs from some other  Frequently, however, you will have a list of gene IDs from some other
179  database (e.g. L<NCBI|http://www.ncbi.nlm.nih.gov>, L<UniProt|http://www.uniprot.org>)  database (e.g. I<NCBI> (L<http://www.ncbi.nlm.nih.gov>), I<UniProt> (L<http://www.uniprot.org>))
180  or in a community format such as Locus Tags or gene names. Most services that  or in a community format such as Locus Tags or gene names. Most services that
181  take gene IDs as input allow you to specify a C<source> option that indicates  take gene IDs as input allow you to specify a C<source> option that indicates
182  the type of IDs being used. The acceptable formats are as follows.  the type of IDs being used. The acceptable formats are as follows.
# Line 185  Line 185 
185    
186  =item CMR  =item CMR
187    
188  L<Comprehensive Microbial Resource ID|http://cmr.jcvi.org>. The CMR IDs for  I<Comprehensive Microbial Resource> (L<http://cmr.jcvi.org>) ID. The CMR IDs for
189  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.
190    
191  =item GENE  =item GENE
# Line 201  Line 201 
201    
202  =item KEGG  =item KEGG
203    
204  L<Kyoto Encyclopedia of Genes and Genomes|http://www.genome.jp/kegg> identifier.  I<Kyoto Encyclopedia of Genes and Genomes> (L<http://www.genome.jp/kegg>) identifier.
205  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.
206    
207  =item LocusTag  =item LocusTag
# Line 211  Line 211 
211    
212  =item NCBI  =item NCBI
213    
214  L<NCBI|http://www.ncbi.nlm.nih.gov> accession number. The accession numbers for  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) accession number. The accession numbers for
215  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.
216    
217  =item RefSeq  =item RefSeq
218    
219  L<NCBI|http://www.ncbi.nlm.nih.gov> reference sequence identifier. The RefSeq  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) reference sequence identifier. The RefSeq
220  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.
221    
222  =item SEED  =item SEED
# Line 225  Line 225 
225    
226  =item SwissProt  =item SwissProt
227    
228  L<SwissProt|http://www.expasy.ch/sprot> identifier. For example, the SwissProt  I<SwissProt> (L<http://www.expasy.ch/sprot>) identifier. For example, the SwissProt
229  identifier for C<fig|243277.1.peg.3952> is C<O31153>.  identifier for C<fig|243277.1.peg.3952> is C<O31153>.
230    
231  =item UniProt  =item UniProt
232    
233  L<UniProt|http://www.uniprot.org> identifier. The UniProt identifiers for  I<UniProt> (L<http://www.uniprot.org>) identifier. The UniProt identifiers for
234  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.
235    
236  =back  =back
# Line 239  Line 239 
239  for the source type. In this case, however, care must be taken, because the same  for the source type. In this case, however, care must be taken, because the same
240  identifier can have different meanings in different databases.  identifier can have different meanings in different databases.
241    
242  =head2 Normal Mode Examples  =head2 Two Normal-Mode Examples
243    
244  The following examples use the Sapling Server in normal mode: that is, data  The following examples use the Sapling Server in normal mode: that is, data
245  is sent to the server in batches and the results are stitched together  is sent to the server in batches and the results are stitched together
# Line 282  Line 282 
282      # Read all the gene IDs.      # Read all the gene IDs.
283      my @genes = map { chomp; $_ } <STDIN>;      my @genes = map { chomp; $_ } <STDIN>;
284      # Compute the functional roles.      # Compute the functional roles.
285      my $results = $sapServer->ids_to_functions(ids => \@genes, source => 'UniProt');      my $results = $sapServer->ids_to_functions(-ids => \@genes, -source => 'UniProt');
286      # Loop through the genes.      # Loop through the genes.
287      for my $gene (@genes) {      for my $gene (@genes) {
288          # Did we find a result?          # Did we find a result?
# Line 327  Line 327 
327      # Read all the gene IDs.      # Read all the gene IDs.
328      my @genes = map { chomp; $_ } <STDIN>;      my @genes = map { chomp; $_ } <STDIN>;
329      # Compute the functional roles.      # Compute the functional roles.
330      my $results = $sapServer->ids_to_subsystems(ids => \@genes, source => 'UniProt');      my $results = $sapServer->ids_to_subsystems(-ids => \@genes, -source => 'UniProt');
331      # Loop through the genes.      # Loop through the genes.
332      for my $gene (@genes) {      for my $gene (@genes) {
333          # Did we find a result?          # Did we find a result?
# Line 407  Line 407 
407      while (my $genomeID = <STDIN>) {      while (my $genomeID = <STDIN>) {
408          chomp $genomeID;          chomp $genomeID;
409          # Get the subsystems for this genome.          # Get the subsystems for this genome.
410          my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);          my $subHash = $sapServer->genomes_to_subsystems(-ids => $genomeID);
411          # The data returned for each genome (and in our case there's only one)          # The data returned for each genome (and in our case there's only one)
412          # includes the subsystem name and the variant code. The following          # includes the subsystem name and the variant code. The following
413          # statement strips away the variant codes, leaving only the subsystem          # statement strips away the variant codes, leaving only the subsystem
414          # names.          # names.
415          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
416          # Ask for the genes in each subsystem, using NCBI identifiers.          # Ask for the genes in each subsystem, using NCBI identifiers.
417          my $roleHashes = $sapServer->ids_in_subsystems(subsystems => $subList,          my $roleHashes = $sapServer->ids_in_subsystems(-subsystems => $subList,
418                                                       genome => $genomeID,                                                         -genome => $genomeID,
419                                                       source => 'NCBI',                                                         -source => 'NCBI',
420                                                       roleForm => 'full');                                                         -roleForm => 'full');
421          # The hash maps each subsystem ID to a hash of roles to lists of feature          # The hash maps each subsystem ID to a hash of roles to lists of feature
422          # IDs. We therefore use three levels of nested loops to produce our          # IDs. We therefore use three levels of nested loops to produce our
423          # output lines. At the top level we have a hash mapping subsystem IDs          # output lines. At the top level we have a hash mapping subsystem IDs
# Line 467  Line 467 
467      while (my $genomeID = <STDIN>) {      while (my $genomeID = <STDIN>) {
468          chomp $genomeID;          chomp $genomeID;
469          # Get the subsystems for this genome.          # Get the subsystems for this genome.
470          my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);          my $subHash = $sapServer->genomes_to_subsystems(-ids => $genomeID);
471          # The data returned for each genome (and in our case there's only one)          # The data returned for each genome (and in our case there's only one)
472          # includes the subsystem name and the variant code. The following          # includes the subsystem name and the variant code. The following
473          # statement strips away the variant codes, leaving only the subsystem          # statement strips away the variant codes, leaving only the subsystem
474          # names.          # names.
475          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];          my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
476          # Ask for the genes in each subsystem, using NCBI identifiers.          # Ask for the genes in each subsystem, using NCBI identifiers.
477          my $genesHash = $sapServer->ids_in_subsystems(subsystems => $subList,          my $genesHash = $sapServer->ids_in_subsystems(-subsystems => $subList,
478                                                       genome => $genomeID,                                                        -genome => $genomeID,
479                                                       source => 'NCBI',                                                        -source => 'NCBI',
480                                                       roleForm => 'none',                                                        -roleForm => 'none',
481                                                       grouped => 1);                                                        -grouped => 1);
482          # The hash maps each subsystem ID to a comma-delimited list of gene IDs.          # The hash maps each subsystem ID to a comma-delimited list of gene IDs.
483          for my $subsystem (sort keys %$genesHash) {          for my $subsystem (sort keys %$genesHash) {
484              my $genes = $genesHash->{$subsystem};              my $genes = $genesHash->{$subsystem};
# Line 495  Line 495 
495      360108.3    RNA processing and degradation, bacterial   85840820, 85841283      360108.3    RNA processing and degradation, bacterial   85840820, 85841283
496      360108.3    Recycling of Peptidoglycan Amino Acids      85842019, 85842272      360108.3    Recycling of Peptidoglycan Amino Acids      85842019, 85842272
497    
498    =head2 A More Complicated Example: Operon Upstream Regions
499    
500    In this example we'll string several services together to perform a more
501    complex task: locating the upstream regions of operons involved in a particular
502    metabolic pathway. The theory is that we can look for a common pattern in
503    these regions.
504    
505    A metabolic pathway is a subsystem, so we'll enter our database via the
506    subsystems. To keep the data manageable, we'll limit our results to
507    specific genomes. The program we write will take as input a subsystem name
508    and a file of genome IDs.
509    
510    The worst part of the task is finding the operon for a gene. This involves
511    finding all the genes in a neighborhood and isolating the ones that point in
512    the correct direction. Fortunately, there is a Sapling Server function--
513    L<SAP/make_runs>-- that specifcally performs this task.
514    
515    To start our program, we create a L<SAPserver> object and pull the subsystem
516    name from the command-line parameters. This program is going to be doing a
517    lot of complicated, long-running stuff, so we'll usually want to deal with one
518    result at a time. To facilitate that, we construct the server helper in
519    singleton mode.
520    
521        use strict;
522        use SAPserver;
523    
524        my $sapServer = SAPserver->new(singleton => 1);
525        # Get the subsystem name.
526        my $ssName = $ARGV[0];
527    
528    Our main loop will read through the list of genomes from the standard input
529    and call a method I<PrintUpstream> to process each one. We're going to be a bit
530    cheesy here and allow our genome loop to stop on the first blank line.
531    
532        while (my $genomeID = <STDIN>) {
533            chomp $genomeID;
534            PrintUpstream($sapServer, $ssName, $genomeID);
535        }
536    
537    Now we need to write I<PrintUpstream>. Its first task is to find all the
538    genes in the genome that belong to the subsystem. A single call to
539    L<SAP/ids_in_subsystems> will get this information. We then feed the
540    results into L<SAP/make_runs> to get operons and call L<SAP/upstream> for
541    each operon. The program is given below.
542    
543        sub PrintUpstream {
544            my ($sapServer, $ssName, $genomeID) = @_;
545            # Because we specify "roleForm => none", we get back one long
546            # gene list.
547            my $geneList = $sapServer->ids_in_subsystems(-subsystems => $ssName,
548                                                         -genome => $genomeID,
549                                                         -roleForm => 'none');
550            # Convert the gene list to a comma-delimited string.
551            my $geneString = join(", ", @$geneList);
552            # Get a list of operons.
553            my $opList = $sapServer->make_runs(-groups => $geneString);
554            # Loop through the operons.
555            for my $opString (@$opList) {
556                # Get the first feature's ID.
557                my ($front) = split /\s*,/, $opString, 2;
558                # Grab its upstream region. We'll include the operon string as the
559                # comment text.
560                my $fasta = $sapServer->upstream(-ids => $front,
561                                                 -comments => { $front => $opString },
562                                                 -skipGene => 1);
563                # Print the result.
564                print "$fasta\n";
565            }
566        }
567    
568    
569  =cut  =cut
570    
571  1;  1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3