[Bio] / FigKernelPackages / SAPtutorial.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/SAPtutorial.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Aug 3 21:31:42 2009 UTC revision 1.6, Mon Aug 10 17:42:33 2009 UTC
# Line 7  Line 7 
7    
8  The B<Sapling Server> is a web service that allows you to access data stored in  The B<Sapling Server> is a web service that allows you to access data stored in
9  the Sapling database, a large-scale MySQL database of genetic data. The Sapling  the Sapling database, a large-scale MySQL database of genetic data. The Sapling
10  database contains data on public genomes imported from L<RAST|http://rast.nmpdr.org>  database contains data on public genomes imported from I<RAST> (L<http://rast.nmpdr.org>)
11  and curated by the annotation team of the Fellowship for Interpretation of  and curated by the annotation team of the Fellowship for Interpretation of
12  Genomes.  Genomes.
13    
# Line 28  Line 28 
28  you create a B<SAPserver> object and call a method with the same name as the  you create a B<SAPserver> object and call a method with the same name as the
29  service.  service.
30    
31        use strict;
32      use SAPserver;      use SAPserver;
33    
34      my $sapServer = SAPserver->new();      my $sapServer = SAPserver->new();
# Line 35  Line 36 
36                                                 format => 'names' });                                                 format => 'names' });
37      for my $id (keys %$resultHash) {      for my $id (keys %$resultHash) {
38          my $taxonomy = $resultHash->{$id};          my $taxonomy = $resultHash->{$id};
39          print $id, join(" ", @$taxonomy);          print "$id: " . join(" ", @$taxonomy) . "\n";
40      }      }
41    
42    The output from this program (reformatted slightly for readability) is shown below.
43    
44        360108.3: Bacteria, Proteobacteria, delta/epsilon subdivisions, Epsilonproteobacteria,
45                  Campylobacterales, Campylobacteraceae, Campylobacter, Campylobacter jejuni,
46                  Campylobacter jejuni subsp. jejuni, Campylobacter jejuni subsp. jejuni 260.94
47    
48        100226.1: Bacteria, Actinobacteria, Actinobacteria (class), Actinobacteridae,
49                  Actinomycetales, Streptomycineae, Streptomycetaceae, Streptomyces,
50                  Streptomyces coelicolor, Streptomyces coelicolor A3(2)
51    
52  For convenience, you can specify the parameters as a simple hash rather  For convenience, you can specify the parameters as a simple hash rather
53  than a hash reference. So, for example, the above I<taxonomy_of> call could  than a hash reference. So, for example, the above I<taxonomy_of> call could
54  also be written like this.  also be written like this.
# Line 105  Line 116 
116          print join("\t", $genomeID, $repID, $taxonomyString) . "\n";          print join("\t", $genomeID, $repID, $taxonomyString) . "\n";
117      }      }
118    
119    An excerpt from the output of this script is shown below. The first column contains
120    a genome ID, the second contains the representative genome's ID, and the third is
121    the full taxonomy. Note that the two genomes with very close taxonomies have the
122    same representative genome: this is the expected vehavior.
123    
124        221109.1    221109.1    Bacteria Firmicutes Bacilli Bacillales Bacillaceae Oceanobacillus Oceanobacillus iheyensis Oceanobacillus iheyensis HTE831
125        204722.1    204722.1    Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Brucellaceae Brucella Brucella suis Brucella suis 1330
126        391037.3    369723.3    Bacteria Actinobacteria Actinobacteria (class) Actinobacteridae Actinomycetales Micromonosporineae Micromonosporaceae Salinispora Salinispora arenicola Salinispora arenicola CNS205
127        339670.3    216591.1    Bacteria Proteobacteria Betaproteobacteria Burkholderiales Burkholderiaceae Burkholderia Burkholderia cepacia complex Burkholderia cepacia Burkholderia cepacia AMMD
128        272560.3    216591.1    Bacteria Proteobacteria Betaproteobacteria Burkholderiales Burkholderiaceae Burkholderia pseudomallei group Burkholderia pseudomallei Burkholderia pseudomallei K96243
129        262768.1    262768.1    Bacteria Firmicutes Mollicutes Acholeplasmatales Acholeplasmataceae Candidatus Phytoplasma Candidatus Phytoplasma asteris Onion yellows phytoplasma Onion yellows phytoplasma OY-M
130        272624.3    272624.3    Bacteria Proteobacteria Gammaproteobacteria Legionellales Legionellaceae Legionella Legionella pneumophila Legionella pneumophila subsp. pneumophila Legionella pneumophila subsp. pneumophila str. Philadelphia 1
131        150340.3    150340.3    Bacteria Proteobacteria Gammaproteobacteria Vibrionales Vibrionaceae Vibrio Vibrio sp. Ex25
132        205914.1    205914.1    Bacteria Proteobacteria Gammaproteobacteria Pasteurellales Pasteurellaceae Histophilus Histophilus somni Haemophilus somnus 129PT
133        393117.3    169963.1    Bacteria Firmicutes Bacilli Bacillales Listeriaceae Listeria Listeria monocytogenes Listeria monocytogenes FSL J1-194
134        203119.1    203119.1    Bacteria Firmicutes Clostridia Clostridiales Clostridiaceae Clostridium Clostridium thermocellum Clostridium thermocellum ATCC 27405
135        380703.5    380703.5    Bacteria Proteobacteria Gammaproteobacteria Aeromonadales Aeromonadaceae Aeromonas Aeromonas hydrophila Aeromonas hydrophila subsp. hydrophila Aeromonas hydrophila subsp. hydrophila ATCC 7966
136        259536.4    259536.4    Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Psychrobacter Psychrobacter arcticus Psychrobacter arcticus 273-4
137    
138  The Sapling Server processes lists of data (in this case a list of genome IDs)  The Sapling Server processes lists of data (in this case a list of genome IDs)
139  so that you can minimize the overhead of sending requests across the web. You  so that you can minimize the overhead of sending requests across the web. You
140  can, however, specify a single ID instead of a list to a method call, and  can, however, specify a single ID instead of a list to a method call, and
# Line 146  Line 176 
176  Bacillus halodurans C-125 (I<272558.1>).  Bacillus halodurans C-125 (I<272558.1>).
177    
178  Frequently, however, you will have a list of gene IDs from some other  Frequently, however, you will have a list of gene IDs from some other
179  database (e.g. L<NCBI|http://www.ncbi.nlm.nih.gov>, L<UniProt|http://www.uniprot.org>)  database (e.g. I<NCBI> (L<http://www.ncbi.nlm.nih.gov>), I<UniProt> (L<http://www.uniprot.org>))
180  or in a community format such as Locus Tags or gene names. Most services that  or in a community format such as Locus Tags or gene names. Most services that
181  take gene IDs as input allow you to specify a C<source> option that indicates  take gene IDs as input allow you to specify a C<source> option that indicates
182  the type of IDs being used. The acceptable formats are as follows.  the type of IDs being used. The acceptable formats are as follows.
# Line 155  Line 185 
185    
186  =item CMR  =item CMR
187    
188  L<Comprehensive Microbial Resource ID|http://cmr.jcvi.org>. The CMR IDs for  I<Comprehensive Microbial Resource> (L<http://cmr.jcvi.org>) ID. The CMR IDs for
189  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.  C<fig|272558.1.peg.203> are C<10172815> and C<NTL01BH0204>.
190    
191  =item GENE  =item GENE
# Line 171  Line 201 
201    
202  =item KEGG  =item KEGG
203    
204  L<Kyoto Encyclopedia of Genes and Genomes|http://www.genome.jp/kegg> identifier.  I<Kyoto Encyclopedia of Genes and Genomes> (L<http://www.genome.jp/kegg>) identifier.
205  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.  For example, the KEGG identifier for C<fig|158878.1.peg.2821> is C<sav:SAV2628>.
206    
207  =item LocusTag  =item LocusTag
# Line 181  Line 211 
211    
212  =item NCBI  =item NCBI
213    
214  L<NCBI|http://www.ncbi.nlm.nih.gov> accession number. The accession numbers for  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) accession number. The accession numbers for
215  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.  C<fig|272558.1.peg.203> are C<10172815>, C<15612766>, and C<81788207>.
216    
217  =item RefSeq  =item RefSeq
218    
219  L<NCBI|http://www.ncbi.nlm.nih.gov> reference sequence identifier. The RefSeq  I<NCBI> (L<http://www.ncbi.nlm.nih.gov>) reference sequence identifier. The RefSeq
220  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.  identifier for C<fig|272558.1.peg.203> is C<NP_241069.1>.
221    
222  =item SEED  =item SEED
# Line 195  Line 225 
225    
226  =item SwissProt  =item SwissProt
227    
228  L<SwissProt|http://www.expasy.ch/sprot> identifier. For example, the SwissProt  I<SwissProt> (L<http://www.expasy.ch/sprot>) identifier. For example, the SwissProt
229  identifier for C<fig|243277.1.peg.3952> is C<O31153>.  identifier for C<fig|243277.1.peg.3952> is C<O31153>.
230    
231  =item UniProt  =item UniProt
232    
233  L<UniProt|http://www.uniprot.org> identifier. The UniProt identifiers for  I<UniProt> (L<http://www.uniprot.org>) identifier. The UniProt identifiers for
234  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.  C<fig|272558.1.peg.203> are C<Q9KGA9> and C<Q9KGA9_BACHD>.
235    
236  =back  =back
# Line 209  Line 239 
239  for the source type. In this case, however, care must be taken, because the same  for the source type. In this case, however, care must be taken, because the same
240  identifier can have different meanings in different databases.  identifier can have different meanings in different databases.
241    
242  =head2 Normal Mode Examples  =head2 Two Normal-Mode Examples
243    
244  The following examples use the Sapling Server in normal mode: that is, data  The following examples use the Sapling Server in normal mode: that is, data
245  is sent to the server in batches and the results are stitched together  is sent to the server in batches and the results are stitched together
# Line 266  Line 296 
296          }          }
297      }      }
298    
299    Sample output from this script is shown below. Note that one of the input IDs was
300    not found.
301    
302        HYPA_ECO57      [NiFe] hydrogenase nickel incorporation protein HypA
303        17KD_RICBR      rickettsial 17 kDa surface antigen precursor
304        18KD_MYCLE      18 KDA antigen (HSP 16.7)
305        P72620_SYNY3    [NiFe] hydrogenase metallocenter assembly protein HypD
306        1A14_ARATH      1-aminocyclopropane-1-carboxylate synthase 4 / ACC synthase 4 (ACS4) / identical to gi:940370 [GB:U23481]; go_function: 1-aminocyclopropane-1-carboxylate synthase activity [goid 0016847]; go_process: ethylene biosynthesis [goid 0009693]; go_process: response to auxin stimulus [goid 0009733]
307        Q2RXN5_RHORT    [NiFe] hydrogenase metallocenter assembly protein HypE
308        O29118          Glutamate N-acetyltransferase (EC 2.3.1.35) / N-acetylglutamate synthase (EC 2.3.1.1)
309        Q8PZM3          tRNA nucleotidyltransferase (EC 2.7.7.25)
310    
311        Q8YY27_ANASP was not found.
312    
313  B<ids_to_subsystems> returns roles in subsystems. Roles in subsystems have  B<ids_to_subsystems> returns roles in subsystems. Roles in subsystems have
314  several differences from general functional roles. A single gene may be in  several differences from general functional roles. A single gene may be in
315  multiple subsystems and may have multiple roles in a subsystem. In addition,  multiple subsystems and may have multiple roles in a subsystem. In addition,
# Line 299  Line 343 
343          }          }
344      }      }
345    
346    Sample output from this script is shown below. In this case, four of the input IDs
347    failed to find a result; however, two of them (C<O29118> and C<Q8PZM3>) produced multiple
348    results.
349    
350        HYPA_ECO57      [NiFe] hydrogenase nickel incorporation protein HypA    (NiFe hydrogenase maturation)
351        P72620_SYNY3    [NiFe] hydrogenase metallocenter assembly protein HypD  (NiFe hydrogenase maturation)
352        Q2RXN5_RHORT    [NiFe] hydrogenase metallocenter assembly protein HypE  (NiFe hydrogenase maturation)
353        O29118  N-acetylglutamate synthase (EC 2.3.1.1) (Arginine Biosynthesis extended)
354        O29118  Glutamate N-acetyltransferase (EC 2.3.1.35)     (Arginine Biosynthesis extended)
355        O29118  N-acetylglutamate synthase (EC 2.3.1.1) (Arginine Biosynthesis)
356        O29118  Glutamate N-acetyltransferase (EC 2.3.1.35)     (Arginine Biosynthesis)
357        Q8PZM3  tRNA nucleotidyltransferase (EC 2.7.7.25)       (CBSS-316057.3.peg.1294)
358        Q8PZM3  tRNA nucleotidyltransferase (EC 2.7.7.25)       (tRNA nucleotidyltransferase)
359    
360        17KD_RICBR is not in a subsystem.
361        Q8YY27_ANASP is not in a subsystem.
362        18KD_MYCLE is not in a subsystem.
363        1A14_ARATH is not in a subsystem.
364    
365    =head3 Genes in Subsystems for Genomes
366    
367    This next example finds all genes in subsystems for a specific genome. We will
368    perform this operation in two phases. First, we will find the subsystems for
369    each genome, and then the genes for those subsystems. This requires two Sapling
370    Server functions.
371    
372    =over 4
373    
374    =item *
375    
376    L<SAP/genomes_to_subsystems> returns a list of the subsystems for each
377    specified genome.
378    
379    =item *
380    
381    L<SAP/ids_in_subsystems> returns a list of the genes in each listed
382    subsystem for a specified genome.
383    
384    =back
385    
386    Our example program will loop through the genome IDs in an input file. For
387    each genome, it will call I<genomes_to_subsystems> to get the subsystem list,
388    and then feed the list to I<ids_in_subsystems> to get the result.
389    
390    L<SAP/ids_in_subsystems> returns gene IDs rather than taking them as input.
391    Like L<SAP/ids_to_subsystems> and L<SAP/ids_to_functions>, it takes a C<source>
392    parameter that indicates the type of ID desired (e.g. C<NCBI>, C<CMR>, C<LocusTag>).
393    In this case, however, the type describes how the gene IDs will be formatted in
394    the output rather than the type expected upon input. If a gene does not have an
395    ID for a particular source type (e.g. C<fig|100226.1.peg.3361> does not have a
396    locus tag), then the FIG identifier is used. The default source type (C<SEED>)
397    means that FIG identifiers will be used for everything.
398    
399    The program is given below.
400    
401        use strict;
402        use SAPserver;
403    
404        my $sapServer = SAPserver->new();
405        # Loop through the input file. Note that this loop will stop on the first
406        # blank line.
407        while (my $genomeID = <STDIN>) {
408            chomp $genomeID;
409            # Get the subsystems for this genome.
410            my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);
411            # The data returned for each genome (and in our case there's only one)
412            # includes the subsystem name and the variant code. The following
413            # statement strips away the variant codes, leaving only the subsystem
414            # names.
415            my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
416            # Ask for the genes in each subsystem, using NCBI identifiers.
417            my $roleHashes = $sapServer->ids_in_subsystems(subsystems => $subList,
418                                                         genome => $genomeID,
419                                                         source => 'NCBI',
420                                                         roleForm => 'full');
421            # The hash maps each subsystem ID to a hash of roles to lists of feature
422            # IDs. We therefore use three levels of nested loops to produce our
423            # output lines. At the top level we have a hash mapping subsystem IDs
424            # to role hashes.
425            for my $subsystem (sort keys %$roleHashes) {
426                my $geneHash = $roleHashes->{$subsystem};
427                # The gene hash maps each role to a list of gene IDs.
428                for my $role (sort keys %$geneHash) {
429                    my $geneList = $geneHash->{$role};
430                    # Finally, we loop through the gene IDs.
431                    for my $gene (@$geneList) {
432                        print "$genomeID\t$gene\t$subsystem\t$role\n";
433                    }
434                }
435            }
436        }
437    
438    An excerpt of the output is shown here.
439    
440        360108.3    85840651    Queuosine-Archaeosine Biosynthesis          Queuosine Biosynthesis QueC ATPase
441        360108.3    85841812    Queuosine-Archaeosine Biosynthesis          Queuosine Biosynthesis QueE Radical SAM
442        360108.3    85841520    Queuosine-Archaeosine Biosynthesis          Queuosine biosynthesis QueD, PTPS-I
443        360108.3    85841162    Queuosine-Archaeosine Biosynthesis          S-adenosylmethionine:tRNA ribosyltransferase-isomerase (EC 5.-.-.-)
444        360108.3    85842244    Queuosine-Archaeosine Biosynthesis          tRNA-guanine transglycosylase (EC 2.4.2.29)
445        360108.3    85841653    Quinate degradation                         3-dehydroquinate dehydratase II (EC 4.2.1.10)
446        360108.3    85840760    RNA polymerase bacterial                    DNA-directed RNA polymerase alpha subunit (EC 2.7.7.6)
447        360108.3    85841269    RNA polymerase bacterial                    DNA-directed RNA polymerase beta subunit (EC 2.7.7.6)
448        360108.3    85841348    RNA polymerase bacterial                    DNA-directed RNA polymerase beta' subunit (EC 2.7.7.6)
449        360108.3    85841887    RNA polymerase bacterial                    DNA-directed RNA polymerase omega subunit (EC 2.7.7.6)
450        360108.3    85841283    RNA processing and degradation, bacterial   3'-to-5' exoribonuclease RNase R
451        360108.3    85840820    RNA processing and degradation, bacterial   Ribonuclease III (EC 3.1.26.3)
452        360108.3    85842272    Recycling of Peptidoglycan Amino Acids      Aminoacyl-histidine dipeptidase (Peptidase D) (EC 3.4.13.3)
453    
454    The I<ids_in_subsystems> service has several useful options for changing the nature
455    of the output. For example, in the above program each role is represented by a
456    full description (C<roleForm> set to C<full>). If you don't need the roles, you
457    can specify C<none> for the role form. You can also request that the gene IDs
458    be returned in a comma-separated list instead of a list data structure. These
459    two changes can drastically simplify the above program.
460    
461        use strict;
462        use SAPserver;
463    
464        my $sapServer = SAPserver->new();
465        # Loop through the input file. Note that this loop will stop on the first
466        # blank line.
467        while (my $genomeID = <STDIN>) {
468            chomp $genomeID;
469            # Get the subsystems for this genome.
470            my $subHash = $sapServer->genomes_to_subsystems(ids => $genomeID);
471            # The data returned for each genome (and in our case there's only one)
472            # includes the subsystem name and the variant code. The following
473            # statement strips away the variant codes, leaving only the subsystem
474            # names.
475            my $subList = [ map { $_->[0] } @{$subHash->{$genomeID}} ];
476            # Ask for the genes in each subsystem, using NCBI identifiers.
477            my $genesHash = $sapServer->ids_in_subsystems(subsystems => $subList,
478                                                         genome => $genomeID,
479                                                         source => 'NCBI',
480                                                         roleForm => 'none',
481                                                         grouped => 1);
482            # The hash maps each subsystem ID to a comma-delimited list of gene IDs.
483            for my $subsystem (sort keys %$genesHash) {
484                my $genes = $genesHash->{$subsystem};
485                print "$genomeID\t$subsystem\t$genes\n";
486            }
487        }
488    
489    The sample output in this case looks quite different. The role information is missing,
490    and all the data for a subsystem is in a single line.
491    
492        360108.3    Queuosine-Archaeosine Biosynthesis          85841622, 85841791, 85840661, 85841162, 85841520, 85842244, 85840651, 85841812
493        360108.3    Quinate degradation                         85841653
494        360108.3    RNA polymerase bacterial                    85840760, 85841269, 85841348, 85841887
495        360108.3    RNA processing and degradation, bacterial   85840820, 85841283
496        360108.3    Recycling of Peptidoglycan Amino Acids      85842019, 85842272
497    
498    =head2 A More Complicated Example: Operon Upstream Regions
499    
500    In this example we'll string several services together to perform a more
501    complex task: locating the upstream regions of operons involved in a particular
502    metabolic pathway. The theory is that we can look for a common pattern in
503    these regions.
504    
505    A metabolic pathway is a subsystem, so we'll enter our database via the
506    subsystems. To keep the data manageable, we'll limit our results to
507    specific genomes. The program we write will take as input a subsystem name
508    and a file of genome IDs.
509    
510    The worst part of the task is finding the operon for a gene. This involves
511    finding all the genes in a neighborhood and isolating the ones that point in
512    the correct direction. Fortunately, there is a Sapling Server function--
513    L<SAP/make_runs>-- that specifcally performs this task.
514    
515    To start our program, we create a L<SAPserver> object and pull the subsystem
516    name from the command-line parameters. This program is going to be doing a
517    lot of complicated, long-running stuff, so we'll usually want to deal with one
518    result at a time. To facilitate that, we construct the server helper in
519    singleton mode.
520    
521        use strict;
522        use SAPserver;
523    
524        my $sapServer = SAPserver->new(singleton => 1);
525        # Get the subsystem name.
526        my $ssName = $ARGV[0];
527    
528    Our main loop will read through the list of genomes from the standard input
529    and call a method I<PrintUpstream> to process each one. We're going to be a bit
530    cheesy here and allow our genome loop to stop on the first blank line.
531    
532        while (my $genomeID = <STDIN>) {
533            chomp $genomeID;
534            PrintUpstream($sapServer, $ssName, $genomeID);
535        }
536    
537    Now we need to write I<PrintUpstream>. Its first task is to find all the
538    genes in the genome that belong to the subsystem. A single call to
539    L<SAP/ids_in_subsystems> will get this information. We then feed the
540    results into L<SAP/make_runs> to get operons and call L<SAP/upstream> for
541    each operon. The program is given below.
542    
543        sub PrintUpstream {
544            my ($sapServer, $ssName, $genomeID) = @_;
545            # Because we specify "roleForm => none", we get back one long
546            # gene list.
547            my $geneList = $sapServer->ids_in_subsystems(subsystems => $ssName,
548                                                         genome => $genomeID,
549                                                         roleForm => 'none');
550            # Convert the gene list to a comma-delimited string.
551            my $geneString = join(", ", @$geneList);
552            # Get a list of operons.
553            my $opList = $sapServer->make_runs(groups => $geneString);
554            # Loop through the operons.
555            for my $opString (@$opList) {
556                # Get the first feature's ID.
557                my ($front) = split /\s*,/, $opString, 2;
558                # Grab its upstream region. We'll include the operon string as the
559                # comment text.
560                my $fasta = $sapServer->upstream(ids => $front,
561                                                 comments => { $front => $opString },
562                                                 skipGene => 1);
563                # Print the result.
564                print "$fasta\n";
565            }
566        }
567    
568    
569  =cut  =cut

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3