[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package GenomeSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSaplingLoader';
26 :    
27 :     =head1 Sapling Genome Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major genome-related tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = GenomeSaplingLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new GenomeSaplingLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SaplingPm]] object for the database being loaded.
44 :    
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 :     my ($class, $erdb, $options) = @_;
60 :     # Create the table list.
61 :     my @tables = sort qw(Genome IsMadeUpOf IsTaxonomyOf TaxonomicGrouping
62 :     IsGroupContaining DnaSequence DnaSequenceBases);
63 :     # Create the BaseSaplingLoader object.
64 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
65 :     # Return it.
66 :     return $retVal;
67 :     }
68 :    
69 :     =head2 Public Methods
70 :    
71 :     =head3 Generate
72 :    
73 :     $sl->Generate();
74 :    
75 :     Generate the data for the genome-related files.
76 :    
77 :     =cut
78 :    
79 :     sub Generate {
80 :     # Get the parameters.
81 :     my ($self) = @_;
82 :     # Process according to the type of section.
83 :     if ($self->global()) {
84 :     # This is the global section. Create the taxonomic hierarchy.
85 :     $self->CreateTaxonomies();
86 :     } else {
87 :     # Get the section ID.
88 :     my $genomeID = $self->section();
89 :     # This is a genome section. Create the data for the genome.
90 :     $self->PlaceGenome($genomeID);
91 :     }
92 :     }
93 :    
94 :     =head3 CreateTaxonomies
95 :    
96 :     $sl->CreateTaxonomies();
97 :    
98 :     Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,
99 :     IsClassOf, and IsTaxonomyOf tables.
100 :    
101 :     =cut
102 :    
103 :     sub CreateTaxonomies {
104 :     # Get the parameters.
105 :     my ($self) = @_;
106 :     # Get the Sapling object.
107 :     my $sapling = $self->db();
108 :     # Get the source object.
109 :     my $fig = $sapling->GetSourceObject();
110 :     # Create the taxonomy hash. For each taxonomic grouping, the hash will map
111 :     # to its parent grouping.
112 :     my %taxTree;
113 :     # Get the genome list.
114 :     my @genomes = sort keys %{$sapling->GenomeHash()};
115 :     # Loop through them, processing the taxonomy of each.
116 :     for my $genome (@genomes) {
117 :     $self->Track(Organisms => $genome, 100);
118 :     # Get the name of this genome. Genome names sometimes get
119 :     # stored incorrectly in the taxonomy.
120 :     my $genomeName = $fig->genus_species($genome);
121 :     # Get the taxonomy list.
122 :     my @taxClasses = grep { $_ ne $genomeName }
123 :     split /\s*;\s*/, $fig->taxonomy_of($genome);
124 :     # Loop through the taxonomy. For each class found, we connect
125 :     # it to the genome with a sequence number indicating its position
126 :     # in the genome's taxonomy, and we store its parent class name
127 :     # so we can connect the groups. As a result, the genome is in
128 :     # every taxonomic group it belongs to, and we have enough data
129 :     # to produce the taxonomy tree as well.
130 :     my $parent = undef;
131 :     my $sequence = 0;
132 :     for my $taxClass (@taxClasses) {
133 :     $taxTree{$taxClass} = $parent;
134 :     $parent = $taxClass;
135 :     $self->PutR(IsTaxonomyOf => $taxClass, $genome,
136 :     sequence => $sequence++);
137 :     }
138 :     }
139 :     # Now we loop through the taxonomy hash, creating the TaxonomicGrouping
140 :     # and IsClassOf records.
141 :     for my $taxClass (sort keys %taxTree) {
142 :     $self->Track(TaxonomicGroupings => $taxClass, 100);
143 :     # Determine whether or not this is a domain.
144 :     my $parent = $taxTree{$taxClass};
145 :     my $isDomain = (defined $parent ? 0 : 1);
146 :     if (! $isDomain) {
147 :     # It isn't a domain, so link it to its parent.
148 :     $self->PutR(IsGroupContaining => $parent, $taxClass);
149 :     }
150 :     # Create the group record.
151 :     $self->PutE(TaxonomicGrouping => $taxClass, domain => $isDomain);
152 :     }
153 :     }
154 :    
155 :    
156 :     =head3 PlaceGenome
157 :    
158 :     $sl->PlaceGenome($genomeID);
159 :    
160 :     Generate the data for a specific genome. This method generates data for
161 :     the Genome, IsMadeUpOf, DnaSequence and DnaSequenceBases
162 :     tables.
163 :    
164 :     =over 4
165 :    
166 :     =item genomeID
167 :    
168 :     ID of the genome whose data is to be generated.
169 :    
170 :     =back
171 :    
172 :     =cut
173 :    
174 :     sub PlaceGenome {
175 :     # Get the parameters.
176 :     my ($self, $genomeID) = @_;
177 :     # Get the Sapling object.
178 :     my $sapling = $self->db();
179 :     # Get the source object.
180 :     my $fig = $sapling->GetSourceObject();
181 :     # We start with the genome record itself, asking the FIG object
182 :     # for its various properties.
183 :     my $complete = $fig->is_complete($genomeID);
184 :     my $dna_size = $fig->genome_szdna($genomeID);
185 :     my $domain = $fig->genome_domain($genomeID);
186 :     my $full_name = $fig->genus_species($genomeID);
187 :     my $pegs = $fig->genome_pegs($genomeID);
188 :     my $rnas = $fig->genome_rnas($genomeID);
189 :     my $version = $fig->genome_version($genomeID) || $genomeID;
190 :     # We need to compute the number of contigs from the list of contig IDs.
191 :     my @contigIDs = $fig->contigs_of($genomeID);
192 :     my $contigs = scalar(@contigIDs);
193 :     # Write the genome record.
194 :     $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
195 :     'dna-size' => $dna_size, domain => $domain, 'full-name' => $full_name,
196 :     pegs => $pegs, rnas => $rnas, version => $version);
197 :     # Now we create the DNA sequences. These correspond to the FIG contigs.
198 :     for my $contigID (@contigIDs) {
199 :     $self->Track(Contigs => $contigID, 100);
200 :     # Get the contig length.
201 :     my $length = $fig->contig_ln($genomeID, $contigID);
202 :     # Generate the contig record. Note that the contig ID includes
203 :     # the genome ID as a prefix. Otherwise, it would be non-unique.
204 :     my $realContigID = "$genomeID:$contigID";
205 :     $self->PutE(DnaSequence => $realContigID, length => $length);
206 :     $self->PutR(IsMadeUpOf => $genomeID, $realContigID);
207 :     # May God have mercy, because here we yank in the DNA.
208 :     my $contigDNA = $fig->get_dna($genomeID, $contigID, 1, $length);
209 :     $self->Add('dna-letters' => length($contigDNA));
210 :     $self->PutE(DnaSequenceBases => $realContigID, bases => $contigDNA);
211 :     }
212 :     }
213 :    
214 :    
215 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3