[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package GenomeSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSaplingLoader';
26 :    
27 :     =head1 Sapling Genome Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major genome-related tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = GenomeSaplingLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new GenomeSaplingLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 : parrello 1.3 L<Sapling> object for the database being loaded.
44 : parrello 1.1
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 :     my ($class, $erdb, $options) = @_;
60 :     # Create the table list.
61 : parrello 1.2 my @tables = sort qw(GenomeSet IsMadeUpOf IsCollectionOf Genome IsTaxonomyOf TaxonomicGrouping
62 :     TaxonomicGroupingAlias IsGroupFor Contig HasSection DNASequence);
63 : parrello 1.1 # Create the BaseSaplingLoader object.
64 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
65 :     # Return it.
66 :     return $retVal;
67 :     }
68 :    
69 :     =head2 Public Methods
70 :    
71 :     =head3 Generate
72 :    
73 :     $sl->Generate();
74 :    
75 :     Generate the data for the genome-related files.
76 :    
77 :     =cut
78 :    
79 :     sub Generate {
80 :     # Get the parameters.
81 :     my ($self) = @_;
82 :     # Process according to the type of section.
83 :     if ($self->global()) {
84 :     # This is the global section. Create the taxonomic hierarchy.
85 :     $self->CreateTaxonomies();
86 : parrello 1.2 # Create the genome sets.
87 :     $self->CreateGenomeSets();
88 : parrello 1.1 } else {
89 :     # Get the section ID.
90 :     my $genomeID = $self->section();
91 :     # This is a genome section. Create the data for the genome.
92 :     $self->PlaceGenome($genomeID);
93 :     }
94 :     }
95 :    
96 : parrello 1.2 =head3 CreateGenomeSets
97 :    
98 :     $sl->CreateGenomeSets();
99 :    
100 :     Generate the genome sets. This includes the GenomeSet and IsCollectionOf
101 :     tables.
102 :    
103 :     =cut
104 :    
105 :     sub CreateGenomeSets {
106 :     # Get the parameters.
107 :     my ($self) = @_;
108 :     # Get the genome hash. Only genomes in this hash will be put into a set.
109 :     my $sapling = $self->db();
110 :     my $genomeHash = $sapling->GenomeHash();
111 :     # We'll track genome set names in here. The set name is the most common
112 :     # genus in the set with an optional number for uniqueness.
113 :     my %setNames;
114 :     # Get the genome set file.
115 :     my $ih = Open(undef, "<$FIG_Config::global/genome.sets");
116 :     # We will accumulate set data and output a set at the end of each set group.
117 :     # This will be a list of genome IDs for the set.
118 :     my @genomes;
119 :     # This will contain the genus counts.
120 :     my %names;
121 :     # This will be the set ID number.
122 :     my $setID;
123 :     # Loop through the set file.
124 :     while (! eof $ih) {
125 :     # Get the next record.
126 :     $self->Add("set-records" => 1);
127 :     my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
128 :     # Is this a new set?
129 :     if ($newSetID != $setID) {
130 :     # Yes. Output the old set.
131 :     $self->OutputGenomeSet(\%names, \%setNames, \@genomes);
132 :     # Clear the set data.
133 :     %names = ();
134 :     @genomes = ();
135 :     # Save the new set ID.
136 :     $setID = $newSetID;
137 :     }
138 :     # Only proceed if this is one of our genomes.
139 :     if ($genomeHash->{$genomeID}) {
140 :     $self->Add("set-genomes" => 1);
141 :     # Save the genome ID.
142 :     push @genomes, $genomeID;
143 :     # Remember it as the representative if it's the first in the set.
144 :     # Count the genus.
145 :     my ($genus) = split /\s/, $name, 2;
146 :     $names{$genus}++;
147 :     }
148 :     }
149 :     # Close the input file.
150 :     close $ih;
151 :     # Output the last set.
152 :     $self->OutputGenomeSet(\%names, \%setNames, \@genomes);
153 :     }
154 :    
155 :     =head3 OutputGenomeSet
156 :    
157 :     $sl->OutputGenomeSet(\%names, \%setNames, \@genomes);
158 :    
159 :     Output the data for a genome set. A name will be computed from the genus
160 :     information and the appropriate GenomeSet and IsCollectionOf records will
161 :     be generated for the genomes in the set.
162 :    
163 :     =over 4
164 :    
165 :     =item names
166 :    
167 :     Reference to a hash of the genus names used in the set. The hash maps each name
168 :     to the number of times it appeared.
169 :    
170 :     =item setNames
171 :    
172 :     Reference to a hash of set names already used. This hash will be updated to include
173 :     the set name chosen.
174 :    
175 :     =item genomes
176 :    
177 :     Reference to a list of the IDs for the genomes in the set.
178 :    
179 :     =back
180 :    
181 :     =cut
182 :    
183 :     sub OutputGenomeSet {
184 :     # Get the parameters.
185 :     my ($self, $names, $setNames, $genomes) = @_;
186 :     # Only proceed if there is at least one genome.
187 :     my $count = scalar @$genomes;
188 :     if ($count) {
189 :     # First we compute the name. Sort the genus names by occurrence count.
190 :     my ($setName) = sort { $names->{$a} <=> $names->{$b} } keys %$names;
191 :     # Apply a suffix to make it unique.
192 :     my $i = 1;
193 :     $i++ while $setNames->{"$setName/$i"};
194 :     $setName .= "/$i";
195 :     # Insure we don't reuse this set name.
196 :     $setNames->{$setName} = 1;
197 :     # Create the set record.
198 :     $self->PutE(GenomeSet => $setName);
199 :     # This will be TRUE for the first genome and FALSE thereafter, insuring that
200 :     # the first genome is used for the representative.
201 :     my $repFlag = 1;
202 :     # Connect all the genomes to it.
203 :     for my $genome (@$genomes) {
204 :     $self->PutR(IsCollectionOf => $setName, $genome, representative => $repFlag);
205 :     $repFlag = 0;
206 :     }
207 :     }
208 :     }
209 :    
210 :    
211 : parrello 1.1 =head3 CreateTaxonomies
212 :    
213 :     $sl->CreateTaxonomies();
214 :    
215 :     Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,
216 : parrello 1.2 IsGroupFor, TaxonomicGroupingAlias, and IsTaxonomyOf relationships. The
217 :     taxonomy hierarchy is computed from the NCBI taxonomy dump.
218 : parrello 1.1
219 :     =cut
220 :    
221 :     sub CreateTaxonomies {
222 :     # Get the parameters.
223 :     my ($self) = @_;
224 :     # Get the Sapling object.
225 :     my $sapling = $self->db();
226 : parrello 1.2 # Get the name of the taxonomy dump directory.
227 :     my $taxDir = "$FIG_Config::global/Taxonomy";
228 :     # The first step is to read in all the names. We will build a hash that maps
229 :     # each taxonomy ID to a list of its names. The first scientific name encountered
230 :     # will be saved as the primary name. Only scientific names, synonoyms, and
231 :     # equivalent names will be kept.
232 :     my (%nameLists, %primaryNames);
233 :     my $ih = Open(undef, "<$taxDir/names.dmp");
234 :     while (! eof $ih) {
235 :     # Get the next name.
236 :     my ($taxID, $name, undef, $type) = GetTaxData($ih);
237 :     $self->Add('taxnames-in' => 1);
238 :     # Is this a scientific name?
239 :     if ($type =~ /scientific/i) {
240 :     # Yes. Save it if it is the first for this ID.
241 :     if (! exists $primaryNames{$taxID}) {
242 :     $primaryNames{$taxID} = $name;
243 :     }
244 :     # Add it to the name list.
245 :     push @{$nameLists{$taxID}}, $name;
246 :     $self->Add('taxnames-scientific' => 1);
247 :     } elsif ($type =~ /synonym|equivalent/i) {
248 :     # Here it's not scientific, but it's generally useful, so we keep it.
249 :     push @{$nameLists{$taxID}}, $name;
250 :     $self->Add('taxnames-other' => 1);
251 :     }
252 :     }
253 :     # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
254 :     # record, and we connect it to its parent using IsGroupFor.
255 :     close $ih;
256 :     $ih = Open(undef, "<$taxDir/nodes.dmp");
257 :     while (! eof $ih) {
258 :     # Get the data for this group.
259 :     my ($taxID, $parent, $type, undef, undef,
260 :     undef, undef, undef, undef, undef, $hidden) = GetTaxData($ih);
261 : parrello 1.4 # Determine whether or not this is a domain group. A domain group is
262 :     # terminal when doing taxonomy searches. The NCBI indicates a top-level
263 :     # node by making it a child of the root node 1. We also include
264 :     # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
265 :     # organisms but are still terminal in our book.
266 :     my $domain = ($type eq 'superkingdom') || ($parent == 1);
267 : parrello 1.2 # Get the node's name.
268 :     my $name = $primaryNames{$taxID};
269 :     # It's an error if there's no name.
270 :     Confess("No name found for tax ID $taxID.") if ! $name;
271 :     # Create the taxonomy group record.
272 :     $self->PutE(TaxonomicGrouping => $taxID, domain => $domain, hidden => $hidden,
273 :     scientific_name => $name);
274 :     # Create the alias records.
275 :     for my $alias (@{$nameLists{$taxID}}) {
276 :     $self->PutE(TaxonomicGroupingAlias => $taxID, alias => $alias);
277 : parrello 1.1 }
278 : parrello 1.2 # Connect the group to its parent.
279 :     $self->PutR(IsGroupFor => $parent, $taxID);
280 :     }
281 :     # Now we need to connect each genome to its taxonomic grouping.
282 :     # Get the genome hash. This gives us our list of genome IDs.
283 :     my $genomeHash = $sapling->GenomeHash();
284 :     # Loop through the genomes.
285 :     for my $genomeID (keys %$genomeHash) {
286 :     # Get this genome's taxonomic group.
287 :     my ($taxID) = split /\./, $genomeID, 2;
288 :     # Connect the genome and the group.
289 :     $self->PutR(IsTaxonomyOf => $taxID, $genomeID);
290 : parrello 1.1 }
291 :     }
292 :    
293 :    
294 :     =head3 PlaceGenome
295 :    
296 :     $sl->PlaceGenome($genomeID);
297 :    
298 :     Generate the data for a specific genome. This method generates data for
299 : parrello 1.2 the Genome, IsMadeUpOf, Contig, HasSection, and DNASequence tables.
300 : parrello 1.1
301 :     =over 4
302 :    
303 :     =item genomeID
304 :    
305 :     ID of the genome whose data is to be generated.
306 :    
307 :     =back
308 :    
309 :     =cut
310 :    
311 :     sub PlaceGenome {
312 :     # Get the parameters.
313 :     my ($self, $genomeID) = @_;
314 :     # Get the Sapling object.
315 :     my $sapling = $self->db();
316 :     # Get the source object.
317 :     my $fig = $sapling->GetSourceObject();
318 :     # We start with the genome record itself, asking the FIG object
319 :     # for its various properties.
320 : parrello 1.2 my $scientific_name = $fig->genus_species($genomeID);
321 : parrello 1.1 my $complete = $fig->is_complete($genomeID);
322 :     my $dna_size = $fig->genome_szdna($genomeID);
323 :     my $pegs = $fig->genome_pegs($genomeID);
324 :     my $rnas = $fig->genome_rnas($genomeID);
325 : parrello 1.5 my $domain = $fig->genome_domain($genomeID);
326 :     my $prokaryotic = ($domain =~ /bacter|archae/i);
327 : parrello 1.1 # We need to compute the number of contigs from the list of contig IDs.
328 :     my @contigIDs = $fig->contigs_of($genomeID);
329 :     my $contigs = scalar(@contigIDs);
330 :     # Write the genome record.
331 :     $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
332 : parrello 1.2 dna_size => $dna_size, scientific_name => $scientific_name,
333 : parrello 1.5 pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
334 :     domain => $domain);
335 : parrello 1.2 # Now we create the Contigs. Each one needs to be split into DNA sequences.
336 : parrello 1.1 for my $contigID (@contigIDs) {
337 :     $self->Track(Contigs => $contigID, 100);
338 :     # Get the contig length.
339 :     my $length = $fig->contig_ln($genomeID, $contigID);
340 :     # Generate the contig record. Note that the contig ID includes
341 :     # the genome ID as a prefix. Otherwise, it would be non-unique.
342 :     my $realContigID = "$genomeID:$contigID";
343 : parrello 1.2 $self->PutE(Contig => $realContigID, length => $length);
344 : parrello 1.1 $self->PutR(IsMadeUpOf => $genomeID, $realContigID);
345 : parrello 1.2 # Now we loop through the DNA chunks.
346 :     my $loc = 1;
347 :     my $ordinal = 0;
348 :     my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
349 :     while ($loc < $length) {
350 :     # Get this segment's true length.
351 :     my $trueLength = Tracer::Min($length - $loc, $segmentLength);
352 :     # Compute the index of this segment's last base pair.
353 :     my $endPoint = $loc + $trueLength - 1;
354 :     # Get the DNA.
355 :     my $chunkDNA = $fig->get_dna($genomeID, $contigID, $loc, $endPoint);
356 :     # Create its sequence record.
357 :     my $paddedOrdinal = Tracer::Pad($ordinal, 7, 1, '0');
358 :     my $seqID = "$realContigID:$paddedOrdinal";
359 :     $self->PutE(DNASequence => $seqID, sequence => $chunkDNA);
360 :     $self->Add('dna-letters' => $trueLength);
361 :     # Connect it to the contig.
362 :     $self->PutR(HasSection => $realContigID, $seqID);
363 :     # Move to the next section of the contig.
364 :     $loc = $endPoint + 1;
365 :     $ordinal++;
366 :     }
367 :     }
368 :     }
369 :    
370 :     =head3 GetTaxData
371 :    
372 :     my @fields = GenomeSaplingLoader::GetTaxData($ih);
373 :    
374 :     Read a taxonomy dump record and return its fields in a list. Taxonomy
375 :     dump records end in a tab-bar-newline sequence, and fields are separated
376 :     by a tab-bar-tab sequence, a more complex arrangement than is used in
377 :     standard tab-delimited files.
378 :    
379 :     =over 4
380 :    
381 :     =item ih
382 :    
383 :     Open input handle for the taxonomy dump file.
384 :    
385 :     =item RETURN
386 :    
387 :     Returns a list of the fields in the record read.
388 :    
389 :     =back
390 :    
391 :     =cut
392 :    
393 :     sub GetTaxData {
394 :     # Get the parameters.
395 :     my ($ih) = @_;
396 :     # Temporarily change the end-of-record character.
397 :     local $/ = "\t|\n";
398 :     # Read the next record.
399 :     my $line = <$ih>;
400 :     # Chop off the end, if any.
401 :     if ($line =~ /(.+)\t\|\n$/) {
402 :     $line = $1;
403 : parrello 1.1 }
404 : parrello 1.2 # Split the line into fields.
405 :     my @retVal = split /\t\|\t/, $line;
406 :     # Return the result.
407 :     return @retVal;
408 : parrello 1.1 }
409 :    
410 :    
411 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3