[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package GenomeSaplingLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 : parrello 1.13 use MD5Computer;
26 : parrello 1.1 use base 'BaseSaplingLoader';
27 :    
28 :     =head1 Sapling Genome Load Group Class
29 :    
30 :     =head2 Introduction
31 :    
32 :     The Load Group includes all of the major genome-related tables.
33 :    
34 :     =head3 new
35 :    
36 :     my $sl = GenomeSaplingLoader->new($erdb, $source, $options, @tables);
37 :    
38 :     Construct a new GenomeSaplingLoader object.
39 :    
40 :     =over 4
41 :    
42 :     =item erdb
43 :    
44 : parrello 1.3 L<Sapling> object for the database being loaded.
45 : parrello 1.1
46 :     =item options
47 :    
48 :     Reference to a hash of command-line options.
49 :    
50 :     =item tables
51 :    
52 :     List of tables in this load group.
53 :    
54 :     =back
55 :    
56 :     =cut
57 :    
58 :     sub new {
59 :     # Get the parameters.
60 :     my ($class, $erdb, $options) = @_;
61 :     # Create the table list.
62 : parrello 1.2 my @tables = sort qw(GenomeSet IsMadeUpOf IsCollectionOf Genome IsTaxonomyOf TaxonomicGrouping
63 :     TaxonomicGroupingAlias IsGroupFor Contig HasSection DNASequence);
64 : parrello 1.1 # Create the BaseSaplingLoader object.
65 :     my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
66 :     # Return it.
67 :     return $retVal;
68 :     }
69 :    
70 :     =head2 Public Methods
71 :    
72 :     =head3 Generate
73 :    
74 :     $sl->Generate();
75 :    
76 :     Generate the data for the genome-related files.
77 :    
78 :     =cut
79 :    
80 :     sub Generate {
81 :     # Get the parameters.
82 :     my ($self) = @_;
83 :     # Process according to the type of section.
84 :     if ($self->global()) {
85 :     # This is the global section. Create the taxonomic hierarchy.
86 :     $self->CreateTaxonomies();
87 : parrello 1.2 # Create the genome sets.
88 :     $self->CreateGenomeSets();
89 : parrello 1.1 } else {
90 :     # Get the section ID.
91 :     my $genomeID = $self->section();
92 :     # This is a genome section. Create the data for the genome.
93 :     $self->PlaceGenome($genomeID);
94 :     }
95 :     }
96 :    
97 : parrello 1.2 =head3 CreateGenomeSets
98 :    
99 :     $sl->CreateGenomeSets();
100 :    
101 :     Generate the genome sets. This includes the GenomeSet and IsCollectionOf
102 :     tables.
103 :    
104 :     =cut
105 :    
106 :     sub CreateGenomeSets {
107 :     # Get the parameters.
108 :     my ($self) = @_;
109 :     # Get the genome hash. Only genomes in this hash will be put into a set.
110 :     my $sapling = $self->db();
111 :     my $genomeHash = $sapling->GenomeHash();
112 :     # We'll track genome set names in here. The set name is the most common
113 :     # genus in the set with an optional number for uniqueness.
114 :     my %setNames;
115 :     # Get the genome set file.
116 :     my $ih = Open(undef, "<$FIG_Config::global/genome.sets");
117 :     # We will accumulate set data and output a set at the end of each set group.
118 :     # This will be a list of genome IDs for the set.
119 :     my @genomes;
120 :     # This will contain the genus counts.
121 :     my %names;
122 :     # This will be the set ID number.
123 :     my $setID;
124 :     # Loop through the set file.
125 :     while (! eof $ih) {
126 :     # Get the next record.
127 :     $self->Add("set-records" => 1);
128 :     my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
129 : parrello 1.7 # Only accept the genome if it's one of ours.
130 : parrello 1.2 if ($genomeHash->{$genomeID}) {
131 : parrello 1.7 # Is this a new set?
132 :     if ($newSetID != $setID) {
133 :     # Yes. Output the old set.
134 :     $self->OutputGenomeSet(\%names, $setID, \@genomes);
135 :     # Clear the set data.
136 :     %names = ();
137 :     @genomes = ();
138 :     # Save the new set ID.
139 :     $setID = $newSetID;
140 :     }
141 :     # Only proceed if this is one of our genomes.
142 :     if ($genomeHash->{$genomeID}) {
143 :     $self->Add("set-genomes" => 1);
144 :     # Save the genome ID.
145 :     push @genomes, $genomeID;
146 :     # Remember it as the representative if it's the first in the set.
147 :     # Count the genus.
148 : parrello 1.16 my ($genus) = split m/\s/, $name, 2;
149 : parrello 1.7 $names{$genus}++;
150 :     }
151 : parrello 1.2 }
152 :     }
153 :     # Close the input file.
154 :     close $ih;
155 :     # Output the last set.
156 : parrello 1.6 $self->OutputGenomeSet(\%names, $setID, \@genomes);
157 : parrello 1.2 }
158 :    
159 :     =head3 OutputGenomeSet
160 :    
161 : parrello 1.6 $sl->OutputGenomeSet(\%names, $setID, \@genomes);
162 : parrello 1.2
163 : parrello 1.6 Output the data for a genome set. The appropriate GenomeSet and IsCollectionOf
164 :     records will be generated for the genomes in the set.
165 : parrello 1.2
166 :     =over 4
167 :    
168 :     =item names
169 :    
170 :     Reference to a hash of the genus names used in the set. The hash maps each name
171 :     to the number of times it appeared.
172 :    
173 : parrello 1.6 =item setID
174 : parrello 1.2
175 : parrello 1.6 The ID to use for this set.
176 : parrello 1.2
177 :     =item genomes
178 :    
179 :     Reference to a list of the IDs for the genomes in the set.
180 :    
181 :     =back
182 :    
183 :     =cut
184 :    
185 :     sub OutputGenomeSet {
186 :     # Get the parameters.
187 : parrello 1.6 my ($self, $names, $setID, $genomes) = @_;
188 : parrello 1.2 # Only proceed if there is at least one genome.
189 :     my $count = scalar @$genomes;
190 :     if ($count) {
191 :     # Create the set record.
192 : parrello 1.6 $self->PutE(GenomeSet => $setID);
193 : parrello 1.2 # This will be TRUE for the first genome and FALSE thereafter, insuring that
194 :     # the first genome is used for the representative.
195 :     my $repFlag = 1;
196 :     # Connect all the genomes to it.
197 :     for my $genome (@$genomes) {
198 : parrello 1.6 $self->PutR(IsCollectionOf => $setID, $genome, representative => $repFlag);
199 : parrello 1.2 $repFlag = 0;
200 :     }
201 :     }
202 :     }
203 :    
204 :    
205 : parrello 1.1 =head3 CreateTaxonomies
206 :    
207 :     $sl->CreateTaxonomies();
208 :    
209 :     Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,
210 : parrello 1.2 IsGroupFor, TaxonomicGroupingAlias, and IsTaxonomyOf relationships. The
211 :     taxonomy hierarchy is computed from the NCBI taxonomy dump.
212 : parrello 1.1
213 :     =cut
214 :    
215 :     sub CreateTaxonomies {
216 :     # Get the parameters.
217 :     my ($self) = @_;
218 :     # Get the Sapling object.
219 :     my $sapling = $self->db();
220 : parrello 1.2 # Get the name of the taxonomy dump directory.
221 : parrello 1.14 my $taxDir = "/homes/parrello/Taxonomy"; # "/vol/biodb/ncbi/taxonomy";
222 : parrello 1.2 # The first step is to read in all the names. We will build a hash that maps
223 :     # each taxonomy ID to a list of its names. The first scientific name encountered
224 :     # will be saved as the primary name. Only scientific names, synonoyms, and
225 :     # equivalent names will be kept.
226 :     my (%nameLists, %primaryNames);
227 :     my $ih = Open(undef, "<$taxDir/names.dmp");
228 :     while (! eof $ih) {
229 :     # Get the next name.
230 :     my ($taxID, $name, undef, $type) = GetTaxData($ih);
231 :     $self->Add('taxnames-in' => 1);
232 :     # Is this a scientific name?
233 :     if ($type =~ /scientific/i) {
234 :     # Yes. Save it if it is the first for this ID.
235 :     if (! exists $primaryNames{$taxID}) {
236 :     $primaryNames{$taxID} = $name;
237 :     }
238 :     # Add it to the name list.
239 :     push @{$nameLists{$taxID}}, $name;
240 :     $self->Add('taxnames-scientific' => 1);
241 :     } elsif ($type =~ /synonym|equivalent/i) {
242 :     # Here it's not scientific, but it's generally useful, so we keep it.
243 :     push @{$nameLists{$taxID}}, $name;
244 :     $self->Add('taxnames-other' => 1);
245 :     }
246 :     }
247 :     # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
248 : parrello 1.10 # record, and we connect it to its parent using IsGroupFor. We also keep the node ID
249 :     # for later so we know what's available.
250 : parrello 1.2 close $ih;
251 :     $ih = Open(undef, "<$taxDir/nodes.dmp");
252 :     while (! eof $ih) {
253 :     # Get the data for this group.
254 :     my ($taxID, $parent, $type, undef, undef,
255 :     undef, undef, undef, undef, undef, $hidden) = GetTaxData($ih);
256 : parrello 1.4 # Determine whether or not this is a domain group. A domain group is
257 :     # terminal when doing taxonomy searches. The NCBI indicates a top-level
258 :     # node by making it a child of the root node 1. We also include
259 :     # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
260 :     # organisms but are still terminal in our book.
261 :     my $domain = ($type eq 'superkingdom') || ($parent == 1);
262 : parrello 1.2 # Get the node's name.
263 :     my $name = $primaryNames{$taxID};
264 :     # It's an error if there's no name.
265 :     Confess("No name found for tax ID $taxID.") if ! $name;
266 :     # Create the taxonomy group record.
267 :     $self->PutE(TaxonomicGrouping => $taxID, domain => $domain, hidden => $hidden,
268 :     scientific_name => $name);
269 :     # Create the alias records.
270 :     for my $alias (@{$nameLists{$taxID}}) {
271 :     $self->PutE(TaxonomicGroupingAlias => $taxID, alias => $alias);
272 : parrello 1.1 }
273 : parrello 1.2 # Connect the group to its parent.
274 :     $self->PutR(IsGroupFor => $parent, $taxID);
275 :     }
276 : parrello 1.10 # Read in the merge file. The merge file tells us which old IDs are mapped to
277 :     # new IDs. We need this to connect genomes with old IDs to the correct group.
278 :     my %merges;
279 :     $ih = Open(undef, "<$taxDir/merged.dmp");
280 :     while (! eof $ih) {
281 :     # Get this merge record.
282 :     my ($oldID, $newID) = GetTaxData($ih);
283 :     # Store it in the hash.
284 :     $merges{$oldID} = $newID;
285 :     }
286 : parrello 1.2 # Now we need to connect each genome to its taxonomic grouping.
287 :     # Get the genome hash. This gives us our list of genome IDs.
288 :     my $genomeHash = $sapling->GenomeHash();
289 :     # Loop through the genomes.
290 :     for my $genomeID (keys %$genomeHash) {
291 :     # Get this genome's taxonomic group.
292 : parrello 1.16 my ($taxID) = split m/\./, $genomeID, 2;
293 : parrello 1.10 # Check to see if we have this tax ID. If we don't, we check for a merge.
294 :     if (! $primaryNames{$taxID}) {
295 :     if ($merges{$taxID}) {
296 :     $taxID = $merges{$taxID};
297 :     $self->Add('merged-names' => 1);
298 : parrello 1.11 Trace("$genomeID has alternate taxonomy ID $taxID.") if T(ERDBLoadGroup => 2);
299 : parrello 1.10 } else {
300 :     $taxID = undef;
301 :     $self->Add('missing-groups' => 1);
302 : parrello 1.11 Trace("$genomeID has no taxonomy group.") if T(ERDBLoadGroup => 1);
303 : parrello 1.10 }
304 :     }
305 :     # Connect the genome and the group if the group is real.
306 :     if (defined $taxID) {
307 :     $self->PutR(IsTaxonomyOf => $taxID, $genomeID);
308 :     }
309 : parrello 1.1 }
310 :     }
311 :    
312 :    
313 :     =head3 PlaceGenome
314 :    
315 :     $sl->PlaceGenome($genomeID);
316 :    
317 :     Generate the data for a specific genome. This method generates data for
318 : parrello 1.2 the Genome, IsMadeUpOf, Contig, HasSection, and DNASequence tables.
319 : parrello 1.1
320 :     =over 4
321 :    
322 :     =item genomeID
323 :    
324 :     ID of the genome whose data is to be generated.
325 :    
326 :     =back
327 :    
328 :     =cut
329 :    
330 :     sub PlaceGenome {
331 :     # Get the parameters.
332 :     my ($self, $genomeID) = @_;
333 :     # Get the Sapling object.
334 :     my $sapling = $self->db();
335 :     # Get the source object.
336 :     my $fig = $sapling->GetSourceObject();
337 : parrello 1.8 # Get the DNA chunk size.
338 :     my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
339 :     Trace("DNA chunk size is $segmentLength.") if T(ERDBLoadGroup => 3);
340 : parrello 1.1 # We start with the genome record itself, asking the FIG object
341 :     # for its various properties.
342 : parrello 1.2 my $scientific_name = $fig->genus_species($genomeID);
343 : parrello 1.1 my $complete = $fig->is_complete($genomeID);
344 :     my $dna_size = $fig->genome_szdna($genomeID);
345 :     my $pegs = $fig->genome_pegs($genomeID);
346 :     my $rnas = $fig->genome_rnas($genomeID);
347 : parrello 1.5 my $domain = $fig->genome_domain($genomeID);
348 :     my $prokaryotic = ($domain =~ /bacter|archae/i);
349 : parrello 1.1 # We need to compute the number of contigs from the list of contig IDs.
350 :     my @contigIDs = $fig->contigs_of($genomeID);
351 :     my $contigs = scalar(@contigIDs);
352 : parrello 1.15 # This will be used to compute the GC content.
353 :     my $gc_count = 0;
354 : parrello 1.8 # Compute the genetic code. Normally, it's 11, but it may be overridden
355 :     # by a GENETIC_CODE file.
356 :     my $gcFile = "$FIG_Config::organisms/$genomeID/GENETIC_CODE";
357 :     my $genetic_code = 11;
358 :     if (-f $gcFile) {
359 :     $genetic_code = Tracer::GetFile($gcFile);
360 :     chomp $genetic_code;
361 :     }
362 : parrello 1.13 # Start a genome descriptor.
363 :     my $genomeMD5Thing = MD5Computer->new();
364 :     # First we create the Contigs. Each one needs to be split into DNA sequences.
365 : parrello 1.1 for my $contigID (@contigIDs) {
366 :     $self->Track(Contigs => $contigID, 100);
367 :     # Get the contig length.
368 :     my $length = $fig->contig_ln($genomeID, $contigID);
369 : parrello 1.13 # Compute the contig ID. Note that the contig ID includes
370 : parrello 1.1 # the genome ID as a prefix. Otherwise, it would be non-unique.
371 :     my $realContigID = "$genomeID:$contigID";
372 : parrello 1.13 # The contig chunks will be gathered in here. At a later point we'll use
373 :     # the chunks to compute the contig's MD5.
374 :     my @dnaChunks;
375 : parrello 1.2 # Now we loop through the DNA chunks.
376 :     my $loc = 1;
377 :     my $ordinal = 0;
378 : parrello 1.9 while ($loc <= $length) {
379 : parrello 1.2 # Get this segment's true length.
380 : parrello 1.9 my $trueLength = Tracer::Min($length + 1 - $loc, $segmentLength);
381 : parrello 1.2 # Compute the index of this segment's last base pair.
382 :     my $endPoint = $loc + $trueLength - 1;
383 :     # Get the DNA.
384 :     my $chunkDNA = $fig->get_dna($genomeID, $contigID, $loc, $endPoint);
385 : parrello 1.13 push @dnaChunks, $chunkDNA;
386 : parrello 1.15 # Count the GC content.
387 :     $gc_count += ($chunkDNA =~ tr/gcGC//);
388 : parrello 1.2 # Create its sequence record.
389 :     my $paddedOrdinal = Tracer::Pad($ordinal, 7, 1, '0');
390 :     my $seqID = "$realContigID:$paddedOrdinal";
391 :     $self->PutE(DNASequence => $seqID, sequence => $chunkDNA);
392 :     $self->Add('dna-letters' => $trueLength);
393 :     # Connect it to the contig.
394 :     $self->PutR(HasSection => $realContigID, $seqID);
395 :     # Move to the next section of the contig.
396 :     $loc = $endPoint + 1;
397 :     $ordinal++;
398 :     }
399 : parrello 1.13 # Compute the contig MD5.
400 :     my $contigMD5 = $genomeMD5Thing->ProcessContig($contigID, \@dnaChunks);
401 :     # Create the contig record.
402 :     $self->PutE(Contig => $realContigID, length => $length,
403 :     md5_identifier => $contigMD5);
404 :     $self->PutR(IsMadeUpOf => $genomeID, $realContigID);
405 : parrello 1.2 }
406 : parrello 1.13 # Compute the genome MD5.
407 :     my $genomeMD5 = $genomeMD5Thing->CloseGenome();
408 :     # Write the genome record.
409 :     $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
410 :     dna_size => $dna_size, scientific_name => $scientific_name,
411 :     pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
412 :     domain => $domain, genetic_code => $genetic_code,
413 : parrello 1.15 md5_identifier => $genomeMD5,
414 :     gc_content => ($gc_count * 100 / $dna_size));
415 : parrello 1.2 }
416 :    
417 :     =head3 GetTaxData
418 :    
419 :     my @fields = GenomeSaplingLoader::GetTaxData($ih);
420 :    
421 :     Read a taxonomy dump record and return its fields in a list. Taxonomy
422 :     dump records end in a tab-bar-newline sequence, and fields are separated
423 :     by a tab-bar-tab sequence, a more complex arrangement than is used in
424 :     standard tab-delimited files.
425 :    
426 :     =over 4
427 :    
428 :     =item ih
429 :    
430 :     Open input handle for the taxonomy dump file.
431 :    
432 :     =item RETURN
433 :    
434 :     Returns a list of the fields in the record read.
435 :    
436 :     =back
437 :    
438 :     =cut
439 :    
440 :     sub GetTaxData {
441 :     # Get the parameters.
442 :     my ($ih) = @_;
443 :     # Temporarily change the end-of-record character.
444 :     local $/ = "\t|\n";
445 :     # Read the next record.
446 :     my $line = <$ih>;
447 :     # Chop off the end, if any.
448 :     if ($line =~ /(.+)\t\|\n$/) {
449 :     $line = $1;
450 : parrello 1.1 }
451 : parrello 1.2 # Split the line into fields.
452 :     my @retVal = split /\t\|\t/, $line;
453 :     # Return the result.
454 :     return @retVal;
455 : parrello 1.1 }
456 :    
457 :    
458 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3