[Bio] / Sprout / SaplingTaxonomyLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingTaxonomyLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingTaxonomyLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 :     use base qw(SaplingDataLoader);
29 :    
30 :     =head1 Sapling Taxonomy Loader
31 :    
32 :     This class loads taxonomy information into the Sapling database. This is global information
33 :     that involves deleting and recreating entire tables. In particular, the following tables
34 :     will be truncated and rebuilt.
35 :    
36 :     The taxonomy process involves relationships with genomes. Only genomes currently in the
37 :     database will be connected to the taxonomy groups and genome sets; after new genomes are
38 :     added this process needs to be rerun or the genomes need to be connected manually.
39 :    
40 :     =over 4
41 :    
42 :     =item GenomeSet
43 :    
44 :     OTU sets
45 :    
46 :     =item IsCollectionOf
47 :    
48 :     relationship from GenomeSet to Genome
49 :    
50 :     =item TaxonomicGrouping
51 :    
52 :     organism classifications
53 :    
54 :     =item IsGroupFor
55 :    
56 :     relationship that organizes taxonomic groupings into a hierarchy
57 :    
58 :     =item IsTaxonomyOf
59 :    
60 :     relationship that connects a genome to its parent in the taxonomy tree
61 :    
62 :     =back
63 :    
64 :     =head2 Main Methods
65 :    
66 :     =head3 ClearTaxonomies
67 :    
68 :     my $stats = SaplingTaxonomyLoader::ClearTaxonomies($sap);
69 :    
70 :     Delete the taxonomy information from the database. This involves removing entire tables,
71 :     since the information is global. As a result, the statistics are fairly uninformative.
72 :    
73 :     =over 4
74 :    
75 :     =item sap
76 :    
77 :     L<Sapling> object used to access the target database.
78 :    
79 :     =item RETURN
80 :    
81 :     Returns a L<Stats> object describing the events during the operation.
82 :    
83 :     =back
84 :    
85 :     =cut
86 :    
87 :     sub ClearTaxonomies {
88 :     # Get the parameters.
89 :     my ($sap) = @_;
90 :     # Create the return object.
91 :     my $retVal = Stats->new('tables');
92 :     # Create a list of the tables to clear.
93 :     my @tables = qw(GenomeSet IsCollectionOf TaxonomicGrouping IsGroupFor IsTaxonomyOf);
94 :     # Loop through the list, truncating the tables.
95 :     for my $table (@tables) {
96 :     $sap->TruncateTable($table);
97 :     $retVal->Add(tables => 1);
98 :     }
99 :     # Return the statistics.
100 :     return $retVal;
101 :     }
102 :    
103 :     =head3 LoadTaxonomies
104 :    
105 :     my $stats = SaplingTaxonomyLoader::LoadTaxonomies($sap, $dumpDirectory, $setFile);
106 :    
107 :     Load the taxonomic information from the specified files.
108 :    
109 :     =over 4
110 :    
111 :     =item sap
112 :    
113 :     L<Sapling> object used to access the target database.
114 :    
115 :     =item dumpDirectory
116 :    
117 :     Name of the directory containing the taxonomic dump files from the NCBI.
118 :    
119 : parrello 1.4 =item setFile (optional)
120 : parrello 1.1
121 : parrello 1.4 Name of the file containing the OTU information. If omitted, the OTU information is not loaded.
122 : parrello 1.1
123 :     =item RETURN
124 :    
125 :     Returns a L<Stats> object describing the events during the operation.
126 :    
127 :     =back
128 :    
129 :     =cut
130 :    
131 :     sub LoadTaxonomies {
132 :     # Get the parameters.
133 :     my ($sap, $dumpDirectory, $setFile) = @_;
134 :     # Create the helper object.
135 :     my $loaderObject = SaplingTaxonomyLoader->new($sap);
136 :     # Load the OTUs.
137 : parrello 1.4 if ($setFile) {
138 :     $loaderObject->ReadGenomeSets($setFile);
139 :     }
140 : parrello 1.1 # Load the taxonomy data.
141 :     $loaderObject->ReadTaxonomies($dumpDirectory);
142 :     # Return the statistics.
143 :     return $loaderObject->{stats};
144 :     }
145 :    
146 :     =head3 Process
147 :    
148 :     my $stats = SaplingTaxonomyLoader::Process($sap, $dumpDirectory, $setFile);
149 :    
150 :     Load taxonomy data from the specified directory and OTU set file. If the taxonomy data
151 :     already exists, it will be deleted first.
152 :    
153 :     =over 4
154 :    
155 :     =item sap
156 :    
157 :     L</Sapling> object for accessing the database.
158 :    
159 :     =item dumpDirectory
160 :    
161 :     name of the directory containing the taxonomy data.
162 :    
163 :     =item setFile
164 :    
165 :     name of the file describing the OTUs
166 :    
167 :     =item RETURN
168 :    
169 :     Returns a statistics object describing the activity during the reload.
170 :    
171 :     =back
172 :    
173 :     =cut
174 :    
175 :     sub Process {
176 :     # Get the parameters.
177 :     my ($sap, $dumpDirectory, $setFile) = @_;
178 :     # Clear the existing taxonomy data.
179 :     my $stats = ClearTaxonomies($sap);
180 :     # Load the new taxonomy data from the specified directory and OTU file.
181 :     my $newStats = LoadTaxonomies($sap, $dumpDirectory, $setFile);
182 :     # Merge the statistics.
183 :     $stats->Accumulate($newStats);
184 :     # Return the result.
185 :     return $stats;
186 :     }
187 :    
188 :    
189 :     =head2 Loader Object Methods
190 :    
191 :     =head3 new
192 :    
193 :     my $loaderObject = SaplingTaxonomyLoader->new($sap);
194 :    
195 :     Create a loader object that can be used to facilitate loading taxonomy information.
196 :    
197 :     =over 4
198 :    
199 :     =item sap
200 :    
201 :     L<Sapling> object used to access the target database.
202 :    
203 :     =back
204 :    
205 :     The object created contains the following fields.
206 :    
207 :     =over 4
208 :    
209 :     =item supportRecords
210 :    
211 :     A hash of hashes, used to track the support records known to exist in the database.
212 :    
213 :     =item sap
214 :    
215 :     L<Sapling> object used to access the database.
216 :    
217 :     =item stats
218 :    
219 :     L<Stats> object for tracking statistical information about the load.
220 :    
221 :     =item gHash
222 :    
223 :     Reference to a hash containing the IDs of all the genomes in the database.
224 :    
225 :     =back
226 :    
227 :     =cut
228 :    
229 :     sub new {
230 :     # Get the parameters.
231 :     my ($class, $sap) = @_;
232 :     # Create the object.
233 :     my $retVal = SaplingDataLoader::new($class, $sap, qw(groups sets));
234 :     # Create the hash of genome IDs.
235 :     my %gHash = map { $_ => 1 } $sap->GetFlat('Genome', "", [], 'id');
236 :     # Store it in the object.
237 :     $retVal->{gHash} = \%gHash;
238 :     # Return the result.
239 :     return $retVal;
240 :     }
241 :    
242 :     =head2 Internal Utility Methods
243 :    
244 :     =head3 ReadGenomeSets
245 :    
246 :     $loaderObject->ReadGenomeSets($fileName);
247 :    
248 :     Read the OTU data from the specified set file and load it into the database.
249 :    
250 :     =over 4
251 :    
252 :     =item fileName
253 :    
254 :     Name of the file containing the OTU data. The file is tab-delimited, and each record consists of
255 :     a genome set number followed by a genome ID. The first genome in each set is the representative
256 :     genome.
257 :    
258 :     =back
259 :    
260 :     =cut
261 :    
262 :     sub ReadGenomeSets {
263 :     # Get the parameters.
264 :     my ($self, $fileName) = @_;
265 :     # Get the statistics object.
266 :     my $stats = $self->{stats};
267 :     # Get the Sapling database.
268 :     my $sap = $self->{sap};
269 :     # Get the genome hash.
270 :     my $gHash = $self->{gHash};
271 :     # Open the set file.
272 :     my $ih = Open(undef, "<$fileName");
273 :     # This will be the ID of the current genome set.
274 :     my $currentSet = "";
275 :     # This will be the number of genomes stored in the current set.
276 :     my $storedInSet = 0;
277 :     # Loop through the file records, creating the sets and links.
278 :     while (! eof $ih) {
279 :     # Get this record.
280 :     my ($setID, $genome) = Tracer::GetLine($ih);
281 :     # Is this a new set?
282 :     if ($setID ne $currentSet) {
283 :     # Yes. Create the set record.
284 :     $sap->InsertObject('GenomeSet', id => $setID);
285 :     # Denote no genomes in this set have been stored.
286 :     $storedInSet = 0;
287 :     # Record that we have a new set.
288 :     $stats->Add(sets => 1);
289 :     $currentSet = $setID;
290 :     }
291 :     # Is this a genome we're keeping?
292 :     if (! $gHash->{$genome}) {
293 :     $stats->Add(setGenomeSkipped => 1);
294 :     } else {
295 :     # Yes. If this is the first one for this set, make it the representative.
296 :     my $repFlag = ($storedInSet ? 0 : 1);
297 :     # Connect the genome to the set.
298 :     $sap->InsertObject('IsCollectionOf', from_link => $setID, to_link => $genome,
299 :     representative => $repFlag);
300 :     # Record that we've stored this genome.
301 :     $stats->Add(genomesInSets => 1);
302 :     $storedInSet++;
303 :     }
304 :     }
305 :     }
306 :    
307 :    
308 :     =head3 ReadTaxonomies
309 :    
310 :     $loaderObject->ReadTaxonomies($directoryName);
311 :    
312 :     Read the NCBI taxonomy groupings from the specified directory.
313 :    
314 :     =over 4
315 :    
316 :     =item directoryName
317 :    
318 :     Name of the directory containing the NCBI taxonomy dump files.
319 :    
320 :     =back
321 :    
322 :     =cut
323 :    
324 :     sub ReadTaxonomies {
325 :     # Get the parameters.
326 :     my ($self, $directoryName) = @_;
327 :     # Get the statistics object.
328 :     my $stats = $self->{stats};
329 :     # Get the Sapling database.
330 :     my $sap = $self->{sap};
331 :     # Get the hash of genomes in the database.
332 :     my $gHash = $self->{gHash};
333 :     # The first step is to read in all the names. We will build a hash that maps
334 :     # each taxonomy ID to a list of its names. The first scientific name encountered
335 :     # will be saved as the primary name. Only scientific names, synonoyms, and
336 :     # equivalent names will be kept.
337 :     my (%nameLists, %primaryNames);
338 :     my $ih = Open(undef, "<$directoryName/names.dmp");
339 :     while (! eof $ih) {
340 :     # Get the next name.
341 :     my ($taxID, $name, undef, $type) = $self->GetTaxData($ih);
342 : parrello 1.2 $stats->Add('taxnames-in' => 1);
343 : parrello 1.1 # Is this a scientific name?
344 :     if ($type =~ /scientific/i) {
345 :     # Yes. Save it if it is the first for this ID.
346 :     if (! exists $primaryNames{$taxID}) {
347 :     $primaryNames{$taxID} = $name;
348 :     }
349 :     # Add it to the name list.
350 :     push @{$nameLists{$taxID}}, $name;
351 : parrello 1.2 $stats->Add('taxnames-scientific' => 1);
352 : parrello 1.1 } elsif ($type =~ /synonym|equivalent/i) {
353 :     # Here it's not scientific, but it's generally useful, so we keep it.
354 :     push @{$nameLists{$taxID}}, $name;
355 : parrello 1.2 $stats->Add('taxnames-other' => 1);
356 : parrello 1.1 }
357 :     }
358 :     # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
359 :     # record, and we connect it to its parent using IsGroupFor. We also keep the node ID
360 :     # for later so we know what's available.
361 :     close $ih;
362 :     $ih = Open(undef, "<$directoryName/nodes.dmp");
363 :     while (! eof $ih) {
364 :     # Get the data for this group.
365 :     my ($taxID, $parent, $type, undef, undef,
366 :     undef, undef, undef, undef, undef, $hidden) = $self->GetTaxData($ih);
367 :     # Determine whether or not this is a domain group. A domain group is
368 :     # terminal when doing taxonomy searches. The NCBI indicates a top-level
369 :     # node by making it a child of the root node 1. We also include
370 :     # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
371 :     # organisms but are still terminal in our book.
372 :     my $domain = ($type eq 'superkingdom') || ($parent == 1);
373 :     # Get the node's name.
374 :     my $name = $primaryNames{$taxID};
375 :     # It's an error if there's no name.
376 :     Confess("No name found for tax ID $taxID.") if ! $name;
377 :     # Create the taxonomy group record.
378 :     $sap->InsertObject('TaxonomicGrouping', id => $taxID, domain => $domain, hidden => $hidden,
379 :     scientific_name => $name);
380 :     # Create the aliases.
381 :     for my $alias (@{$nameLists{$taxID}}) {
382 :     $sap->InsertValue($taxID, 'TaxonomicGrouping(alias)', $alias);
383 :     }
384 :     # Connect the group to its parent.
385 :     $sap->InsertObject('IsGroupFor', from_link => $parent, to_link => $taxID);
386 :     }
387 :     # Read in the merge file. The merge file tells us which old IDs are mapped to
388 :     # new IDs. We need this to connect genomes with old IDs to the correct group.
389 :     my %merges;
390 :     $ih = Open(undef, "<$directoryName/merged.dmp");
391 :     while (! eof $ih) {
392 :     # Get this merge record.
393 :     my ($oldID, $newID) = $self->GetTaxData($ih);
394 :     # Store it in the hash.
395 :     $merges{$oldID} = $newID;
396 :     }
397 :     # Now we need to connect each genome to its taxonomic grouping.
398 :     # Loop through the genomes.
399 :     for my $genomeID (keys %$gHash) {
400 :     # Get this genome's taxonomic group.
401 :     my ($taxID) = split /\./, $genomeID, 2;
402 :     # Check to see if we have this tax ID. If we don't, we check for a merge.
403 :     if (! $primaryNames{$taxID}) {
404 :     if ($merges{$taxID}) {
405 :     $taxID = $merges{$taxID};
406 : parrello 1.2 $stats->Add('merged-names' => 1);
407 : parrello 1.3 Trace("$genomeID has alternate taxonomy ID $taxID.") if T(SaplingDataLoader => 2);
408 : parrello 1.1 } else {
409 :     $taxID = undef;
410 : parrello 1.2 $stats->Add('missing-groups' => 1);
411 : parrello 1.3 Trace("$genomeID has no taxonomy group.") if T(SaplingDataLoader => 1);
412 : parrello 1.1 }
413 :     }
414 :     # Connect the genome and the group if the group is real.
415 :     if (defined $taxID) {
416 :     $sap->InsertObject('IsTaxonomyOf', from_link => $taxID, to_link => $genomeID);
417 :     }
418 :     }
419 :     }
420 :    
421 :    
422 :     =head3 GetTaxData
423 :    
424 :     my @fields = $loaderObject->GetTaxData($ih);
425 :    
426 :     Read a taxonomy dump record and return its fields in a list. Taxonomy
427 :     dump records end in a tab-bar-newline sequence, and fields are separated
428 :     by a tab-bar-tab sequence, a more complex arrangement than is used in
429 :     standard tab-delimited files.
430 :    
431 :     =over 4
432 :    
433 :     =item ih
434 :    
435 :     Open input handle for the taxonomy dump file.
436 :    
437 :     =item RETURN
438 :    
439 :     Returns a list of the fields in the record read.
440 :    
441 :     =back
442 :    
443 :     =cut
444 :    
445 :     sub GetTaxData {
446 :     # Get the parameters.
447 :     my ($self, $ih) = @_;
448 :     # Get the statistics object.
449 :     my $stats = $self->{stats};
450 :     # Temporarily change the end-of-record character.
451 :     local $/ = "\t|\n";
452 :     # Read the next record.
453 :     my $line = <$ih>;
454 :     $stats->Add(taxDumpRecords => 1);
455 :     # Chop off the end, if any.
456 :     if ($line =~ /(.+)\t\|\n$/) {
457 :     $line = $1;
458 :     }
459 :     # Split the line into fields.
460 :     my @retVal = split /\t\|\t/, $line;
461 :     $stats->Add(taxDumpFields => scalar(@retVal));
462 :     # Return the result.
463 :     return @retVal;
464 :     }
465 :    
466 :    
467 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3