[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Jan 19 21:43:27 2009 UTC revision 1.12, Thu Oct 14 17:27:37 2010 UTC
# Line 40  Line 40 
40    
41  =item erdb  =item erdb
42    
43  [[SaplingPm]] object for the database being loaded.  L<Sapling> object for the database being loaded.
44    
45  =item options  =item options
46    
# Line 58  Line 58 
58      # Get the parameters.      # Get the parameters.
59      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
60      # Create the table list.      # Create the table list.
61      my @tables = sort qw(Genome IsMadeUpOf IsTaxonomyOf TaxonomicGrouping      my @tables = sort qw(GenomeSet IsMadeUpOf IsCollectionOf Genome IsTaxonomyOf TaxonomicGrouping
62                           IsGroupContaining DnaSequence DnaSequenceBases);                           TaxonomicGroupingAlias IsGroupFor Contig HasSection DNASequence);
63      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
64      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
65      # Return it.      # Return it.
# Line 83  Line 83 
83      if ($self->global()) {      if ($self->global()) {
84          # This is the global section. Create the taxonomic hierarchy.          # This is the global section. Create the taxonomic hierarchy.
85          $self->CreateTaxonomies();          $self->CreateTaxonomies();
86            # Create the genome sets.
87            $self->CreateGenomeSets();
88      } else {      } else {
89          # Get the section ID.          # Get the section ID.
90          my $genomeID = $self->section();          my $genomeID = $self->section();
# Line 91  Line 93 
93      }      }
94  }  }
95    
96    =head3 CreateGenomeSets
97    
98        $sl->CreateGenomeSets();
99    
100    Generate the genome sets. This includes the GenomeSet and IsCollectionOf
101    tables.
102    
103    =cut
104    
105    sub CreateGenomeSets {
106        # Get the parameters.
107        my ($self) = @_;
108        # Get the genome hash. Only genomes in this hash will be put into a set.
109        my $sapling = $self->db();
110        my $genomeHash = $sapling->GenomeHash();
111        # We'll track genome set names in here. The set name is the most common
112        # genus in the set with an optional number for uniqueness.
113        my %setNames;
114        # Get the genome set file.
115        my $ih = Open(undef, "<$FIG_Config::global/genome.sets");
116        # We will accumulate set data and output a set at the end of each set group.
117        # This will be a list of genome IDs for the set.
118        my @genomes;
119        # This will contain the genus counts.
120        my %names;
121        # This will be the set ID number.
122        my $setID;
123        # Loop through the set file.
124        while (! eof $ih) {
125            # Get the next record.
126            $self->Add("set-records" => 1);
127            my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
128            # Only accept the genome if it's one of ours.
129            if ($genomeHash->{$genomeID}) {
130                # Is this a new set?
131                if ($newSetID != $setID) {
132                    # Yes. Output the old set.
133                    $self->OutputGenomeSet(\%names, $setID, \@genomes);
134                    # Clear the set data.
135                    %names = ();
136                    @genomes = ();
137                    # Save the new set ID.
138                    $setID = $newSetID;
139                }
140                # Only proceed if this is one of our genomes.
141                if ($genomeHash->{$genomeID}) {
142                    $self->Add("set-genomes" => 1);
143                    # Save the genome ID.
144                    push @genomes, $genomeID;
145                    # Remember it as the representative if it's the first in the set.
146                    # Count the genus.
147                    my ($genus) = split /\s/, $name, 2;
148                    $names{$genus}++;
149                }
150            }
151        }
152        # Close the input file.
153        close $ih;
154        # Output the last set.
155        $self->OutputGenomeSet(\%names, $setID, \@genomes);
156    }
157    
158    =head3 OutputGenomeSet
159    
160        $sl->OutputGenomeSet(\%names, $setID, \@genomes);
161    
162    Output the data for a genome set. The appropriate GenomeSet and IsCollectionOf
163    records will be generated for the genomes in the set.
164    
165    =over 4
166    
167    =item names
168    
169    Reference to a hash of the genus names used in the set. The hash maps each name
170    to the number of times it appeared.
171    
172    =item setID
173    
174    The ID to use for this set.
175    
176    =item genomes
177    
178    Reference to a list of the IDs for the genomes in the set.
179    
180    =back
181    
182    =cut
183    
184    sub OutputGenomeSet {
185        # Get the parameters.
186        my ($self, $names, $setID, $genomes) = @_;
187        # Only proceed if there is at least one genome.
188        my $count = scalar @$genomes;
189        if ($count) {
190            # Create the set record.
191            $self->PutE(GenomeSet => $setID);
192            # This will be TRUE for the first genome and FALSE thereafter, insuring that
193            # the first genome is used for the representative.
194            my $repFlag = 1;
195            # Connect all the genomes to it.
196            for my $genome (@$genomes) {
197                $self->PutR(IsCollectionOf => $setID, $genome, representative => $repFlag);
198                $repFlag = 0;
199            }
200        }
201    }
202    
203    
204  =head3 CreateTaxonomies  =head3 CreateTaxonomies
205    
206      $sl->CreateTaxonomies();      $sl->CreateTaxonomies();
207    
208  Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,  Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,
209  IsClassOf, and IsTaxonomyOf tables.  IsGroupFor, TaxonomicGroupingAlias, and IsTaxonomyOf relationships. The
210    taxonomy hierarchy is computed from the NCBI taxonomy dump.
211    
212  =cut  =cut
213    
# Line 105  Line 216 
216      my ($self) = @_;      my ($self) = @_;
217      # Get the Sapling object.      # Get the Sapling object.
218      my $sapling = $self->db();      my $sapling = $self->db();
219      # Get the source object.      # Get the name of the taxonomy dump directory.
220      my $fig = $sapling->GetSourceObject();      my $taxDir = "/vol/biodb/ncbi/taxonomy";
221      # Create the taxonomy hash. For each taxonomic grouping, the hash will map      # The first step is to read in all the names. We will build a hash that maps
222      # to its parent grouping.      # each taxonomy ID to a list of its names. The first scientific name encountered
223      my %taxTree;      # will be saved as the primary name. Only scientific names, synonoyms, and
224      # Get the genome list.      # equivalent names will be kept.
225      my @genomes = sort keys %{$sapling->GenomeHash()};      my (%nameLists, %primaryNames);
226      # Loop through them, processing the taxonomy of each.      my $ih = Open(undef, "<$taxDir/names.dmp");
227      for my $genome (@genomes) {      while (! eof $ih) {
228          $self->Track(Organisms => $genome, 100);          # Get the next name.
229          # Get the name of this genome. Genome names sometimes get          my ($taxID, $name, undef, $type) = GetTaxData($ih);
230          # stored incorrectly in the taxonomy.          $self->Add('taxnames-in' => 1);
231          my $genomeName = $fig->genus_species($genome);          # Is this a scientific name?
232          # Get the taxonomy list.          if ($type =~ /scientific/i) {
233          my @taxClasses = grep { $_ ne $genomeName }              # Yes. Save it if it is the first for this ID.
234              split /\s*;\s*/, $fig->taxonomy_of($genome);              if (! exists $primaryNames{$taxID}) {
235          # Loop through the taxonomy. For each class found, we connect                  $primaryNames{$taxID} = $name;
236          # it to the genome with a sequence number indicating its position              }
237          # in the genome's taxonomy, and we store its parent class name              # Add it to the name list.
238          # so we can connect the groups. As a result, the genome is in              push @{$nameLists{$taxID}}, $name;
239          # every taxonomic group it belongs to, and we have enough data              $self->Add('taxnames-scientific' => 1);
240          # to produce the taxonomy tree as well.          } elsif ($type =~ /synonym|equivalent/i) {
241          my $parent = undef;              # Here it's not scientific, but it's generally useful, so we keep it.
242          my $sequence = 0;              push @{$nameLists{$taxID}}, $name;
243          for my $taxClass (@taxClasses) {              $self->Add('taxnames-other' => 1);
244              $taxTree{$taxClass} = $parent;          }
245              $parent = $taxClass;      }
246              $self->PutR(IsTaxonomyOf => $taxClass, $genome,      # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
247                          sequence => $sequence++);      # record, and we connect it to its parent using IsGroupFor. We also keep the node ID
248          }      # for later so we know what's available.
249      }      close $ih;
250      # Now we loop through the taxonomy hash, creating the TaxonomicGrouping      $ih = Open(undef, "<$taxDir/nodes.dmp");
251      # and IsClassOf records.      while (! eof $ih) {
252      for my $taxClass (sort keys %taxTree) {          # Get the data for this group.
253          $self->Track(TaxonomicGroupings => $taxClass, 100);          my ($taxID, $parent, $type, undef, undef,
254          # Determine whether or not this is a domain.              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);
255          my $parent = $taxTree{$taxClass};          # Determine whether or not this is a domain group. A domain group is
256          my $isDomain = (defined $parent ? 0 : 1);          # terminal when doing taxonomy searches. The NCBI indicates a top-level
257          if (! $isDomain) {          # node by making it a child of the root node 1. We also include
258              # It isn't a domain, so link it to its parent.          # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
259              $self->PutR(IsGroupContaining => $parent, $taxClass);          # organisms but are still terminal in our book.
260            my $domain = ($type eq 'superkingdom') || ($parent == 1);
261            # Get the node's name.
262            my $name = $primaryNames{$taxID};
263            # It's an error if there's no name.
264            Confess("No name found for tax ID $taxID.") if ! $name;
265            # Create the taxonomy group record.
266            $self->PutE(TaxonomicGrouping => $taxID, domain => $domain, hidden => $hidden,
267                        scientific_name => $name);
268            # Create the alias records.
269            for my $alias (@{$nameLists{$taxID}}) {
270                $self->PutE(TaxonomicGroupingAlias => $taxID, alias => $alias);
271            }
272            # Connect the group to its parent.
273            $self->PutR(IsGroupFor => $parent, $taxID);
274        }
275        # Read in the merge file. The merge file tells us which old IDs are mapped to
276        # new IDs. We need this to connect genomes with old IDs to the correct group.
277        my %merges;
278        $ih = Open(undef, "<$taxDir/merged.dmp");
279        while (! eof $ih) {
280            # Get this merge record.
281            my ($oldID, $newID) = GetTaxData($ih);
282            # Store it in the hash.
283            $merges{$oldID} = $newID;
284        }
285        # Now we need to connect each genome to its taxonomic grouping.
286        # Get the genome hash. This gives us our list of genome IDs.
287        my $genomeHash = $sapling->GenomeHash();
288        # Loop through the genomes.
289        for my $genomeID (keys %$genomeHash) {
290            # Get this genome's taxonomic group.
291            my ($taxID) = split /\./, $genomeID, 2;
292            # Check to see if we have this tax ID. If we don't, we check for a merge.
293            if (! $primaryNames{$taxID}) {
294                if ($merges{$taxID}) {
295                    $taxID = $merges{$taxID};
296                    $self->Add('merged-names' => 1);
297                    Trace("$genomeID has alternate taxonomy ID $taxID.") if T(ERDBLoadGroup => 2);
298                } else {
299                    $taxID = undef;
300                    $self->Add('missing-groups' => 1);
301                    Trace("$genomeID has no taxonomy group.") if T(ERDBLoadGroup => 1);
302                }
303            }
304            # Connect the genome and the group if the group is real.
305            if (defined $taxID) {
306                $self->PutR(IsTaxonomyOf => $taxID, $genomeID);
307          }          }
         # Create the group record.  
         $self->PutE(TaxonomicGrouping => $taxClass, domain => $isDomain);  
308      }      }
309  }  }
310    
# Line 158  Line 314 
314      $sl->PlaceGenome($genomeID);      $sl->PlaceGenome($genomeID);
315    
316  Generate the data for a specific genome. This method generates data for  Generate the data for a specific genome. This method generates data for
317  the Genome, IsMadeUpOf, DnaSequence and DnaSequenceBases  the Genome, IsMadeUpOf, Contig, HasSection, and DNASequence tables.
 tables.  
318    
319  =over 4  =over 4
320    
# Line 178  Line 333 
333      my $sapling = $self->db();      my $sapling = $self->db();
334      # Get the source object.      # Get the source object.
335      my $fig = $sapling->GetSourceObject();      my $fig = $sapling->GetSourceObject();
336        # Get the DNA chunk size.
337        my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
338        Trace("DNA chunk size is $segmentLength.") if T(ERDBLoadGroup => 3);
339      # We start with the genome record itself, asking the FIG object      # We start with the genome record itself, asking the FIG object
340      # for its various properties.      # for its various properties.
341        my $scientific_name = $fig->genus_species($genomeID);
342      my $complete = $fig->is_complete($genomeID);      my $complete = $fig->is_complete($genomeID);
343      my $dna_size = $fig->genome_szdna($genomeID);      my $dna_size = $fig->genome_szdna($genomeID);
     my $domain = $fig->genome_domain($genomeID);  
     my $full_name = $fig->genus_species($genomeID);  
344      my $pegs = $fig->genome_pegs($genomeID);      my $pegs = $fig->genome_pegs($genomeID);
345      my $rnas = $fig->genome_rnas($genomeID);      my $rnas = $fig->genome_rnas($genomeID);
346      my $version = $fig->genome_version($genomeID) || $genomeID;      my $domain = $fig->genome_domain($genomeID);
347        my $prokaryotic = ($domain =~ /bacter|archae/i);
348      # We need to compute the number of contigs from the list of contig IDs.      # We need to compute the number of contigs from the list of contig IDs.
349      my @contigIDs = $fig->contigs_of($genomeID);      my @contigIDs = $fig->contigs_of($genomeID);
350      my $contigs = scalar(@contigIDs);      my $contigs = scalar(@contigIDs);
351        # Compute the genetic code. Normally, it's 11, but it may be overridden
352        # by a GENETIC_CODE file.
353        my $gcFile = "$FIG_Config::organisms/$genomeID/GENETIC_CODE";
354        my $genetic_code = 11;
355        if (-f $gcFile) {
356            $genetic_code = Tracer::GetFile($gcFile);
357            chomp $genetic_code;
358        }
359      # Write the genome record.      # Write the genome record.
360      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
361                  'dna-size' => $dna_size, domain => $domain, 'full-name' => $full_name,                  dna_size => $dna_size, scientific_name => $scientific_name,
362                  pegs => $pegs, rnas => $rnas, version => $version);                  pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
363      # Now we create the DNA sequences. These correspond to the FIG contigs.                  domain => $domain, genetic_code => $genetic_code);
364        # Now we create the Contigs. Each one needs to be split into DNA sequences.
365      for my $contigID (@contigIDs) {      for my $contigID (@contigIDs) {
366          $self->Track(Contigs => $contigID, 100);          $self->Track(Contigs => $contigID, 100);
367          # Get the contig length.          # Get the contig length.
# Line 202  Line 369 
369          # Generate the contig record. Note that the contig ID includes          # Generate the contig record. Note that the contig ID includes
370          # the genome ID as a prefix. Otherwise, it would be non-unique.          # the genome ID as a prefix. Otherwise, it would be non-unique.
371          my $realContigID = "$genomeID:$contigID";          my $realContigID = "$genomeID:$contigID";
372          $self->PutE(DnaSequence => $realContigID, length => $length);          $self->PutE(Contig => $realContigID, length => $length);
373          $self->PutR(IsMadeUpOf => $genomeID, $realContigID);          $self->PutR(IsMadeUpOf => $genomeID, $realContigID);
374          # May God have mercy, because here we yank in the DNA.          # Now we loop through the DNA chunks.
375          my $contigDNA = $fig->get_dna($genomeID, $contigID, 1, $length);          my $loc = 1;
376          $self->Add('dna-letters' => length($contigDNA));          my $ordinal = 0;
377          $self->PutE(DnaSequenceBases => $realContigID, bases => $contigDNA);          while ($loc <= $length) {
378                # Get this segment's true length.
379                my $trueLength = Tracer::Min($length + 1 - $loc, $segmentLength);
380                # Compute the index of this segment's last base pair.
381                my $endPoint = $loc + $trueLength - 1;
382                # Get the DNA.
383                my $chunkDNA = $fig->get_dna($genomeID, $contigID, $loc, $endPoint);
384                # Create its sequence record.
385                my $paddedOrdinal = Tracer::Pad($ordinal, 7, 1, '0');
386                my $seqID = "$realContigID:$paddedOrdinal";
387                $self->PutE(DNASequence => $seqID, sequence => $chunkDNA);
388                $self->Add('dna-letters' => $trueLength);
389                # Connect it to the contig.
390                $self->PutR(HasSection => $realContigID, $seqID);
391                # Move to the next section of the contig.
392                $loc = $endPoint + 1;
393                $ordinal++;
394            }
395        }
396    }
397    
398    =head3 GetTaxData
399    
400        my @fields = GenomeSaplingLoader::GetTaxData($ih);
401    
402    Read a taxonomy dump record and return its fields in a list. Taxonomy
403    dump records end in a tab-bar-newline sequence, and fields are separated
404    by a tab-bar-tab sequence, a more complex arrangement than is used in
405    standard tab-delimited files.
406    
407    =over 4
408    
409    =item ih
410    
411    Open input handle for the taxonomy dump file.
412    
413    =item RETURN
414    
415    Returns a list of the fields in the record read.
416    
417    =back
418    
419    =cut
420    
421    sub GetTaxData {
422        # Get the parameters.
423        my ($ih) = @_;
424        # Temporarily change the end-of-record character.
425        local $/ = "\t|\n";
426        # Read the next record.
427        my $line = <$ih>;
428        # Chop off the end, if any.
429        if ($line =~ /(.+)\t\|\n$/) {
430            $line = $1;
431      }      }
432        # Split the line into fields.
433        my @retVal = split /\t\|\t/, $line;
434        # Return the result.
435        return @retVal;
436  }  }
437    
438    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3