[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Jan 19 21:43:27 2009 UTC revision 1.3, Tue Jun 30 19:53:01 2009 UTC
# Line 40  Line 40 
40    
41  =item erdb  =item erdb
42    
43  [[SaplingPm]] object for the database being loaded.  L<Sapling> object for the database being loaded.
44    
45  =item options  =item options
46    
# Line 58  Line 58 
58      # Get the parameters.      # Get the parameters.
59      my ($class, $erdb, $options) = @_;      my ($class, $erdb, $options) = @_;
60      # Create the table list.      # Create the table list.
61      my @tables = sort qw(Genome IsMadeUpOf IsTaxonomyOf TaxonomicGrouping      my @tables = sort qw(GenomeSet IsMadeUpOf IsCollectionOf Genome IsTaxonomyOf TaxonomicGrouping
62                           IsGroupContaining DnaSequence DnaSequenceBases);                           TaxonomicGroupingAlias IsGroupFor Contig HasSection DNASequence);
63      # Create the BaseSaplingLoader object.      # Create the BaseSaplingLoader object.
64      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);      my $retVal = BaseSaplingLoader::new($class, $erdb, $options, @tables);
65      # Return it.      # Return it.
# Line 83  Line 83 
83      if ($self->global()) {      if ($self->global()) {
84          # This is the global section. Create the taxonomic hierarchy.          # This is the global section. Create the taxonomic hierarchy.
85          $self->CreateTaxonomies();          $self->CreateTaxonomies();
86            # Create the genome sets.
87            $self->CreateGenomeSets();
88      } else {      } else {
89          # Get the section ID.          # Get the section ID.
90          my $genomeID = $self->section();          my $genomeID = $self->section();
# Line 91  Line 93 
93      }      }
94  }  }
95    
96    =head3 CreateGenomeSets
97    
98        $sl->CreateGenomeSets();
99    
100    Generate the genome sets. This includes the GenomeSet and IsCollectionOf
101    tables.
102    
103    =cut
104    
105    sub CreateGenomeSets {
106        # Get the parameters.
107        my ($self) = @_;
108        # Get the genome hash. Only genomes in this hash will be put into a set.
109        my $sapling = $self->db();
110        my $genomeHash = $sapling->GenomeHash();
111        # We'll track genome set names in here. The set name is the most common
112        # genus in the set with an optional number for uniqueness.
113        my %setNames;
114        # Get the genome set file.
115        my $ih = Open(undef, "<$FIG_Config::global/genome.sets");
116        # We will accumulate set data and output a set at the end of each set group.
117        # This will be a list of genome IDs for the set.
118        my @genomes;
119        # This will contain the genus counts.
120        my %names;
121        # This will be the set ID number.
122        my $setID;
123        # Loop through the set file.
124        while (! eof $ih) {
125            # Get the next record.
126            $self->Add("set-records" => 1);
127            my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
128            # Is this a new set?
129            if ($newSetID != $setID) {
130                # Yes. Output the old set.
131                $self->OutputGenomeSet(\%names, \%setNames, \@genomes);
132                # Clear the set data.
133                %names = ();
134                @genomes = ();
135                # Save the new set ID.
136                $setID = $newSetID;
137            }
138            # Only proceed if this is one of our genomes.
139            if ($genomeHash->{$genomeID}) {
140                $self->Add("set-genomes" => 1);
141                # Save the genome ID.
142                push @genomes, $genomeID;
143                # Remember it as the representative if it's the first in the set.
144                # Count the genus.
145                my ($genus) = split /\s/, $name, 2;
146                $names{$genus}++;
147            }
148        }
149        # Close the input file.
150        close $ih;
151        # Output the last set.
152        $self->OutputGenomeSet(\%names, \%setNames, \@genomes);
153    }
154    
155    =head3 OutputGenomeSet
156    
157        $sl->OutputGenomeSet(\%names, \%setNames, \@genomes);
158    
159    Output the data for a genome set. A name will be computed from the genus
160    information and the appropriate GenomeSet and IsCollectionOf records will
161    be generated for the genomes in the set.
162    
163    =over 4
164    
165    =item names
166    
167    Reference to a hash of the genus names used in the set. The hash maps each name
168    to the number of times it appeared.
169    
170    =item setNames
171    
172    Reference to a hash of set names already used. This hash will be updated to include
173    the set name chosen.
174    
175    =item genomes
176    
177    Reference to a list of the IDs for the genomes in the set.
178    
179    =back
180    
181    =cut
182    
183    sub OutputGenomeSet {
184        # Get the parameters.
185        my ($self, $names, $setNames, $genomes) = @_;
186        # Only proceed if there is at least one genome.
187        my $count = scalar @$genomes;
188        if ($count) {
189            # First we compute the name. Sort the genus names by occurrence count.
190            my ($setName) = sort { $names->{$a} <=> $names->{$b} } keys %$names;
191            # Apply a suffix to make it unique.
192            my $i = 1;
193            $i++ while $setNames->{"$setName/$i"};
194            $setName .= "/$i";
195            # Insure we don't reuse this set name.
196            $setNames->{$setName} = 1;
197            # Create the set record.
198            $self->PutE(GenomeSet => $setName);
199            # This will be TRUE for the first genome and FALSE thereafter, insuring that
200            # the first genome is used for the representative.
201            my $repFlag = 1;
202            # Connect all the genomes to it.
203            for my $genome (@$genomes) {
204                $self->PutR(IsCollectionOf => $setName, $genome, representative => $repFlag);
205                $repFlag = 0;
206            }
207        }
208    }
209    
210    
211  =head3 CreateTaxonomies  =head3 CreateTaxonomies
212    
213      $sl->CreateTaxonomies();      $sl->CreateTaxonomies();
214    
215  Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,  Generate the taxonomy hierarchy. This includes the TaxonomicGrouping,
216  IsClassOf, and IsTaxonomyOf tables.  IsGroupFor, TaxonomicGroupingAlias, and IsTaxonomyOf relationships. The
217    taxonomy hierarchy is computed from the NCBI taxonomy dump.
218    
219  =cut  =cut
220    
# Line 105  Line 223 
223      my ($self) = @_;      my ($self) = @_;
224      # Get the Sapling object.      # Get the Sapling object.
225      my $sapling = $self->db();      my $sapling = $self->db();
226      # Get the source object.      # Get the name of the taxonomy dump directory.
227      my $fig = $sapling->GetSourceObject();      my $taxDir = "$FIG_Config::global/Taxonomy";
228      # Create the taxonomy hash. For each taxonomic grouping, the hash will map      # The first step is to read in all the names. We will build a hash that maps
229      # to its parent grouping.      # each taxonomy ID to a list of its names. The first scientific name encountered
230      my %taxTree;      # will be saved as the primary name. Only scientific names, synonoyms, and
231      # Get the genome list.      # equivalent names will be kept.
232      my @genomes = sort keys %{$sapling->GenomeHash()};      my (%nameLists, %primaryNames);
233      # Loop through them, processing the taxonomy of each.      my $ih = Open(undef, "<$taxDir/names.dmp");
234      for my $genome (@genomes) {      while (! eof $ih) {
235          $self->Track(Organisms => $genome, 100);          # Get the next name.
236          # Get the name of this genome. Genome names sometimes get          my ($taxID, $name, undef, $type) = GetTaxData($ih);
237          # stored incorrectly in the taxonomy.          $self->Add('taxnames-in' => 1);
238          my $genomeName = $fig->genus_species($genome);          # Is this a scientific name?
239          # Get the taxonomy list.          if ($type =~ /scientific/i) {
240          my @taxClasses = grep { $_ ne $genomeName }              # Yes. Save it if it is the first for this ID.
241              split /\s*;\s*/, $fig->taxonomy_of($genome);              if (! exists $primaryNames{$taxID}) {
242          # Loop through the taxonomy. For each class found, we connect                  $primaryNames{$taxID} = $name;
243          # it to the genome with a sequence number indicating its position              }
244          # in the genome's taxonomy, and we store its parent class name              # Add it to the name list.
245          # so we can connect the groups. As a result, the genome is in              push @{$nameLists{$taxID}}, $name;
246          # every taxonomic group it belongs to, and we have enough data              $self->Add('taxnames-scientific' => 1);
247          # to produce the taxonomy tree as well.          } elsif ($type =~ /synonym|equivalent/i) {
248          my $parent = undef;              # Here it's not scientific, but it's generally useful, so we keep it.
249          my $sequence = 0;              push @{$nameLists{$taxID}}, $name;
250          for my $taxClass (@taxClasses) {              $self->Add('taxnames-other' => 1);
             $taxTree{$taxClass} = $parent;  
             $parent = $taxClass;  
             $self->PutR(IsTaxonomyOf => $taxClass, $genome,  
                         sequence => $sequence++);  
         }  
     }  
     # Now we loop through the taxonomy hash, creating the TaxonomicGrouping  
     # and IsClassOf records.  
     for my $taxClass (sort keys %taxTree) {  
         $self->Track(TaxonomicGroupings => $taxClass, 100);  
         # Determine whether or not this is a domain.  
         my $parent = $taxTree{$taxClass};  
         my $isDomain = (defined $parent ? 0 : 1);  
         if (! $isDomain) {  
             # It isn't a domain, so link it to its parent.  
             $self->PutR(IsGroupContaining => $parent, $taxClass);  
251          }          }
252          # Create the group record.      }
253          $self->PutE(TaxonomicGrouping => $taxClass, domain => $isDomain);      # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
254        # record, and we connect it to its parent using IsGroupFor.
255        close $ih;
256        $ih = Open(undef, "<$taxDir/nodes.dmp");
257        while (! eof $ih) {
258            # Get the data for this group.
259            my ($taxID, $parent, $type, undef, undef,
260                undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);
261            # Determine whether or not this is a domain group.
262            my $domain = ($type eq 'superkingdom');
263            # Get the node's name.
264            my $name = $primaryNames{$taxID};
265            # It's an error if there's no name.
266            Confess("No name found for tax ID $taxID.") if ! $name;
267            # Create the taxonomy group record.
268            $self->PutE(TaxonomicGrouping => $taxID, domain => $domain, hidden => $hidden,
269                        scientific_name => $name);
270            # Create the alias records.
271            for my $alias (@{$nameLists{$taxID}}) {
272                $self->PutE(TaxonomicGroupingAlias => $taxID, alias => $alias);
273            }
274            # Connect the group to its parent.
275            $self->PutR(IsGroupFor => $parent, $taxID);
276        }
277        # Now we need to connect each genome to its taxonomic grouping.
278        # Get the genome hash. This gives us our list of genome IDs.
279        my $genomeHash = $sapling->GenomeHash();
280        # Loop through the genomes.
281        for my $genomeID (keys %$genomeHash) {
282            # Get this genome's taxonomic group.
283            my ($taxID) = split /\./, $genomeID, 2;
284            # Connect the genome and the group.
285            $self->PutR(IsTaxonomyOf => $taxID, $genomeID);
286      }      }
287  }  }
288    
# Line 158  Line 292 
292      $sl->PlaceGenome($genomeID);      $sl->PlaceGenome($genomeID);
293    
294  Generate the data for a specific genome. This method generates data for  Generate the data for a specific genome. This method generates data for
295  the Genome, IsMadeUpOf, DnaSequence and DnaSequenceBases  the Genome, IsMadeUpOf, Contig, HasSection, and DNASequence tables.
 tables.  
296    
297  =over 4  =over 4
298    
# Line 180  Line 313 
313      my $fig = $sapling->GetSourceObject();      my $fig = $sapling->GetSourceObject();
314      # We start with the genome record itself, asking the FIG object      # We start with the genome record itself, asking the FIG object
315      # for its various properties.      # for its various properties.
316        my $scientific_name = $fig->genus_species($genomeID);
317      my $complete = $fig->is_complete($genomeID);      my $complete = $fig->is_complete($genomeID);
318      my $dna_size = $fig->genome_szdna($genomeID);      my $dna_size = $fig->genome_szdna($genomeID);
     my $domain = $fig->genome_domain($genomeID);  
     my $full_name = $fig->genus_species($genomeID);  
319      my $pegs = $fig->genome_pegs($genomeID);      my $pegs = $fig->genome_pegs($genomeID);
320      my $rnas = $fig->genome_rnas($genomeID);      my $rnas = $fig->genome_rnas($genomeID);
     my $version = $fig->genome_version($genomeID) || $genomeID;  
321      # We need to compute the number of contigs from the list of contig IDs.      # We need to compute the number of contigs from the list of contig IDs.
322      my @contigIDs = $fig->contigs_of($genomeID);      my @contigIDs = $fig->contigs_of($genomeID);
323      my $contigs = scalar(@contigIDs);      my $contigs = scalar(@contigIDs);
324      # Write the genome record.      # Write the genome record.
325      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
326                  'dna-size' => $dna_size, domain => $domain, 'full-name' => $full_name,                  dna_size => $dna_size, scientific_name => $scientific_name,
327                  pegs => $pegs, rnas => $rnas, version => $version);                  pegs => $pegs, rnas => $rnas);
328      # Now we create the DNA sequences. These correspond to the FIG contigs.      # Now we create the Contigs. Each one needs to be split into DNA sequences.
329      for my $contigID (@contigIDs) {      for my $contigID (@contigIDs) {
330          $self->Track(Contigs => $contigID, 100);          $self->Track(Contigs => $contigID, 100);
331          # Get the contig length.          # Get the contig length.
# Line 202  Line 333 
333          # Generate the contig record. Note that the contig ID includes          # Generate the contig record. Note that the contig ID includes
334          # the genome ID as a prefix. Otherwise, it would be non-unique.          # the genome ID as a prefix. Otherwise, it would be non-unique.
335          my $realContigID = "$genomeID:$contigID";          my $realContigID = "$genomeID:$contigID";
336          $self->PutE(DnaSequence => $realContigID, length => $length);          $self->PutE(Contig => $realContigID, length => $length);
337          $self->PutR(IsMadeUpOf => $genomeID, $realContigID);          $self->PutR(IsMadeUpOf => $genomeID, $realContigID);
338          # May God have mercy, because here we yank in the DNA.          # Now we loop through the DNA chunks.
339          my $contigDNA = $fig->get_dna($genomeID, $contigID, 1, $length);          my $loc = 1;
340          $self->Add('dna-letters' => length($contigDNA));          my $ordinal = 0;
341          $self->PutE(DnaSequenceBases => $realContigID, bases => $contigDNA);          my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
342            while ($loc < $length) {
343                # Get this segment's true length.
344                my $trueLength = Tracer::Min($length - $loc, $segmentLength);
345                # Compute the index of this segment's last base pair.
346                my $endPoint = $loc + $trueLength - 1;
347                # Get the DNA.
348                my $chunkDNA = $fig->get_dna($genomeID, $contigID, $loc, $endPoint);
349                # Create its sequence record.
350                my $paddedOrdinal = Tracer::Pad($ordinal, 7, 1, '0');
351                my $seqID = "$realContigID:$paddedOrdinal";
352                $self->PutE(DNASequence => $seqID, sequence => $chunkDNA);
353                $self->Add('dna-letters' => $trueLength);
354                # Connect it to the contig.
355                $self->PutR(HasSection => $realContigID, $seqID);
356                # Move to the next section of the contig.
357                $loc = $endPoint + 1;
358                $ordinal++;
359            }
360        }
361    }
362    
363    =head3 GetTaxData
364    
365        my @fields = GenomeSaplingLoader::GetTaxData($ih);
366    
367    Read a taxonomy dump record and return its fields in a list. Taxonomy
368    dump records end in a tab-bar-newline sequence, and fields are separated
369    by a tab-bar-tab sequence, a more complex arrangement than is used in
370    standard tab-delimited files.
371    
372    =over 4
373    
374    =item ih
375    
376    Open input handle for the taxonomy dump file.
377    
378    =item RETURN
379    
380    Returns a list of the fields in the record read.
381    
382    =back
383    
384    =cut
385    
386    sub GetTaxData {
387        # Get the parameters.
388        my ($ih) = @_;
389        # Temporarily change the end-of-record character.
390        local $/ = "\t|\n";
391        # Read the next record.
392        my $line = <$ih>;
393        # Chop off the end, if any.
394        if ($line =~ /(.+)\t\|\n$/) {
395            $line = $1;
396      }      }
397        # Split the line into fields.
398        my @retVal = split /\t\|\t/, $line;
399        # Return the result.
400        return @retVal;
401  }  }
402    
403    

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3