[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Tue Jun 30 19:53:01 2009 UTC revision 1.11, Mon Oct 11 16:22:43 2010 UTC
# Line 125  Line 125 
125          # Get the next record.          # Get the next record.
126          $self->Add("set-records" => 1);          $self->Add("set-records" => 1);
127          my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);          my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
128            # Only accept the genome if it's one of ours.
129            if ($genomeHash->{$genomeID}) {
130          # Is this a new set?          # Is this a new set?
131          if ($newSetID != $setID) {          if ($newSetID != $setID) {
132              # Yes. Output the old set.              # Yes. Output the old set.
133              $self->OutputGenomeSet(\%names, \%setNames, \@genomes);                  $self->OutputGenomeSet(\%names, $setID, \@genomes);
134              # Clear the set data.              # Clear the set data.
135              %names = ();              %names = ();
136              @genomes = ();              @genomes = ();
# Line 146  Line 148 
148              $names{$genus}++;              $names{$genus}++;
149          }          }
150      }      }
151        }
152      # Close the input file.      # Close the input file.
153      close $ih;      close $ih;
154      # Output the last set.      # Output the last set.
155      $self->OutputGenomeSet(\%names, \%setNames, \@genomes);      $self->OutputGenomeSet(\%names, $setID, \@genomes);
156  }  }
157    
158  =head3 OutputGenomeSet  =head3 OutputGenomeSet
159    
160      $sl->OutputGenomeSet(\%names, \%setNames, \@genomes);      $sl->OutputGenomeSet(\%names, $setID, \@genomes);
161    
162  Output the data for a genome set. A name will be computed from the genus  Output the data for a genome set. The appropriate GenomeSet and IsCollectionOf
163  information and the appropriate GenomeSet and IsCollectionOf records will  records will be generated for the genomes in the set.
 be generated for the genomes in the set.  
164    
165  =over 4  =over 4
166    
# Line 167  Line 169 
169  Reference to a hash of the genus names used in the set. The hash maps each name  Reference to a hash of the genus names used in the set. The hash maps each name
170  to the number of times it appeared.  to the number of times it appeared.
171    
172  =item setNames  =item setID
173    
174  Reference to a hash of set names already used. This hash will be updated to include  The ID to use for this set.
 the set name chosen.  
175    
176  =item genomes  =item genomes
177    
# Line 182  Line 183 
183    
184  sub OutputGenomeSet {  sub OutputGenomeSet {
185      # Get the parameters.      # Get the parameters.
186      my ($self, $names, $setNames, $genomes) = @_;      my ($self, $names, $setID, $genomes) = @_;
187      # Only proceed if there is at least one genome.      # Only proceed if there is at least one genome.
188      my $count = scalar @$genomes;      my $count = scalar @$genomes;
189      if ($count) {      if ($count) {
         # First we compute the name. Sort the genus names by occurrence count.  
         my ($setName) = sort { $names->{$a} <=> $names->{$b} } keys %$names;  
         # Apply a suffix to make it unique.  
         my $i = 1;  
         $i++ while $setNames->{"$setName/$i"};  
         $setName .= "/$i";  
         # Insure we don't reuse this set name.  
         $setNames->{$setName} = 1;  
190          # Create the set record.          # Create the set record.
191          $self->PutE(GenomeSet => $setName);          $self->PutE(GenomeSet => $setID);
192          # This will be TRUE for the first genome and FALSE thereafter, insuring that          # This will be TRUE for the first genome and FALSE thereafter, insuring that
193          # the first genome is used for the representative.          # the first genome is used for the representative.
194          my $repFlag = 1;          my $repFlag = 1;
195          # Connect all the genomes to it.          # Connect all the genomes to it.
196          for my $genome (@$genomes) {          for my $genome (@$genomes) {
197              $self->PutR(IsCollectionOf => $setName, $genome, representative => $repFlag);              $self->PutR(IsCollectionOf => $setID, $genome, representative => $repFlag);
198              $repFlag = 0;              $repFlag = 0;
199          }          }
200      }      }
# Line 224  Line 217 
217      # Get the Sapling object.      # Get the Sapling object.
218      my $sapling = $self->db();      my $sapling = $self->db();
219      # Get the name of the taxonomy dump directory.      # Get the name of the taxonomy dump directory.
220      my $taxDir = "$FIG_Config::global/Taxonomy";      my $taxDir = $FIG_Config::taxData || "$FIG_Config::global/Taxonomy";
221      # The first step is to read in all the names. We will build a hash that maps      # The first step is to read in all the names. We will build a hash that maps
222      # each taxonomy ID to a list of its names. The first scientific name encountered      # each taxonomy ID to a list of its names. The first scientific name encountered
223      # will be saved as the primary name. Only scientific names, synonoyms, and      # will be saved as the primary name. Only scientific names, synonoyms, and
# Line 251  Line 244 
244          }          }
245      }      }
246      # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping      # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
247      # record, and we connect it to its parent using IsGroupFor.      # record, and we connect it to its parent using IsGroupFor. We also keep the node ID
248        # for later so we know what's available.
249      close $ih;      close $ih;
250      $ih = Open(undef, "<$taxDir/nodes.dmp");      $ih = Open(undef, "<$taxDir/nodes.dmp");
251      while (! eof $ih) {      while (! eof $ih) {
252          # Get the data for this group.          # Get the data for this group.
253          my ($taxID, $parent, $type, undef, undef,          my ($taxID, $parent, $type, undef, undef,
254              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);
255          # Determine whether or not this is a domain group.          # Determine whether or not this is a domain group. A domain group is
256          my $domain = ($type eq 'superkingdom');          # terminal when doing taxonomy searches. The NCBI indicates a top-level
257            # node by making it a child of the root node 1. We also include
258            # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
259            # organisms but are still terminal in our book.
260            my $domain = ($type eq 'superkingdom') || ($parent == 1);
261          # Get the node's name.          # Get the node's name.
262          my $name = $primaryNames{$taxID};          my $name = $primaryNames{$taxID};
263          # It's an error if there's no name.          # It's an error if there's no name.
# Line 274  Line 272 
272          # Connect the group to its parent.          # Connect the group to its parent.
273          $self->PutR(IsGroupFor => $parent, $taxID);          $self->PutR(IsGroupFor => $parent, $taxID);
274      }      }
275        # Read in the merge file. The merge file tells us which old IDs are mapped to
276        # new IDs. We need this to connect genomes with old IDs to the correct group.
277        my %merges;
278        $ih = Open(undef, "<$taxDir/merged.dmp");
279        while (! eof $ih) {
280            # Get this merge record.
281            my ($oldID, $newID) = GetTaxData($ih);
282            # Store it in the hash.
283            $merges{$oldID} = $newID;
284        }
285      # Now we need to connect each genome to its taxonomic grouping.      # Now we need to connect each genome to its taxonomic grouping.
286      # Get the genome hash. This gives us our list of genome IDs.      # Get the genome hash. This gives us our list of genome IDs.
287      my $genomeHash = $sapling->GenomeHash();      my $genomeHash = $sapling->GenomeHash();
# Line 281  Line 289 
289      for my $genomeID (keys %$genomeHash) {      for my $genomeID (keys %$genomeHash) {
290          # Get this genome's taxonomic group.          # Get this genome's taxonomic group.
291          my ($taxID) = split /\./, $genomeID, 2;          my ($taxID) = split /\./, $genomeID, 2;
292          # Connect the genome and the group.          # Check to see if we have this tax ID. If we don't, we check for a merge.
293            if (! $primaryNames{$taxID}) {
294                if ($merges{$taxID}) {
295                    $taxID = $merges{$taxID};
296                    $self->Add('merged-names' => 1);
297                    Trace("$genomeID has alternate taxonomy ID $taxID.") if T(ERDBLoadGroup => 2);
298                } else {
299                    $taxID = undef;
300                    $self->Add('missing-groups' => 1);
301                    Trace("$genomeID has no taxonomy group.") if T(ERDBLoadGroup => 1);
302                }
303            }
304            # Connect the genome and the group if the group is real.
305            if (defined $taxID) {
306          $self->PutR(IsTaxonomyOf => $taxID, $genomeID);          $self->PutR(IsTaxonomyOf => $taxID, $genomeID);
307      }      }
308  }  }
309    }
310    
311    
312  =head3 PlaceGenome  =head3 PlaceGenome
# Line 311  Line 333 
333      my $sapling = $self->db();      my $sapling = $self->db();
334      # Get the source object.      # Get the source object.
335      my $fig = $sapling->GetSourceObject();      my $fig = $sapling->GetSourceObject();
336        # Get the DNA chunk size.
337        my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
338        Trace("DNA chunk size is $segmentLength.") if T(ERDBLoadGroup => 3);
339      # We start with the genome record itself, asking the FIG object      # We start with the genome record itself, asking the FIG object
340      # for its various properties.      # for its various properties.
341      my $scientific_name = $fig->genus_species($genomeID);      my $scientific_name = $fig->genus_species($genomeID);
# Line 318  Line 343 
343      my $dna_size = $fig->genome_szdna($genomeID);      my $dna_size = $fig->genome_szdna($genomeID);
344      my $pegs = $fig->genome_pegs($genomeID);      my $pegs = $fig->genome_pegs($genomeID);
345      my $rnas = $fig->genome_rnas($genomeID);      my $rnas = $fig->genome_rnas($genomeID);
346        my $domain = $fig->genome_domain($genomeID);
347        my $prokaryotic = ($domain =~ /bacter|archae/i);
348      # We need to compute the number of contigs from the list of contig IDs.      # We need to compute the number of contigs from the list of contig IDs.
349      my @contigIDs = $fig->contigs_of($genomeID);      my @contigIDs = $fig->contigs_of($genomeID);
350      my $contigs = scalar(@contigIDs);      my $contigs = scalar(@contigIDs);
351        # Compute the genetic code. Normally, it's 11, but it may be overridden
352        # by a GENETIC_CODE file.
353        my $gcFile = "$FIG_Config::organisms/$genomeID/GENETIC_CODE";
354        my $genetic_code = 11;
355        if (-f $gcFile) {
356            $genetic_code = Tracer::GetFile($gcFile);
357            chomp $genetic_code;
358        }
359      # Write the genome record.      # Write the genome record.
360      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
361                  dna_size => $dna_size, scientific_name => $scientific_name,                  dna_size => $dna_size, scientific_name => $scientific_name,
362                  pegs => $pegs, rnas => $rnas);                  pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
363                    domain => $domain, genetic_code => $genetic_code);
364      # Now we create the Contigs. Each one needs to be split into DNA sequences.      # Now we create the Contigs. Each one needs to be split into DNA sequences.
365      for my $contigID (@contigIDs) {      for my $contigID (@contigIDs) {
366          $self->Track(Contigs => $contigID, 100);          $self->Track(Contigs => $contigID, 100);
# Line 338  Line 374 
374          # Now we loop through the DNA chunks.          # Now we loop through the DNA chunks.
375          my $loc = 1;          my $loc = 1;
376          my $ordinal = 0;          my $ordinal = 0;
377          my $segmentLength = $sapling->TuningParameter('maxSequenceLength');          while ($loc <= $length) {
         while ($loc < $length) {  
378              # Get this segment's true length.              # Get this segment's true length.
379              my $trueLength = Tracer::Min($length - $loc, $segmentLength);              my $trueLength = Tracer::Min($length + 1 - $loc, $segmentLength);
380              # Compute the index of this segment's last base pair.              # Compute the index of this segment's last base pair.
381              my $endPoint = $loc + $trueLength - 1;              my $endPoint = $loc + $trueLength - 1;
382              # Get the DNA.              # Get the DNA.

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3