[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Wed Nov 25 22:01:03 2009 UTC revision 1.12, Thu Oct 14 17:27:37 2010 UTC
# Line 217  Line 217 
217      # Get the Sapling object.      # Get the Sapling object.
218      my $sapling = $self->db();      my $sapling = $self->db();
219      # Get the name of the taxonomy dump directory.      # Get the name of the taxonomy dump directory.
220      my $taxDir = "$FIG_Config::global/Taxonomy";      my $taxDir = "/vol/biodb/ncbi/taxonomy";
221      # The first step is to read in all the names. We will build a hash that maps      # The first step is to read in all the names. We will build a hash that maps
222      # each taxonomy ID to a list of its names. The first scientific name encountered      # each taxonomy ID to a list of its names. The first scientific name encountered
223      # will be saved as the primary name. Only scientific names, synonoyms, and      # will be saved as the primary name. Only scientific names, synonoyms, and
# Line 244  Line 244 
244          }          }
245      }      }
246      # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping      # Now we read in the taxonomy nodes. For each node, we generate a TaxonomicGrouping
247      # record, and we connect it to its parent using IsGroupFor.      # record, and we connect it to its parent using IsGroupFor. We also keep the node ID
248        # for later so we know what's available.
249      close $ih;      close $ih;
250      $ih = Open(undef, "<$taxDir/nodes.dmp");      $ih = Open(undef, "<$taxDir/nodes.dmp");
251      while (! eof $ih) {      while (! eof $ih) {
# Line 271  Line 272 
272          # Connect the group to its parent.          # Connect the group to its parent.
273          $self->PutR(IsGroupFor => $parent, $taxID);          $self->PutR(IsGroupFor => $parent, $taxID);
274      }      }
275        # Read in the merge file. The merge file tells us which old IDs are mapped to
276        # new IDs. We need this to connect genomes with old IDs to the correct group.
277        my %merges;
278        $ih = Open(undef, "<$taxDir/merged.dmp");
279        while (! eof $ih) {
280            # Get this merge record.
281            my ($oldID, $newID) = GetTaxData($ih);
282            # Store it in the hash.
283            $merges{$oldID} = $newID;
284        }
285      # Now we need to connect each genome to its taxonomic grouping.      # Now we need to connect each genome to its taxonomic grouping.
286      # Get the genome hash. This gives us our list of genome IDs.      # Get the genome hash. This gives us our list of genome IDs.
287      my $genomeHash = $sapling->GenomeHash();      my $genomeHash = $sapling->GenomeHash();
# Line 278  Line 289 
289      for my $genomeID (keys %$genomeHash) {      for my $genomeID (keys %$genomeHash) {
290          # Get this genome's taxonomic group.          # Get this genome's taxonomic group.
291          my ($taxID) = split /\./, $genomeID, 2;          my ($taxID) = split /\./, $genomeID, 2;
292          # Connect the genome and the group.          # Check to see if we have this tax ID. If we don't, we check for a merge.
293            if (! $primaryNames{$taxID}) {
294                if ($merges{$taxID}) {
295                    $taxID = $merges{$taxID};
296                    $self->Add('merged-names' => 1);
297                    Trace("$genomeID has alternate taxonomy ID $taxID.") if T(ERDBLoadGroup => 2);
298                } else {
299                    $taxID = undef;
300                    $self->Add('missing-groups' => 1);
301                    Trace("$genomeID has no taxonomy group.") if T(ERDBLoadGroup => 1);
302                }
303            }
304            # Connect the genome and the group if the group is real.
305            if (defined $taxID) {
306          $self->PutR(IsTaxonomyOf => $taxID, $genomeID);          $self->PutR(IsTaxonomyOf => $taxID, $genomeID);
307      }      }
308  }  }
309    }
310    
311    
312  =head3 PlaceGenome  =head3 PlaceGenome
# Line 308  Line 333 
333      my $sapling = $self->db();      my $sapling = $self->db();
334      # Get the source object.      # Get the source object.
335      my $fig = $sapling->GetSourceObject();      my $fig = $sapling->GetSourceObject();
336        # Get the DNA chunk size.
337        my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
338        Trace("DNA chunk size is $segmentLength.") if T(ERDBLoadGroup => 3);
339      # We start with the genome record itself, asking the FIG object      # We start with the genome record itself, asking the FIG object
340      # for its various properties.      # for its various properties.
341      my $scientific_name = $fig->genus_species($genomeID);      my $scientific_name = $fig->genus_species($genomeID);
# Line 320  Line 348 
348      # We need to compute the number of contigs from the list of contig IDs.      # We need to compute the number of contigs from the list of contig IDs.
349      my @contigIDs = $fig->contigs_of($genomeID);      my @contigIDs = $fig->contigs_of($genomeID);
350      my $contigs = scalar(@contigIDs);      my $contigs = scalar(@contigIDs);
351        # Compute the genetic code. Normally, it's 11, but it may be overridden
352        # by a GENETIC_CODE file.
353        my $gcFile = "$FIG_Config::organisms/$genomeID/GENETIC_CODE";
354        my $genetic_code = 11;
355        if (-f $gcFile) {
356            $genetic_code = Tracer::GetFile($gcFile);
357            chomp $genetic_code;
358        }
359      # Write the genome record.      # Write the genome record.
360      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
361                  dna_size => $dna_size, scientific_name => $scientific_name,                  dna_size => $dna_size, scientific_name => $scientific_name,
362                  pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,                  pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
363                  domain => $domain);                  domain => $domain, genetic_code => $genetic_code);
364      # Now we create the Contigs. Each one needs to be split into DNA sequences.      # Now we create the Contigs. Each one needs to be split into DNA sequences.
365      for my $contigID (@contigIDs) {      for my $contigID (@contigIDs) {
366          $self->Track(Contigs => $contigID, 100);          $self->Track(Contigs => $contigID, 100);
# Line 338  Line 374 
374          # Now we loop through the DNA chunks.          # Now we loop through the DNA chunks.
375          my $loc = 1;          my $loc = 1;
376          my $ordinal = 0;          my $ordinal = 0;
377          my $segmentLength = $sapling->TuningParameter('maxSequenceLength');          while ($loc <= $length) {
         while ($loc < $length) {  
378              # Get this segment's true length.              # Get this segment's true length.
379              my $trueLength = Tracer::Min($length - $loc, $segmentLength);              my $trueLength = Tracer::Min($length + 1 - $loc, $segmentLength);
380              # Compute the index of this segment's last base pair.              # Compute the index of this segment's last base pair.
381              my $endPoint = $loc + $trueLength - 1;              my $endPoint = $loc + $trueLength - 1;
382              # Get the DNA.              # Get the DNA.

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3