[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Tue Jun 30 19:53:01 2009 UTC revision 1.8, Wed Dec 16 01:45:19 2009 UTC
# Line 125  Line 125 
125          # Get the next record.          # Get the next record.
126          $self->Add("set-records" => 1);          $self->Add("set-records" => 1);
127          my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);          my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
128            # Only accept the genome if it's one of ours.
129            if ($genomeHash->{$genomeID}) {
130          # Is this a new set?          # Is this a new set?
131          if ($newSetID != $setID) {          if ($newSetID != $setID) {
132              # Yes. Output the old set.              # Yes. Output the old set.
133              $self->OutputGenomeSet(\%names, \%setNames, \@genomes);                  $self->OutputGenomeSet(\%names, $setID, \@genomes);
134              # Clear the set data.              # Clear the set data.
135              %names = ();              %names = ();
136              @genomes = ();              @genomes = ();
# Line 146  Line 148 
148              $names{$genus}++;              $names{$genus}++;
149          }          }
150      }      }
151        }
152      # Close the input file.      # Close the input file.
153      close $ih;      close $ih;
154      # Output the last set.      # Output the last set.
155      $self->OutputGenomeSet(\%names, \%setNames, \@genomes);      $self->OutputGenomeSet(\%names, $setID, \@genomes);
156  }  }
157    
158  =head3 OutputGenomeSet  =head3 OutputGenomeSet
159    
160      $sl->OutputGenomeSet(\%names, \%setNames, \@genomes);      $sl->OutputGenomeSet(\%names, $setID, \@genomes);
161    
162  Output the data for a genome set. A name will be computed from the genus  Output the data for a genome set. The appropriate GenomeSet and IsCollectionOf
163  information and the appropriate GenomeSet and IsCollectionOf records will  records will be generated for the genomes in the set.
 be generated for the genomes in the set.  
164    
165  =over 4  =over 4
166    
# Line 167  Line 169 
169  Reference to a hash of the genus names used in the set. The hash maps each name  Reference to a hash of the genus names used in the set. The hash maps each name
170  to the number of times it appeared.  to the number of times it appeared.
171    
172  =item setNames  =item setID
173    
174  Reference to a hash of set names already used. This hash will be updated to include  The ID to use for this set.
 the set name chosen.  
175    
176  =item genomes  =item genomes
177    
# Line 182  Line 183 
183    
184  sub OutputGenomeSet {  sub OutputGenomeSet {
185      # Get the parameters.      # Get the parameters.
186      my ($self, $names, $setNames, $genomes) = @_;      my ($self, $names, $setID, $genomes) = @_;
187      # Only proceed if there is at least one genome.      # Only proceed if there is at least one genome.
188      my $count = scalar @$genomes;      my $count = scalar @$genomes;
189      if ($count) {      if ($count) {
         # First we compute the name. Sort the genus names by occurrence count.  
         my ($setName) = sort { $names->{$a} <=> $names->{$b} } keys %$names;  
         # Apply a suffix to make it unique.  
         my $i = 1;  
         $i++ while $setNames->{"$setName/$i"};  
         $setName .= "/$i";  
         # Insure we don't reuse this set name.  
         $setNames->{$setName} = 1;  
190          # Create the set record.          # Create the set record.
191          $self->PutE(GenomeSet => $setName);          $self->PutE(GenomeSet => $setID);
192          # This will be TRUE for the first genome and FALSE thereafter, insuring that          # This will be TRUE for the first genome and FALSE thereafter, insuring that
193          # the first genome is used for the representative.          # the first genome is used for the representative.
194          my $repFlag = 1;          my $repFlag = 1;
195          # Connect all the genomes to it.          # Connect all the genomes to it.
196          for my $genome (@$genomes) {          for my $genome (@$genomes) {
197              $self->PutR(IsCollectionOf => $setName, $genome, representative => $repFlag);              $self->PutR(IsCollectionOf => $setID, $genome, representative => $repFlag);
198              $repFlag = 0;              $repFlag = 0;
199          }          }
200      }      }
# Line 258  Line 251 
251          # Get the data for this group.          # Get the data for this group.
252          my ($taxID, $parent, $type, undef, undef,          my ($taxID, $parent, $type, undef, undef,
253              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);
254          # Determine whether or not this is a domain group.          # Determine whether or not this is a domain group. A domain group is
255          my $domain = ($type eq 'superkingdom');          # terminal when doing taxonomy searches. The NCBI indicates a top-level
256            # node by making it a child of the root node 1. We also include
257            # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
258            # organisms but are still terminal in our book.
259            my $domain = ($type eq 'superkingdom') || ($parent == 1);
260          # Get the node's name.          # Get the node's name.
261          my $name = $primaryNames{$taxID};          my $name = $primaryNames{$taxID};
262          # It's an error if there's no name.          # It's an error if there's no name.
# Line 311  Line 308 
308      my $sapling = $self->db();      my $sapling = $self->db();
309      # Get the source object.      # Get the source object.
310      my $fig = $sapling->GetSourceObject();      my $fig = $sapling->GetSourceObject();
311        # Get the DNA chunk size.
312        my $segmentLength = $sapling->TuningParameter('maxSequenceLength');
313        Trace("DNA chunk size is $segmentLength.") if T(ERDBLoadGroup => 3);
314      # We start with the genome record itself, asking the FIG object      # We start with the genome record itself, asking the FIG object
315      # for its various properties.      # for its various properties.
316      my $scientific_name = $fig->genus_species($genomeID);      my $scientific_name = $fig->genus_species($genomeID);
# Line 318  Line 318 
318      my $dna_size = $fig->genome_szdna($genomeID);      my $dna_size = $fig->genome_szdna($genomeID);
319      my $pegs = $fig->genome_pegs($genomeID);      my $pegs = $fig->genome_pegs($genomeID);
320      my $rnas = $fig->genome_rnas($genomeID);      my $rnas = $fig->genome_rnas($genomeID);
321        my $domain = $fig->genome_domain($genomeID);
322        my $prokaryotic = ($domain =~ /bacter|archae/i);
323      # We need to compute the number of contigs from the list of contig IDs.      # We need to compute the number of contigs from the list of contig IDs.
324      my @contigIDs = $fig->contigs_of($genomeID);      my @contigIDs = $fig->contigs_of($genomeID);
325      my $contigs = scalar(@contigIDs);      my $contigs = scalar(@contigIDs);
326        # Compute the genetic code. Normally, it's 11, but it may be overridden
327        # by a GENETIC_CODE file.
328        my $gcFile = "$FIG_Config::organisms/$genomeID/GENETIC_CODE";
329        my $genetic_code = 11;
330        if (-f $gcFile) {
331            $genetic_code = Tracer::GetFile($gcFile);
332            chomp $genetic_code;
333        }
334      # Write the genome record.      # Write the genome record.
335      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
336                  dna_size => $dna_size, scientific_name => $scientific_name,                  dna_size => $dna_size, scientific_name => $scientific_name,
337                  pegs => $pegs, rnas => $rnas);                  pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
338                    domain => $domain, genetic_code => $genetic_code);
339      # Now we create the Contigs. Each one needs to be split into DNA sequences.      # Now we create the Contigs. Each one needs to be split into DNA sequences.
340      for my $contigID (@contigIDs) {      for my $contigID (@contigIDs) {
341          $self->Track(Contigs => $contigID, 100);          $self->Track(Contigs => $contigID, 100);
# Line 338  Line 349 
349          # Now we loop through the DNA chunks.          # Now we loop through the DNA chunks.
350          my $loc = 1;          my $loc = 1;
351          my $ordinal = 0;          my $ordinal = 0;
         my $segmentLength = $sapling->TuningParameter('maxSequenceLength');  
352          while ($loc < $length) {          while ($loc < $length) {
353              # Get this segment's true length.              # Get this segment's true length.
354              my $trueLength = Tracer::Min($length - $loc, $segmentLength);              my $trueLength = Tracer::Min($length - $loc, $segmentLength);

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3