[Bio] / Sprout / GenomeSaplingLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/GenomeSaplingLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Tue Jun 30 19:53:01 2009 UTC revision 1.7, Wed Nov 25 22:01:03 2009 UTC
# Line 125  Line 125 
125          # Get the next record.          # Get the next record.
126          $self->Add("set-records" => 1);          $self->Add("set-records" => 1);
127          my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);          my ($newSetID, $genomeID, $name) = Tracer::GetLine($ih);
128            # Only accept the genome if it's one of ours.
129            if ($genomeHash->{$genomeID}) {
130          # Is this a new set?          # Is this a new set?
131          if ($newSetID != $setID) {          if ($newSetID != $setID) {
132              # Yes. Output the old set.              # Yes. Output the old set.
133              $self->OutputGenomeSet(\%names, \%setNames, \@genomes);                  $self->OutputGenomeSet(\%names, $setID, \@genomes);
134              # Clear the set data.              # Clear the set data.
135              %names = ();              %names = ();
136              @genomes = ();              @genomes = ();
# Line 146  Line 148 
148              $names{$genus}++;              $names{$genus}++;
149          }          }
150      }      }
151        }
152      # Close the input file.      # Close the input file.
153      close $ih;      close $ih;
154      # Output the last set.      # Output the last set.
155      $self->OutputGenomeSet(\%names, \%setNames, \@genomes);      $self->OutputGenomeSet(\%names, $setID, \@genomes);
156  }  }
157    
158  =head3 OutputGenomeSet  =head3 OutputGenomeSet
159    
160      $sl->OutputGenomeSet(\%names, \%setNames, \@genomes);      $sl->OutputGenomeSet(\%names, $setID, \@genomes);
161    
162  Output the data for a genome set. A name will be computed from the genus  Output the data for a genome set. The appropriate GenomeSet and IsCollectionOf
163  information and the appropriate GenomeSet and IsCollectionOf records will  records will be generated for the genomes in the set.
 be generated for the genomes in the set.  
164    
165  =over 4  =over 4
166    
# Line 167  Line 169 
169  Reference to a hash of the genus names used in the set. The hash maps each name  Reference to a hash of the genus names used in the set. The hash maps each name
170  to the number of times it appeared.  to the number of times it appeared.
171    
172  =item setNames  =item setID
173    
174  Reference to a hash of set names already used. This hash will be updated to include  The ID to use for this set.
 the set name chosen.  
175    
176  =item genomes  =item genomes
177    
# Line 182  Line 183 
183    
184  sub OutputGenomeSet {  sub OutputGenomeSet {
185      # Get the parameters.      # Get the parameters.
186      my ($self, $names, $setNames, $genomes) = @_;      my ($self, $names, $setID, $genomes) = @_;
187      # Only proceed if there is at least one genome.      # Only proceed if there is at least one genome.
188      my $count = scalar @$genomes;      my $count = scalar @$genomes;
189      if ($count) {      if ($count) {
         # First we compute the name. Sort the genus names by occurrence count.  
         my ($setName) = sort { $names->{$a} <=> $names->{$b} } keys %$names;  
         # Apply a suffix to make it unique.  
         my $i = 1;  
         $i++ while $setNames->{"$setName/$i"};  
         $setName .= "/$i";  
         # Insure we don't reuse this set name.  
         $setNames->{$setName} = 1;  
190          # Create the set record.          # Create the set record.
191          $self->PutE(GenomeSet => $setName);          $self->PutE(GenomeSet => $setID);
192          # This will be TRUE for the first genome and FALSE thereafter, insuring that          # This will be TRUE for the first genome and FALSE thereafter, insuring that
193          # the first genome is used for the representative.          # the first genome is used for the representative.
194          my $repFlag = 1;          my $repFlag = 1;
195          # Connect all the genomes to it.          # Connect all the genomes to it.
196          for my $genome (@$genomes) {          for my $genome (@$genomes) {
197              $self->PutR(IsCollectionOf => $setName, $genome, representative => $repFlag);              $self->PutR(IsCollectionOf => $setID, $genome, representative => $repFlag);
198              $repFlag = 0;              $repFlag = 0;
199          }          }
200      }      }
# Line 258  Line 251 
251          # Get the data for this group.          # Get the data for this group.
252          my ($taxID, $parent, $type, undef, undef,          my ($taxID, $parent, $type, undef, undef,
253              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);              undef,  undef,   undef, undef, undef, $hidden) = GetTaxData($ih);
254          # Determine whether or not this is a domain group.          # Determine whether or not this is a domain group. A domain group is
255          my $domain = ($type eq 'superkingdom');          # terminal when doing taxonomy searches. The NCBI indicates a top-level
256            # node by making it a child of the root node 1. We also include
257            # super-kingdoms (archaea, eukaryota, bacteria), which are below cellular
258            # organisms but are still terminal in our book.
259            my $domain = ($type eq 'superkingdom') || ($parent == 1);
260          # Get the node's name.          # Get the node's name.
261          my $name = $primaryNames{$taxID};          my $name = $primaryNames{$taxID};
262          # It's an error if there's no name.          # It's an error if there's no name.
# Line 318  Line 315 
315      my $dna_size = $fig->genome_szdna($genomeID);      my $dna_size = $fig->genome_szdna($genomeID);
316      my $pegs = $fig->genome_pegs($genomeID);      my $pegs = $fig->genome_pegs($genomeID);
317      my $rnas = $fig->genome_rnas($genomeID);      my $rnas = $fig->genome_rnas($genomeID);
318        my $domain = $fig->genome_domain($genomeID);
319        my $prokaryotic = ($domain =~ /bacter|archae/i);
320      # We need to compute the number of contigs from the list of contig IDs.      # We need to compute the number of contigs from the list of contig IDs.
321      my @contigIDs = $fig->contigs_of($genomeID);      my @contigIDs = $fig->contigs_of($genomeID);
322      my $contigs = scalar(@contigIDs);      my $contigs = scalar(@contigIDs);
323      # Write the genome record.      # Write the genome record.
324      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,      $self->PutE(Genome => $genomeID, complete => $complete, contigs => $contigs,
325                  dna_size => $dna_size, scientific_name => $scientific_name,                  dna_size => $dna_size, scientific_name => $scientific_name,
326                  pegs => $pegs, rnas => $rnas);                  pegs => $pegs, rnas => $rnas, prokaryotic => $prokaryotic,
327                    domain => $domain);
328      # Now we create the Contigs. Each one needs to be split into DNA sequences.      # Now we create the Contigs. Each one needs to be split into DNA sequences.
329      for my $contigID (@contigIDs) {      for my $contigID (@contigIDs) {
330          $self->Track(Contigs => $contigID, 100);          $self->Track(Contigs => $contigID, 100);

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3