[Bio] / Sprout / GenomeSproutLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/GenomeSproutLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package GenomeSproutLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use ERDB;
25 :     use base 'BaseSproutLoader';
26 :    
27 :     =head1 Sprout Genome Load Group Class
28 :    
29 :     =head2 Introduction
30 :    
31 :     The Load Group includes all of the major genome-related tables.
32 :    
33 :     =head3 new
34 :    
35 :     my $sl = SproutLoader->new($erdb, $source, $options, @tables);
36 :    
37 :     Construct a new SproutLoader object.
38 :    
39 :     =over 4
40 :    
41 :     =item erdb
42 :    
43 :     [[SproutPm]] object for the database being loaded.
44 :    
45 :     =item options
46 :    
47 :     Reference to a hash of command-line options.
48 :    
49 :     =item tables
50 :    
51 :     List of tables in this load group.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     sub new {
58 :     # Get the parameters.
59 : parrello 1.2 my ($class, $erdb, $options) = @_;
60 : parrello 1.1 # Create the table list.
61 : parrello 1.4 my @tables = sort qw(Genome HasContig Contig IsMadeUpOf Sequence Host IsPathogenicIn
62 :     IsRepresentativeOf);
63 : parrello 1.1 # Create the BaseSproutLoader object.
64 : parrello 1.2 my $retVal = BaseSproutLoader::new($class, $erdb, $options, @tables);
65 : parrello 1.1 # Return it.
66 :     return $retVal;
67 :     }
68 :    
69 :     =head2 Public Methods
70 :    
71 :     =head3 Generate
72 :    
73 :     $sl->Generate();
74 :    
75 :     Generate the data for the genome-related files.
76 :    
77 :     =cut
78 :    
79 :     sub Generate {
80 :     # Get the parameters.
81 :     my ($self) = @_;
82 :     # Get the section ID.
83 :     my $genomeID = $self->section();
84 :     # Get the sprout object.
85 :     my $sprout = $self->db();
86 :     # Get the FIG object.
87 :     my $fig = $self->source();
88 :     # Only proceed if we're not the global section.
89 :     if (! $self->global()) {
90 :     # Get the genus, species, and strain from the scientific name.
91 : parrello 1.2 my $scientificName = $fig->genus_species($genomeID);
92 :     my ($genus, $species, $extra) = split / /, $scientificName, 3;
93 : parrello 1.1 # Get the full taxonomy.
94 :     my $taxonomy = $fig->taxonomy_of($genomeID);
95 :     # Get the version. If no version is specified, we default to the genome ID by itself.
96 :     my $version = $fig->genome_version($genomeID);
97 :     if (! defined($version)) {
98 :     $version = $genomeID;
99 :     }
100 : parrello 1.4 # Get the group hash and compute our group name.
101 : parrello 1.1 my $group;
102 : parrello 1.4 my $superGroup = $sprout->SuperGroup("$genus $species");
103 :     if (! $superGroup) {
104 :     # Here we have a supporting genome.
105 :     $group = $FIG_Config::otherGroup;
106 : parrello 1.1 } else {
107 : parrello 1.4 # Now we have an NMPDR genome. Compute the group name. The group name
108 :     # is either "other XXXX" or "XXXX YYYY" where XXXX is the genus and
109 :     # YYYY is the species. The species is used if it's in the special-species
110 :     # list.
111 :     my %groupHash = $sprout->CheckGroupFile();
112 :     if (exists $groupHash{$superGroup}->{specials}->{$species}){
113 :     $group = "$genus $species";
114 :     } else {
115 :     $group = "other $genus";
116 :     }
117 : parrello 1.1 }
118 : parrello 1.2 # Get the version-numbered ID. If there is none, we just use the genome.
119 :     my $genomeVersion = $fig->genome_version($genomeID) || $genomeID;
120 : parrello 1.1 # Get the contigs.
121 :     my @contigs = $fig->all_contigs($genomeID);
122 :     Trace(scalar(@contigs) . " contigs found for $genomeID.") if T(ERDBLoadGroup => 3);
123 : parrello 1.2 # Now come some attribute-related values. We create a hash of the genome's
124 :     # attributes.
125 :     my %attributes = map { $_->[1] => $_->[2] } $fig->get_attributes($genomeID);
126 :     # The first attribute is the pathogenic host list. Note we have to get rid
127 :     # of the value "No", which we treat as not being connected to any host.
128 :     my @hosts = grep { $_ ne 'No' } split /\s*,\s*/, ($attributes{Pathogenic_In} || "");
129 :     for my $host (@hosts) {
130 :     $self->PutE(Host => $host);
131 :     $self->PutR(IsPathogenicIn => $genomeID, $host);
132 :     }
133 :     # Next is the gram stain, which must be converted to semi-boolean.
134 :     my $gram_stain = $attributes{Gram_Stain};
135 :     if ($gram_stain =~ /positive/i) {
136 :     $gram_stain = 'Y';
137 :     } elsif ($gram_stain =~ /negative/i) {
138 :     $gram_stain = 'N';
139 :     } else {
140 :     $gram_stain = '?'
141 :     }
142 :     # The temperature range needs to be split in two. The default is 0 to 100.
143 :     my ($tempRangeMin, $tempRangeMax);
144 :     my $tempRange = $attributes{Temperature_Range};
145 :     if (! defined $tempRange) {
146 :     ($tempRangeMin, $tempRangeMax) = (0,100);
147 :     } elsif ($tempRange =~ /^\d+$/) {
148 :     ($tempRangeMin, $tempRangeMax) = ($tempRange, $tempRange);
149 :     } elsif ($tempRange =~ /^(\d+)-(\d+)$/) {
150 :     ($tempRangeMin, $tempRangeMax) = ($1, $2);
151 :     } else {
152 :     ($tempRangeMin, $tempRangeMax) = (0, 100);
153 :     }
154 :     # These attributes are all simple.
155 :     my $endospore = ERDBTypeSemiBoolean::ComputeFromString($attributes{Endospores});
156 :     my $motility = ERDBTypeSemiBoolean::ComputeFromString($attributes{Motility});
157 :     my $oxygen = $attributes{Oxygen_Requirement} || "unknown";
158 :     my $optimalTempRange = $attributes{Temperature_Range} || "unknown";
159 :     my $pathogenic = ERDBTypeSemiBoolean::ComputeFromString($attributes{Pathogenic});
160 :     my $salinity = $attributes{Salinity} || "unknown";
161 :     my $habitat = $attributes{Habitat} || "unknown";
162 : parrello 1.4 # We need to find the representative genome. That's not simple, because the
163 :     # representative has to be one of ours, and the SEED rep may not be.
164 :     # To solve this problem, we read in the entire genome set file. For each
165 :     # set, we stash the first eligible genome in a hash, and when we find
166 :     # our genome, we'll get the saved representative.
167 :     my (%repHash, $repGenome);
168 :     my %eligibleGenomes = map { $_ => 1 } BaseSproutLoader::GetSectionList($sprout, $fig);
169 :     my $ih = Open(undef, "<$FIG_Config::data/Global/genome.sets");
170 :     while (! eof $ih && ! defined $repGenome) {
171 :     my ($set, $member) = Tracer::GetLine($ih);
172 :     # Only process our genomes.
173 :     if ($eligibleGenomes{$member}) {
174 :     # If this is the first eligible genome for the set, remember it.
175 :     if (! exists $repHash{$set}) {
176 :     $repHash{$set} = $member;
177 :     }
178 :     # If this is our genome, we're done.
179 :     if ($member eq $genomeID) {
180 :     $repGenome = $repHash{$set};
181 :     }
182 :     }
183 :     }
184 :     close $ih;
185 :     # If we found a representative, save it.
186 :     if (defined $repGenome) {
187 :     $self->PutR(IsRepresentativeOf => $repGenome, $genomeID);
188 :     }
189 : parrello 1.2 # Now we loop through each of the genome's contigs. While doing so, we'll
190 :     # track the GC content and DNA size.
191 :     my $gc_content = 0;
192 :     my $dnaSize = 0;
193 : parrello 1.1 for my $contigID (@contigs) {
194 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
195 : parrello 1.2 $self->Track(Contigs => $contigID, 100);
196 : parrello 1.1 $self->Add(contigIn => 1);
197 :     # Create the contig ID.
198 :     my $sproutContigID = "$genomeID:$contigID";
199 : parrello 1.3 # Get the contig length.
200 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
201 : parrello 1.1 # Create the contig record and relate it to the genome.
202 : parrello 1.3 $self->PutE(Contig => $sproutContigID, length => $contigLen);
203 : parrello 1.1 $self->PutR(HasContig => $genomeID, $sproutContigID);
204 :     # Now we need to split the contig into sequences. The maximum sequence size is
205 :     # a property of the Sprout object.
206 :     my $chunkSize = $sprout->MaxSequence();
207 :     # Now we get the sequence a chunk at a time.
208 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
209 :     $self->Add(chunkIn => 1);
210 :     # Compute the endpoint of this chunk.
211 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
212 :     # Get the actual DNA.
213 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
214 : parrello 1.2 # Compute the stats.
215 :     my $chunkLen = length($dna);
216 :     my $chunkGC = length(join("", split /[^gc]+/, $dna));
217 :     $gc_content += $chunkGC;
218 :     $dnaSize += $chunkLen;
219 : parrello 1.1 # Compute the sequenceID.
220 :     my $seqID = "$sproutContigID.$i";
221 :     # Write out the data. For now, the quality vector is always "unknown".
222 :     $self->PutR(IsMadeUpOf => $sproutContigID, $seqID, len => ($end + 1 - $i),
223 :     'start-position' => $i);
224 :     $self->PutE(Sequence => $seqID, 'quality-vector' => "unknown", sequence => $dna);
225 : parrello 1.2 $self->Add('dna-letters' => $chunkLen);
226 : parrello 1.1 }
227 :     }
228 : parrello 1.2 # Finalize the GC content computation.
229 :     $gc_content = $gc_content * 100 / $dnaSize;
230 :     # Output the genome record.
231 :     $self->PutE(Genome => $genomeID, complete => $fig->is_complete($genomeID),
232 :     contigs => scalar(@contigs), dna_size => $dnaSize,
233 :     genus => $genus, pegs => $fig->genome_pegs($genomeID),
234 :     primary_group => $group, rnas => $fig->genome_rnas($genomeID),
235 :     species => $species, unique_characterization => $extra,
236 :     version => $genomeVersion, taxonomy => $taxonomy,
237 :     endospore => $endospore, gc_content => $gc_content,
238 :     gram_stain => $gram_stain, motility => $motility,
239 :     oxygen => $oxygen, optimal_temperature_range => $optimalTempRange,
240 :     pathogenic => $pathogenic, salinity => $salinity,
241 :     temperature_min => $tempRangeMin, temperature_max => $tempRangeMax,
242 :     habitat => $habitat, scientific_name => $scientificName);
243 : parrello 1.1 }
244 :     }
245 :    
246 :    
247 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3