10 |
use Sprout; |
use Sprout; |
11 |
use Stats; |
use Stats; |
12 |
use BasicLocation; |
use BasicLocation; |
13 |
|
use HTML; |
14 |
|
|
15 |
=head1 Sprout Load Methods |
=head1 Sprout Load Methods |
16 |
|
|
30 |
$stats->Accumulate($spl->LoadFeatureData()); |
$stats->Accumulate($spl->LoadFeatureData()); |
31 |
print $stats->Show(); |
print $stats->Show(); |
32 |
|
|
|
This module makes use of the internal Sprout property C<_erdb>. |
|
|
|
|
33 |
It is worth noting that the FIG object does not need to be a real one. Any object |
It is worth noting that the FIG object does not need to be a real one. Any object |
34 |
that implements the FIG methods for data retrieval could be used. So, for example, |
that implements the FIG methods for data retrieval could be used. So, for example, |
35 |
this object could be used to copy data from one Sprout database to another, or |
this object could be used to copy data from one Sprout database to another, or |
39 |
a variable called C<$fig>. This makes it fairly straightforward to determine which |
a variable called C<$fig>. This makes it fairly straightforward to determine which |
40 |
FIG methods are required to load the Sprout database. |
FIG methods are required to load the Sprout database. |
41 |
|
|
42 |
|
This object creates the load files; however, the tables are not created until it |
43 |
|
is time to actually do the load from the files into the target database. |
44 |
|
|
45 |
=cut |
=cut |
46 |
|
|
47 |
#: Constructor SproutLoad->new(); |
#: Constructor SproutLoad->new(); |
50 |
|
|
51 |
=head3 new |
=head3 new |
52 |
|
|
53 |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >> |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
54 |
|
|
55 |
Construct a new Sprout Loader object, specifying the two participating databases and |
Construct a new Sprout Loader object, specifying the two participating databases and |
56 |
the name of the files containing the list of genomes and subsystems to use. |
the name of the files containing the list of genomes and subsystems to use. |
78 |
=item subsysFile |
=item subsysFile |
79 |
|
|
80 |
Either the name of the file containing the list of trusted subsystems or a reference |
Either the name of the file containing the list of trusted subsystems or a reference |
81 |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be |
82 |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR> |
83 |
|
in its data directory.) Only subsystem data related to the trusted subsystems is loaded. |
84 |
|
|
85 |
|
=item options |
86 |
|
|
87 |
|
Reference to a hash of command-line options. |
88 |
|
|
89 |
=back |
=back |
90 |
|
|
92 |
|
|
93 |
sub new { |
sub new { |
94 |
# Get the parameters. |
# Get the parameters. |
95 |
my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_; |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
96 |
# Load the list of genomes into a hash. |
# Create the genome hash. |
97 |
my %genomes; |
my %genomes = (); |
98 |
|
# We only need it if load-only is NOT specified. |
99 |
|
if (! $options->{loadOnly}) { |
100 |
if (! defined($genomeFile) || $genomeFile eq '') { |
if (! defined($genomeFile) || $genomeFile eq '') { |
101 |
# Here we want all the complete genomes and an access code of 1. |
# Here we want all the complete genomes and an access code of 1. |
102 |
my @genomeList = $fig->genomes(1); |
my @genomeList = $fig->genomes(1); |
103 |
%genomes = map { $_ => 1 } @genomeList; |
%genomes = map { $_ => 1 } @genomeList; |
104 |
} elsif (ref $genomeFile eq 'HASH') { |
} else { |
105 |
|
my $type = ref $genomeFile; |
106 |
|
Trace("Genome file parameter type is \"$type\".") if T(3); |
107 |
|
if ($type eq 'HASH') { |
108 |
# Here the user specified a hash of genome IDs to access codes, which is |
# Here the user specified a hash of genome IDs to access codes, which is |
109 |
# exactly what we want. |
# exactly what we want. |
110 |
%genomes = %{$genomeFile}; |
%genomes = %{$genomeFile}; |
111 |
} elsif (ref $genomeFile eq 'SCALAR') { |
} elsif (! $type || $type eq 'SCALAR' ) { |
112 |
# The caller specified a file, so read the genomes from the file. |
# The caller specified a file, so read the genomes from the file. (Note |
113 |
|
# that some PERLs return an empty string rather than SCALAR.) |
114 |
my @genomeList = Tracer::GetFile($genomeFile); |
my @genomeList = Tracer::GetFile($genomeFile); |
115 |
if (! @genomeList) { |
if (! @genomeList) { |
116 |
# It's an error if the genome file is empty or not found. |
# It's an error if the genome file is empty or not found. |
127 |
} |
} |
128 |
} |
} |
129 |
} else { |
} else { |
|
my $type = ref $genomeFile; |
|
130 |
Confess("Invalid genome parameter ($type) in SproutLoad constructor."); |
Confess("Invalid genome parameter ($type) in SproutLoad constructor."); |
131 |
} |
} |
132 |
|
} |
133 |
|
} |
134 |
# Load the list of trusted subsystems. |
# Load the list of trusted subsystems. |
135 |
my %subsystems = (); |
my %subsystems = (); |
136 |
|
# We only need it if load-only is NOT specified. |
137 |
|
if (! $options->{loadOnly}) { |
138 |
if (! defined $subsysFile || $subsysFile eq '') { |
if (! defined $subsysFile || $subsysFile eq '') { |
139 |
# Here we want all the subsystems. |
# Here we want all the NMPDR subsystems. First we get the whole list. |
140 |
%subsystems = map { $_ => 1 } $fig->all_subsystems(); |
my @subs = $fig->all_subsystems(); |
141 |
} elsif (ref $subsysFile eq 'ARRAY') { |
# Loop through, checking for the NMPDR file. |
142 |
|
for my $sub (@subs) { |
143 |
|
if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") { |
144 |
|
$subsystems{$sub} = 1; |
145 |
|
} |
146 |
|
} |
147 |
|
} else { |
148 |
|
my $type = ref $subsysFile; |
149 |
|
if ($type eq 'ARRAY') { |
150 |
# Here the user passed in a list of subsystems. |
# Here the user passed in a list of subsystems. |
151 |
%subsystems = map { $_ => 1 } @{$subsysFile}; |
%subsystems = map { $_ => 1 } @{$subsysFile}; |
152 |
} elsif (ref $subsysFile eq 'SCALAR') { |
} elsif (! $type || $type eq 'SCALAR') { |
153 |
# Here the list of subsystems is in a file. |
# Here the list of subsystems is in a file. |
154 |
if (! -e $subsysFile) { |
if (! -e $subsysFile) { |
155 |
# It's an error if the file does not exist. |
# It's an error if the file does not exist. |
162 |
} else { |
} else { |
163 |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
164 |
} |
} |
165 |
|
} |
166 |
|
} |
167 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
168 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
169 |
# Create the Sprout load object. |
# Create the Sprout load object. |
173 |
subsystems => \%subsystems, |
subsystems => \%subsystems, |
174 |
sprout => $sprout, |
sprout => $sprout, |
175 |
loadDirectory => $directory, |
loadDirectory => $directory, |
176 |
erdb => $sprout->{_erdb}, |
erdb => $sprout, |
177 |
loaders => [] |
loaders => [], |
178 |
|
options => $options |
179 |
}; |
}; |
180 |
# Bless and return it. |
# Bless and return it. |
181 |
bless $retVal, $class; |
bless $retVal, $class; |
182 |
return $retVal; |
return $retVal; |
183 |
} |
} |
184 |
|
|
185 |
|
=head3 LoadOnly |
186 |
|
|
187 |
|
C<< my $flag = $spl->LoadOnly; >> |
188 |
|
|
189 |
|
Return TRUE if we are in load-only mode, else FALSE. |
190 |
|
|
191 |
|
=cut |
192 |
|
|
193 |
|
sub LoadOnly { |
194 |
|
my ($self) = @_; |
195 |
|
return $self->{options}->{loadOnly}; |
196 |
|
} |
197 |
|
|
198 |
|
=head3 PrimaryOnly |
199 |
|
|
200 |
|
C<< my $flag = $spl->PrimaryOnly; >> |
201 |
|
|
202 |
|
Return TRUE if only the main entity is to be loaded, else FALSE. |
203 |
|
|
204 |
|
=cut |
205 |
|
|
206 |
|
sub PrimaryOnly { |
207 |
|
my ($self) = @_; |
208 |
|
return $self->{options}->{primaryOnly}; |
209 |
|
} |
210 |
|
|
211 |
=head3 LoadGenomeData |
=head3 LoadGenomeData |
212 |
|
|
213 |
C<< my $stats = $spl->LoadGenomeData(); >> |
C<< my $stats = $spl->LoadGenomeData(); >> |
235 |
|
|
236 |
=back |
=back |
237 |
|
|
|
B<TO DO> |
|
|
|
|
|
Real quality vectors instead of C<unknown> for everything. |
|
|
|
|
|
GenomeGroup relation. (The original script took group information from the C<NMPDR> file |
|
|
in each genome's main directory, but no such file exists anywhere in my version of the |
|
|
data store.) |
|
|
|
|
238 |
=cut |
=cut |
239 |
#: Return Type $%; |
#: Return Type $%; |
240 |
sub LoadGenomeData { |
sub LoadGenomeData { |
245 |
# Get the genome count. |
# Get the genome count. |
246 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
247 |
my $genomeCount = (keys %{$genomeHash}); |
my $genomeCount = (keys %{$genomeHash}); |
|
Trace("Beginning genome data load.") if T(2); |
|
248 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
249 |
my $loadGenome = $self->_TableLoader('Genome', $genomeCount); |
my $loadGenome = $self->_TableLoader('Genome'); |
250 |
my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300); |
my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly); |
251 |
my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300); |
my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly); |
252 |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000); |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly); |
253 |
my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000); |
my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly); |
254 |
|
if ($self->{options}->{loadOnly}) { |
255 |
|
Trace("Loading from existing files.") if T(2); |
256 |
|
} else { |
257 |
|
Trace("Generating genome data.") if T(2); |
258 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
259 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
260 |
Trace("Loading data for genome $genomeID.") if T(3); |
Trace("Generating data for genome $genomeID.") if T(3); |
261 |
|
$loadGenome->Add("genomeIn"); |
262 |
# The access code comes in via the genome hash. |
# The access code comes in via the genome hash. |
263 |
my $accessCode = $genomeHash->{$genomeID}; |
my $accessCode = $genomeHash->{$genomeID}; |
264 |
# Get the genus, species, and strain from the scientific name. Note that we append |
# Get the genus, species, and strain from the scientific name. |
|
# the genome ID to the strain. In some cases this is the totality of the strain name. |
|
265 |
my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
266 |
my $extra = join " ", @extraData, "[genomeID]"; |
my $extra = join " ", @extraData; |
267 |
# Get the full taxonomy. |
# Get the full taxonomy. |
268 |
my $taxonomy = $fig->taxonomy_of($genomeID); |
my $taxonomy = $fig->taxonomy_of($genomeID); |
269 |
# Output the genome record. |
# Output the genome record. |
273 |
my @contigs = $fig->all_contigs($genomeID); |
my @contigs = $fig->all_contigs($genomeID); |
274 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
275 |
Trace("Processing contig $contigID for $genomeID.") if T(4); |
Trace("Processing contig $contigID for $genomeID.") if T(4); |
276 |
|
$loadContig->Add("contigIn"); |
277 |
|
$loadSequence->Add("contigIn"); |
278 |
# Create the contig ID. |
# Create the contig ID. |
279 |
my $sproutContigID = "$genomeID:$contigID"; |
my $sproutContigID = "$genomeID:$contigID"; |
280 |
# Create the contig record and relate it to the genome. |
# Create the contig record and relate it to the genome. |
286 |
# Now we get the sequence a chunk at a time. |
# Now we get the sequence a chunk at a time. |
287 |
my $contigLen = $fig->contig_ln($genomeID, $contigID); |
my $contigLen = $fig->contig_ln($genomeID, $contigID); |
288 |
for (my $i = 1; $i <= $contigLen; $i += $chunkSize) { |
for (my $i = 1; $i <= $contigLen; $i += $chunkSize) { |
289 |
|
$loadSequence->Add("chunkIn"); |
290 |
# Compute the endpoint of this chunk. |
# Compute the endpoint of this chunk. |
291 |
my $end = FIG::min($i + $chunkSize - 1, $contigLen); |
my $end = FIG::min($i + $chunkSize - 1, $contigLen); |
292 |
# Get the actual DNA. |
# Get the actual DNA. |
299 |
} |
} |
300 |
} |
} |
301 |
} |
} |
302 |
|
} |
303 |
# Finish the loads. |
# Finish the loads. |
304 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
305 |
# Return the result. |
# Return the result. |
343 |
my $genomeCount = (keys %{$genomeFilter}); |
my $genomeCount = (keys %{$genomeFilter}); |
344 |
my $featureCount = $genomeCount * 4000; |
my $featureCount = $genomeCount * 4000; |
345 |
# Start the loads. |
# Start the loads. |
346 |
my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount); |
my $loadCoupling = $self->_TableLoader('Coupling'); |
347 |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000); |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
348 |
my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000); |
my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly); |
349 |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000); |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly); |
350 |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000); |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly); |
351 |
Trace("Beginning coupling data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
352 |
|
Trace("Loading from existing files.") if T(2); |
353 |
|
} else { |
354 |
|
Trace("Generating coupling data.") if T(2); |
355 |
# Loop through the genomes found. |
# Loop through the genomes found. |
356 |
for my $genome (sort keys %{$genomeFilter}) { |
for my $genome (sort keys %{$genomeFilter}) { |
357 |
Trace("Generating coupling data for $genome.") if T(3); |
Trace("Generating coupling data for $genome.") if T(3); |
358 |
|
$loadCoupling->Add("genomeIn"); |
359 |
# Create a hash table for holding coupled pairs. We use this to prevent |
# Create a hash table for holding coupled pairs. We use this to prevent |
360 |
# duplicates. For example, if A is coupled to B, we don't want to also |
# duplicates. For example, if A is coupled to B, we don't want to also |
361 |
# assert that B is coupled to A, because we already know it. Fortunately, |
# assert that B is coupled to A, because we already know it. Fortunately, |
366 |
my @pegs = $fig->pegs_of($genome); |
my @pegs = $fig->pegs_of($genome); |
367 |
# Loop through the PEGs. |
# Loop through the PEGs. |
368 |
for my $peg1 (@pegs) { |
for my $peg1 (@pegs) { |
369 |
|
$loadCoupling->Add("pegIn"); |
370 |
Trace("Processing PEG $peg1 for $genome.") if T(4); |
Trace("Processing PEG $peg1 for $genome.") if T(4); |
371 |
# Get a list of the coupled PEGs. |
# Get a list of the coupled PEGs. |
372 |
my @couplings = $fig->coupled_to($peg1); |
my @couplings = $fig->coupled_to($peg1); |
377 |
# Compute the coupling ID. |
# Compute the coupling ID. |
378 |
my $coupleID = Sprout::CouplingID($peg1, $peg2); |
my $coupleID = Sprout::CouplingID($peg1, $peg2); |
379 |
if (! exists $dupHash{$coupleID}) { |
if (! exists $dupHash{$coupleID}) { |
380 |
|
$loadCoupling->Add("couplingIn"); |
381 |
# Here we have a new coupling to store in the load files. |
# Here we have a new coupling to store in the load files. |
382 |
Trace("Storing coupling ($coupleID) with score $score.") if T(4); |
Trace("Storing coupling ($coupleID) with score $score.") if T(4); |
383 |
# Ensure we don't do this again. |
# Ensure we don't do this again. |
393 |
my %evidenceMap = (); |
my %evidenceMap = (); |
394 |
# Process each evidence item. |
# Process each evidence item. |
395 |
for my $evidenceData (@evidence) { |
for my $evidenceData (@evidence) { |
396 |
|
$loadPCH->Add("evidenceIn"); |
397 |
my ($peg3, $peg4, $usage) = @{$evidenceData}; |
my ($peg3, $peg4, $usage) = @{$evidenceData}; |
398 |
# Only proceed if the evidence is from a Sprout |
# Only proceed if the evidence is from a Sprout |
399 |
# genome. |
# genome. |
400 |
if ($genomeFilter->{$fig->genome_of($peg3)}) { |
if ($genomeFilter->{$fig->genome_of($peg3)}) { |
401 |
|
$loadUsesAsEvidence->Add("evidenceChosen"); |
402 |
my $evidenceKey = "$coupleID $peg3 $peg4"; |
my $evidenceKey = "$coupleID $peg3 $peg4"; |
403 |
# We store this evidence in the hash if the usage |
# We store this evidence in the hash if the usage |
404 |
# is nonzero or no prior evidence has been found. This |
# is nonzero or no prior evidence has been found. This |
405 |
# insures that if there is duplicate evidence, we |
# insures that if there is duplicate evidence, we |
406 |
# at least keep the meaningful ones. Only evidence is |
# at least keep the meaningful ones. Only evidence in |
407 |
# the hash makes it to the output. |
# the hash makes it to the output. |
408 |
if ($usage || ! exists $evidenceMap{$evidenceKey}) { |
if ($usage || ! exists $evidenceMap{$evidenceKey}) { |
409 |
$evidenceMap{$evidenceKey} = $evidenceData; |
$evidenceMap{$evidenceKey} = $evidenceData; |
418 |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
419 |
# Connect it to the features. |
# Connect it to the features. |
420 |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
421 |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 1); |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 2); |
422 |
|
} |
423 |
} |
} |
424 |
} |
} |
425 |
} |
} |
446 |
FeatureTranslation |
FeatureTranslation |
447 |
FeatureUpstream |
FeatureUpstream |
448 |
IsLocatedIn |
IsLocatedIn |
449 |
|
HasFeature |
450 |
|
|
451 |
=over 4 |
=over 4 |
452 |
|
|
465 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
466 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
467 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
468 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
469 |
my $loadFeature = $self->_TableLoader('Feature', $featureCount); |
my $loadFeature = $self->_TableLoader('Feature'); |
470 |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6); |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly); |
471 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10); |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
472 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
473 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
474 |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
475 |
|
my $loadHasFeature = $self->_TableLoader('HasFeature'); |
476 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
477 |
# locations. |
# locations. |
478 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
479 |
Trace("Beginning feature data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
480 |
|
Trace("Loading from existing files.") if T(2); |
481 |
|
} else { |
482 |
|
Trace("Generating feature data.") if T(2); |
483 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
484 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
485 |
Trace("Loading features for genome $genomeID.") if T(3); |
Trace("Loading features for genome $genomeID.") if T(3); |
486 |
|
$loadFeature->Add("genomeIn"); |
487 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
488 |
my $features = $fig->all_features_detailed($genomeID); |
my $features = $fig->all_features_detailed($genomeID); |
489 |
# Loop through the features. |
# Loop through the features. |
490 |
for my $featureData (@{$features}) { |
for my $featureData (@{$features}) { |
491 |
|
$loadFeature->Add("featureIn"); |
492 |
# Split the tuple. |
# Split the tuple. |
493 |
my ($featureID, $locations, $aliases, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
494 |
# Create the feature record. |
# Create the feature record. |
495 |
$loadFeature->Put($featureID, 1, $type); |
$loadFeature->Put($featureID, 1, $type); |
496 |
|
# Link it to the parent genome. |
497 |
|
$loadHasFeature->Put($genomeID, $featureID, $type); |
498 |
# Create the aliases. |
# Create the aliases. |
499 |
for my $alias (split /\s*,\s*/, $aliases) { |
for my $alias ($fig->feature_aliases($featureID)) { |
500 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
501 |
} |
} |
502 |
# Get the links. |
# Get the links. |
506 |
} |
} |
507 |
# If this is a peg, generate the translation and the upstream. |
# If this is a peg, generate the translation and the upstream. |
508 |
if ($type eq 'peg') { |
if ($type eq 'peg') { |
509 |
|
$loadFeatureTranslation->Add("pegIn"); |
510 |
my $translation = $fig->get_translation($featureID); |
my $translation = $fig->get_translation($featureID); |
511 |
if ($translation) { |
if ($translation) { |
512 |
$loadFeatureTranslation->Put($featureID, $translation); |
$loadFeatureTranslation->Put($featureID, $translation); |
522 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
523 |
# for Sprout. |
# for Sprout. |
524 |
my @locationList = split /\s*,\s*/, $locations; |
my @locationList = split /\s*,\s*/, $locations; |
525 |
|
# Create the location position indicator. |
526 |
|
my $i = 1; |
527 |
# Loop through the locations. |
# Loop through the locations. |
528 |
for my $location (@locationList) { |
for my $location (@locationList) { |
529 |
# Parse the location. |
# Parse the location. |
530 |
my $locObject = BasicLocation->new($location); |
my $locObject = BasicLocation->new("$genomeID:$location"); |
531 |
# Split it into a list of chunks. |
# Split it into a list of chunks. |
532 |
my @locOList = (); |
my @locOList = (); |
533 |
while (my $peeling = $locObject->Peel($chunkSize)) { |
while (my $peeling = $locObject->Peel($chunkSize)) { |
534 |
|
$loadIsLocatedIn->Add("peeling"); |
535 |
push @locOList, $peeling; |
push @locOList, $peeling; |
536 |
} |
} |
537 |
push @locOList, $locObject; |
push @locOList, $locObject; |
538 |
# Loop through the chunks, creating IsLocatedIn records. The variable |
# Loop through the chunks, creating IsLocatedIn records. The variable |
539 |
# "$i" will be used to keep the location index. |
# "$i" will be used to keep the location index. |
|
my $i = 1; |
|
540 |
for my $locChunk (@locOList) { |
for my $locChunk (@locOList) { |
541 |
$loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
$loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
542 |
$locChunk->Dir, $locChunk->Length, $i); |
$locChunk->Dir, $locChunk->Length, $i); |
545 |
} |
} |
546 |
} |
} |
547 |
} |
} |
548 |
|
} |
549 |
# Finish the loads. |
# Finish the loads. |
550 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
551 |
return $retVal; |
return $retVal; |
582 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
583 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
584 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
585 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
586 |
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf', |
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf'); |
587 |
$featureCount * $genomeCount); |
if ($self->{options}->{loadOnly}) { |
588 |
Trace("Beginning BBH load.") if T(2); |
Trace("Loading from existing files.") if T(2); |
589 |
|
} else { |
590 |
|
Trace("Generating BBH data.") if T(2); |
591 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
592 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
593 |
|
$loadIsBidirectionalBestHitOf->Add("genomeIn"); |
594 |
Trace("Processing features for genome $genomeID.") if T(3); |
Trace("Processing features for genome $genomeID.") if T(3); |
595 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
596 |
my $features = $fig->all_features_detailed($genomeID); |
my $features = $fig->all_features_detailed($genomeID); |
613 |
} |
} |
614 |
} |
} |
615 |
} |
} |
616 |
|
} |
617 |
# Finish the loads. |
# Finish the loads. |
618 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
619 |
return $retVal; |
return $retVal; |
635 |
|
|
636 |
Subsystem |
Subsystem |
637 |
Role |
Role |
638 |
|
RoleEC |
639 |
SSCell |
SSCell |
640 |
ContainsFeature |
ContainsFeature |
641 |
IsGenomeOf |
IsGenomeOf |
643 |
OccursInSubsystem |
OccursInSubsystem |
644 |
ParticipatesIn |
ParticipatesIn |
645 |
HasSSCell |
HasSSCell |
646 |
|
ConsistsOfRoles |
647 |
|
RoleSubset |
648 |
|
HasRoleSubset |
649 |
|
ConsistsOfGenomes |
650 |
|
GenomeSubset |
651 |
|
HasGenomeSubset |
652 |
|
Catalyzes |
653 |
|
Diagram |
654 |
|
RoleOccursIn |
655 |
|
|
656 |
=over 4 |
=over 4 |
657 |
|
|
661 |
|
|
662 |
=back |
=back |
663 |
|
|
|
B<TO DO> |
|
|
|
|
|
Generate RoleName table? |
|
|
|
|
664 |
=cut |
=cut |
665 |
#: Return Type $%; |
#: Return Type $%; |
666 |
sub LoadSubsystemData { |
sub LoadSubsystemData { |
674 |
# Get the subsystem hash. This lists the subsystems we'll process. |
# Get the subsystem hash. This lists the subsystems we'll process. |
675 |
my $subsysHash = $self->{subsystems}; |
my $subsysHash = $self->{subsystems}; |
676 |
my @subsysIDs = sort keys %{$subsysHash}; |
my @subsysIDs = sort keys %{$subsysHash}; |
677 |
my $subsysCount = @subsysIDs; |
# Get the map list. |
678 |
my $genomeCount = (keys %{$genomeHash}); |
my @maps = $fig->all_maps; |
|
my $featureCount = $genomeCount * 4000; |
|
679 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
680 |
my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount); |
my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly); |
681 |
my $loadRole = $self->_TableLoader('Role', $featureCount * 6); |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly); |
682 |
my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount); |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
683 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount); |
my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly); |
684 |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount); |
my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly); |
685 |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount); |
my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly); |
686 |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6); |
my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly); |
687 |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount); |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly); |
688 |
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount); |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly); |
689 |
Trace("Beginning subsystem data load.") if T(2); |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly); |
690 |
|
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly); |
691 |
|
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly); |
692 |
|
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly); |
693 |
|
my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly); |
694 |
|
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly); |
695 |
|
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly); |
696 |
|
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); |
697 |
|
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); |
698 |
|
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); |
699 |
|
if ($self->{options}->{loadOnly}) { |
700 |
|
Trace("Loading from existing files.") if T(2); |
701 |
|
} else { |
702 |
|
Trace("Generating subsystem data.") if T(2); |
703 |
|
# This hash will contain the role for each EC. When we're done, this |
704 |
|
# information will be used to generate the Catalyzes table. |
705 |
|
my %ecToRoles = (); |
706 |
# Loop through the subsystems. Our first task will be to create the |
# Loop through the subsystems. Our first task will be to create the |
707 |
# roles. We do this by looping through the subsystems and creating a |
# roles. We do this by looping through the subsystems and creating a |
708 |
# role hash. The hash tracks each role ID so that we don't create |
# role hash. The hash tracks each role ID so that we don't create |
709 |
# duplicates. As we move along, we'll connect the roles and subsystems. |
# duplicates. As we move along, we'll connect the roles and subsystems |
710 |
|
# and memorize up the reactions. |
711 |
|
my ($genomeID, $roleID); |
712 |
my %roleData = (); |
my %roleData = (); |
713 |
for my $subsysID (@subsysIDs) { |
for my $subsysID (@subsysIDs) { |
714 |
|
# Get the subsystem object. |
715 |
|
my $sub = $fig->get_subsystem($subsysID); |
716 |
|
# Only proceed if the subsystem has a spreadsheet. |
717 |
|
if (! $sub->{empty_ss}) { |
718 |
Trace("Creating subsystem $subsysID.") if T(3); |
Trace("Creating subsystem $subsysID.") if T(3); |
719 |
|
$loadSubsystem->Add("subsystemIn"); |
720 |
# Create the subsystem record. |
# Create the subsystem record. |
721 |
$loadSubsystem->Put($subsysID); |
my $curator = $sub->get_curator(); |
722 |
# Get the subsystem's roles. |
my $notes = $sub->get_notes(); |
723 |
my @roles = $fig->subsys_to_roles($subsysID); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
724 |
# Connect the roles to the subsystem. If a role is new, we create |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
725 |
# a role record for it. |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
726 |
for my $roleID (@roles) { |
# Connect to this role. |
727 |
$loadOccursInSubsystem->Put($roleID, $subsysID); |
$loadOccursInSubsystem->Add("roleIn"); |
728 |
|
$loadOccursInSubsystem->Put($roleID, $subsysID, $col); |
729 |
|
# If it's a new role, add it to the role table. |
730 |
if (! exists $roleData{$roleID}) { |
if (! exists $roleData{$roleID}) { |
731 |
$loadRole->Put($roleID); |
# Get the role's abbreviation. |
732 |
|
my $abbr = $sub->get_role_abbr($col); |
733 |
|
# Add the role. |
734 |
|
$loadRole->Put($roleID, $abbr); |
735 |
$roleData{$roleID} = 1; |
$roleData{$roleID} = 1; |
736 |
|
# Check for an EC number. |
737 |
|
if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) { |
738 |
|
my $ec = $1; |
739 |
|
$loadRoleEC->Put($roleID, $ec); |
740 |
|
$ecToRoles{$ec} = $roleID; |
741 |
} |
} |
742 |
} |
} |
743 |
# Now all roles for this subsystem have been filled in. We create the |
} |
744 |
# spreadsheet by matches roles to genomes. To do this, we need to |
# Now we create the spreadsheet for the subsystem by matching roles to |
745 |
# get the genomes on the sheet. |
# genomes. Each genome is a row and each role is a column. We may need |
746 |
|
# to actually create the roles as we find them. |
747 |
Trace("Creating subsystem $subsysID spreadsheet.") if T(3); |
Trace("Creating subsystem $subsysID spreadsheet.") if T(3); |
748 |
my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)}; |
for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) { |
749 |
for my $genomeID (@genomes) { |
# Only proceed if this is one of our genomes. |
|
# Only process this genome if it's one of ours. |
|
750 |
if (exists $genomeHash->{$genomeID}) { |
if (exists $genomeHash->{$genomeID}) { |
751 |
# Connect the genome to the subsystem. |
# Count the PEGs and cells found for verification purposes. |
752 |
$loadParticipatesIn->Put($genomeID, $subsysID); |
my $pegCount = 0; |
753 |
|
my $cellCount = 0; |
754 |
|
# Create a list for the PEGs we find. This list will be used |
755 |
|
# to generate cluster numbers. |
756 |
|
my @pegsFound = (); |
757 |
|
# Create a hash that maps spreadsheet IDs to PEGs. We will |
758 |
|
# use this to generate the ContainsFeature data after we have |
759 |
|
# the cluster numbers. |
760 |
|
my %cellPegs = (); |
761 |
|
# Get the genome's variant code for this subsystem. |
762 |
|
my $variantCode = $sub->get_variant_code($row); |
763 |
# Loop through the subsystem's roles. We use an index because it is |
# Loop through the subsystem's roles. We use an index because it is |
764 |
# part of the spreadsheet cell ID. |
# part of the spreadsheet cell ID. |
765 |
for (my $i = 0; $i <= $#roles; $i++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
|
my $role = $roles[$i]; |
|
766 |
# Get the features in the spreadsheet cell for this genome and role. |
# Get the features in the spreadsheet cell for this genome and role. |
767 |
my @pegs = $fig->pegs_in_subsystem_coll($subsysID, $genomeID, $i); |
my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col); |
768 |
# Only proceed if features exist. |
# Only proceed if features exist. |
769 |
if (@pegs > 0) { |
if (@pegs > 0) { |
770 |
# Create the spreadsheet cell. |
# Create the spreadsheet cell. |
771 |
my $cellID = "$subsysID:$genomeID:$i"; |
$cellCount++; |
772 |
|
my $cellID = "$subsysID:$genomeID:$col"; |
773 |
$loadSSCell->Put($cellID); |
$loadSSCell->Put($cellID); |
774 |
$loadIsGenomeOf->Put($genomeID, $cellID); |
$loadIsGenomeOf->Put($genomeID, $cellID); |
775 |
$loadIsRoleOf->Put($role, $cellID); |
$loadIsRoleOf->Put($roleID, $cellID); |
776 |
$loadHasSSCell->Put($subsysID, $cellID); |
$loadHasSSCell->Put($subsysID, $cellID); |
777 |
# Attach the features to it. |
# Remember its features. |
778 |
for my $pegID (@pegs) { |
push @pegsFound, @pegs; |
779 |
$loadContainsFeature->Put($cellID, $pegID); |
$cellPegs{$cellID} = \@pegs; |
780 |
|
$pegCount += @pegs; |
781 |
|
} |
782 |
|
} |
783 |
|
# If we found some cells for this genome, we need to compute clusters and |
784 |
|
# denote it participates in the subsystem. |
785 |
|
if ($pegCount > 0) { |
786 |
|
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
787 |
|
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
788 |
|
# Partition the PEGs found into clusters. |
789 |
|
my @clusters = $fig->compute_clusters(\@pegsFound, $sub); |
790 |
|
# Create a hash mapping PEG IDs to cluster numbers. |
791 |
|
# We default to -1 for all of them. |
792 |
|
my %clusterOf = map { $_ => -1 } @pegsFound; |
793 |
|
for (my $i = 0; $i <= $#clusters; $i++) { |
794 |
|
my $subList = $clusters[$i]; |
795 |
|
for my $peg (@{$subList}) { |
796 |
|
$clusterOf{$peg} = $i; |
797 |
|
} |
798 |
|
} |
799 |
|
# Create the ContainsFeature data. |
800 |
|
for my $cellID (keys %cellPegs) { |
801 |
|
my $cellList = $cellPegs{$cellID}; |
802 |
|
for my $cellPeg (@$cellList) { |
803 |
|
$loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg}); |
804 |
} |
} |
805 |
} |
} |
806 |
} |
} |
807 |
} |
} |
808 |
} |
} |
809 |
|
# Now we need to generate the subsets. The subset names must be concatenated to |
810 |
|
# the subsystem name to make them unique keys. There are two types of subsets: |
811 |
|
# genome subsets and role subsets. We do the role subsets first. |
812 |
|
my @subsetNames = $sub->get_subset_names(); |
813 |
|
for my $subsetID (@subsetNames) { |
814 |
|
# Create the subset record. |
815 |
|
my $actualID = "$subsysID:$subsetID"; |
816 |
|
$loadRoleSubset->Put($actualID); |
817 |
|
# Connect the subset to the subsystem. |
818 |
|
$loadHasRoleSubset->Put($subsysID, $actualID); |
819 |
|
# Connect the subset to its roles. |
820 |
|
my @roles = $sub->get_subsetC_roles($subsetID); |
821 |
|
for my $roleID (@roles) { |
822 |
|
$loadConsistsOfRoles->Put($actualID, $roleID); |
823 |
} |
} |
|
# Finish the load. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
824 |
} |
} |
825 |
|
# Next the genome subsets. |
826 |
=head3 LoadDiagramData |
@subsetNames = $sub->get_subset_namesR(); |
827 |
|
for my $subsetID (@subsetNames) { |
828 |
C<< my $stats = $spl->LoadDiagramData(); >> |
# Create the subset record. |
829 |
|
my $actualID = "$subsysID:$subsetID"; |
830 |
Load the diagram data from FIG into Sprout. |
$loadGenomeSubset->Put($actualID); |
831 |
|
# Connect the subset to the subsystem. |
832 |
Diagrams are used to organize functional roles. The diagram shows the |
$loadHasGenomeSubset->Put($subsysID, $actualID); |
833 |
connections between chemicals that interact with a subsystem. |
# Connect the subset to its genomes. |
834 |
|
my @genomes = $sub->get_subsetR($subsetID); |
835 |
The following relations are loaded by this method. |
for my $genomeID (@genomes) { |
836 |
|
$loadConsistsOfGenomes->Put($actualID, $genomeID); |
837 |
Diagram |
} |
838 |
RoleOccursIn |
} |
839 |
|
} |
840 |
=over 4 |
# Now we loop through the diagrams. We need to create the diagram records |
841 |
|
# and link each diagram to its roles. Note that only roles which occur |
842 |
=item RETURNS |
# in subsystems (and therefore appear in the %ecToRoles hash) are |
843 |
|
# included. |
844 |
Returns a statistics object for the loads. |
for my $map (@maps) { |
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadDiagramData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the map list. |
|
|
my @maps = $fig->all_maps; |
|
|
my $mapCount = @maps; |
|
|
my $genomeCount = (keys %{$self->{genomes}}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
|
# Create load objects for each of the tables we're loading. |
|
|
my $loadDiagram = $self->_TableLoader('Diagram', $mapCount); |
|
|
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6); |
|
|
Trace("Beginning diagram data load.") if T(2); |
|
|
# Loop through the diagrams. |
|
|
for my $map ($fig->all_maps) { |
|
845 |
Trace("Loading diagram $map.") if T(3); |
Trace("Loading diagram $map.") if T(3); |
846 |
# Get the diagram's descriptive name. |
# Get the diagram's descriptive name. |
847 |
my $name = $fig->map_name($map); |
my $name = $fig->map_name($map); |
850 |
# A hash is used to prevent duplicates. |
# A hash is used to prevent duplicates. |
851 |
my %roleHash = (); |
my %roleHash = (); |
852 |
for my $role ($fig->map_to_ecs($map)) { |
for my $role ($fig->map_to_ecs($map)) { |
853 |
if (! $roleHash{$role}) { |
if (exists $ecToRoles{$role} && ! $roleHash{$role}) { |
854 |
$loadRoleOccursIn->Put($role, $map); |
$loadRoleOccursIn->Put($ecToRoles{$role}, $map); |
855 |
$roleHash{$role} = 1; |
$roleHash{$role} = 1; |
856 |
} |
} |
857 |
} |
} |
858 |
} |
} |
859 |
|
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
860 |
|
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
861 |
|
my @reactions = $fig->all_reactions(); |
862 |
|
for my $reactionID (@reactions) { |
863 |
|
# Get this reaction's list of roles. The results will be EC numbers. |
864 |
|
my @roles = $fig->catalyzed_by($reactionID); |
865 |
|
# Loop through the roles, creating catalyzation records. |
866 |
|
for my $thisRole (@roles) { |
867 |
|
if (exists $ecToRoles{$thisRole}) { |
868 |
|
$loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID); |
869 |
|
} |
870 |
|
} |
871 |
|
} |
872 |
|
} |
873 |
|
} |
874 |
# Finish the load. |
# Finish the load. |
875 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
876 |
return $retVal; |
return $retVal; |
912 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
913 |
# Get the genome hash. |
# Get the genome hash. |
914 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
915 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
916 |
my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500); |
my $loadProperty = $self->_TableLoader('Property'); |
917 |
my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500); |
my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly); |
918 |
Trace("Beginning property data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
919 |
|
Trace("Loading from existing files.") if T(2); |
920 |
|
} else { |
921 |
|
Trace("Generating property data.") if T(2); |
922 |
# Create a hash for storing property IDs. |
# Create a hash for storing property IDs. |
923 |
my %propertyKeys = (); |
my %propertyKeys = (); |
924 |
my $nextID = 1; |
my $nextID = 1; |
925 |
# Loop through the genomes. |
# Loop through the genomes. |
926 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (keys %{$genomeHash}) { |
927 |
|
$loadProperty->Add("genomeIn"); |
928 |
|
Trace("Generating properties for $genomeID.") if T(3); |
929 |
# Get the genome's features. The feature ID is the first field in the |
# Get the genome's features. The feature ID is the first field in the |
930 |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
931 |
# rather than "all_features" because we want all features regardless of type. |
# rather than "all_features" because we want all features regardless of type. |
932 |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
933 |
|
my $featureCount = 0; |
934 |
|
my $propertyCount = 0; |
935 |
# Loop through the features, creating HasProperty records. |
# Loop through the features, creating HasProperty records. |
936 |
for my $fid (@features) { |
for my $fid (@features) { |
937 |
# Get all attributes for this feature. We do this one feature at a time |
# Get all attributes for this feature. We do this one feature at a time |
938 |
# to insure we do not get any genome attributes. |
# to insure we do not get any genome attributes. |
939 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
940 |
|
if (scalar @attributeList) { |
941 |
|
$featureCount++; |
942 |
|
} |
943 |
# Loop through the attributes. |
# Loop through the attributes. |
944 |
for my $tuple (@attributeList) { |
for my $tuple (@attributeList) { |
945 |
|
$propertyCount++; |
946 |
# Get this attribute value's data. Note that we throw away the FID, |
# Get this attribute value's data. Note that we throw away the FID, |
947 |
# since it will always be the same as the value if "$fid". |
# since it will always be the same as the value if "$fid". |
948 |
my (undef, $key, $value, $url) = @{$tuple}; |
my (undef, $key, $value, $url) = @{$tuple}; |
964 |
$loadHasProperty->Put($fid, $propertyID, $url); |
$loadHasProperty->Put($fid, $propertyID, $url); |
965 |
} |
} |
966 |
} |
} |
967 |
|
# Update the statistics. |
968 |
|
Trace("$propertyCount attributes processed for $featureCount features.") if T(3); |
969 |
|
$loadHasProperty->Add("featuresIn", $featureCount); |
970 |
|
$loadHasProperty->Add("propertiesIn", $propertyCount); |
971 |
|
} |
972 |
} |
} |
973 |
# Finish the load. |
# Finish the load. |
974 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1009 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1010 |
# Get the genome hash. |
# Get the genome hash. |
1011 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1012 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1013 |
my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000); |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
1014 |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000); |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly); |
1015 |
my $loadSproutUser = $self->_TableLoader('SproutUser', 100); |
my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly); |
1016 |
my $loadUserAccess = $self->_TableLoader('UserAccess', 1000); |
my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly); |
1017 |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000); |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly); |
1018 |
Trace("Beginning annotation data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1019 |
|
Trace("Loading from existing files.") if T(2); |
1020 |
|
} else { |
1021 |
|
Trace("Generating annotation data.") if T(2); |
1022 |
# Create a hash of user names. We'll use this to prevent us from generating duplicate |
# Create a hash of user names. We'll use this to prevent us from generating duplicate |
1023 |
# user records. |
# user records. |
1024 |
my %users = ( FIG => 1, master => 1 ); |
my %users = ( FIG => 1, master => 1 ); |
1030 |
# Get the current time. |
# Get the current time. |
1031 |
my $time = time(); |
my $time = time(); |
1032 |
# Loop through the genomes. |
# Loop through the genomes. |
1033 |
for my $genomeID (%{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1034 |
Trace("Processing $genomeID.") if T(3); |
Trace("Processing $genomeID.") if T(3); |
|
# Get the genome's PEGs. |
|
|
my @pegs = $fig->pegs_of($genomeID); |
|
|
for my $peg (@pegs) { |
|
|
Trace("Processing $peg.") if T(4); |
|
1035 |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
1036 |
# from showing up for a single PEG's annotations. |
# from showing up for a single PEG's annotations. |
1037 |
my %seenTimestamps = (); |
my %seenTimestamps = (); |
1038 |
# Check for a functional assignment. |
# Get the genome's annotations. |
1039 |
my $func = $fig->function_of($peg); |
my @annotations = $fig->read_all_annotations($genomeID); |
1040 |
if ($func) { |
Trace("Processing annotations.") if T(2); |
1041 |
# If this is NOT a hypothetical assignment, we create an |
for my $tuple (@annotations) { |
1042 |
# assignment annotation for it. |
# Get the annotation tuple. |
1043 |
if (! FIG::hypo($peg)) { |
my ($peg, $timestamp, $user, $text) = @{$tuple}; |
|
# Note that we double the slashes so that what goes into the database is |
|
|
# a new-line escape sequence rather than an actual new-line. |
|
|
$loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func"); |
|
|
$loadIsTargetOfAnnotation->Put($peg, "$peg:$time"); |
|
|
$loadMadeAnnotation->Put("FIG", "$peg:$time"); |
|
|
# Denote we've seen this timestamp. |
|
|
$seenTimestamps{$time} = 1; |
|
|
} |
|
|
# Now loop through the real annotations. |
|
|
for my $tuple ($fig->feature_annotations($peg, "raw")) { |
|
|
my ($fid, $timestamp, $user, $text) = $tuple; |
|
1044 |
# Here we fix up the annotation text. "\r" is removed, |
# Here we fix up the annotation text. "\r" is removed, |
1045 |
# and "\t" and "\n" are escaped. Note we use the "s" |
# and "\t" and "\n" are escaped. Note we use the "s" |
1046 |
# modifier so that new-lines inside the text do not |
# modifier so that new-lines inside the text do not |
1052 |
$text =~ s/Set master function/Set FIG function/s; |
$text =~ s/Set master function/Set FIG function/s; |
1053 |
# Insure the time stamp is valid. |
# Insure the time stamp is valid. |
1054 |
if ($timestamp =~ /^\d+$/) { |
if ($timestamp =~ /^\d+$/) { |
1055 |
# Here it's a number. We need to insure it's unique. |
# Here it's a number. We need to insure the one we use to form |
1056 |
while ($seenTimestamps{$timestamp}) { |
# the key is unique. |
1057 |
$timestamp++; |
my $keyStamp = $timestamp; |
1058 |
|
while ($seenTimestamps{"$peg:$keyStamp"}) { |
1059 |
|
$keyStamp++; |
1060 |
} |
} |
1061 |
$seenTimestamps{$timestamp} = 1; |
my $annotationID = "$peg:$keyStamp"; |
1062 |
my $annotationID = "$peg:$timestamp"; |
$seenTimestamps{$annotationID} = 1; |
1063 |
# Insure the user exists. |
# Insure the user exists. |
1064 |
if (! $users{$user}) { |
if (! $users{$user}) { |
1065 |
$loadSproutUser->Put($user, "SEED user"); |
$loadSproutUser->Put($user, "SEED user"); |
1067 |
$users{$user} = 1; |
$users{$user} = 1; |
1068 |
} |
} |
1069 |
# Generate the annotation. |
# Generate the annotation. |
1070 |
$loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text"); |
$loadAnnotation->Put($annotationID, $timestamp, $text); |
1071 |
$loadIsTargetOfAnnotation->Put($peg, $annotationID); |
$loadIsTargetOfAnnotation->Put($peg, $annotationID); |
1072 |
$loadMadeAnnotation->Put($user, $annotationID); |
$loadMadeAnnotation->Put($user, $annotationID); |
1073 |
} else { |
} else { |
1077 |
} |
} |
1078 |
} |
} |
1079 |
} |
} |
1080 |
|
# Finish the load. |
1081 |
|
my $retVal = $self->_FinishAll(); |
1082 |
|
return $retVal; |
1083 |
|
} |
1084 |
|
|
1085 |
|
=head3 LoadSourceData |
1086 |
|
|
1087 |
|
C<< my $stats = $spl->LoadSourceData(); >> |
1088 |
|
|
1089 |
|
Load the source data from FIG into Sprout. |
1090 |
|
|
1091 |
|
Source data links genomes to information about the organizations that |
1092 |
|
mapped it. |
1093 |
|
|
1094 |
|
The following relations are loaded by this method. |
1095 |
|
|
1096 |
|
ComesFrom |
1097 |
|
Source |
1098 |
|
SourceURL |
1099 |
|
|
1100 |
|
There is no direct support for source attribution in FIG, so we access the SEED |
1101 |
|
files directly. |
1102 |
|
|
1103 |
|
=over 4 |
1104 |
|
|
1105 |
|
=item RETURNS |
1106 |
|
|
1107 |
|
Returns a statistics object for the loads. |
1108 |
|
|
1109 |
|
=back |
1110 |
|
|
1111 |
|
=cut |
1112 |
|
#: Return Type $%; |
1113 |
|
sub LoadSourceData { |
1114 |
|
# Get this object instance. |
1115 |
|
my ($self) = @_; |
1116 |
|
# Get the FIG object. |
1117 |
|
my $fig = $self->{fig}; |
1118 |
|
# Get the genome hash. |
1119 |
|
my $genomeHash = $self->{genomes}; |
1120 |
|
# Create load objects for each of the tables we're loading. |
1121 |
|
my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly); |
1122 |
|
my $loadSource = $self->_TableLoader('Source'); |
1123 |
|
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
1124 |
|
if ($self->{options}->{loadOnly}) { |
1125 |
|
Trace("Loading from existing files.") if T(2); |
1126 |
|
} else { |
1127 |
|
Trace("Generating annotation data.") if T(2); |
1128 |
|
# Create hashes to collect the Source information. |
1129 |
|
my %sourceURL = (); |
1130 |
|
my %sourceDesc = (); |
1131 |
|
# Loop through the genomes. |
1132 |
|
my $line; |
1133 |
|
for my $genomeID (sort keys %{$genomeHash}) { |
1134 |
|
Trace("Processing $genomeID.") if T(3); |
1135 |
|
# Open the project file. |
1136 |
|
if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) && |
1137 |
|
defined($line = <TMP>)) { |
1138 |
|
chomp $line; |
1139 |
|
my($sourceID, $desc, $url) = split(/\t/,$line); |
1140 |
|
$loadComesFrom->Put($genomeID, $sourceID); |
1141 |
|
if ($url && ! exists $sourceURL{$sourceID}) { |
1142 |
|
$loadSourceURL->Put($sourceID, $url); |
1143 |
|
$sourceURL{$sourceID} = 1; |
1144 |
|
} |
1145 |
|
if ($desc) { |
1146 |
|
$sourceDesc{$sourceID} = $desc; |
1147 |
|
} elsif (! exists $sourceDesc{$sourceID}) { |
1148 |
|
$sourceDesc{$sourceID} = $sourceID; |
1149 |
|
} |
1150 |
|
} |
1151 |
|
close TMP; |
1152 |
|
} |
1153 |
|
# Write the source descriptions. |
1154 |
|
for my $sourceID (keys %sourceDesc) { |
1155 |
|
$loadSource->Put($sourceID, $sourceDesc{$sourceID}); |
1156 |
|
} |
1157 |
|
} |
1158 |
|
# Finish the load. |
1159 |
|
my $retVal = $self->_FinishAll(); |
1160 |
|
return $retVal; |
1161 |
|
} |
1162 |
|
|
1163 |
|
=head3 LoadExternalData |
1164 |
|
|
1165 |
|
C<< my $stats = $spl->LoadExternalData(); >> |
1166 |
|
|
1167 |
|
Load the external data from FIG into Sprout. |
1168 |
|
|
1169 |
|
External data contains information about external feature IDs. |
1170 |
|
|
1171 |
|
The following relations are loaded by this method. |
1172 |
|
|
1173 |
|
ExternalAliasFunc |
1174 |
|
ExternalAliasOrg |
1175 |
|
|
1176 |
|
The support for external IDs in FIG is hidden beneath layers of other data, so |
1177 |
|
we access the SEED files directly to create these tables. This is also one of |
1178 |
|
the few load methods that does not proceed genome by genome. |
1179 |
|
|
1180 |
|
=over 4 |
1181 |
|
|
1182 |
|
=item RETURNS |
1183 |
|
|
1184 |
|
Returns a statistics object for the loads. |
1185 |
|
|
1186 |
|
=back |
1187 |
|
|
1188 |
|
=cut |
1189 |
|
#: Return Type $%; |
1190 |
|
sub LoadExternalData { |
1191 |
|
# Get this object instance. |
1192 |
|
my ($self) = @_; |
1193 |
|
# Get the FIG object. |
1194 |
|
my $fig = $self->{fig}; |
1195 |
|
# Get the genome hash. |
1196 |
|
my $genomeHash = $self->{genomes}; |
1197 |
|
# Convert the genome hash. We'll get the genus and species for each genome and make |
1198 |
|
# it the key. |
1199 |
|
my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); |
1200 |
|
# Create load objects for each of the tables we're loading. |
1201 |
|
my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc'); |
1202 |
|
my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg'); |
1203 |
|
if ($self->{options}->{loadOnly}) { |
1204 |
|
Trace("Loading from existing files.") if T(2); |
1205 |
|
} else { |
1206 |
|
Trace("Generating external data.") if T(2); |
1207 |
|
# We loop through the files one at a time. First, the organism file. |
1208 |
|
Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); |
1209 |
|
my $orgLine; |
1210 |
|
while (defined($orgLine = <ORGS>)) { |
1211 |
|
# Clean the input line. |
1212 |
|
chomp $orgLine; |
1213 |
|
# Parse the organism name. |
1214 |
|
my ($protID, $name) = split /\s*\t\s*/, $orgLine; |
1215 |
|
$loadExternalAliasOrg->Put($protID, $name); |
1216 |
|
} |
1217 |
|
close ORGS; |
1218 |
|
# Now the function file. |
1219 |
|
my $funcLine; |
1220 |
|
Open(\*FUNCS, "<$FIG_Config::global/ext_func.table"); |
1221 |
|
while (defined($funcLine = <FUNCS>)) { |
1222 |
|
# Clean the line ending. |
1223 |
|
chomp $funcLine; |
1224 |
|
# Only proceed if the line is non-blank. |
1225 |
|
if ($funcLine) { |
1226 |
|
# Split it into fields. |
1227 |
|
my @funcFields = split /\s*\t\s*/, $funcLine; |
1228 |
|
# If there's an EC number, append it to the description. |
1229 |
|
if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) { |
1230 |
|
$funcFields[1] .= " $1"; |
1231 |
|
} |
1232 |
|
# Output the function line. |
1233 |
|
$loadExternalAliasFunc->Put(@funcFields[0,1]); |
1234 |
|
} |
1235 |
|
} |
1236 |
|
} |
1237 |
|
# Finish the load. |
1238 |
|
my $retVal = $self->_FinishAll(); |
1239 |
|
return $retVal; |
1240 |
|
} |
1241 |
|
|
1242 |
|
|
1243 |
|
=head3 LoadReactionData |
1244 |
|
|
1245 |
|
C<< my $stats = $spl->LoadReactionData(); >> |
1246 |
|
|
1247 |
|
Load the reaction data from FIG into Sprout. |
1248 |
|
|
1249 |
|
Reaction data connects reactions to the compounds that participate in them. |
1250 |
|
|
1251 |
|
The following relations are loaded by this method. |
1252 |
|
|
1253 |
|
Reaction |
1254 |
|
ReactionURL |
1255 |
|
Compound |
1256 |
|
CompoundName |
1257 |
|
CompoundCAS |
1258 |
|
IsAComponentOf |
1259 |
|
|
1260 |
|
This method proceeds reaction by reaction rather than genome by genome. |
1261 |
|
|
1262 |
|
=over 4 |
1263 |
|
|
1264 |
|
=item RETURNS |
1265 |
|
|
1266 |
|
Returns a statistics object for the loads. |
1267 |
|
|
1268 |
|
=back |
1269 |
|
|
1270 |
|
=cut |
1271 |
|
#: Return Type $%; |
1272 |
|
sub LoadReactionData { |
1273 |
|
# Get this object instance. |
1274 |
|
my ($self) = @_; |
1275 |
|
# Get the FIG object. |
1276 |
|
my $fig = $self->{fig}; |
1277 |
|
# Create load objects for each of the tables we're loading. |
1278 |
|
my $loadReaction = $self->_TableLoader('Reaction'); |
1279 |
|
my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly); |
1280 |
|
my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly); |
1281 |
|
my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly); |
1282 |
|
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly); |
1283 |
|
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly); |
1284 |
|
if ($self->{options}->{loadOnly}) { |
1285 |
|
Trace("Loading from existing files.") if T(2); |
1286 |
|
} else { |
1287 |
|
Trace("Generating annotation data.") if T(2); |
1288 |
|
# First we create the compounds. |
1289 |
|
my @compounds = $fig->all_compounds(); |
1290 |
|
for my $cid (@compounds) { |
1291 |
|
# Check for names. |
1292 |
|
my @names = $fig->names_of_compound($cid); |
1293 |
|
# Each name will be given a priority number, starting with 1. |
1294 |
|
my $prio = 1; |
1295 |
|
for my $name (@names) { |
1296 |
|
$loadCompoundName->Put($cid, $name, $prio++); |
1297 |
|
} |
1298 |
|
# Create the main compound record. Note that the first name |
1299 |
|
# becomes the label. |
1300 |
|
my $label = (@names > 0 ? $names[0] : $cid); |
1301 |
|
$loadCompound->Put($cid, $label); |
1302 |
|
# Check for a CAS ID. |
1303 |
|
my $cas = $fig->cas($cid); |
1304 |
|
if ($cas) { |
1305 |
|
$loadCompoundCAS->Put($cid, $cas); |
1306 |
|
} |
1307 |
|
} |
1308 |
|
# All the compounds are set up, so we need to loop through the reactions next. First, |
1309 |
|
# we initialize the discriminator index. This is a single integer used to insure |
1310 |
|
# duplicate elements in a reaction are not accidentally collapsed. |
1311 |
|
my $discrim = 0; |
1312 |
|
my @reactions = $fig->all_reactions(); |
1313 |
|
for my $reactionID (@reactions) { |
1314 |
|
# Create the reaction record. |
1315 |
|
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
1316 |
|
# Compute the reaction's URL. |
1317 |
|
my $url = HTML::reaction_link($reactionID); |
1318 |
|
# Put it in the ReactionURL table. |
1319 |
|
$loadReactionURL->Put($reactionID, $url); |
1320 |
|
# Now we need all of the reaction's compounds. We get these in two phases, |
1321 |
|
# substrates first and then products. |
1322 |
|
for my $product (0, 1) { |
1323 |
|
# Get the compounds of the current type for the current reaction. FIG will |
1324 |
|
# give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not |
1325 |
|
# have location data in SEED, so it defaults to the empty string. |
1326 |
|
my @compounds = $fig->reaction2comp($reactionID, $product); |
1327 |
|
for my $compData (@compounds) { |
1328 |
|
# Extract the compound data from the current tuple. |
1329 |
|
my ($cid, $stoich, $main) = @{$compData}; |
1330 |
|
# Link the compound to the reaction. |
1331 |
|
$loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main, |
1332 |
|
$product, $stoich); |
1333 |
|
} |
1334 |
|
} |
1335 |
|
} |
1336 |
|
} |
1337 |
|
# Finish the load. |
1338 |
|
my $retVal = $self->_FinishAll(); |
1339 |
|
return $retVal; |
1340 |
|
} |
1341 |
|
|
1342 |
|
=head3 LoadGroupData |
1343 |
|
|
1344 |
|
C<< my $stats = $spl->LoadGroupData(); >> |
1345 |
|
|
1346 |
|
Load the genome Groups into Sprout. |
1347 |
|
|
1348 |
|
The following relations are loaded by this method. |
1349 |
|
|
1350 |
|
GenomeGroups |
1351 |
|
|
1352 |
|
There is no direct support for genome groups in FIG, so we access the SEED |
1353 |
|
files directly. |
1354 |
|
|
1355 |
|
=over 4 |
1356 |
|
|
1357 |
|
=item RETURNS |
1358 |
|
|
1359 |
|
Returns a statistics object for the loads. |
1360 |
|
|
1361 |
|
=back |
1362 |
|
|
1363 |
|
=cut |
1364 |
|
#: Return Type $%; |
1365 |
|
sub LoadGroupData { |
1366 |
|
# Get this object instance. |
1367 |
|
my ($self) = @_; |
1368 |
|
# Get the FIG object. |
1369 |
|
my $fig = $self->{fig}; |
1370 |
|
# Get the genome hash. |
1371 |
|
my $genomeHash = $self->{genomes}; |
1372 |
|
# Create a load object for the table we're loading. |
1373 |
|
my $loadGenomeGroups = $self->_TableLoader('GenomeGroups'); |
1374 |
|
if ($self->{options}->{loadOnly}) { |
1375 |
|
Trace("Loading from existing files.") if T(2); |
1376 |
|
} else { |
1377 |
|
Trace("Generating group data.") if T(2); |
1378 |
|
# Loop through the genomes. |
1379 |
|
my $line; |
1380 |
|
for my $genomeID (keys %{$genomeHash}) { |
1381 |
|
Trace("Processing $genomeID.") if T(3); |
1382 |
|
# Open the NMPDR group file for this genome. |
1383 |
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
1384 |
|
defined($line = <TMP>)) { |
1385 |
|
# Clean the line ending. |
1386 |
|
chomp $line; |
1387 |
|
# Add the group to the table. Note that there can only be one group |
1388 |
|
# per genome. |
1389 |
|
$loadGenomeGroups->Put($genomeID, $line); |
1390 |
|
} |
1391 |
|
close TMP; |
1392 |
|
} |
1393 |
} |
} |
1394 |
# Finish the load. |
# Finish the load. |
1395 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1412 |
|
|
1413 |
Name of the table (relation) being loaded. |
Name of the table (relation) being loaded. |
1414 |
|
|
1415 |
=item rowCount (optional) |
=item ignore |
1416 |
|
|
1417 |
Estimated maximum number of rows in the table. |
TRUE if the table should be ignored entirely, else FALSE. |
1418 |
|
|
1419 |
=item RETURN |
=item RETURN |
1420 |
|
|
1426 |
|
|
1427 |
sub _TableLoader { |
sub _TableLoader { |
1428 |
# Get the parameters. |
# Get the parameters. |
1429 |
my ($self, $tableName, $rowCount) = @_; |
my ($self, $tableName, $ignore) = @_; |
1430 |
# Create the load object. |
# Create the load object. |
1431 |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount); |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly, |
1432 |
|
$ignore); |
1433 |
# Cache it in the loader list. |
# Cache it in the loader list. |
1434 |
push @{$self->{loaders}}, $retVal; |
push @{$self->{loaders}}, $retVal; |
1435 |
# Return it to the caller. |
# Return it to the caller. |
1466 |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
1467 |
# ignominiously. At some future point, we want to make the loads restartable. |
# ignominiously. At some future point, we want to make the loads restartable. |
1468 |
while (my $loader = pop @{$loadList}) { |
while (my $loader = pop @{$loadList}) { |
1469 |
|
# Get the relation name. |
1470 |
|
my $relName = $loader->RelName; |
1471 |
|
# Check the ignore flag. |
1472 |
|
if ($loader->Ignore) { |
1473 |
|
Trace("Relation $relName not loaded.") if T(2); |
1474 |
|
} else { |
1475 |
|
# Here we really need to finish. |
1476 |
|
Trace("Finishing $relName.") if T(2); |
1477 |
my $stats = $loader->Finish(); |
my $stats = $loader->Finish(); |
1478 |
|
if ($self->{options}->{dbLoad}) { |
1479 |
|
# Here we want to use the load file just created to load the database. |
1480 |
|
Trace("Loading relation $relName.") if T(2); |
1481 |
|
my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]); |
1482 |
|
# Accumulate the statistics from the DB load. |
1483 |
|
$stats->Accumulate($newStats); |
1484 |
|
} |
1485 |
$retVal->Accumulate($stats); |
$retVal->Accumulate($stats); |
|
my $relName = $loader->RelName; |
|
1486 |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
1487 |
} |
} |
1488 |
|
} |
1489 |
# Return the load statistics. |
# Return the load statistics. |
1490 |
return $retVal; |
return $retVal; |
1491 |
} |
} |