7 |
use PageBuilder; |
use PageBuilder; |
8 |
use ERDBLoad; |
use ERDBLoad; |
9 |
use FIG; |
use FIG; |
10 |
|
use FIGRules; |
11 |
use Sprout; |
use Sprout; |
12 |
use Stats; |
use Stats; |
13 |
use BasicLocation; |
use BasicLocation; |
14 |
use HTML; |
use HTML; |
15 |
|
use AliasAnalysis; |
16 |
|
use BioWords; |
17 |
|
|
18 |
=head1 Sprout Load Methods |
=head1 Sprout Load Methods |
19 |
|
|
53 |
|
|
54 |
=head3 new |
=head3 new |
55 |
|
|
56 |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); |
57 |
|
|
58 |
Construct a new Sprout Loader object, specifying the two participating databases and |
Construct a new Sprout Loader object, specifying the two participating databases and |
59 |
the name of the files containing the list of genomes and subsystems to use. |
the name of the files containing the list of genomes and subsystems to use. |
104 |
# Here we want all the complete genomes and an access code of 1. |
# Here we want all the complete genomes and an access code of 1. |
105 |
my @genomeList = $fig->genomes(1); |
my @genomeList = $fig->genomes(1); |
106 |
%genomes = map { $_ => 1 } @genomeList; |
%genomes = map { $_ => 1 } @genomeList; |
107 |
|
Trace(scalar(keys %genomes) . " genomes found.") if T(3); |
108 |
} else { |
} else { |
109 |
my $type = ref $genomeFile; |
my $type = ref $genomeFile; |
110 |
Trace("Genome file parameter type is \"$type\".") if T(3); |
Trace("Genome file parameter type is \"$type\".") if T(3); |
171 |
for my $subsystem (keys %subsystems) { |
for my $subsystem (keys %subsystems) { |
172 |
my $name = $subsystem; |
my $name = $subsystem; |
173 |
$name =~ s/_/ /g; |
$name =~ s/_/ /g; |
|
my $classes = $fig->subsystem_classification($subsystem); |
|
|
$name .= " " . join(" ", @{$classes}); |
|
174 |
$subsystems{$subsystem} = $name; |
$subsystems{$subsystem} = $name; |
175 |
} |
} |
176 |
} |
} |
177 |
|
# Get the list of NMPDR-oriented attribute keys. |
178 |
|
my @propKeys = $fig->get_group_keys("NMPDR"); |
179 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
180 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
181 |
# Create the Sprout load object. |
# Create the Sprout load object. |
187 |
loadDirectory => $directory, |
loadDirectory => $directory, |
188 |
erdb => $sprout, |
erdb => $sprout, |
189 |
loaders => [], |
loaders => [], |
190 |
options => $options |
options => $options, |
191 |
|
propKeys => \@propKeys, |
192 |
}; |
}; |
193 |
# Bless and return it. |
# Bless and return it. |
194 |
bless $retVal, $class; |
bless $retVal, $class; |
197 |
|
|
198 |
=head3 LoadOnly |
=head3 LoadOnly |
199 |
|
|
200 |
C<< my $flag = $spl->LoadOnly; >> |
my $flag = $spl->LoadOnly; |
201 |
|
|
202 |
Return TRUE if we are in load-only mode, else FALSE. |
Return TRUE if we are in load-only mode, else FALSE. |
203 |
|
|
208 |
return $self->{options}->{loadOnly}; |
return $self->{options}->{loadOnly}; |
209 |
} |
} |
210 |
|
|
|
=head3 PrimaryOnly |
|
|
|
|
|
C<< my $flag = $spl->PrimaryOnly; >> |
|
|
|
|
|
Return TRUE if only the main entity is to be loaded, else FALSE. |
|
|
|
|
|
=cut |
|
|
|
|
|
sub PrimaryOnly { |
|
|
my ($self) = @_; |
|
|
return $self->{options}->{primaryOnly}; |
|
|
} |
|
211 |
|
|
212 |
=head3 LoadGenomeData |
=head3 LoadGenomeData |
213 |
|
|
214 |
C<< my $stats = $spl->LoadGenomeData(); >> |
my $stats = $spl->LoadGenomeData(); |
215 |
|
|
216 |
Load the Genome, Contig, and Sequence data from FIG into Sprout. |
Load the Genome, Contig, and Sequence data from FIG into Sprout. |
217 |
|
|
248 |
my $genomeCount = (keys %{$genomeHash}); |
my $genomeCount = (keys %{$genomeHash}); |
249 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
250 |
my $loadGenome = $self->_TableLoader('Genome'); |
my $loadGenome = $self->_TableLoader('Genome'); |
251 |
my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly); |
my $loadHasContig = $self->_TableLoader('HasContig'); |
252 |
my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly); |
my $loadContig = $self->_TableLoader('Contig'); |
253 |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly); |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf'); |
254 |
my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly); |
my $loadSequence = $self->_TableLoader('Sequence'); |
255 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
256 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
257 |
} else { |
} else { |
258 |
Trace("Generating genome data.") if T(2); |
Trace("Generating genome data.") if T(2); |
259 |
|
# Get the full info for the FIG genomes. |
260 |
|
my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3], |
261 |
|
pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()}; |
262 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
263 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
264 |
Trace("Generating data for genome $genomeID.") if T(3); |
Trace("Generating data for genome $genomeID.") if T(3); |
288 |
$group = $FIG_Config::otherGroup; |
$group = $FIG_Config::otherGroup; |
289 |
} |
} |
290 |
close TMP; |
close TMP; |
291 |
|
# Get the contigs. |
292 |
|
my @contigs = $fig->all_contigs($genomeID); |
293 |
|
# Get this genome's info array. |
294 |
|
my $info = $genomeInfo{$genomeID}; |
295 |
# Output the genome record. |
# Output the genome record. |
296 |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), |
$loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs), |
297 |
$dnaSize, $genus, $group, $species, $extra, $version, $taxonomy); |
$dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy); |
298 |
# Now we loop through each of the genome's contigs. |
# Now we loop through each of the genome's contigs. |
|
my @contigs = $fig->all_contigs($genomeID); |
|
299 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
300 |
Trace("Processing contig $contigID for $genomeID.") if T(4); |
Trace("Processing contig $contigID for $genomeID.") if T(4); |
301 |
$loadContig->Add("contigIn"); |
$loadContig->Add("contigIn"); |
331 |
return $retVal; |
return $retVal; |
332 |
} |
} |
333 |
|
|
|
=head3 LoadCouplingData |
|
|
|
|
|
C<< my $stats = $spl->LoadCouplingData(); >> |
|
|
|
|
|
Load the coupling and evidence data from FIG into Sprout. |
|
|
|
|
|
The coupling data specifies which genome features are functionally coupled. The |
|
|
evidence data explains why the coupling is functional. |
|
|
|
|
|
The following relations are loaded by this method. |
|
|
|
|
|
Coupling |
|
|
IsEvidencedBy |
|
|
PCH |
|
|
ParticipatesInCoupling |
|
|
UsesAsEvidence |
|
|
|
|
|
=over 4 |
|
|
|
|
|
=item RETURNS |
|
|
|
|
|
Returns a statistics object for the loads. |
|
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadCouplingData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the genome hash. |
|
|
my $genomeFilter = $self->{genomes}; |
|
|
# Set up an ID counter for the PCHs. |
|
|
my $pchID = 0; |
|
|
# Start the loads. |
|
|
my $loadCoupling = $self->_TableLoader('Coupling'); |
|
|
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
|
|
my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly); |
|
|
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly); |
|
|
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly); |
|
|
if ($self->{options}->{loadOnly}) { |
|
|
Trace("Loading from existing files.") if T(2); |
|
|
} else { |
|
|
Trace("Generating coupling data.") if T(2); |
|
|
# Loop through the genomes found. |
|
|
for my $genome (sort keys %{$genomeFilter}) { |
|
|
Trace("Generating coupling data for $genome.") if T(3); |
|
|
$loadCoupling->Add("genomeIn"); |
|
|
# Create a hash table for holding coupled pairs. We use this to prevent |
|
|
# duplicates. For example, if A is coupled to B, we don't want to also |
|
|
# assert that B is coupled to A, because we already know it. Fortunately, |
|
|
# all couplings occur within a genome, so we can keep the hash table |
|
|
# size reasonably small. |
|
|
my %dupHash = (); |
|
|
# Get all of the genome's PEGs. |
|
|
my @pegs = $fig->pegs_of($genome); |
|
|
# Loop through the PEGs. |
|
|
for my $peg1 (@pegs) { |
|
|
$loadCoupling->Add("pegIn"); |
|
|
Trace("Processing PEG $peg1 for $genome.") if T(4); |
|
|
# Get a list of the coupled PEGs. |
|
|
my @couplings = $fig->coupled_to($peg1); |
|
|
# For each coupled PEG, we need to verify that a coupling already |
|
|
# exists. If not, we have to create one. |
|
|
for my $coupleData (@couplings) { |
|
|
my ($peg2, $score) = @{$coupleData}; |
|
|
# Compute the coupling ID. |
|
|
my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2); |
|
|
if (! exists $dupHash{$coupleID}) { |
|
|
$loadCoupling->Add("couplingIn"); |
|
|
# Here we have a new coupling to store in the load files. |
|
|
Trace("Storing coupling ($coupleID) with score $score.") if T(4); |
|
|
# Ensure we don't do this again. |
|
|
$dupHash{$coupleID} = $score; |
|
|
# Write the coupling record. |
|
|
$loadCoupling->Put($coupleID, $score); |
|
|
# Connect it to the coupled PEGs. |
|
|
$loadParticipatesInCoupling->Put($peg1, $coupleID, 1); |
|
|
$loadParticipatesInCoupling->Put($peg2, $coupleID, 2); |
|
|
# Get the evidence for this coupling. |
|
|
my @evidence = $fig->coupling_evidence($peg1, $peg2); |
|
|
# Organize the evidence into a hash table. |
|
|
my %evidenceMap = (); |
|
|
# Process each evidence item. |
|
|
for my $evidenceData (@evidence) { |
|
|
$loadPCH->Add("evidenceIn"); |
|
|
my ($peg3, $peg4, $usage) = @{$evidenceData}; |
|
|
# Only proceed if the evidence is from a Sprout |
|
|
# genome. |
|
|
if ($genomeFilter->{$fig->genome_of($peg3)}) { |
|
|
$loadUsesAsEvidence->Add("evidenceChosen"); |
|
|
my $evidenceKey = "$coupleID $peg3 $peg4"; |
|
|
# We store this evidence in the hash if the usage |
|
|
# is nonzero or no prior evidence has been found. This |
|
|
# insures that if there is duplicate evidence, we |
|
|
# at least keep the meaningful ones. Only evidence in |
|
|
# the hash makes it to the output. |
|
|
if ($usage || ! exists $evidenceMap{$evidenceKey}) { |
|
|
$evidenceMap{$evidenceKey} = $evidenceData; |
|
|
} |
|
|
} |
|
|
} |
|
|
for my $evidenceID (keys %evidenceMap) { |
|
|
# Get the ID for this evidence. |
|
|
$pchID++; |
|
|
# Create the evidence record. |
|
|
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
|
|
$loadPCH->Put($pchID, $usage); |
|
|
# Connect it to the coupling. |
|
|
$loadIsEvidencedBy->Put($coupleID, $pchID); |
|
|
# Connect it to the features. |
|
|
$loadUsesAsEvidence->Put($pchID, $peg3, 1); |
|
|
$loadUsesAsEvidence->Put($pchID, $peg4, 2); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
# All done. Finish the load. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
|
} |
|
|
|
|
334 |
=head3 LoadFeatureData |
=head3 LoadFeatureData |
335 |
|
|
336 |
C<< my $stats = $spl->LoadFeatureData(); >> |
my $stats = $spl->LoadFeatureData(); |
337 |
|
|
338 |
Load the feature data from FIG into Sprout. |
Load the feature data from FIG into Sprout. |
339 |
|
|
343 |
|
|
344 |
Feature |
Feature |
345 |
FeatureAlias |
FeatureAlias |
346 |
|
IsAliasOf |
347 |
FeatureLink |
FeatureLink |
348 |
FeatureTranslation |
FeatureTranslation |
349 |
FeatureUpstream |
FeatureUpstream |
353 |
FeatureEssential |
FeatureEssential |
354 |
FeatureVirulent |
FeatureVirulent |
355 |
FeatureIEDB |
FeatureIEDB |
356 |
|
CDD |
357 |
|
IsPresentOnProteinOf |
358 |
|
CellLocation |
359 |
|
IsPossiblePlaceFor |
360 |
|
ExternalDatabase |
361 |
|
IsAlsoFoundIn |
362 |
|
Keyword |
363 |
|
|
364 |
=over 4 |
=over 4 |
365 |
|
|
381 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
382 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
383 |
my $loadFeature = $self->_TableLoader('Feature'); |
my $loadFeature = $self->_TableLoader('Feature'); |
384 |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly); |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn'); |
385 |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
386 |
|
my $loadIsAliasOf = $self->_TableLoader('IsAliasOf'); |
387 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
388 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
389 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
390 |
my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly); |
my $loadHasFeature = $self->_TableLoader('HasFeature'); |
391 |
my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly); |
my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem'); |
392 |
my $loadFeatureEssential = $self->_TableLoader('FeatureEssential'); |
my $loadFeatureEssential = $self->_TableLoader('FeatureEssential'); |
393 |
my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent'); |
my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent'); |
394 |
my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); |
my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); |
395 |
|
my $loadCDD = $self->_TableLoader('CDD'); |
396 |
|
my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf'); |
397 |
|
my $loadCellLocation = $self->_TableLoader('CellLocation'); |
398 |
|
my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor'); |
399 |
|
my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn'); |
400 |
|
my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase'); |
401 |
|
my $loadKeyword = $self->_TableLoader('Keyword'); |
402 |
# Get the subsystem hash. |
# Get the subsystem hash. |
403 |
my $subHash = $self->{subsystems}; |
my $subHash = $self->{subsystems}; |
404 |
|
# Get the property keys. |
405 |
|
my $propKeys = $self->{propKeys}; |
406 |
|
# Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values. |
407 |
|
my %CDD = (); |
408 |
|
my %alias = (); |
409 |
|
my %cellLocation = (); |
410 |
|
my %xdb = (); |
411 |
|
# Create the bio-words object. |
412 |
|
my $biowords = BioWords->new(cache => 0); |
413 |
|
# One of the things we have to do here is build the keyword table, and the keyword |
414 |
|
# table needs to contain the originating text and feature count for each stem. Unfortunately, |
415 |
|
# the number of distinct keywords is so large it causes PERL to hang if we try to |
416 |
|
# keep them in memory. As a result, we need to track them using disk files. |
417 |
|
# Our approach will be to use two sequential files. One will contain stems and phonexes. |
418 |
|
# Each time a stem occurs in a feature, a record will be written to that file. The stem |
419 |
|
# file can then be sorted and collated to determine the number of features for each |
420 |
|
# stem. A separate file will contain keywords and stems. This last file |
421 |
|
# will be subjected to a sort unique on stem/keyword. The file is then merged |
422 |
|
# with the stem file to create the keyword table relation (keyword, stem, phonex, count). |
423 |
|
my $stemFileName = "$FIG_Config::temp/stems$$.tbl"; |
424 |
|
my $keyFileName = "$FIG_Config::temp/keys$$.tbl"; |
425 |
|
my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName"); |
426 |
|
my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName"); |
427 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
428 |
# locations. |
# locations. |
429 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
432 |
} else { |
} else { |
433 |
Trace("Generating feature data.") if T(2); |
Trace("Generating feature data.") if T(2); |
434 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
435 |
for my $genomeID (sort keys %{$genomeHash}) { |
my @allGenomes = sort keys %{$genomeHash}; |
436 |
|
Trace(scalar(@allGenomes) . " genomes found in list.") if T(3); |
437 |
|
for my $genomeID (@allGenomes) { |
438 |
Trace("Loading features for genome $genomeID.") if T(3); |
Trace("Loading features for genome $genomeID.") if T(3); |
439 |
$loadFeature->Add("genomeIn"); |
$loadFeature->Add("genomeIn"); |
440 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
445 |
my @fids = map { $_->[0] } @featureTuples; |
my @fids = map { $_->[0] } @featureTuples; |
446 |
Trace("$count features found for genome $genomeID.") if T(3); |
Trace("$count features found for genome $genomeID.") if T(3); |
447 |
# Get the attributes for this genome and put them in a hash by feature ID. |
# Get the attributes for this genome and put them in a hash by feature ID. |
448 |
my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids); |
my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys); |
449 |
|
Trace("Looping through features for $genomeID.") if T(3); |
450 |
# Set up for our duplicate-feature check. |
# Set up for our duplicate-feature check. |
451 |
my $oldFeatureID = ""; |
my $oldFeatureID = ""; |
452 |
# Loop through the features. |
# Loop through the features. |
471 |
$fig->taxonomy_of($genomeID)); |
$fig->taxonomy_of($genomeID)); |
472 |
# Create the aliases. |
# Create the aliases. |
473 |
for my $alias ($fig->feature_aliases($featureID)) { |
for my $alias ($fig->feature_aliases($featureID)) { |
474 |
$loadFeatureAlias->Put($featureID, $alias); |
#Connect this alias to this feature. |
475 |
|
$loadIsAliasOf->Put($alias, $featureID); |
476 |
push @keywords, $alias; |
push @keywords, $alias; |
477 |
|
# If this is a locus tag, also add its natural form as a keyword. |
478 |
|
my $naturalName = AliasAnalysis::Type(LocusTag => $alias); |
479 |
|
if ($naturalName) { |
480 |
|
push @keywords, $naturalName; |
481 |
|
} |
482 |
|
# If this is the first time for the specified alias, create its |
483 |
|
# alias record. |
484 |
|
if (! exists $alias{$alias}) { |
485 |
|
$loadFeatureAlias->Put($alias); |
486 |
|
$alias{$alias} = 1; |
487 |
|
} |
488 |
|
} |
489 |
|
# Add the corresponding IDs. We ask for 2-tuples of the form (id, database). |
490 |
|
my @corresponders = $fig->get_corresponding_ids($featureID, 1); |
491 |
|
for my $tuple (@corresponders) { |
492 |
|
my ($id, $xdb) = @{$tuple}; |
493 |
|
# Ignore SEED: that's us. |
494 |
|
if ($xdb ne 'SEED') { |
495 |
|
# Connect this ID to the feature. |
496 |
|
$loadIsAlsoFoundIn->Put($featureID, $xdb, $id); |
497 |
|
# Add it as a keyword. |
498 |
|
push @keywords, $id; |
499 |
|
# If this is a new database, create a record for it. |
500 |
|
if (! exists $xdb{$xdb}) { |
501 |
|
$xdb{$xdb} = 1; |
502 |
|
$loadExternalDatabase->Put($xdb); |
503 |
|
} |
504 |
|
} |
505 |
} |
} |
506 |
Trace("Assignment for $featureID is: $assignment") if T(4); |
Trace("Assignment for $featureID is: $assignment") if T(4); |
507 |
# Break the assignment into words and shove it onto the |
# Break the assignment into words and shove it onto the |
529 |
} |
} |
530 |
# Now we need to find the subsystems this feature participates in. |
# Now we need to find the subsystems this feature participates in. |
531 |
# We also add the subsystems to the keyword list. Before we do that, |
# We also add the subsystems to the keyword list. Before we do that, |
532 |
# we must convert underscores to spaces and tack on the classifications. |
# we must convert underscores to spaces. |
533 |
my @subsystems = $fig->peg_to_subsystems($featureID); |
my @subsystems = $fig->peg_to_subsystems($featureID); |
534 |
for my $subsystem (@subsystems) { |
for my $subsystem (@subsystems) { |
535 |
# Only proceed if we like this subsystem. |
# Only proceed if we like this subsystem. |
578 |
push @keywords, 'iedb'; |
push @keywords, 'iedb'; |
579 |
$loadFeature->Add('iedb'); |
$loadFeature->Add('iedb'); |
580 |
} |
} |
581 |
# Now we need to bust up hyphenated words in the keyword |
# Now we have some other attributes we need to process. To get |
582 |
# list. We keep them separate and put them at the end so |
# through them, we convert the attribute list for this feature |
583 |
# the original word order is available. |
# into a two-layer hash: key => subkey => value. |
584 |
my $keywordString = ""; |
my %attributeHash = (); |
585 |
my $bustedString = ""; |
for my $attrRow (@{$attributes->{$featureID}}) { |
586 |
for my $keyword (@keywords) { |
my (undef, $key, @values) = @{$attrRow}; |
587 |
if (length $keyword >= 3) { |
my ($realKey, $subKey); |
588 |
$keywordString .= " $keyword"; |
if ($key =~ /^([^:]+)::(.+)/) { |
589 |
if ($keyword =~ /-/) { |
($realKey, $subKey) = ($1, $2); |
590 |
my @words = split /-/, $keyword; |
} else { |
591 |
$bustedString .= join(" ", "", @words); |
($realKey, $subKey) = ($key, ""); |
592 |
} |
} |
593 |
|
if (exists $attributeHash{$1}) { |
594 |
|
$attributeHash{$1}->{$2} = \@values; |
595 |
|
} else { |
596 |
|
$attributeHash{$1} = {$2 => \@values}; |
597 |
|
} |
598 |
|
} |
599 |
|
# First we handle CDD. This is a bit complicated, because |
600 |
|
# there are multiple CDDs per protein. |
601 |
|
if (exists $attributeHash{CDD}) { |
602 |
|
# Get the hash of CDD IDs to scores for this feature. We |
603 |
|
# already know it exists because of the above IF. |
604 |
|
my $cddHash = $attributeHash{CDD}; |
605 |
|
my @cddData = sort keys %{$cddHash}; |
606 |
|
for my $cdd (@cddData) { |
607 |
|
# Extract the score for this CDD and decode it. |
608 |
|
my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]); |
609 |
|
my $realScore = FIGRules::DecodeScore($codeScore); |
610 |
|
# We can't afford to crash because of a bad attribute |
611 |
|
# value, hence the IF below. |
612 |
|
if (! defined($realScore)) { |
613 |
|
# Bad score, so count it. |
614 |
|
$loadFeature->Add('badCDDscore'); |
615 |
|
Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3); |
616 |
|
} else { |
617 |
|
# Create the connection. |
618 |
|
$loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore); |
619 |
|
# If this CDD does not yet exist, create its record. |
620 |
|
if (! exists $CDD{$cdd}) { |
621 |
|
$CDD{$cdd} = 1; |
622 |
|
$loadCDD->Put($cdd); |
623 |
|
} |
624 |
|
} |
625 |
|
} |
626 |
|
} |
627 |
|
# Next we do PSORT cell locations. here the confidence value |
628 |
|
# could have the value "unknown", which we translate to -1. |
629 |
|
if (exists $attributeHash{PSORT}) { |
630 |
|
# This will be a hash of cell locations to confidence |
631 |
|
# factors. |
632 |
|
my $psortHash = $attributeHash{PSORT}; |
633 |
|
for my $psort (keys %{$psortHash}) { |
634 |
|
# Get the confidence, and convert it to a number if necessary. |
635 |
|
my $confidence = $psortHash->{$psort}; |
636 |
|
if ($confidence eq 'unknown') { |
637 |
|
$confidence = -1; |
638 |
|
} |
639 |
|
$loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence); |
640 |
|
# If this cell location does not yet exist, create its record. |
641 |
|
if (! exists $cellLocation{$psort}) { |
642 |
|
$cellLocation{$psort} = 1; |
643 |
|
$loadCellLocation->Put($psort); |
644 |
|
} |
645 |
|
# If this is a significant location, add it as a keyword. |
646 |
|
if ($confidence > 2.5) { |
647 |
|
push @keywords, $psort; |
648 |
|
} |
649 |
|
} |
650 |
|
} |
651 |
|
# Phobius data is next. This consists of the signal peptide location and |
652 |
|
# the transmembrane locations. |
653 |
|
my $signalList = ""; |
654 |
|
my $transList = ""; |
655 |
|
if (exists $attributeHash{Phobius}) { |
656 |
|
# This will be a hash of two keys (transmembrane and signal) to |
657 |
|
# location strings. If there's no value, we stuff in an empty string. |
658 |
|
$signalList = ($attributeHash{Phobius}->{signal} || ""); |
659 |
|
$transList = ($attributeHash{Phobius}->{transmembrane} || ""); |
660 |
|
} |
661 |
|
# Here are some more numbers: isoelectric point, molecular weight, and |
662 |
|
# the similar-to-human flag. |
663 |
|
my $isoelectric = 0; |
664 |
|
if (exists $attributeHash{isoelectric_point}) { |
665 |
|
$isoelectric = $attributeHash{isoelectric_point}->{""}; |
666 |
|
} |
667 |
|
my $similarToHuman = 0; |
668 |
|
if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') { |
669 |
|
$similarToHuman = 1; |
670 |
|
} |
671 |
|
my $molecularWeight = 0; |
672 |
|
if (exists $attributeHash{molecular_weight}) { |
673 |
|
$molecularWeight = $attributeHash{molecular_weight}->{""}; |
674 |
|
} |
675 |
|
# Create the keyword string. |
676 |
|
my $keywordString = join(" ", @keywords); |
677 |
|
Trace("Real keyword string for $featureID: $keywordString.") if T(4); |
678 |
|
# Get rid of annoying punctuation. |
679 |
|
$keywordString =~ s/[();@#\/]/ /g; |
680 |
|
# Get the list of keywords in the keyword string. |
681 |
|
my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString); |
682 |
|
# We need to do two things here: create the keyword string for the feature table |
683 |
|
# and write records to the keyword and stem files. The stuff we write to |
684 |
|
# the files will be taken from the following two hashes. The stuff used |
685 |
|
# to create the keyword string will be taken from the list. |
686 |
|
my (%keys, %stems, @realStems); |
687 |
|
for my $keyword (@realKeywords) { |
688 |
|
# Compute the stem and phonex for this keyword. |
689 |
|
my ($stem, $phonex) = $biowords->StemLookup($keyword); |
690 |
|
# Only proceed if a stem comes back. If no stem came back, it's a |
691 |
|
# stop word and we throw it away. |
692 |
|
if ($stem) { |
693 |
|
$keys{$keyword} = $stem; |
694 |
|
$stems{$stem} = $phonex; |
695 |
|
push @realStems, $stem; |
696 |
} |
} |
697 |
} |
} |
698 |
$keywordString .= $bustedString; |
# Now create the keyword string. |
699 |
# Get rid of annoying punctuation. |
my $cleanWords = join(" ", @realStems); |
|
$keywordString =~ s/[();]//g; |
|
|
# Clean the keyword list. |
|
|
my $cleanWords = $sprout->CleanKeywords($keywordString); |
|
700 |
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
701 |
# Create the feature record. |
# Write the stem and keyword records. |
702 |
$loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords); |
for my $stem (keys %stems) { |
703 |
|
Tracer::PutLine($stemh, [$stem, $stems{$stem}]); |
704 |
|
} |
705 |
|
for my $key (keys %keys) { |
706 |
|
# The stem goes first in this file, because we want to sort |
707 |
|
# by stem and then keyword. |
708 |
|
Tracer::PutLine($keyh, [$keys{$key}, $key]); |
709 |
|
} |
710 |
|
# Now we need to process the feature's locations. First, we split them up. |
711 |
|
my @locationList = split /\s*,\s*/, $locations; |
712 |
|
# Next, we convert them to Sprout location objects. |
713 |
|
my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; |
714 |
|
# Assemble them into a sprout location string for later. |
715 |
|
my $locationString = join(", ", map { $_->String } @locObjectList); |
716 |
|
# We'll store the sequence length in here. |
717 |
|
my $sequenceLength = 0; |
718 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
719 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
720 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
721 |
# for Sprout. |
# for Sprout. To start, we create the location position indicator. |
|
my @locationList = split /\s*,\s*/, $locations; |
|
|
# Create the location position indicator. |
|
722 |
my $i = 1; |
my $i = 1; |
723 |
# Loop through the locations. |
# Loop through the locations. |
724 |
for my $location (@locationList) { |
for my $locObject (@locObjectList) { |
725 |
# Parse the location. |
# Record the length. |
726 |
my $locObject = BasicLocation->new("$genomeID:$location"); |
$sequenceLength += $locObject->Length; |
727 |
# Split it into a list of chunks. |
# Split this location into a list of chunks. |
728 |
my @locOList = (); |
my @locOList = (); |
729 |
while (my $peeling = $locObject->Peel($chunkSize)) { |
while (my $peeling = $locObject->Peel($chunkSize)) { |
730 |
$loadIsLocatedIn->Add("peeling"); |
$loadIsLocatedIn->Add("peeling"); |
739 |
$i++; |
$i++; |
740 |
} |
} |
741 |
} |
} |
742 |
|
# Now we get some ancillary flags. |
743 |
|
my $locked = $fig->is_locked_fid($featureID); |
744 |
|
my $in_genbank = $fig->peg_in_gendb($featureID); |
745 |
|
# Create the feature record. |
746 |
|
$loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight, |
747 |
|
$sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString, |
748 |
|
$transList); |
749 |
|
} |
750 |
|
} |
751 |
|
Trace("Genome $genomeID processed.") if T(3); |
752 |
|
} |
753 |
|
} |
754 |
|
Trace("Sorting keywords.") if T(2); |
755 |
|
# Now we need to load the keyword table from the key and stem files. |
756 |
|
close $keyh; |
757 |
|
close $stemh; |
758 |
|
Trace("Loading keywords.") if T(2); |
759 |
|
$keyh = Open(undef, "<$keyFileName"); |
760 |
|
$stemh = Open(undef, "<$stemFileName"); |
761 |
|
# We'll count the keywords in here, for tracing purposes. |
762 |
|
my $count = 0; |
763 |
|
# These variables track the current stem's data. When an incoming |
764 |
|
# keyword's stem changes, these will be recomputed. |
765 |
|
my ($currentStem, $currentPhonex, $currentCount); |
766 |
|
# Prime the loop by reading the first stem in the stem file. |
767 |
|
my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh); |
768 |
|
# Loop through the keyword file. |
769 |
|
while (! eof $keyh) { |
770 |
|
# Read this keyword. |
771 |
|
my ($thisStem, $thisKey) = Tracer::GetLine($keyh); |
772 |
|
# Check to see if it's the new stem yet. |
773 |
|
if ($thisStem ne $currentStem) { |
774 |
|
# Yes. It's a terrible error if it's not also the next stem. |
775 |
|
if ($thisStem ne $nextStem) { |
776 |
|
Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\"."); |
777 |
|
} else { |
778 |
|
# Here we're okay. |
779 |
|
($currentStem, $currentPhonex) = ($nextStem, $nextPhonex); |
780 |
|
# Count the number of features for this stem. |
781 |
|
$currentCount = 0; |
782 |
|
while ($nextStem eq $thisStem) { |
783 |
|
($nextStem, $nextPhonex) = Tracer::GetLine($stemh); |
784 |
|
$currentCount++; |
785 |
|
} |
786 |
} |
} |
787 |
} |
} |
788 |
|
# Now $currentStem is the same as $thisStem, and the other $current-vars |
789 |
|
# contain the stem's data (phonex and count). |
790 |
|
$loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem); |
791 |
|
if (++$count % 1000 == 0 && T(3)) { |
792 |
|
Trace("$count keywords loaded."); |
793 |
} |
} |
794 |
} |
} |
795 |
|
Trace("$count keywords loaded into keyword table.") if T(2); |
796 |
# Finish the loads. |
# Finish the loads. |
797 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
798 |
return $retVal; |
return $retVal; |
800 |
|
|
801 |
=head3 LoadSubsystemData |
=head3 LoadSubsystemData |
802 |
|
|
803 |
C<< my $stats = $spl->LoadSubsystemData(); >> |
my $stats = $spl->LoadSubsystemData(); |
804 |
|
|
805 |
Load the subsystem data from FIG into Sprout. |
Load the subsystem data from FIG into Sprout. |
806 |
|
|
816 |
SubsystemClass |
SubsystemClass |
817 |
Role |
Role |
818 |
RoleEC |
RoleEC |
819 |
|
IsIdentifiedByEC |
820 |
SSCell |
SSCell |
821 |
ContainsFeature |
ContainsFeature |
822 |
IsGenomeOf |
IsGenomeOf |
830 |
ConsistsOfGenomes |
ConsistsOfGenomes |
831 |
GenomeSubset |
GenomeSubset |
832 |
HasGenomeSubset |
HasGenomeSubset |
|
Catalyzes |
|
833 |
Diagram |
Diagram |
834 |
RoleOccursIn |
RoleOccursIn |
835 |
|
SubsystemHopeNotes |
836 |
|
|
837 |
=over 4 |
=over 4 |
838 |
|
|
858 |
# Get the map list. |
# Get the map list. |
859 |
my @maps = $fig->all_maps; |
my @maps = $fig->all_maps; |
860 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
861 |
my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly); |
my $loadDiagram = $self->_TableLoader('Diagram'); |
862 |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly); |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn'); |
863 |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
864 |
my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly); |
my $loadRole = $self->_TableLoader('Role'); |
865 |
my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly); |
my $loadRoleEC = $self->_TableLoader('RoleEC'); |
866 |
my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly); |
my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC'); |
867 |
my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly); |
my $loadCatalyzes = $self->_TableLoader('Catalyzes'); |
868 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly); |
my $loadSSCell = $self->_TableLoader('SSCell'); |
869 |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly); |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); |
870 |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly); |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf'); |
871 |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly); |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf'); |
872 |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly); |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem'); |
873 |
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly); |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn'); |
874 |
my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly); |
my $loadHasSSCell = $self->_TableLoader('HasSSCell'); |
875 |
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly); |
my $loadRoleSubset = $self->_TableLoader('RoleSubset'); |
876 |
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly); |
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset'); |
877 |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); |
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles'); |
878 |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes'); |
879 |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset'); |
880 |
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly); |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset'); |
881 |
|
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass'); |
882 |
|
my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes'); |
883 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
884 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
885 |
} else { |
} else { |
886 |
Trace("Generating subsystem data.") if T(2); |
Trace("Generating subsystem data.") if T(2); |
887 |
# This hash will contain the role for each EC. When we're done, this |
# This hash will contain the roles for each EC. When we're done, this |
888 |
# information will be used to generate the Catalyzes table. |
# information will be used to generate the Catalyzes table. |
889 |
my %ecToRoles = (); |
my %ecToRoles = (); |
890 |
# Loop through the subsystems. Our first task will be to create the |
# Loop through the subsystems. Our first task will be to create the |
904 |
# Create the subsystem record. |
# Create the subsystem record. |
905 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
906 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
907 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
my $version = $sub->get_version(); |
908 |
|
my $description = $sub->get_description(); |
909 |
|
$loadSubsystem->Put($subsysID, $curator, $version, $description, $notes); |
910 |
|
# Add the hope notes. |
911 |
|
my $hopeNotes = $sub->get_hope_curation_notes(); |
912 |
|
if ($hopeNotes) { |
913 |
|
$loadSubsystemHopeNotes->Put($sub, $hopeNotes); |
914 |
|
} |
915 |
# Now for the classification string. This comes back as a list |
# Now for the classification string. This comes back as a list |
916 |
# reference and we convert it to a space-delimited string. |
# reference and we convert it to a space-delimited string. |
917 |
my $classList = $fig->subsystem_classification($subsysID); |
my $classList = $fig->subsystem_classification($subsysID); |
919 |
$loadSubsystemClass->Put($subsysID, $classString); |
$loadSubsystemClass->Put($subsysID, $classString); |
920 |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
921 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
922 |
|
# Get the role's abbreviation. |
923 |
|
my $abbr = $sub->get_role_abbr($col); |
924 |
|
# Get its essentiality. |
925 |
|
my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID); |
926 |
|
# Get its reaction note. |
927 |
|
my $hope_note = $sub->get_hope_reaction_notes($roleID) || ""; |
928 |
# Connect to this role. |
# Connect to this role. |
929 |
$loadOccursInSubsystem->Add("roleIn"); |
$loadOccursInSubsystem->Add("roleIn"); |
930 |
$loadOccursInSubsystem->Put($roleID, $subsysID, $col); |
$loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note); |
931 |
# If it's a new role, add it to the role table. |
# If it's a new role, add it to the role table. |
932 |
if (! exists $roleData{$roleID}) { |
if (! exists $roleData{$roleID}) { |
933 |
# Get the role's abbreviation. |
# Get the role's abbreviation. |
|
my $abbr = $sub->get_role_abbr($col); |
|
934 |
# Add the role. |
# Add the role. |
935 |
$loadRole->Put($roleID, $abbr); |
$loadRole->Put($roleID); |
936 |
$roleData{$roleID} = 1; |
$roleData{$roleID} = 1; |
937 |
# Check for an EC number. |
# Check for an EC number. |
938 |
if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) { |
if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) { |
939 |
my $ec = $1; |
my $ec = $1; |
940 |
$loadRoleEC->Put($roleID, $ec); |
$loadIsIdentifiedByEC->Put($roleID, $ec); |
941 |
$ecToRoles{$ec} = $roleID; |
# Check to see if this is our first encounter with this EC. |
942 |
|
if (exists $ecToRoles{$ec}) { |
943 |
|
# No, so just add this role to the EC list. |
944 |
|
push @{$ecToRoles{$ec}}, $roleID; |
945 |
|
} else { |
946 |
|
# Output this EC. |
947 |
|
$loadRoleEC->Put($ec); |
948 |
|
# Create its role list. |
949 |
|
$ecToRoles{$ec} = [$roleID]; |
950 |
|
} |
951 |
} |
} |
952 |
} |
} |
953 |
} |
} |
1060 |
# Now we need to link all the map's roles to it. |
# Now we need to link all the map's roles to it. |
1061 |
# A hash is used to prevent duplicates. |
# A hash is used to prevent duplicates. |
1062 |
my %roleHash = (); |
my %roleHash = (); |
1063 |
for my $role ($fig->map_to_ecs($map)) { |
for my $ec ($fig->map_to_ecs($map)) { |
1064 |
if (exists $ecToRoles{$role} && ! $roleHash{$role}) { |
if (exists $ecToRoles{$ec}) { |
1065 |
$loadRoleOccursIn->Put($ecToRoles{$role}, $map); |
for my $role (@{$ecToRoles{$ec}}) { |
1066 |
|
if (! $roleHash{$role}) { |
1067 |
|
$loadRoleOccursIn->Put($role, $map); |
1068 |
$roleHash{$role} = 1; |
$roleHash{$role} = 1; |
1069 |
} |
} |
1070 |
} |
} |
1071 |
} |
} |
|
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
|
|
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
|
|
my @reactions = $fig->all_reactions(); |
|
|
for my $reactionID (@reactions) { |
|
|
# Get this reaction's list of roles. The results will be EC numbers. |
|
|
my @roles = $fig->catalyzed_by($reactionID); |
|
|
# Loop through the roles, creating catalyzation records. |
|
|
for my $thisRole (@roles) { |
|
|
if (exists $ecToRoles{$thisRole}) { |
|
|
$loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID); |
|
|
} |
|
1072 |
} |
} |
1073 |
} |
} |
1074 |
} |
} |
1079 |
|
|
1080 |
=head3 LoadPropertyData |
=head3 LoadPropertyData |
1081 |
|
|
1082 |
C<< my $stats = $spl->LoadPropertyData(); >> |
my $stats = $spl->LoadPropertyData(); |
1083 |
|
|
1084 |
Load the attribute data from FIG into Sprout. |
Load the attribute data from FIG into Sprout. |
1085 |
|
|
1115 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1116 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1117 |
my $loadProperty = $self->_TableLoader('Property'); |
my $loadProperty = $self->_TableLoader('Property'); |
1118 |
my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly); |
my $loadHasProperty = $self->_TableLoader('HasProperty'); |
1119 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1120 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1121 |
} else { |
} else { |
1124 |
my %propertyKeys = (); |
my %propertyKeys = (); |
1125 |
my $nextID = 1; |
my $nextID = 1; |
1126 |
# Get the attributes we intend to store in the property table. |
# Get the attributes we intend to store in the property table. |
1127 |
my @propKeys = $fig->get_group_keys("NMPDR"); |
my $propKeys = $self->{propKeys}; |
1128 |
# Loop through the genomes. |
# Loop through the genomes. |
1129 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1130 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
1132 |
# Initialize a counter. |
# Initialize a counter. |
1133 |
my $propertyCount = 0; |
my $propertyCount = 0; |
1134 |
# Get the properties for this genome's features. |
# Get the properties for this genome's features. |
1135 |
my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys); |
my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys); |
1136 |
Trace("Property list built for $genomeID.") if T(3); |
Trace("Property list built for $genomeID.") if T(3); |
1137 |
# Loop through the results, creating HasProperty records. |
# Loop through the results, creating HasProperty records. |
1138 |
for my $attributeData (@attributes) { |
for my $attributeData (@attributes) { |
1167 |
|
|
1168 |
=head3 LoadAnnotationData |
=head3 LoadAnnotationData |
1169 |
|
|
1170 |
C<< my $stats = $spl->LoadAnnotationData(); >> |
my $stats = $spl->LoadAnnotationData(); |
1171 |
|
|
1172 |
Load the annotation data from FIG into Sprout. |
Load the annotation data from FIG into Sprout. |
1173 |
|
|
1201 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1202 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1203 |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
1204 |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly); |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation'); |
1205 |
my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly); |
my $loadSproutUser = $self->_TableLoader('SproutUser'); |
1206 |
my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly); |
my $loadUserAccess = $self->_TableLoader('UserAccess'); |
1207 |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly); |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation'); |
1208 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1209 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1210 |
} else { |
} else { |
1274 |
|
|
1275 |
=head3 LoadSourceData |
=head3 LoadSourceData |
1276 |
|
|
1277 |
C<< my $stats = $spl->LoadSourceData(); >> |
my $stats = $spl->LoadSourceData(); |
1278 |
|
|
1279 |
Load the source data from FIG into Sprout. |
Load the source data from FIG into Sprout. |
1280 |
|
|
1308 |
# Get the genome hash. |
# Get the genome hash. |
1309 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1310 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1311 |
my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly); |
my $loadComesFrom = $self->_TableLoader('ComesFrom'); |
1312 |
my $loadSource = $self->_TableLoader('Source'); |
my $loadSource = $self->_TableLoader('Source'); |
1313 |
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
1314 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1352 |
|
|
1353 |
=head3 LoadExternalData |
=head3 LoadExternalData |
1354 |
|
|
1355 |
C<< my $stats = $spl->LoadExternalData(); >> |
my $stats = $spl->LoadExternalData(); |
1356 |
|
|
1357 |
Load the external data from FIG into Sprout. |
Load the external data from FIG into Sprout. |
1358 |
|
|
1432 |
|
|
1433 |
=head3 LoadReactionData |
=head3 LoadReactionData |
1434 |
|
|
1435 |
C<< my $stats = $spl->LoadReactionData(); >> |
my $stats = $spl->LoadReactionData(); |
1436 |
|
|
1437 |
Load the reaction data from FIG into Sprout. |
Load the reaction data from FIG into Sprout. |
1438 |
|
|
1445 |
Compound |
Compound |
1446 |
CompoundName |
CompoundName |
1447 |
CompoundCAS |
CompoundCAS |
1448 |
|
IsIdentifiedByCAS |
1449 |
|
HasCompoundName |
1450 |
IsAComponentOf |
IsAComponentOf |
1451 |
|
Scenario |
1452 |
|
Catalyzes |
1453 |
|
HasScenario |
1454 |
|
IsInputFor |
1455 |
|
IsOutputOf |
1456 |
|
ExcludesReaction |
1457 |
|
IncludesReaction |
1458 |
|
IsOnDiagram |
1459 |
|
IncludesReaction |
1460 |
|
|
1461 |
This method proceeds reaction by reaction rather than genome by genome. |
This method proceeds reaction by reaction rather than genome by genome. |
1462 |
|
|
1477 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1478 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1479 |
my $loadReaction = $self->_TableLoader('Reaction'); |
my $loadReaction = $self->_TableLoader('Reaction'); |
1480 |
my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly); |
my $loadReactionURL = $self->_TableLoader('ReactionURL'); |
1481 |
my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly); |
my $loadCompound = $self->_TableLoader('Compound'); |
1482 |
my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly); |
my $loadCompoundName = $self->_TableLoader('CompoundName'); |
1483 |
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly); |
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS'); |
1484 |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly); |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf'); |
1485 |
|
my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS'); |
1486 |
|
my $loadHasCompoundName = $self->_TableLoader('HasCompoundName'); |
1487 |
|
my $loadScenario = $self->_TableLoader('Scenario'); |
1488 |
|
my $loadHasScenario = $self->_TableLoader('HasScenario'); |
1489 |
|
my $loadIsInputFor = $self->_TableLoader('IsInputFor'); |
1490 |
|
my $loadIsOutputOf = $self->_TableLoader('IsOutputOf'); |
1491 |
|
my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram'); |
1492 |
|
my $loadIncludesReaction = $self->_TableLoader('IncludesReaction'); |
1493 |
|
my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction'); |
1494 |
|
my $loadCatalyzes = $self->_TableLoader('Catalyzes'); |
1495 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1496 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1497 |
} else { |
} else { |
1498 |
Trace("Generating annotation data.") if T(2); |
Trace("Generating reaction data.") if T(2); |
1499 |
|
# We need some hashes to prevent duplicates. |
1500 |
|
my %compoundNames = (); |
1501 |
|
my %compoundCASes = (); |
1502 |
# First we create the compounds. |
# First we create the compounds. |
1503 |
my @compounds = $fig->all_compounds(); |
my %compounds = map { $_ => 1 } $fig->all_compounds(); |
1504 |
for my $cid (@compounds) { |
for my $cid (keys %compounds) { |
1505 |
# Check for names. |
# Check for names. |
1506 |
my @names = $fig->names_of_compound($cid); |
my @names = $fig->names_of_compound($cid); |
1507 |
# Each name will be given a priority number, starting with 1. |
# Each name will be given a priority number, starting with 1. |
1508 |
my $prio = 1; |
my $prio = 1; |
1509 |
for my $name (@names) { |
for my $name (@names) { |
1510 |
$loadCompoundName->Put($cid, $name, $prio++); |
if (! exists $compoundNames{$name}) { |
1511 |
|
$loadCompoundName->Put($name); |
1512 |
|
$compoundNames{$name} = 1; |
1513 |
|
} |
1514 |
|
$loadHasCompoundName->Put($cid, $name, $prio++); |
1515 |
} |
} |
1516 |
# Create the main compound record. Note that the first name |
# Create the main compound record. Note that the first name |
1517 |
# becomes the label. |
# becomes the label. |
1520 |
# Check for a CAS ID. |
# Check for a CAS ID. |
1521 |
my $cas = $fig->cas($cid); |
my $cas = $fig->cas($cid); |
1522 |
if ($cas) { |
if ($cas) { |
1523 |
$loadCompoundCAS->Put($cid, $cas); |
$loadIsIdentifiedByCAS->Put($cid, $cas); |
1524 |
|
if (! exists $compoundCASes{$cas}) { |
1525 |
|
$loadCompoundCAS->Put($cas); |
1526 |
|
$compoundCASes{$cas} = 1; |
1527 |
|
} |
1528 |
} |
} |
1529 |
} |
} |
1530 |
# All the compounds are set up, so we need to loop through the reactions next. First, |
# All the compounds are set up, so we need to loop through the reactions next. First, |
1531 |
# we initialize the discriminator index. This is a single integer used to insure |
# we initialize the discriminator index. This is a single integer used to insure |
1532 |
# duplicate elements in a reaction are not accidentally collapsed. |
# duplicate elements in a reaction are not accidentally collapsed. |
1533 |
my $discrim = 0; |
my $discrim = 0; |
1534 |
my @reactions = $fig->all_reactions(); |
my %reactions = map { $_ => 1 } $fig->all_reactions(); |
1535 |
for my $reactionID (@reactions) { |
for my $reactionID (keys %reactions) { |
1536 |
# Create the reaction record. |
# Create the reaction record. |
1537 |
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
1538 |
# Compute the reaction's URL. |
# Compute the reaction's URL. |
1555 |
} |
} |
1556 |
} |
} |
1557 |
} |
} |
1558 |
|
# Now we run through the subsystems and roles, generating the scenarios |
1559 |
|
# and connecting the reactions. We'll need some hashes to prevent |
1560 |
|
# duplicates and a counter for compound group keys. |
1561 |
|
my %roles = (); |
1562 |
|
my %scenarios = (); |
1563 |
|
my @subsystems = $fig->all_subsystems(); |
1564 |
|
for my $subName (@subsystems) { |
1565 |
|
my $sub = $fig->get_subsystem($subName); |
1566 |
|
Trace("Processing $subName reactions.") if T(3); |
1567 |
|
# Get the subsystem's reactions. |
1568 |
|
my %reactions = $sub->get_hope_reactions(); |
1569 |
|
# Loop through the roles, connecting them to the reactions. |
1570 |
|
for my $role (keys %reactions) { |
1571 |
|
# Only process this role if it is new. |
1572 |
|
if (! $roles{$role}) { |
1573 |
|
$roles{$role} = 1; |
1574 |
|
my @reactions = @{$reactions{$role}}; |
1575 |
|
for my $reaction (@reactions) { |
1576 |
|
$loadCatalyzes->Put($role, $reaction); |
1577 |
|
} |
1578 |
|
} |
1579 |
|
} |
1580 |
|
Trace("Processing $subName scenarios.") if T(3); |
1581 |
|
# Get the subsystem's scenarios. |
1582 |
|
my @scenarioNames = $sub->get_hope_scenario_names(); |
1583 |
|
# Loop through the scenarios, creating scenario data. |
1584 |
|
for my $scenarioName (@scenarioNames) { |
1585 |
|
# Link this scenario to this subsystem. |
1586 |
|
$loadHasScenario->Put($subName, $scenarioName); |
1587 |
|
# If this scenario is new, we need to create it. |
1588 |
|
if (! $scenarios{$scenarioName}) { |
1589 |
|
Trace("Creating scenario $scenarioName.") if T(3); |
1590 |
|
$scenarios{$scenarioName} = 1; |
1591 |
|
# Create the scenario itself. |
1592 |
|
$loadScenario->Put($scenarioName); |
1593 |
|
# Attach the input compounds. |
1594 |
|
for my $input ($sub->get_hope_input_compounds($scenarioName)) { |
1595 |
|
$loadIsInputFor->Put($input, $scenarioName); |
1596 |
|
} |
1597 |
|
# Now we need to set up the output compounds. They come in two |
1598 |
|
# groups, which we mark 0 and 1. |
1599 |
|
my $outputGroup = 0; |
1600 |
|
# Set up the output compounds. |
1601 |
|
for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) { |
1602 |
|
# Attach the compounds. |
1603 |
|
for my $compound (@$outputGroup) { |
1604 |
|
$loadIsOutputOf->Put($scenarioName, $compound, $outputGroup); |
1605 |
|
} |
1606 |
|
} |
1607 |
|
# Create the reaction lists. |
1608 |
|
my @addReactions = $sub->get_hope_additional_reactions($scenarioName); |
1609 |
|
for my $reaction (@addReactions) { |
1610 |
|
$loadIncludesReaction->Put($scenarioName, $reaction); |
1611 |
|
} |
1612 |
|
my @notReactions = $sub->get_hope_ignore_reactions($scenarioName); |
1613 |
|
for my $reaction (@notReactions) { |
1614 |
|
$loadExcludesReaction->Put($scenarioName, $reaction); |
1615 |
|
} |
1616 |
|
# Link the maps. |
1617 |
|
my @maps = $sub->get_hope_map_ids($scenarioName); |
1618 |
|
for my $map (@maps) { |
1619 |
|
$loadIsOnDiagram->Put($scenarioName, "map$map"); |
1620 |
|
} |
1621 |
|
} |
1622 |
} |
} |
|
# Finish the load. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
1623 |
} |
} |
|
|
|
|
=head3 LoadGroupData |
|
|
|
|
|
C<< my $stats = $spl->LoadGroupData(); >> |
|
|
|
|
|
Load the genome Groups into Sprout. |
|
|
|
|
|
The following relations are loaded by this method. |
|
|
|
|
|
GenomeGroups |
|
|
|
|
|
Currently, we do not use groups. We used to use them for NMPDR groups, |
|
|
butThere is no direct support for genome groups in FIG, so we access the SEED |
|
|
files directly. |
|
|
|
|
|
=over 4 |
|
|
|
|
|
=item RETURNS |
|
|
|
|
|
Returns a statistics object for the loads. |
|
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadGroupData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the genome hash. |
|
|
my $genomeHash = $self->{genomes}; |
|
|
# Create a load object for the table we're loading. |
|
|
my $loadGenomeGroups = $self->_TableLoader('GenomeGroups'); |
|
|
if ($self->{options}->{loadOnly}) { |
|
|
Trace("Loading from existing files.") if T(2); |
|
|
} else { |
|
|
Trace("Generating group data.") if T(2); |
|
|
# Currently there are no groups. |
|
1624 |
} |
} |
1625 |
# Finish the load. |
# Finish the load. |
1626 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1629 |
|
|
1630 |
=head3 LoadSynonymData |
=head3 LoadSynonymData |
1631 |
|
|
1632 |
C<< my $stats = $spl->LoadSynonymData(); >> |
my $stats = $spl->LoadSynonymData(); |
1633 |
|
|
1634 |
Load the synonym groups into Sprout. |
Load the synonym groups into Sprout. |
1635 |
|
|
1668 |
Trace("Generating synonym group data.") if T(2); |
Trace("Generating synonym group data.") if T(2); |
1669 |
# Get the database handle. |
# Get the database handle. |
1670 |
my $dbh = $fig->db_handle(); |
my $dbh = $fig->db_handle(); |
1671 |
# Ask for the synonyms. |
# Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias. |
1672 |
my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to"); |
my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to"); |
1673 |
my $result = $sth->execute(); |
my $result = $sth->execute(); |
1674 |
if (! defined($result)) { |
if (! defined($result)) { |
1675 |
Confess("Database error in Synonym load: " . $sth->errstr()); |
Confess("Database error in Synonym load: " . $sth->errstr()); |
1676 |
} else { |
} else { |
1677 |
|
Trace("Processing synonym results.") if T(2); |
1678 |
# Remember the current synonym. |
# Remember the current synonym. |
1679 |
my $current_syn = ""; |
my $current_syn = ""; |
1680 |
# Count the features. |
# Count the features. |
1681 |
my $featureCount = 0; |
my $featureCount = 0; |
1682 |
|
my $entryCount = 0; |
1683 |
# Loop through the synonym/peg pairs. |
# Loop through the synonym/peg pairs. |
1684 |
while (my @row = $sth->fetchrow()) { |
while (my @row = $sth->fetchrow()) { |
1685 |
# Get the synonym ID and feature ID. |
# Get the synonym group ID and feature ID. |
1686 |
my ($syn_id, $peg) = @row; |
my ($syn_id, $peg) = @row; |
1687 |
|
# Count this row. |
1688 |
|
$entryCount++; |
1689 |
|
if ($entryCount % 1000 == 0) { |
1690 |
|
Trace("$entryCount rows processed.") if T(3); |
1691 |
|
} |
1692 |
# Insure it's for one of our genomes. |
# Insure it's for one of our genomes. |
1693 |
my $genomeID = FIG::genome_of($peg); |
my $genomeID = FIG::genome_of($peg); |
1694 |
if (exists $genomeHash->{$genomeID}) { |
if (exists $genomeHash->{$genomeID}) { |
1707 |
} |
} |
1708 |
} |
} |
1709 |
} |
} |
1710 |
|
Trace("$entryCount rows produced $featureCount features.") if T(2); |
1711 |
} |
} |
1712 |
} |
} |
1713 |
# Finish the load. |
# Finish the load. |
1717 |
|
|
1718 |
=head3 LoadFamilyData |
=head3 LoadFamilyData |
1719 |
|
|
1720 |
C<< my $stats = $spl->LoadFamilyData(); >> |
my $stats = $spl->LoadFamilyData(); |
1721 |
|
|
1722 |
Load the protein families into Sprout. |
Load the protein families into Sprout. |
1723 |
|
|
1785 |
|
|
1786 |
=head3 LoadDrugData |
=head3 LoadDrugData |
1787 |
|
|
1788 |
C<< my $stats = $spl->LoadDrugData(); >> |
my $stats = $spl->LoadDrugData(); |
1789 |
|
|
1790 |
Load the drug target data into Sprout. |
Load the drug target data into Sprout. |
1791 |
|
|
1919 |
# Decode the score. |
# Decode the score. |
1920 |
my $realScore = FIGRules::DecodeScore($score); |
my $realScore = FIGRules::DecodeScore($score); |
1921 |
# Connect the PDB to the feature. |
# Connect the PDB to the feature. |
1922 |
$loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end); |
$loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end); |
1923 |
} |
} |
1924 |
} |
} |
1925 |
} |
} |
1984 |
|
|
1985 |
=head3 SpecialAttribute |
=head3 SpecialAttribute |
1986 |
|
|
1987 |
C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >> |
my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); |
1988 |
|
|
1989 |
Look for special attributes of a given type. A special attribute is found by comparing one of |
Look for special attributes of a given type. A special attribute is found by comparing one of |
1990 |
the columns of the incoming attribute list to a search pattern. If a match is found, then |
the columns of the incoming attribute list to a search pattern. If a match is found, then |
2074 |
|
|
2075 |
Name of the table (relation) being loaded. |
Name of the table (relation) being loaded. |
2076 |
|
|
|
=item ignore |
|
|
|
|
|
TRUE if the table should be ignored entirely, else FALSE. |
|
|
|
|
2077 |
=item RETURN |
=item RETURN |
2078 |
|
|
2079 |
Returns an ERDBLoad object for loading the specified table. |
Returns an ERDBLoad object for loading the specified table. |
2084 |
|
|
2085 |
sub _TableLoader { |
sub _TableLoader { |
2086 |
# Get the parameters. |
# Get the parameters. |
2087 |
my ($self, $tableName, $ignore) = @_; |
my ($self, $tableName) = @_; |
2088 |
# Create the load object. |
# Create the load object. |
2089 |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly, |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly); |
|
$ignore); |
|
2090 |
# Cache it in the loader list. |
# Cache it in the loader list. |
2091 |
push @{$self->{loaders}}, $retVal; |
push @{$self->{loaders}}, $retVal; |
2092 |
# Return it to the caller. |
# Return it to the caller. |
2160 |
|
|
2161 |
=head3 GetGenomeAttributes |
=head3 GetGenomeAttributes |
2162 |
|
|
2163 |
C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >> |
my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); |
2164 |
|
|
2165 |
Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related |
Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related |
2166 |
attributes for all the features of a genome in a single call, then organizes them into |
attributes for all the features of a genome in a single call, then organizes them into |
2180 |
|
|
2181 |
Reference to a list of the feature IDs whose attributes are to be kept. |
Reference to a list of the feature IDs whose attributes are to be kept. |
2182 |
|
|
2183 |
|
=item propKeys |
2184 |
|
|
2185 |
|
A list of the keys to retrieve. |
2186 |
|
|
2187 |
=item RETURN |
=item RETURN |
2188 |
|
|
2189 |
Returns a reference to a hash. The key of the hash is the feature ID. The value is the |
Returns a reference to a hash. The key of the hash is the feature ID. The value is the |
2196 |
|
|
2197 |
sub GetGenomeAttributes { |
sub GetGenomeAttributes { |
2198 |
# Get the parameters. |
# Get the parameters. |
2199 |
my ($fig, $genomeID, $fids) = @_; |
my ($fig, $genomeID, $fids, $propKeys) = @_; |
2200 |
# Declare the return variable. |
# Declare the return variable. |
2201 |
my $retVal = {}; |
my $retVal = {}; |
|
# Get a list of the attributes we care about. |
|
|
my @propKeys = $fig->get_group_keys("NMPDR"); |
|
|
# Get the attributes. |
|
|
my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys); |
|
2202 |
# Initialize the hash. This not only enables us to easily determine which FIDs to |
# Initialize the hash. This not only enables us to easily determine which FIDs to |
2203 |
# keep, it insures that the caller sees a list reference for every known fid, |
# keep, it insures that the caller sees a list reference for every known fid, |
2204 |
# simplifying the logic. |
# simplifying the logic. |
2205 |
for my $fid (@{$fids}) { |
for my $fid (@{$fids}) { |
2206 |
$retVal->{$fid} = []; |
$retVal->{$fid} = []; |
2207 |
} |
} |
2208 |
# Populate the hash. |
# Get the attributes. If ev_code_cron is running, we may get a timeout error, so |
2209 |
|
# an eval is used. |
2210 |
|
my @aList = (); |
2211 |
|
eval { |
2212 |
|
@aList = $fig->get_attributes("fig|$genomeID%", $propKeys); |
2213 |
|
Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3); |
2214 |
|
}; |
2215 |
|
# Check for a problem. |
2216 |
|
if ($@) { |
2217 |
|
Trace("Retrying attributes for $genomeID due to error: $@") if T(1); |
2218 |
|
# Our fallback plan is to process the attributes in blocks of 100. This is much slower, |
2219 |
|
# but allows us to continue processing. |
2220 |
|
my $nFids = scalar @{$fids}; |
2221 |
|
for (my $i = 0; $i < $nFids; $i += 100) { |
2222 |
|
# Determine the index of the last feature ID we'll be specifying on this pass. |
2223 |
|
# Normally it's $i + 99, but if we're close to the end it may be less. |
2224 |
|
my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99); |
2225 |
|
# Get a slice of the fid list. |
2226 |
|
my @slice = @{$fids}[$i .. $end]; |
2227 |
|
# Get the relevant attributes. |
2228 |
|
Trace("Retrieving attributes for fids $i to $end.") if T(3); |
2229 |
|
my @aShort = $fig->get_attributes(\@slice, $propKeys); |
2230 |
|
Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3); |
2231 |
|
push @aList, @aShort; |
2232 |
|
} |
2233 |
|
} |
2234 |
|
# Now we should have all the interesting attributes in @aList. Populate the hash with |
2235 |
|
# them. |
2236 |
for my $aListEntry (@aList) { |
for my $aListEntry (@aList) { |
2237 |
my $fid = $aListEntry->[0]; |
my $fid = $aListEntry->[0]; |
2238 |
if (exists $retVal->{$fid}) { |
if (exists $retVal->{$fid}) { |
2243 |
return $retVal; |
return $retVal; |
2244 |
} |
} |
2245 |
|
|
2246 |
|
|
2247 |
1; |
1; |