30 |
$stats->Accumulate($spl->LoadFeatureData()); |
$stats->Accumulate($spl->LoadFeatureData()); |
31 |
print $stats->Show(); |
print $stats->Show(); |
32 |
|
|
|
This module makes use of the internal Sprout property C<_erdb>. |
|
|
|
|
33 |
It is worth noting that the FIG object does not need to be a real one. Any object |
It is worth noting that the FIG object does not need to be a real one. Any object |
34 |
that implements the FIG methods for data retrieval could be used. So, for example, |
that implements the FIG methods for data retrieval could be used. So, for example, |
35 |
this object could be used to copy data from one Sprout database to another, or |
this object could be used to copy data from one Sprout database to another, or |
78 |
=item subsysFile |
=item subsysFile |
79 |
|
|
80 |
Either the name of the file containing the list of trusted subsystems or a reference |
Either the name of the file containing the list of trusted subsystems or a reference |
81 |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be |
82 |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR> |
83 |
|
in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded. |
84 |
|
|
85 |
=item options |
=item options |
86 |
|
|
93 |
sub new { |
sub new { |
94 |
# Get the parameters. |
# Get the parameters. |
95 |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
96 |
# Load the list of genomes into a hash. |
# Create the genome hash. |
97 |
my %genomes; |
my %genomes = (); |
98 |
|
# We only need it if load-only is NOT specified. |
99 |
|
if (! $options->{loadOnly}) { |
100 |
if (! defined($genomeFile) || $genomeFile eq '') { |
if (! defined($genomeFile) || $genomeFile eq '') { |
101 |
# Here we want all the complete genomes and an access code of 1. |
# Here we want all the complete genomes and an access code of 1. |
102 |
my @genomeList = $fig->genomes(1); |
my @genomeList = $fig->genomes(1); |
120 |
# an omitted access code can be defaulted to 1. |
# an omitted access code can be defaulted to 1. |
121 |
for my $genomeLine (@genomeList) { |
for my $genomeLine (@genomeList) { |
122 |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
123 |
if (undef $accessCode) { |
if (! defined($accessCode)) { |
124 |
$accessCode = 1; |
$accessCode = 1; |
125 |
} |
} |
126 |
$genomes{$genomeID} = $accessCode; |
$genomes{$genomeID} = $accessCode; |
130 |
Confess("Invalid genome parameter ($type) in SproutLoad constructor."); |
Confess("Invalid genome parameter ($type) in SproutLoad constructor."); |
131 |
} |
} |
132 |
} |
} |
133 |
|
} |
134 |
# Load the list of trusted subsystems. |
# Load the list of trusted subsystems. |
135 |
my %subsystems = (); |
my %subsystems = (); |
136 |
|
# We only need it if load-only is NOT specified. |
137 |
|
if (! $options->{loadOnly}) { |
138 |
if (! defined $subsysFile || $subsysFile eq '') { |
if (! defined $subsysFile || $subsysFile eq '') { |
139 |
# Here we want all the subsystems. |
# Here we want all the usable subsystems. First we get the whole list. |
140 |
%subsystems = map { $_ => 1 } $fig->all_subsystems(); |
my @subs = $fig->all_subsystems(); |
141 |
|
# Loop through, checking for the NMPDR file. |
142 |
|
for my $sub (@subs) { |
143 |
|
if ($fig->nmpdr_subsystem($sub)) { |
144 |
|
$subsystems{$sub} = 1; |
145 |
|
} |
146 |
|
} |
147 |
} else { |
} else { |
148 |
my $type = ref $subsysFile; |
my $type = ref $subsysFile; |
149 |
if ($type eq 'ARRAY') { |
if ($type eq 'ARRAY') { |
163 |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
164 |
} |
} |
165 |
} |
} |
166 |
|
# Go through the subsys hash again, creating the keyword list for each subsystem. |
167 |
|
for my $subsystem (keys %subsystems) { |
168 |
|
my $name = $subsystem; |
169 |
|
$name =~ s/_/ /g; |
170 |
|
my $classes = $fig->subsystem_classification($subsystem); |
171 |
|
$name .= " " . join(" ", @{$classes}); |
172 |
|
$subsystems{$subsystem} = $name; |
173 |
|
} |
174 |
|
} |
175 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
176 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
177 |
# Create the Sprout load object. |
# Create the Sprout load object. |
181 |
subsystems => \%subsystems, |
subsystems => \%subsystems, |
182 |
sprout => $sprout, |
sprout => $sprout, |
183 |
loadDirectory => $directory, |
loadDirectory => $directory, |
184 |
erdb => $sprout->{_erdb}, |
erdb => $sprout, |
185 |
loaders => [], |
loaders => [], |
186 |
options => $options |
options => $options |
187 |
}; |
}; |
190 |
return $retVal; |
return $retVal; |
191 |
} |
} |
192 |
|
|
193 |
|
=head3 LoadOnly |
194 |
|
|
195 |
|
C<< my $flag = $spl->LoadOnly; >> |
196 |
|
|
197 |
|
Return TRUE if we are in load-only mode, else FALSE. |
198 |
|
|
199 |
|
=cut |
200 |
|
|
201 |
|
sub LoadOnly { |
202 |
|
my ($self) = @_; |
203 |
|
return $self->{options}->{loadOnly}; |
204 |
|
} |
205 |
|
|
206 |
|
=head3 PrimaryOnly |
207 |
|
|
208 |
|
C<< my $flag = $spl->PrimaryOnly; >> |
209 |
|
|
210 |
|
Return TRUE if only the main entity is to be loaded, else FALSE. |
211 |
|
|
212 |
|
=cut |
213 |
|
|
214 |
|
sub PrimaryOnly { |
215 |
|
my ($self) = @_; |
216 |
|
return $self->{options}->{primaryOnly}; |
217 |
|
} |
218 |
|
|
219 |
=head3 LoadGenomeData |
=head3 LoadGenomeData |
220 |
|
|
221 |
C<< my $stats = $spl->LoadGenomeData(); >> |
C<< my $stats = $spl->LoadGenomeData(); >> |
243 |
|
|
244 |
=back |
=back |
245 |
|
|
|
B<TO DO> |
|
|
|
|
|
Real quality vectors instead of C<unknown> for everything. |
|
|
|
|
|
GenomeGroup relation. (The original script took group information from the C<NMPDR> file |
|
|
in each genome's main directory, but no such file exists anywhere in my version of the |
|
|
data store.) |
|
|
|
|
246 |
=cut |
=cut |
247 |
#: Return Type $%; |
#: Return Type $%; |
248 |
sub LoadGenomeData { |
sub LoadGenomeData { |
253 |
# Get the genome count. |
# Get the genome count. |
254 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
255 |
my $genomeCount = (keys %{$genomeHash}); |
my $genomeCount = (keys %{$genomeHash}); |
|
Trace("Beginning genome data load.") if T(2); |
|
256 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
257 |
my $loadGenome = $self->_TableLoader('Genome', $genomeCount); |
my $loadGenome = $self->_TableLoader('Genome'); |
258 |
my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300); |
my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly); |
259 |
my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300); |
my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly); |
260 |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000); |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly); |
261 |
my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000); |
my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly); |
262 |
|
if ($self->{options}->{loadOnly}) { |
263 |
|
Trace("Loading from existing files.") if T(2); |
264 |
|
} else { |
265 |
|
Trace("Generating genome data.") if T(2); |
266 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
267 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
268 |
Trace("Loading data for genome $genomeID.") if T(3); |
Trace("Generating data for genome $genomeID.") if T(3); |
269 |
$loadGenome->Add("genomeIn"); |
$loadGenome->Add("genomeIn"); |
270 |
# The access code comes in via the genome hash. |
# The access code comes in via the genome hash. |
271 |
my $accessCode = $genomeHash->{$genomeID}; |
my $accessCode = $genomeHash->{$genomeID}; |
272 |
# Get the genus, species, and strain from the scientific name. Note that we append |
# Get the genus, species, and strain from the scientific name. |
|
# the genome ID to the strain. In some cases this is the totality of the strain name. |
|
273 |
my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
274 |
my $extra = join " ", @extraData, "[$genomeID]"; |
my $extra = join " ", @extraData; |
275 |
# Get the full taxonomy. |
# Get the full taxonomy. |
276 |
my $taxonomy = $fig->taxonomy_of($genomeID); |
my $taxonomy = $fig->taxonomy_of($genomeID); |
277 |
|
# Get the version. If no version is specified, we default to the genome ID by itself. |
278 |
|
my $version = $fig->genome_version($genomeID); |
279 |
|
if (! defined($version)) { |
280 |
|
$version = $genomeID; |
281 |
|
} |
282 |
|
# Get the DNA size. |
283 |
|
my $dnaSize = $fig->genome_szdna($genomeID); |
284 |
|
# Open the NMPDR group file for this genome. |
285 |
|
my $group; |
286 |
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
287 |
|
defined($group = <TMP>)) { |
288 |
|
# Clean the line ending. |
289 |
|
chomp $group; |
290 |
|
} else { |
291 |
|
# No group, so use the default. |
292 |
|
$group = $FIG_Config::otherGroup; |
293 |
|
} |
294 |
|
close TMP; |
295 |
# Output the genome record. |
# Output the genome record. |
296 |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), |
297 |
$species, $extra, $taxonomy); |
$dnaSize, $genus, $group, $species, $extra, $version, $taxonomy); |
298 |
# Now we loop through each of the genome's contigs. |
# Now we loop through each of the genome's contigs. |
299 |
my @contigs = $fig->all_contigs($genomeID); |
my @contigs = $fig->all_contigs($genomeID); |
300 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
325 |
} |
} |
326 |
} |
} |
327 |
} |
} |
328 |
|
} |
329 |
# Finish the loads. |
# Finish the loads. |
330 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
331 |
# Return the result. |
# Return the result. |
366 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
367 |
# Get the genome hash. |
# Get the genome hash. |
368 |
my $genomeFilter = $self->{genomes}; |
my $genomeFilter = $self->{genomes}; |
369 |
my $genomeCount = (keys %{$genomeFilter}); |
# Set up an ID counter for the PCHs. |
370 |
my $featureCount = $genomeCount * 4000; |
my $pchID = 0; |
371 |
# Start the loads. |
# Start the loads. |
372 |
my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount); |
my $loadCoupling = $self->_TableLoader('Coupling'); |
373 |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000); |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
374 |
my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000); |
my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly); |
375 |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000); |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly); |
376 |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000); |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly); |
377 |
Trace("Beginning coupling data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
378 |
|
Trace("Loading from existing files.") if T(2); |
379 |
|
} else { |
380 |
|
Trace("Generating coupling data.") if T(2); |
381 |
# Loop through the genomes found. |
# Loop through the genomes found. |
382 |
for my $genome (sort keys %{$genomeFilter}) { |
for my $genome (sort keys %{$genomeFilter}) { |
383 |
Trace("Generating coupling data for $genome.") if T(3); |
Trace("Generating coupling data for $genome.") if T(3); |
401 |
for my $coupleData (@couplings) { |
for my $coupleData (@couplings) { |
402 |
my ($peg2, $score) = @{$coupleData}; |
my ($peg2, $score) = @{$coupleData}; |
403 |
# Compute the coupling ID. |
# Compute the coupling ID. |
404 |
my $coupleID = Sprout::CouplingID($peg1, $peg2); |
my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2); |
405 |
if (! exists $dupHash{$coupleID}) { |
if (! exists $dupHash{$coupleID}) { |
406 |
$loadCoupling->Add("couplingIn"); |
$loadCoupling->Add("couplingIn"); |
407 |
# Here we have a new coupling to store in the load files. |
# Here we have a new coupling to store in the load files. |
437 |
} |
} |
438 |
} |
} |
439 |
for my $evidenceID (keys %evidenceMap) { |
for my $evidenceID (keys %evidenceMap) { |
440 |
|
# Get the ID for this evidence. |
441 |
|
$pchID++; |
442 |
# Create the evidence record. |
# Create the evidence record. |
443 |
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
444 |
$loadPCH->Put($evidenceID, $usage); |
$loadPCH->Put($pchID, $usage); |
445 |
# Connect it to the coupling. |
# Connect it to the coupling. |
446 |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
$loadIsEvidencedBy->Put($coupleID, $pchID); |
447 |
# Connect it to the features. |
# Connect it to the features. |
448 |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
$loadUsesAsEvidence->Put($pchID, $peg3, 1); |
449 |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 2); |
$loadUsesAsEvidence->Put($pchID, $peg4, 2); |
450 |
|
} |
451 |
} |
} |
452 |
} |
} |
453 |
} |
} |
474 |
FeatureTranslation |
FeatureTranslation |
475 |
FeatureUpstream |
FeatureUpstream |
476 |
IsLocatedIn |
IsLocatedIn |
477 |
|
HasFeature |
478 |
|
HasRoleInSubsystem |
479 |
|
FeatureEssential |
480 |
|
FeatureVirulent |
481 |
|
FeatureIEDB |
482 |
|
|
483 |
=over 4 |
=over 4 |
484 |
|
|
493 |
sub LoadFeatureData { |
sub LoadFeatureData { |
494 |
# Get this object instance. |
# Get this object instance. |
495 |
my ($self) = @_; |
my ($self) = @_; |
496 |
# Get the FIG object. |
# Get the FIG and Sprout objects. |
497 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
498 |
# Find out if this is a limited run. |
my $sprout = $self->{sprout}; |
|
my $limited = $self->{options}->{limitedFeatures}; |
|
499 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
500 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
501 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
502 |
my $loadFeature = $self->_TableLoader('Feature', $featureCount); |
my $loadFeature = $self->_TableLoader('Feature'); |
503 |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount); |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly); |
504 |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6); |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
505 |
my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
506 |
if (! $limited) { |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
507 |
$loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
508 |
$loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount); |
my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly); |
509 |
$loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount); |
my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly); |
510 |
} |
my $loadFeatureEssential = $self->_TableLoader('FeatureEssential'); |
511 |
|
my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent'); |
512 |
|
my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); |
513 |
|
# Get the subsystem hash. |
514 |
|
my $subHash = $self->{subsystems}; |
515 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
516 |
# locations. |
# locations. |
517 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
518 |
Trace("Beginning feature data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
519 |
|
Trace("Loading from existing files.") if T(2); |
520 |
|
} else { |
521 |
|
Trace("Generating feature data.") if T(2); |
522 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
523 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
524 |
Trace("Loading features for genome $genomeID.") if T(3); |
Trace("Loading features for genome $genomeID.") if T(3); |
525 |
$loadFeature->Add("genomeIn"); |
$loadFeature->Add("genomeIn"); |
526 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
527 |
my $features = $fig->all_features_detailed($genomeID); |
my $features = $fig->all_features_detailed_fast($genomeID); |
528 |
|
# Sort and count the list. |
529 |
|
my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; |
530 |
|
my $count = scalar @featureTuples; |
531 |
|
my @fids = map { $_->[0] } @featureTuples; |
532 |
|
Trace("$count features found for genome $genomeID.") if T(3); |
533 |
|
# Get the attributes for this genome and put them in a hash by feature ID. |
534 |
|
my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids); |
535 |
|
# Set up for our duplicate-feature check. |
536 |
|
my $oldFeatureID = ""; |
537 |
# Loop through the features. |
# Loop through the features. |
538 |
for my $featureData (@{$features}) { |
for my $featureTuple (@featureTuples) { |
|
$loadFeature->Add("featureIn"); |
|
539 |
# Split the tuple. |
# Split the tuple. |
540 |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple}; |
541 |
# Create the feature record. |
# Check for duplicates. |
542 |
$loadFeature->Put($featureID, 1, $type); |
if ($featureID eq $oldFeatureID) { |
543 |
|
Trace("Duplicate feature $featureID found.") if T(1); |
544 |
|
} else { |
545 |
|
$oldFeatureID = $featureID; |
546 |
|
# Count this feature. |
547 |
|
$loadFeature->Add("featureIn"); |
548 |
|
# Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the |
549 |
|
# Sprout database requires a single character. |
550 |
|
if (! defined($quality) || $quality eq "") { |
551 |
|
$quality = " "; |
552 |
|
} |
553 |
|
# Begin building the keywords. We start with the genome ID, the |
554 |
|
# feature ID, the taxonomy, and the organism name. |
555 |
|
my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID), |
556 |
|
$fig->taxonomy_of($genomeID)); |
557 |
# Create the aliases. |
# Create the aliases. |
558 |
for my $alias ($fig->feature_aliases($featureID)) { |
for my $alias ($fig->feature_aliases($featureID)) { |
559 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
560 |
|
push @keywords, $alias; |
561 |
} |
} |
562 |
# The next stuff is for a full load only. |
Trace("Assignment for $featureID is: $assignment") if T(4); |
563 |
if (! $limited) { |
# Break the assignment into words and shove it onto the |
564 |
|
# keyword list. |
565 |
|
push @keywords, split(/\s+/, $assignment); |
566 |
|
# Link this feature to the parent genome. |
567 |
|
$loadHasFeature->Put($genomeID, $featureID, $type); |
568 |
# Get the links. |
# Get the links. |
569 |
my @links = $fig->fid_links($featureID); |
my @links = $fig->fid_links($featureID); |
570 |
for my $link (@links) { |
for my $link (@links) { |
583 |
$loadFeatureUpstream->Put($featureID, $upstream); |
$loadFeatureUpstream->Put($featureID, $upstream); |
584 |
} |
} |
585 |
} |
} |
586 |
} |
# Now we need to find the subsystems this feature participates in. |
587 |
|
# We also add the subsystems to the keyword list. Before we do that, |
588 |
|
# we must convert underscores to spaces and tack on the classifications. |
589 |
|
my @subsystems = $fig->peg_to_subsystems($featureID); |
590 |
|
for my $subsystem (@subsystems) { |
591 |
|
# Only proceed if we like this subsystem. |
592 |
|
if (exists $subHash->{$subsystem}) { |
593 |
|
# Store the has-role link. |
594 |
|
$loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type); |
595 |
|
# Save the subsystem's keyword data. |
596 |
|
my $subKeywords = $subHash->{$subsystem}; |
597 |
|
push @keywords, split /\s+/, $subKeywords; |
598 |
|
# Now we need to get this feature's role in the subsystem. |
599 |
|
my $subObject = $fig->get_subsystem($subsystem); |
600 |
|
my @roleColumns = $subObject->get_peg_roles($featureID); |
601 |
|
my @allRoles = $subObject->get_roles(); |
602 |
|
for my $col (@roleColumns) { |
603 |
|
my $role = $allRoles[$col]; |
604 |
|
push @keywords, split /\s+/, $role; |
605 |
|
push @keywords, $subObject->get_role_abbr($col); |
606 |
|
} |
607 |
|
} |
608 |
|
} |
609 |
|
# There are three special attributes computed from property |
610 |
|
# data that we build next. If the special attribute is non-empty, |
611 |
|
# its name will be added to the keyword list. First, we get all |
612 |
|
# the attributes for this feature. They will come back as |
613 |
|
# 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead: |
614 |
|
# [name, value, value with URL]. (We don't need the PEG, since |
615 |
|
# we already know it.) |
616 |
|
my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] } |
617 |
|
@{$attributes->{$featureID}}; |
618 |
|
# Now we process each of the special attributes. |
619 |
|
if (SpecialAttribute($featureID, \@attributes, |
620 |
|
1, [0,2], '^(essential|potential_essential)$', |
621 |
|
$loadFeatureEssential)) { |
622 |
|
push @keywords, 'essential'; |
623 |
|
$loadFeature->Add('essential'); |
624 |
|
} |
625 |
|
if (SpecialAttribute($featureID, \@attributes, |
626 |
|
0, [2], '^virulen', |
627 |
|
$loadFeatureVirulent)) { |
628 |
|
push @keywords, 'virulent'; |
629 |
|
$loadFeature->Add('virulent'); |
630 |
|
} |
631 |
|
if (SpecialAttribute($featureID, \@attributes, |
632 |
|
0, [0,2], '^iedb_', |
633 |
|
$loadFeatureIEDB)) { |
634 |
|
push @keywords, 'iedb'; |
635 |
|
$loadFeature->Add('iedb'); |
636 |
|
} |
637 |
|
# Now we need to bust up hyphenated words in the keyword |
638 |
|
# list. We keep them separate and put them at the end so |
639 |
|
# the original word order is available. |
640 |
|
my $keywordString = ""; |
641 |
|
my $bustedString = ""; |
642 |
|
for my $keyword (@keywords) { |
643 |
|
if (length $keyword >= 3) { |
644 |
|
$keywordString .= " $keyword"; |
645 |
|
if ($keyword =~ /-/) { |
646 |
|
my @words = split /-/, $keyword; |
647 |
|
$bustedString .= join(" ", "", @words); |
648 |
|
} |
649 |
|
} |
650 |
|
} |
651 |
|
$keywordString .= $bustedString; |
652 |
|
# Get rid of annoying punctuation. |
653 |
|
$keywordString =~ s/[();]//g; |
654 |
|
# Clean the keyword list. |
655 |
|
my $cleanWords = $sprout->CleanKeywords($keywordString); |
656 |
|
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
657 |
|
# Create the feature record. |
658 |
|
$loadFeature->Put($featureID, 1, $user, $quality, $type, $assignment, $cleanWords); |
659 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
660 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
661 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
684 |
} |
} |
685 |
} |
} |
686 |
} |
} |
|
# Finish the loads. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
|
} |
|
|
|
|
|
=head3 LoadBBHData |
|
|
|
|
|
C<< my $stats = $spl->LoadBBHData(); >> |
|
|
|
|
|
Load the bidirectional best hit data from FIG into Sprout. |
|
|
|
|
|
Sprout does not store information on similarities. Instead, it has only the |
|
|
bi-directional best hits. Even so, the BBH table is one of the largest in |
|
|
the database. |
|
|
|
|
|
The following relations are loaded by this method. |
|
|
|
|
|
IsBidirectionalBestHitOf |
|
|
|
|
|
=over 4 |
|
|
|
|
|
=item RETURNS |
|
|
|
|
|
Returns a statistics object for the loads. |
|
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadBBHData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the table of genome IDs. |
|
|
my $genomeHash = $self->{genomes}; |
|
|
my $genomeCount = (keys %{$genomeHash}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
|
# Create load objects for each of the tables we're loading. |
|
|
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf', |
|
|
$featureCount * $genomeCount); |
|
|
Trace("Beginning BBH load.") if T(2); |
|
|
# Now we loop through the genomes, generating the data for each one. |
|
|
for my $genomeID (sort keys %{$genomeHash}) { |
|
|
$loadIsBidirectionalBestHitOf->Add("genomeIn"); |
|
|
Trace("Processing features for genome $genomeID.") if T(3); |
|
|
# Get the feature list for this genome. |
|
|
my $features = $fig->all_features_detailed($genomeID); |
|
|
# Loop through the features. |
|
|
for my $featureData (@{$features}) { |
|
|
# Split the tuple. |
|
|
my ($featureID, $locations, $aliases, $type) = @{$featureData}; |
|
|
# Get the bi-directional best hits. |
|
|
my @bbhList = $fig->bbhs($featureID); |
|
|
for my $bbhEntry (@bbhList) { |
|
|
# Get the target feature ID and the score. |
|
|
my ($targetID, $score) = @{$bbhEntry}; |
|
|
# Check the target feature's genome. |
|
|
my $targetGenomeID = $fig->genome_of($targetID); |
|
|
# Only proceed if it's one of our genomes. |
|
|
if ($genomeHash->{$targetGenomeID}) { |
|
|
$loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, |
|
|
$score); |
|
|
} |
|
|
} |
|
687 |
} |
} |
688 |
} |
} |
689 |
# Finish the loads. |
# Finish the loads. |
706 |
The following relations are loaded by this method. |
The following relations are loaded by this method. |
707 |
|
|
708 |
Subsystem |
Subsystem |
709 |
|
SubsystemClass |
710 |
Role |
Role |
711 |
|
RoleEC |
712 |
SSCell |
SSCell |
713 |
ContainsFeature |
ContainsFeature |
714 |
IsGenomeOf |
IsGenomeOf |
716 |
OccursInSubsystem |
OccursInSubsystem |
717 |
ParticipatesIn |
ParticipatesIn |
718 |
HasSSCell |
HasSSCell |
|
Catalyzes |
|
|
Reaction |
|
719 |
ConsistsOfRoles |
ConsistsOfRoles |
720 |
RoleSubset |
RoleSubset |
721 |
HasRoleSubset |
HasRoleSubset |
722 |
ConsistsOfGenomes |
ConsistsOfGenomes |
723 |
GenomeSubset |
GenomeSubset |
724 |
HasGenomeSubset |
HasGenomeSubset |
725 |
|
Catalyzes |
726 |
|
Diagram |
727 |
|
RoleOccursIn |
728 |
|
|
729 |
=over 4 |
=over 4 |
730 |
|
|
734 |
|
|
735 |
=back |
=back |
736 |
|
|
|
B<TO DO> |
|
|
|
|
|
Generate RoleName table? |
|
|
|
|
737 |
=cut |
=cut |
738 |
#: Return Type $%; |
#: Return Type $%; |
739 |
sub LoadSubsystemData { |
sub LoadSubsystemData { |
747 |
# Get the subsystem hash. This lists the subsystems we'll process. |
# Get the subsystem hash. This lists the subsystems we'll process. |
748 |
my $subsysHash = $self->{subsystems}; |
my $subsysHash = $self->{subsystems}; |
749 |
my @subsysIDs = sort keys %{$subsysHash}; |
my @subsysIDs = sort keys %{$subsysHash}; |
750 |
my $subsysCount = @subsysIDs; |
# Get the map list. |
751 |
my $genomeCount = (keys %{$genomeHash}); |
my @maps = $fig->all_maps; |
|
my $featureCount = $genomeCount * 4000; |
|
752 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
753 |
my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount); |
my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly); |
754 |
my $loadRole = $self->_TableLoader('Role', $featureCount * 6); |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly); |
755 |
my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount); |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
756 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount); |
my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly); |
757 |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount); |
my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly); |
758 |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount); |
my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly); |
759 |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6); |
my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly); |
760 |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount); |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly); |
761 |
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount); |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly); |
762 |
my $loadReaction = $self->_TableLoader('Reaction', $featureCount * $genomeCount); |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly); |
763 |
my $loadCatalyzes = $self->_TableLoader('Catalyzes', $featureCount * $genomeCount); |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly); |
764 |
my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50); |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly); |
765 |
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50); |
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly); |
766 |
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount); |
my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly); |
767 |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount); |
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly); |
768 |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50); |
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly); |
769 |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50); |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); |
770 |
Trace("Beginning subsystem data load.") if T(2); |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); |
771 |
# The reaction hash will contain a list of reactions for each role. When we're done, |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); |
772 |
# a complicated sort and merge will be used to generate the Reaction and Catalyzes |
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly); |
773 |
# tables. |
if ($self->{options}->{loadOnly}) { |
774 |
my %reactionsToRoles = (); |
Trace("Loading from existing files.") if T(2); |
775 |
|
} else { |
776 |
|
Trace("Generating subsystem data.") if T(2); |
777 |
|
# This hash will contain the role for each EC. When we're done, this |
778 |
|
# information will be used to generate the Catalyzes table. |
779 |
|
my %ecToRoles = (); |
780 |
# Loop through the subsystems. Our first task will be to create the |
# Loop through the subsystems. Our first task will be to create the |
781 |
# roles. We do this by looping through the subsystems and creating a |
# roles. We do this by looping through the subsystems and creating a |
782 |
# role hash. The hash tracks each role ID so that we don't create |
# role hash. The hash tracks each role ID so that we don't create |
785 |
my ($genomeID, $roleID); |
my ($genomeID, $roleID); |
786 |
my %roleData = (); |
my %roleData = (); |
787 |
for my $subsysID (@subsysIDs) { |
for my $subsysID (@subsysIDs) { |
|
Trace("Creating subsystem $subsysID.") if T(3); |
|
|
$loadSubsystem->Add("subsystemIn"); |
|
788 |
# Get the subsystem object. |
# Get the subsystem object. |
789 |
my $sub = $fig->get_subsystem($subsysID); |
my $sub = $fig->get_subsystem($subsysID); |
790 |
# Get its reaction hash. |
# Only proceed if the subsystem has a spreadsheet. |
791 |
my $reactionHash = $sub->get_reactions(); |
if (defined($sub) && ! $sub->{empty_ss}) { |
792 |
|
Trace("Creating subsystem $subsysID.") if T(3); |
793 |
|
$loadSubsystem->Add("subsystemIn"); |
794 |
# Create the subsystem record. |
# Create the subsystem record. |
795 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
796 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
797 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
798 |
|
# Now for the classification string. This comes back as a list |
799 |
|
# reference and we convert it to a space-delimited string. |
800 |
|
my $classList = $fig->subsystem_classification($subsysID); |
801 |
|
my $classString = join($FIG_Config::splitter, grep { $_ } @$classList); |
802 |
|
$loadSubsystemClass->Put($subsysID, $classString); |
803 |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
804 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
805 |
# Connect to this role. |
# Connect to this role. |
812 |
# Add the role. |
# Add the role. |
813 |
$loadRole->Put($roleID, $abbr); |
$loadRole->Put($roleID, $abbr); |
814 |
$roleData{$roleID} = 1; |
$roleData{$roleID} = 1; |
815 |
# Add the role's reactions. |
# Check for an EC number. |
816 |
my $reactions = $reactionHash->{$roleID}; |
if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) { |
817 |
for my $reactionID (@{$reactions}) { |
my $ec = $1; |
818 |
if (! exists $reactionsToRoles{$reactionID}) { |
$loadRoleEC->Put($roleID, $ec); |
819 |
# Here the reaction is brand-new, so we create its reaction |
$ecToRoles{$ec} = $roleID; |
|
# record. |
|
|
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
|
|
# We also create a blank list for it in the reaction hash. |
|
|
$reactionsToRoles{$reactionID} = []; |
|
|
} |
|
|
# Add the role to the reaction's role list. |
|
|
push @{$reactionsToRoles{$reactionID}}, $roleID; |
|
820 |
} |
} |
821 |
} |
} |
822 |
} |
} |
843 |
# part of the spreadsheet cell ID. |
# part of the spreadsheet cell ID. |
844 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
845 |
# Get the features in the spreadsheet cell for this genome and role. |
# Get the features in the spreadsheet cell for this genome and role. |
846 |
my @pegs = $sub->get_pegs_from_cell($row, $col); |
my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col); |
847 |
# Only proceed if features exist. |
# Only proceed if features exist. |
848 |
if (@pegs > 0) { |
if (@pegs > 0) { |
849 |
# Create the spreadsheet cell. |
# Create the spreadsheet cell. |
864 |
if ($pegCount > 0) { |
if ($pegCount > 0) { |
865 |
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
866 |
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
|
# Partition the PEGs found into clusters. |
|
|
my @clusters = $fig->compute_clusters(\@pegsFound, $sub); |
|
867 |
# Create a hash mapping PEG IDs to cluster numbers. |
# Create a hash mapping PEG IDs to cluster numbers. |
868 |
# We default to -1 for all of them. |
# We default to -1 for all of them. |
869 |
my %clusterOf = map { $_ => -1 } @pegsFound; |
my %clusterOf = map { $_ => -1 } @pegsFound; |
870 |
|
# Partition the PEGs found into clusters. |
871 |
|
my @clusters = $fig->compute_clusters([keys %clusterOf], $sub); |
872 |
for (my $i = 0; $i <= $#clusters; $i++) { |
for (my $i = 0; $i <= $#clusters; $i++) { |
873 |
my $subList = $clusters[$i]; |
my $subList = $clusters[$i]; |
874 |
for my $peg (@{$subList}) { |
for my $peg (@{$subList}) { |
896 |
# Connect the subset to the subsystem. |
# Connect the subset to the subsystem. |
897 |
$loadHasRoleSubset->Put($subsysID, $actualID); |
$loadHasRoleSubset->Put($subsysID, $actualID); |
898 |
# Connect the subset to its roles. |
# Connect the subset to its roles. |
899 |
my @roles = $sub->get_subset($subsetID); |
my @roles = $sub->get_subsetC_roles($subsetID); |
900 |
for my $roleID (@roles) { |
for my $roleID (@roles) { |
901 |
$loadConsistsOfRoles->Put($actualID, $roleID); |
$loadConsistsOfRoles->Put($actualID, $roleID); |
902 |
} |
} |
916 |
} |
} |
917 |
} |
} |
918 |
} |
} |
|
# Before we leave, we must create the Catalyzes table. The data is all stored in |
|
|
# "reactionToRoles" hash. |
|
|
for my $reactionID (keys %reactionsToRoles) { |
|
|
# Get this reaction's list of roles. We sort it so we can merge out duplicates. |
|
|
my @roles = sort @{$reactionsToRoles{$reactionID}}; |
|
|
my $lastRole = ""; |
|
|
# Loop through the roles, creating catalyzation records. |
|
|
for my $thisRole (@roles) { |
|
|
if ($thisRole ne $lastRole) { |
|
|
$loadCatalyzes->Put($thisRole, $reactionID); |
|
|
} |
|
|
} |
|
|
} |
|
|
# Finish the load. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
919 |
} |
} |
920 |
|
# Now we loop through the diagrams. We need to create the diagram records |
921 |
=head3 LoadDiagramData |
# and link each diagram to its roles. Note that only roles which occur |
922 |
|
# in subsystems (and therefore appear in the %ecToRoles hash) are |
923 |
C<< my $stats = $spl->LoadDiagramData(); >> |
# included. |
924 |
|
for my $map (@maps) { |
|
Load the diagram data from FIG into Sprout. |
|
|
|
|
|
Diagrams are used to organize functional roles. The diagram shows the |
|
|
connections between chemicals that interact with a subsystem. |
|
|
|
|
|
The following relations are loaded by this method. |
|
|
|
|
|
Diagram |
|
|
RoleOccursIn |
|
|
|
|
|
=over 4 |
|
|
|
|
|
=item RETURNS |
|
|
|
|
|
Returns a statistics object for the loads. |
|
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadDiagramData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the map list. |
|
|
my @maps = $fig->all_maps; |
|
|
my $mapCount = @maps; |
|
|
my $genomeCount = (keys %{$self->{genomes}}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
|
# Create load objects for each of the tables we're loading. |
|
|
my $loadDiagram = $self->_TableLoader('Diagram', $mapCount); |
|
|
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6); |
|
|
Trace("Beginning diagram data load.") if T(2); |
|
|
# Loop through the diagrams. |
|
|
for my $map ($fig->all_maps) { |
|
925 |
Trace("Loading diagram $map.") if T(3); |
Trace("Loading diagram $map.") if T(3); |
926 |
# Get the diagram's descriptive name. |
# Get the diagram's descriptive name. |
927 |
my $name = $fig->map_name($map); |
my $name = $fig->map_name($map); |
930 |
# A hash is used to prevent duplicates. |
# A hash is used to prevent duplicates. |
931 |
my %roleHash = (); |
my %roleHash = (); |
932 |
for my $role ($fig->map_to_ecs($map)) { |
for my $role ($fig->map_to_ecs($map)) { |
933 |
if (! $roleHash{$role}) { |
if (exists $ecToRoles{$role} && ! $roleHash{$role}) { |
934 |
$loadRoleOccursIn->Put($role, $map); |
$loadRoleOccursIn->Put($ecToRoles{$role}, $map); |
935 |
$roleHash{$role} = 1; |
$roleHash{$role} = 1; |
936 |
} |
} |
937 |
} |
} |
938 |
} |
} |
939 |
|
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
940 |
|
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
941 |
|
my @reactions = $fig->all_reactions(); |
942 |
|
for my $reactionID (@reactions) { |
943 |
|
# Get this reaction's list of roles. The results will be EC numbers. |
944 |
|
my @roles = $fig->catalyzed_by($reactionID); |
945 |
|
# Loop through the roles, creating catalyzation records. |
946 |
|
for my $thisRole (@roles) { |
947 |
|
if (exists $ecToRoles{$thisRole}) { |
948 |
|
$loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID); |
949 |
|
} |
950 |
|
} |
951 |
|
} |
952 |
|
} |
953 |
# Finish the load. |
# Finish the load. |
954 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
955 |
return $retVal; |
return $retVal; |
991 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
992 |
# Get the genome hash. |
# Get the genome hash. |
993 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
994 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
995 |
my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500); |
my $loadProperty = $self->_TableLoader('Property'); |
996 |
my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500); |
my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly); |
997 |
Trace("Beginning property data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
998 |
|
Trace("Loading from existing files.") if T(2); |
999 |
|
} else { |
1000 |
|
Trace("Generating property data.") if T(2); |
1001 |
# Create a hash for storing property IDs. |
# Create a hash for storing property IDs. |
1002 |
my %propertyKeys = (); |
my %propertyKeys = (); |
1003 |
my $nextID = 1; |
my $nextID = 1; |
1004 |
|
# Get the attributes we intend to store in the property table. |
1005 |
|
my @propKeys = $fig->get_group_keys("NMPDR"); |
1006 |
# Loop through the genomes. |
# Loop through the genomes. |
1007 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1008 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
1009 |
# Get the genome's features. The feature ID is the first field in the |
Trace("Generating properties for $genomeID.") if T(3); |
1010 |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
# Initialize a counter. |
1011 |
# rather than "all_features" because we want all features regardless of type. |
my $propertyCount = 0; |
1012 |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
# Get the properties for this genome's features. |
1013 |
# Loop through the features, creating HasProperty records. |
my @attributes = $fig->get_attributes("fig|$genomeID%", \@propKeys); |
1014 |
for my $fid (@features) { |
Trace("Property list built for $genomeID.") if T(3); |
1015 |
$loadProperty->Add("featureIn"); |
# Loop through the results, creating HasProperty records. |
1016 |
# Get all attributes for this feature. We do this one feature at a time |
for my $attributeData (@attributes) { |
1017 |
# to insure we do not get any genome attributes. |
# Pull apart the attribute tuple. |
1018 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my ($fid, $key, $value, $url) = @{$attributeData}; |
|
# Loop through the attributes. |
|
|
for my $tuple (@attributeList) { |
|
|
# Get this attribute value's data. Note that we throw away the FID, |
|
|
# since it will always be the same as the value if "$fid". |
|
|
my (undef, $key, $value, $url) = @{$tuple}; |
|
1019 |
# Concatenate the key and value and check the "propertyKeys" hash to |
# Concatenate the key and value and check the "propertyKeys" hash to |
1020 |
# see if we already have an ID for it. We use a tab for the separator |
# see if we already have an ID for it. We use a tab for the separator |
1021 |
# character. |
# character. |
1033 |
# Create the HasProperty entry for this feature/property association. |
# Create the HasProperty entry for this feature/property association. |
1034 |
$loadHasProperty->Put($fid, $propertyID, $url); |
$loadHasProperty->Put($fid, $propertyID, $url); |
1035 |
} |
} |
1036 |
|
# Update the statistics. |
1037 |
|
Trace("$propertyCount attributes processed.") if T(3); |
1038 |
|
$loadHasProperty->Add("propertiesIn", $propertyCount); |
1039 |
} |
} |
1040 |
} |
} |
1041 |
# Finish the load. |
# Finish the load. |
1077 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1078 |
# Get the genome hash. |
# Get the genome hash. |
1079 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1080 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1081 |
my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000); |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
1082 |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000); |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly); |
1083 |
my $loadSproutUser = $self->_TableLoader('SproutUser', 100); |
my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly); |
1084 |
my $loadUserAccess = $self->_TableLoader('UserAccess', 1000); |
my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly); |
1085 |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000); |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly); |
1086 |
Trace("Beginning annotation data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1087 |
|
Trace("Loading from existing files.") if T(2); |
1088 |
|
} else { |
1089 |
|
Trace("Generating annotation data.") if T(2); |
1090 |
# Create a hash of user names. We'll use this to prevent us from generating duplicate |
# Create a hash of user names. We'll use this to prevent us from generating duplicate |
1091 |
# user records. |
# user records. |
1092 |
my %users = ( FIG => 1, master => 1 ); |
my %users = ( FIG => 1, master => 1 ); |
1100 |
# Loop through the genomes. |
# Loop through the genomes. |
1101 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1102 |
Trace("Processing $genomeID.") if T(3); |
Trace("Processing $genomeID.") if T(3); |
|
# Get the genome's PEGs. |
|
|
my @pegs = $fig->pegs_of($genomeID); |
|
|
for my $peg (@pegs) { |
|
|
Trace("Processing $peg.") if T(4); |
|
1103 |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
1104 |
# from showing up for a single PEG's annotations. |
# from showing up for a single PEG's annotations. |
1105 |
my %seenTimestamps = (); |
my %seenTimestamps = (); |
1106 |
# Check for a functional assignment. |
# Get the genome's annotations. |
1107 |
my $func = $fig->function_of($peg); |
my @annotations = $fig->read_all_annotations($genomeID); |
1108 |
if ($func) { |
Trace("Processing annotations.") if T(2); |
1109 |
# If this is NOT a hypothetical assignment, we create an |
for my $tuple (@annotations) { |
1110 |
# assignment annotation for it. |
# Get the annotation tuple. |
1111 |
if (! FIG::hypo($peg)) { |
my ($peg, $timestamp, $user, $text) = @{$tuple}; |
|
# Note that we double the slashes so that what goes into the database is |
|
|
# a new-line escape sequence rather than an actual new-line. |
|
|
$loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func"); |
|
|
$loadIsTargetOfAnnotation->Put($peg, "$peg:$time"); |
|
|
$loadMadeAnnotation->Put("FIG", "$peg:$time"); |
|
|
# Denote we've seen this timestamp. |
|
|
$seenTimestamps{$time} = 1; |
|
|
} |
|
|
} |
|
|
# Now loop through the real annotations. |
|
|
for my $tuple ($fig->feature_annotations($peg, "raw")) { |
|
|
my ($fid, $timestamp, $user, $text) = @{$tuple}; |
|
1112 |
# Here we fix up the annotation text. "\r" is removed, |
# Here we fix up the annotation text. "\r" is removed, |
1113 |
# and "\t" and "\n" are escaped. Note we use the "s" |
# and "\t" and "\n" are escaped. Note we use the "gs" |
1114 |
# modifier so that new-lines inside the text do not |
# modifier so that new-lines inside the text do not |
1115 |
# stop the substitution search. |
# stop the substitution search. |
1116 |
$text =~ s/\r//gs; |
$text =~ s/\r//gs; |
1123 |
# Here it's a number. We need to insure the one we use to form |
# Here it's a number. We need to insure the one we use to form |
1124 |
# the key is unique. |
# the key is unique. |
1125 |
my $keyStamp = $timestamp; |
my $keyStamp = $timestamp; |
1126 |
while ($seenTimestamps{$keyStamp}) { |
while ($seenTimestamps{"$peg:$keyStamp"}) { |
1127 |
$keyStamp++; |
$keyStamp++; |
1128 |
} |
} |
|
$seenTimestamps{$keyStamp} = 1; |
|
1129 |
my $annotationID = "$peg:$keyStamp"; |
my $annotationID = "$peg:$keyStamp"; |
1130 |
|
$seenTimestamps{$annotationID} = 1; |
1131 |
# Insure the user exists. |
# Insure the user exists. |
1132 |
if (! $users{$user}) { |
if (! $users{$user}) { |
1133 |
$loadSproutUser->Put($user, "SEED user"); |
$loadSproutUser->Put($user, "SEED user"); |
1185 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1186 |
# Get the genome hash. |
# Get the genome hash. |
1187 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1188 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1189 |
my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4); |
my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly); |
1190 |
my $loadSource = $self->_TableLoader('Source', $genomeCount * 4); |
my $loadSource = $self->_TableLoader('Source'); |
1191 |
my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8); |
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
1192 |
Trace("Beginning source data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1193 |
|
Trace("Loading from existing files.") if T(2); |
1194 |
|
} else { |
1195 |
|
Trace("Generating annotation data.") if T(2); |
1196 |
# Create hashes to collect the Source information. |
# Create hashes to collect the Source information. |
1197 |
my %sourceURL = (); |
my %sourceURL = (); |
1198 |
my %sourceDesc = (); |
my %sourceDesc = (); |
1222 |
for my $sourceID (keys %sourceDesc) { |
for my $sourceID (keys %sourceDesc) { |
1223 |
$loadSource->Put($sourceID, $sourceDesc{$sourceID}); |
$loadSource->Put($sourceID, $sourceDesc{$sourceID}); |
1224 |
} |
} |
1225 |
|
} |
1226 |
# Finish the load. |
# Finish the load. |
1227 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1228 |
return $retVal; |
return $retVal; |
1262 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1263 |
# Get the genome hash. |
# Get the genome hash. |
1264 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1265 |
# Convert the genome hash. We'll get the genus and species for each genome and make |
# Convert the genome hash. We'll get the genus and species for each genome and make |
1266 |
# it the key. |
# it the key. |
1267 |
my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); |
my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); |
1268 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1269 |
my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000); |
my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc'); |
1270 |
my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000); |
my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg'); |
1271 |
Trace("Beginning external data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1272 |
|
Trace("Loading from existing files.") if T(2); |
1273 |
|
} else { |
1274 |
|
Trace("Generating external data.") if T(2); |
1275 |
# We loop through the files one at a time. First, the organism file. |
# We loop through the files one at a time. First, the organism file. |
1276 |
Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); |
Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |"); |
1277 |
my $orgLine; |
my $orgLine; |
1278 |
while (defined($orgLine = <ORGS>)) { |
while (defined($orgLine = <ORGS>)) { |
1279 |
# Clean the input line. |
# Clean the input line. |
1285 |
close ORGS; |
close ORGS; |
1286 |
# Now the function file. |
# Now the function file. |
1287 |
my $funcLine; |
my $funcLine; |
1288 |
Open(\*FUNCS, "<$FIG_Config::global/ext_func.table"); |
Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |"); |
1289 |
while (defined($funcLine = <FUNCS>)) { |
while (defined($funcLine = <FUNCS>)) { |
1290 |
# Clean the line ending. |
# Clean the line ending. |
1291 |
chomp $funcLine; |
chomp $funcLine; |
1301 |
$loadExternalAliasFunc->Put(@funcFields[0,1]); |
$loadExternalAliasFunc->Put(@funcFields[0,1]); |
1302 |
} |
} |
1303 |
} |
} |
1304 |
|
} |
1305 |
# Finish the load. |
# Finish the load. |
1306 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1307 |
return $retVal; |
return $retVal; |
1318 |
|
|
1319 |
The following relations are loaded by this method. |
The following relations are loaded by this method. |
1320 |
|
|
1321 |
|
Reaction |
1322 |
ReactionURL |
ReactionURL |
1323 |
Compound |
Compound |
1324 |
CompoundName |
CompoundName |
1342 |
my ($self) = @_; |
my ($self) = @_; |
1343 |
# Get the FIG object. |
# Get the FIG object. |
1344 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
|
# Get the genome hash. |
|
|
my $genomeHash = $self->{genomes}; |
|
|
my $genomeCount = (keys %{$genomeHash}); |
|
1345 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1346 |
my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000); |
my $loadReaction = $self->_TableLoader('Reaction'); |
1347 |
my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000); |
my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly); |
1348 |
my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000); |
my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly); |
1349 |
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000); |
my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly); |
1350 |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000); |
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly); |
1351 |
Trace("Beginning reaction/compound data load.") if T(2); |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly); |
1352 |
# Create a hash to remember the compounds we've generated in the compound table. |
if ($self->{options}->{loadOnly}) { |
1353 |
my %compoundHash = (); |
Trace("Loading from existing files.") if T(2); |
1354 |
# Loop through the reactions. |
} else { |
1355 |
|
Trace("Generating annotation data.") if T(2); |
1356 |
|
# First we create the compounds. |
1357 |
|
my @compounds = $fig->all_compounds(); |
1358 |
|
for my $cid (@compounds) { |
1359 |
|
# Check for names. |
1360 |
|
my @names = $fig->names_of_compound($cid); |
1361 |
|
# Each name will be given a priority number, starting with 1. |
1362 |
|
my $prio = 1; |
1363 |
|
for my $name (@names) { |
1364 |
|
$loadCompoundName->Put($cid, $name, $prio++); |
1365 |
|
} |
1366 |
|
# Create the main compound record. Note that the first name |
1367 |
|
# becomes the label. |
1368 |
|
my $label = (@names > 0 ? $names[0] : $cid); |
1369 |
|
$loadCompound->Put($cid, $label); |
1370 |
|
# Check for a CAS ID. |
1371 |
|
my $cas = $fig->cas($cid); |
1372 |
|
if ($cas) { |
1373 |
|
$loadCompoundCAS->Put($cid, $cas); |
1374 |
|
} |
1375 |
|
} |
1376 |
|
# All the compounds are set up, so we need to loop through the reactions next. First, |
1377 |
|
# we initialize the discriminator index. This is a single integer used to insure |
1378 |
|
# duplicate elements in a reaction are not accidentally collapsed. |
1379 |
|
my $discrim = 0; |
1380 |
my @reactions = $fig->all_reactions(); |
my @reactions = $fig->all_reactions(); |
1381 |
for my $reactionID (@reactions) { |
for my $reactionID (@reactions) { |
1382 |
|
# Create the reaction record. |
1383 |
|
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
1384 |
# Compute the reaction's URL. |
# Compute the reaction's URL. |
1385 |
my $url = HTML::reaction_link($reactionID); |
my $url = HTML::reaction_link($reactionID); |
1386 |
# Put it in the ReactionURL table. |
# Put it in the ReactionURL table. |
1389 |
# substrates first and then products. |
# substrates first and then products. |
1390 |
for my $product (0, 1) { |
for my $product (0, 1) { |
1391 |
# Get the compounds of the current type for the current reaction. FIG will |
# Get the compounds of the current type for the current reaction. FIG will |
1392 |
# give us 3-tuples: [ID, Stoichometry, main-flag]. At this time we do not |
# give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not |
1393 |
# have location data in SEED, so it defaults to the empty string. |
# have location data in SEED, so it defaults to the empty string. |
1394 |
my @compounds = $fig->reaction2comp($reactionID, $product); |
my @compounds = $fig->reaction2comp($reactionID, $product); |
1395 |
for my $compData (@compounds) { |
for my $compData (@compounds) { |
1396 |
# Extract the compound data from the current tuple. |
# Extract the compound data from the current tuple. |
1397 |
my ($cid, $stoich, $main) = @{$compData}; |
my ($cid, $stoich, $main) = @{$compData}; |
1398 |
# Link the compound to the reaction. |
# Link the compound to the reaction. |
1399 |
$loadIsAComponentOf->Put($cid, $reactionID, "", $main, $product, $stoich); |
$loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main, |
1400 |
# If this is a new compound, we need to create its table entries. |
$product, $stoich); |
|
if (! exists $compoundHash{$cid}) { |
|
|
$compoundHash{$cid} = 1; |
|
|
# Create the main compound record and denote we've done it. |
|
|
$loadCompound->Put($cid); |
|
|
# Check for a CAS ID. |
|
|
my $cas = $fig->cas($cid); |
|
|
if ($cas) { |
|
|
$loadCompoundCAS->Put($cid, $cas); |
|
|
} |
|
|
# Check for names. |
|
|
my @names = $fig->names_of_compound($cid); |
|
|
# Each name will be given a priority number, starting with 1. |
|
|
my $prio = 0; |
|
|
for my $name (@names) { |
|
|
$loadCompoundName->Put($cid, $name, $prio++); |
|
|
} |
|
1401 |
} |
} |
1402 |
} |
} |
1403 |
} |
} |
1417 |
|
|
1418 |
GenomeGroups |
GenomeGroups |
1419 |
|
|
1420 |
There is no direct support for genome groups in FIG, so we access the SEED |
Currently, we do not use groups. We used to use them for NMPDR groups, |
1421 |
|
butThere is no direct support for genome groups in FIG, so we access the SEED |
1422 |
files directly. |
files directly. |
1423 |
|
|
1424 |
=over 4 |
=over 4 |
1438 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1439 |
# Get the genome hash. |
# Get the genome hash. |
1440 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1441 |
# Create a load object for the table we're loading. |
# Create a load object for the table we're loading. |
1442 |
my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4); |
my $loadGenomeGroups = $self->_TableLoader('GenomeGroups'); |
1443 |
Trace("Beginning group data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1444 |
|
Trace("Loading from existing files.") if T(2); |
1445 |
|
} else { |
1446 |
|
Trace("Generating group data.") if T(2); |
1447 |
|
# Currently there are no groups. |
1448 |
|
} |
1449 |
|
# Finish the load. |
1450 |
|
my $retVal = $self->_FinishAll(); |
1451 |
|
return $retVal; |
1452 |
|
} |
1453 |
|
|
1454 |
|
=head3 LoadSynonymData |
1455 |
|
|
1456 |
|
C<< my $stats = $spl->LoadSynonymData(); >> |
1457 |
|
|
1458 |
|
Load the synonym groups into Sprout. |
1459 |
|
|
1460 |
|
The following relations are loaded by this method. |
1461 |
|
|
1462 |
|
SynonymGroup |
1463 |
|
IsSynonymGroupFor |
1464 |
|
|
1465 |
|
The source information for these relations is taken from the C<maps_to_id> method |
1466 |
|
of the B<FIG> object. Unfortunately, to make this work, we need to use direct |
1467 |
|
SQL against the FIG database. |
1468 |
|
|
1469 |
|
=over 4 |
1470 |
|
|
1471 |
|
=item RETURNS |
1472 |
|
|
1473 |
|
Returns a statistics object for the loads. |
1474 |
|
|
1475 |
|
=back |
1476 |
|
|
1477 |
|
=cut |
1478 |
|
#: Return Type $%; |
1479 |
|
sub LoadSynonymData { |
1480 |
|
# Get this object instance. |
1481 |
|
my ($self) = @_; |
1482 |
|
# Get the FIG object. |
1483 |
|
my $fig = $self->{fig}; |
1484 |
|
# Get the genome hash. |
1485 |
|
my $genomeHash = $self->{genomes}; |
1486 |
|
# Create a load object for the table we're loading. |
1487 |
|
my $loadSynonymGroup = $self->_TableLoader('SynonymGroup'); |
1488 |
|
my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor'); |
1489 |
|
if ($self->{options}->{loadOnly}) { |
1490 |
|
Trace("Loading from existing files.") if T(2); |
1491 |
|
} else { |
1492 |
|
Trace("Generating synonym group data.") if T(2); |
1493 |
|
# Get the database handle. |
1494 |
|
my $dbh = $fig->db_handle(); |
1495 |
|
# Ask for the synonyms. |
1496 |
|
my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to"); |
1497 |
|
my $result = $sth->execute(); |
1498 |
|
if (! defined($result)) { |
1499 |
|
Confess("Database error in Synonym load: " . $sth->errstr()); |
1500 |
|
} else { |
1501 |
|
# Remember the current synonym. |
1502 |
|
my $current_syn = ""; |
1503 |
|
# Count the features. |
1504 |
|
my $featureCount = 0; |
1505 |
|
# Loop through the synonym/peg pairs. |
1506 |
|
while (my @row = $sth->fetchrow()) { |
1507 |
|
# Get the synonym ID and feature ID. |
1508 |
|
my ($syn_id, $peg) = @row; |
1509 |
|
# Insure it's for one of our genomes. |
1510 |
|
my $genomeID = FIG::genome_of($peg); |
1511 |
|
if (exists $genomeHash->{$genomeID}) { |
1512 |
|
# Verify the synonym. |
1513 |
|
if ($syn_id ne $current_syn) { |
1514 |
|
# It's new, so put it in the group table. |
1515 |
|
$loadSynonymGroup->Put($syn_id); |
1516 |
|
$current_syn = $syn_id; |
1517 |
|
} |
1518 |
|
# Connect the synonym to the peg. |
1519 |
|
$loadIsSynonymGroupFor->Put($syn_id, $peg); |
1520 |
|
# Count this feature. |
1521 |
|
$featureCount++; |
1522 |
|
if ($featureCount % 1000 == 0) { |
1523 |
|
Trace("$featureCount features processed.") if T(3); |
1524 |
|
} |
1525 |
|
} |
1526 |
|
} |
1527 |
|
} |
1528 |
|
} |
1529 |
|
# Finish the load. |
1530 |
|
my $retVal = $self->_FinishAll(); |
1531 |
|
return $retVal; |
1532 |
|
} |
1533 |
|
|
1534 |
|
=head3 LoadFamilyData |
1535 |
|
|
1536 |
|
C<< my $stats = $spl->LoadFamilyData(); >> |
1537 |
|
|
1538 |
|
Load the protein families into Sprout. |
1539 |
|
|
1540 |
|
The following relations are loaded by this method. |
1541 |
|
|
1542 |
|
Family |
1543 |
|
IsFamilyForFeature |
1544 |
|
|
1545 |
|
The source information for these relations is taken from the C<families_for_protein>, |
1546 |
|
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
1547 |
|
|
1548 |
|
=over 4 |
1549 |
|
|
1550 |
|
=item RETURNS |
1551 |
|
|
1552 |
|
Returns a statistics object for the loads. |
1553 |
|
|
1554 |
|
=back |
1555 |
|
|
1556 |
|
=cut |
1557 |
|
#: Return Type $%; |
1558 |
|
sub LoadFamilyData { |
1559 |
|
# Get this object instance. |
1560 |
|
my ($self) = @_; |
1561 |
|
# Get the FIG object. |
1562 |
|
my $fig = $self->{fig}; |
1563 |
|
# Get the genome hash. |
1564 |
|
my $genomeHash = $self->{genomes}; |
1565 |
|
# Create load objects for the tables we're loading. |
1566 |
|
my $loadFamily = $self->_TableLoader('Family'); |
1567 |
|
my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature'); |
1568 |
|
if ($self->{options}->{loadOnly}) { |
1569 |
|
Trace("Loading from existing files.") if T(2); |
1570 |
|
} else { |
1571 |
|
Trace("Generating family data.") if T(2); |
1572 |
|
# Create a hash for the family IDs. |
1573 |
|
my %familyHash = (); |
1574 |
# Loop through the genomes. |
# Loop through the genomes. |
1575 |
my $line; |
for my $genomeID (sort keys %{$genomeHash}) { |
1576 |
for my $genomeID (keys %{$genomeHash}) { |
Trace("Processing features for $genomeID.") if T(2); |
1577 |
Trace("Processing $genomeID.") if T(3); |
# Loop through this genome's PEGs. |
1578 |
# Open the NMPDR group file for this genome. |
for my $fid ($fig->all_features($genomeID, "peg")) { |
1579 |
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
$loadIsFamilyForFeature->Add("features", 1); |
1580 |
defined($line = <TMP>)) { |
# Get this feature's families. |
1581 |
# Clean the line ending. |
my @families = $fig->families_for_protein($fid); |
1582 |
chomp $line; |
# Loop through the families, connecting them to the feature. |
1583 |
# Add the group to the table. Note that there can only be one group |
for my $family (@families) { |
1584 |
# per genome. |
$loadIsFamilyForFeature->Put($family, $fid); |
1585 |
$loadGenomeGroups->Put($genomeID, $line); |
# If this is a new family, create a record for it. |
1586 |
|
if (! exists $familyHash{$family}) { |
1587 |
|
$familyHash{$family} = 1; |
1588 |
|
$loadFamily->Add("families", 1); |
1589 |
|
my $size = $fig->sz_family($family); |
1590 |
|
my $func = $fig->family_function($family); |
1591 |
|
$loadFamily->Put($family, $size, $func); |
1592 |
|
} |
1593 |
|
} |
1594 |
|
} |
1595 |
} |
} |
|
close TMP; |
|
1596 |
} |
} |
1597 |
# Finish the load. |
# Finish the load. |
1598 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1599 |
return $retVal; |
return $retVal; |
1600 |
} |
} |
1601 |
|
|
1602 |
|
=head3 LoadDrugData |
1603 |
|
|
1604 |
|
C<< my $stats = $spl->LoadDrugData(); >> |
1605 |
|
|
1606 |
|
Load the drug target data into Sprout. |
1607 |
|
|
1608 |
|
The following relations are loaded by this method. |
1609 |
|
|
1610 |
|
PDB |
1611 |
|
DocksWith |
1612 |
|
IsProteinForFeature |
1613 |
|
Ligand |
1614 |
|
|
1615 |
|
The source information for these relations is taken from attributes. The |
1616 |
|
C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>. |
1617 |
|
The C<zinc_name> attribute describes the ligands. The C<docking_results> |
1618 |
|
attribute contains the information for the B<DocksWith> relationship. It is |
1619 |
|
expected that additional attributes and tables will be added in the future. |
1620 |
|
|
1621 |
|
=over 4 |
1622 |
|
|
1623 |
|
=item RETURNS |
1624 |
|
|
1625 |
|
Returns a statistics object for the loads. |
1626 |
|
|
1627 |
|
=back |
1628 |
|
|
1629 |
|
=cut |
1630 |
|
#: Return Type $%; |
1631 |
|
sub LoadDrugData { |
1632 |
|
# Get this object instance. |
1633 |
|
my ($self) = @_; |
1634 |
|
# Get the FIG object. |
1635 |
|
my $fig = $self->{fig}; |
1636 |
|
# Get the genome hash. |
1637 |
|
my $genomeHash = $self->{genomes}; |
1638 |
|
# Create load objects for the tables we're loading. |
1639 |
|
my $loadPDB = $self->_TableLoader('PDB'); |
1640 |
|
my $loadLigand = $self->_TableLoader('Ligand'); |
1641 |
|
my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature'); |
1642 |
|
my $loadDocksWith = $self->_TableLoader('DocksWith'); |
1643 |
|
if ($self->{options}->{loadOnly}) { |
1644 |
|
Trace("Loading from existing files.") if T(2); |
1645 |
|
} else { |
1646 |
|
Trace("Generating drug target data.") if T(2); |
1647 |
|
# First comes the "DocksWith" relationship. This will give us a list of PDBs. |
1648 |
|
# We can also encounter PDBs when we process "IsProteinForFeature". To manage |
1649 |
|
# this process, PDB information is collected in a hash table and then |
1650 |
|
# unspooled after both relationships are created. |
1651 |
|
my %pdbHash = (); |
1652 |
|
Trace("Generating docking data.") if T(2); |
1653 |
|
# Get all the docking data. This may cause problems if there are too many PDBs, |
1654 |
|
# at which point we'll need another algorithm. The indicator that this is |
1655 |
|
# happening will be a timeout error in the next statement. |
1656 |
|
my @dockData = $fig->query_attributes('$key = ? AND $value < ?', |
1657 |
|
['docking_results', $FIG_Config::dockLimit]); |
1658 |
|
Trace(scalar(@dockData) . " rows of docking data found.") if T(3); |
1659 |
|
for my $dockData (@dockData) { |
1660 |
|
# Get the docking data components. |
1661 |
|
my ($pdbID, $docking_key, @valueData) = @{$dockData}; |
1662 |
|
# Fix the PDB ID. It's supposed to be lower-case, but this does not always happen. |
1663 |
|
$pdbID = lc $pdbID; |
1664 |
|
# Strip off the object type. |
1665 |
|
$pdbID =~ s/pdb://; |
1666 |
|
# Extract the ZINC ID from the docking key. Note that there are two possible |
1667 |
|
# formats. |
1668 |
|
my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/; |
1669 |
|
if (! $zinc_id) { |
1670 |
|
Trace("Invalid docking result key $docking_key for $pdbID.") if T(0); |
1671 |
|
$loadDocksWith->Add("errors"); |
1672 |
|
} else { |
1673 |
|
# Get the pieces of the value and parse the energy. |
1674 |
|
# Note that we don't care about the rank, since |
1675 |
|
# we can sort on the energy level itself in our database. |
1676 |
|
my ($energy, $tool, $type) = @valueData; |
1677 |
|
my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy; |
1678 |
|
# Ignore predicted results. |
1679 |
|
if ($type ne "Predicted") { |
1680 |
|
# Count this docking result. |
1681 |
|
if (! exists $pdbHash{$pdbID}) { |
1682 |
|
$pdbHash{$pdbID} = 1; |
1683 |
|
} else { |
1684 |
|
$pdbHash{$pdbID}++; |
1685 |
|
} |
1686 |
|
# Write the result to the output. |
1687 |
|
$loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool, |
1688 |
|
$total, $vanderwaals); |
1689 |
|
} |
1690 |
|
} |
1691 |
|
} |
1692 |
|
Trace("Connecting features.") if T(2); |
1693 |
|
# Loop through the genomes. |
1694 |
|
for my $genome (sort keys %{$genomeHash}) { |
1695 |
|
Trace("Generating PDBs for $genome.") if T(3); |
1696 |
|
# Get all of the PDBs that BLAST against this genome's features. |
1697 |
|
my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%'); |
1698 |
|
for my $pdbData (@attributeData) { |
1699 |
|
# The PDB ID is coded as a subkey. |
1700 |
|
if ($pdbData->[1] !~ /PDB::(.+)/i) { |
1701 |
|
Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0); |
1702 |
|
$loadPDB->Add("errors"); |
1703 |
|
} else { |
1704 |
|
my $pdbID = $1; |
1705 |
|
# Insure the PDB is in the hash. |
1706 |
|
if (! exists $pdbHash{$pdbID}) { |
1707 |
|
$pdbHash{$pdbID} = 0; |
1708 |
|
} |
1709 |
|
# The score and locations are coded in the attribute value. |
1710 |
|
if ($pdbData->[2] !~ /^([^;]+)(.*)$/) { |
1711 |
|
Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0); |
1712 |
|
$loadIsProteinForFeature->Add("errors"); |
1713 |
|
} else { |
1714 |
|
my ($score, $locData) = ($1,$2); |
1715 |
|
# The location data may not be present, so we have to start with some |
1716 |
|
# defaults and then check. |
1717 |
|
my ($start, $end) = (1, 0); |
1718 |
|
if ($locData) { |
1719 |
|
$locData =~ /(\d+)-(\d+)/; |
1720 |
|
$start = $1; |
1721 |
|
$end = $2; |
1722 |
|
} |
1723 |
|
# If we still don't have the end location, compute it from |
1724 |
|
# the feature length. |
1725 |
|
if (! $end) { |
1726 |
|
# Most features have one location, but we do a list iteration |
1727 |
|
# just in case. |
1728 |
|
my @locations = $fig->feature_location($pdbData->[0]); |
1729 |
|
$end = 0; |
1730 |
|
for my $loc (@locations) { |
1731 |
|
my $locObject = BasicLocation->new($loc); |
1732 |
|
$end += $locObject->Length; |
1733 |
|
} |
1734 |
|
} |
1735 |
|
# Decode the score. |
1736 |
|
my $realScore = FIGRules::DecodeScore($score); |
1737 |
|
# Connect the PDB to the feature. |
1738 |
|
$loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end); |
1739 |
|
} |
1740 |
|
} |
1741 |
|
} |
1742 |
|
} |
1743 |
|
# We've got all our PDBs now, so we unspool them from the hash. |
1744 |
|
Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2); |
1745 |
|
my $count = 0; |
1746 |
|
for my $pdbID (sort keys %pdbHash) { |
1747 |
|
$loadPDB->Put($pdbID, $pdbHash{$pdbID}); |
1748 |
|
$count++; |
1749 |
|
Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0); |
1750 |
|
} |
1751 |
|
# Finally we create the ligand table. This information can be found in the |
1752 |
|
# zinc_name attribute. |
1753 |
|
Trace("Loading ligands.") if T(2); |
1754 |
|
# The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates. |
1755 |
|
my $last_zinc_id = ""; |
1756 |
|
my $zinc_id = ""; |
1757 |
|
my $done = 0; |
1758 |
|
while (! $done) { |
1759 |
|
# Get the next 10000 ligands. We insist that the object ID is greater than |
1760 |
|
# the last ID we processed. |
1761 |
|
Trace("Loading batch starting with ZINC:$zinc_id.") if T(3); |
1762 |
|
my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000', |
1763 |
|
["ZINC:$zinc_id", "zinc_name"]); |
1764 |
|
Trace(scalar(@attributeData) . " attribute rows returned.") if T(3); |
1765 |
|
if (! @attributeData) { |
1766 |
|
# Here there are no attributes left, so we quit the loop. |
1767 |
|
$done = 1; |
1768 |
|
} else { |
1769 |
|
# Process the attribute data we've received. |
1770 |
|
for my $zinc_data (@attributeData) { |
1771 |
|
# The ZINC ID is found in the first return column, prefixed with the word ZINC. |
1772 |
|
if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) { |
1773 |
|
$zinc_id = $1; |
1774 |
|
# Check for a duplicate. |
1775 |
|
if ($zinc_id eq $last_zinc_id) { |
1776 |
|
$loadLigand->Add("duplicate"); |
1777 |
|
} else { |
1778 |
|
# Here it's safe to output the ligand. The ligand name is the attribute value |
1779 |
|
# (third column in the row). |
1780 |
|
$loadLigand->Put($zinc_id, $zinc_data->[2]); |
1781 |
|
# Insure we don't try to add this ID again. |
1782 |
|
$last_zinc_id = $zinc_id; |
1783 |
|
} |
1784 |
|
} else { |
1785 |
|
Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0); |
1786 |
|
$loadLigand->Add("errors"); |
1787 |
|
} |
1788 |
|
} |
1789 |
|
} |
1790 |
|
} |
1791 |
|
Trace("Ligands loaded.") if T(2); |
1792 |
|
} |
1793 |
|
# Finish the load. |
1794 |
|
my $retVal = $self->_FinishAll(); |
1795 |
|
return $retVal; |
1796 |
|
} |
1797 |
|
|
1798 |
|
|
1799 |
=head2 Internal Utility Methods |
=head2 Internal Utility Methods |
1800 |
|
|
1801 |
|
=head3 SpecialAttribute |
1802 |
|
|
1803 |
|
C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >> |
1804 |
|
|
1805 |
|
Look for special attributes of a given type. A special attribute is found by comparing one of |
1806 |
|
the columns of the incoming attribute list to a search pattern. If a match is found, then |
1807 |
|
a set of columns is put into an output table connected to the specified ID. |
1808 |
|
|
1809 |
|
For example, when processing features, the attribute list we look at has three columns: attribute |
1810 |
|
name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name |
1811 |
|
begins with C<iedb_>. The call signature is therefore |
1812 |
|
|
1813 |
|
my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB); |
1814 |
|
|
1815 |
|
The pattern is matched against column 0, and if we have a match, then column 2's value is put |
1816 |
|
to the output along with the specified feature ID. |
1817 |
|
|
1818 |
|
=over 4 |
1819 |
|
|
1820 |
|
=item id |
1821 |
|
|
1822 |
|
ID of the object whose special attributes are being loaded. This forms the first column of the |
1823 |
|
output. |
1824 |
|
|
1825 |
|
=item attributes |
1826 |
|
|
1827 |
|
Reference to a list of tuples. |
1828 |
|
|
1829 |
|
=item idxMatch |
1830 |
|
|
1831 |
|
Index in each tuple of the column to be matched against the pattern. If the match is |
1832 |
|
successful, an output record will be generated. |
1833 |
|
|
1834 |
|
=item idxValues |
1835 |
|
|
1836 |
|
Reference to a list containing the indexes in each tuple of the columns to be put as |
1837 |
|
the second column of the output. |
1838 |
|
|
1839 |
|
=item pattern |
1840 |
|
|
1841 |
|
Pattern to be matched against the specified column. The match will be case-insensitive. |
1842 |
|
|
1843 |
|
=item loader |
1844 |
|
|
1845 |
|
An object to which each output record will be put. Usually this is an B<ERDBLoad> object, |
1846 |
|
but technically it could be anything with a C<Put> method. |
1847 |
|
|
1848 |
|
=item RETURN |
1849 |
|
|
1850 |
|
Returns a count of the matches found. |
1851 |
|
|
1852 |
|
=item |
1853 |
|
|
1854 |
|
=back |
1855 |
|
|
1856 |
|
=cut |
1857 |
|
|
1858 |
|
sub SpecialAttribute { |
1859 |
|
# Get the parameters. |
1860 |
|
my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_; |
1861 |
|
# Declare the return variable. |
1862 |
|
my $retVal = 0; |
1863 |
|
# Loop through the attribute rows. |
1864 |
|
for my $row (@{$attributes}) { |
1865 |
|
# Check for a match. |
1866 |
|
if ($row->[$idxMatch] =~ m/$pattern/i) { |
1867 |
|
# We have a match, so output a row. This is a bit tricky, since we may |
1868 |
|
# be putting out multiple columns of data from the input. |
1869 |
|
my $value = join(" ", map { $row->[$_] } @{$idxValues}); |
1870 |
|
$loader->Put($id, $value); |
1871 |
|
$retVal++; |
1872 |
|
} |
1873 |
|
} |
1874 |
|
Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal; |
1875 |
|
# Return the number of matches. |
1876 |
|
return $retVal; |
1877 |
|
} |
1878 |
|
|
1879 |
=head3 TableLoader |
=head3 TableLoader |
1880 |
|
|
1881 |
Create an ERDBLoad object for the specified table. The object is also added to |
Create an ERDBLoad object for the specified table. The object is also added to |
1890 |
|
|
1891 |
Name of the table (relation) being loaded. |
Name of the table (relation) being loaded. |
1892 |
|
|
1893 |
=item rowCount (optional) |
=item ignore |
1894 |
|
|
1895 |
Estimated maximum number of rows in the table. |
TRUE if the table should be ignored entirely, else FALSE. |
1896 |
|
|
1897 |
=item RETURN |
=item RETURN |
1898 |
|
|
1904 |
|
|
1905 |
sub _TableLoader { |
sub _TableLoader { |
1906 |
# Get the parameters. |
# Get the parameters. |
1907 |
my ($self, $tableName, $rowCount) = @_; |
my ($self, $tableName, $ignore) = @_; |
1908 |
# Create the load object. |
# Create the load object. |
1909 |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount); |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly, |
1910 |
|
$ignore); |
1911 |
# Cache it in the loader list. |
# Cache it in the loader list. |
1912 |
push @{$self->{loaders}}, $retVal; |
push @{$self->{loaders}}, $retVal; |
1913 |
# Return it to the caller. |
# Return it to the caller. |
1941 |
my $retVal = Stats->new(); |
my $retVal = Stats->new(); |
1942 |
# Get the loader list. |
# Get the loader list. |
1943 |
my $loadList = $self->{loaders}; |
my $loadList = $self->{loaders}; |
1944 |
|
# Create a hash to hold the statistics objects, keyed on relation name. |
1945 |
|
my %loaderHash = (); |
1946 |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
1947 |
# ignominiously. At some future point, we want to make the loads restartable. |
# ignominiously. At some future point, we want to make the loads more restartable. |
1948 |
while (my $loader = pop @{$loadList}) { |
while (my $loader = pop @{$loadList}) { |
1949 |
|
# Get the relation name. |
1950 |
|
my $relName = $loader->RelName; |
1951 |
|
# Check the ignore flag. |
1952 |
|
if ($loader->Ignore) { |
1953 |
|
Trace("Relation $relName not loaded.") if T(2); |
1954 |
|
} else { |
1955 |
|
# Here we really need to finish. |
1956 |
|
Trace("Finishing $relName.") if T(2); |
1957 |
my $stats = $loader->Finish(); |
my $stats = $loader->Finish(); |
1958 |
|
$loaderHash{$relName} = $stats; |
1959 |
|
} |
1960 |
|
} |
1961 |
|
# Now we loop through again, actually loading the tables. We want to finish before |
1962 |
|
# loading so that if something goes wrong at this point, all the load files are usable |
1963 |
|
# and we don't have to redo all that work. |
1964 |
|
for my $relName (sort keys %loaderHash) { |
1965 |
|
# Get the statistics for this relation. |
1966 |
|
my $stats = $loaderHash{$relName}; |
1967 |
|
# Check for a database load. |
1968 |
|
if ($self->{options}->{dbLoad}) { |
1969 |
|
# Here we want to use the load file just created to load the database. |
1970 |
|
Trace("Loading relation $relName.") if T(2); |
1971 |
|
my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]); |
1972 |
|
# Accumulate the statistics from the DB load. |
1973 |
|
$stats->Accumulate($newStats); |
1974 |
|
} |
1975 |
$retVal->Accumulate($stats); |
$retVal->Accumulate($stats); |
|
my $relName = $loader->RelName; |
|
1976 |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
1977 |
} |
} |
1978 |
# Return the load statistics. |
# Return the load statistics. |
1979 |
return $retVal; |
return $retVal; |
1980 |
} |
} |
1981 |
|
|
1982 |
|
=head3 GetGenomeAttributes |
1983 |
|
|
1984 |
|
C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >> |
1985 |
|
|
1986 |
|
Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related |
1987 |
|
attributes for all the features of a genome in a single call, then organizes them into |
1988 |
|
a hash. |
1989 |
|
|
1990 |
|
=over 4 |
1991 |
|
|
1992 |
|
=item fig |
1993 |
|
|
1994 |
|
FIG-like object for accessing attributes. |
1995 |
|
|
1996 |
|
=item genomeID |
1997 |
|
|
1998 |
|
ID of the genome who's attributes are desired. |
1999 |
|
|
2000 |
|
=item fids |
2001 |
|
|
2002 |
|
Reference to a list of the feature IDs whose attributes are to be kept. |
2003 |
|
|
2004 |
|
=item RETURN |
2005 |
|
|
2006 |
|
Returns a reference to a hash. The key of the hash is the feature ID. The value is the |
2007 |
|
reference to a list of the feature's attribute tuples. Each tuple contains the feature ID, |
2008 |
|
the attribute key, and one or more attribute values. |
2009 |
|
|
2010 |
|
=back |
2011 |
|
|
2012 |
|
=cut |
2013 |
|
|
2014 |
|
sub GetGenomeAttributes { |
2015 |
|
# Get the parameters. |
2016 |
|
my ($fig, $genomeID, $fids) = @_; |
2017 |
|
# Declare the return variable. |
2018 |
|
my $retVal = {}; |
2019 |
|
# Get a list of the attributes we care about. |
2020 |
|
my @propKeys = $fig->get_group_keys("NMPDR"); |
2021 |
|
# Get the attributes. |
2022 |
|
my @aList = $fig->get_attributes("fig|$genomeID%", \@propKeys); |
2023 |
|
# Initialize the hash. This not only enables us to easily determine which FIDs to |
2024 |
|
# keep, it insures that the caller sees a list reference for every known fid, |
2025 |
|
# simplifying the logic. |
2026 |
|
for my $fid (@{$fids}) { |
2027 |
|
$retVal->{$fid} = []; |
2028 |
|
} |
2029 |
|
# Populate the hash. |
2030 |
|
for my $aListEntry (@aList) { |
2031 |
|
my $fid = $aListEntry->[0]; |
2032 |
|
if (exists $retVal->{$fid}) { |
2033 |
|
push @{$retVal->{$fid}}, $aListEntry; |
2034 |
|
} |
2035 |
|
} |
2036 |
|
# Return the result. |
2037 |
|
return $retVal; |
2038 |
|
} |
2039 |
|
|
2040 |
1; |
1; |