10 |
use Sprout; |
use Sprout; |
11 |
use Stats; |
use Stats; |
12 |
use BasicLocation; |
use BasicLocation; |
13 |
|
use HTML; |
14 |
|
|
15 |
=head1 Sprout Load Methods |
=head1 Sprout Load Methods |
16 |
|
|
52 |
|
|
53 |
=head3 new |
=head3 new |
54 |
|
|
55 |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >> |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
56 |
|
|
57 |
Construct a new Sprout Loader object, specifying the two participating databases and |
Construct a new Sprout Loader object, specifying the two participating databases and |
58 |
the name of the files containing the list of genomes and subsystems to use. |
the name of the files containing the list of genomes and subsystems to use. |
83 |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
84 |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
85 |
|
|
86 |
|
=item options |
87 |
|
|
88 |
|
Reference to a hash of command-line options. |
89 |
|
|
90 |
=back |
=back |
91 |
|
|
92 |
=cut |
=cut |
93 |
|
|
94 |
sub new { |
sub new { |
95 |
# Get the parameters. |
# Get the parameters. |
96 |
my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_; |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
97 |
# Load the list of genomes into a hash. |
# Load the list of genomes into a hash. |
98 |
my %genomes; |
my %genomes; |
99 |
if (! defined($genomeFile) || $genomeFile eq '') { |
if (! defined($genomeFile) || $genomeFile eq '') { |
163 |
sprout => $sprout, |
sprout => $sprout, |
164 |
loadDirectory => $directory, |
loadDirectory => $directory, |
165 |
erdb => $sprout->{_erdb}, |
erdb => $sprout->{_erdb}, |
166 |
loaders => [] |
loaders => [], |
167 |
|
options => $options |
168 |
}; |
}; |
169 |
# Bless and return it. |
# Bless and return it. |
170 |
bless $retVal, $class; |
bless $retVal, $class; |
171 |
return $retVal; |
return $retVal; |
172 |
} |
} |
173 |
|
|
174 |
|
=head3 LoadOnly |
175 |
|
|
176 |
|
C<< my $flag = $spl->LoadOnly; >> |
177 |
|
|
178 |
|
Return TRUE if we are in load-only mode, else FALSE. |
179 |
|
|
180 |
|
=cut |
181 |
|
|
182 |
|
sub LoadOnly { |
183 |
|
my ($self) = @_; |
184 |
|
return $self->{options}->{loadOnly}; |
185 |
|
} |
186 |
|
|
187 |
=head3 LoadGenomeData |
=head3 LoadGenomeData |
188 |
|
|
189 |
C<< my $stats = $spl->LoadGenomeData(); >> |
C<< my $stats = $spl->LoadGenomeData(); >> |
229 |
# Get the genome count. |
# Get the genome count. |
230 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
231 |
my $genomeCount = (keys %{$genomeHash}); |
my $genomeCount = (keys %{$genomeHash}); |
|
Trace("Beginning genome data load.") if T(2); |
|
232 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
233 |
my $loadGenome = $self->_TableLoader('Genome', $genomeCount); |
my $loadGenome = $self->_TableLoader('Genome'); |
234 |
my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300); |
my $loadHasContig = $self->_TableLoader('HasContig'); |
235 |
my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300); |
my $loadContig = $self->_TableLoader('Contig'); |
236 |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000); |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf'); |
237 |
my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000); |
my $loadSequence = $self->_TableLoader('Sequence'); |
238 |
|
if ($self->{options}->{loadOnly}) { |
239 |
|
Trace("Loading from existing files.") if T(2); |
240 |
|
} else { |
241 |
|
Trace("Generating genome data.") if T(2); |
242 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
243 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
244 |
Trace("Loading data for genome $genomeID.") if T(3); |
Trace("Generating data for genome $genomeID.") if T(3); |
245 |
$loadGenome->Add("genomeIn"); |
$loadGenome->Add("genomeIn"); |
246 |
# The access code comes in via the genome hash. |
# The access code comes in via the genome hash. |
247 |
my $accessCode = $genomeHash->{$genomeID}; |
my $accessCode = $genomeHash->{$genomeID}; |
284 |
} |
} |
285 |
} |
} |
286 |
} |
} |
287 |
|
} |
288 |
# Finish the loads. |
# Finish the loads. |
289 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
290 |
# Return the result. |
# Return the result. |
328 |
my $genomeCount = (keys %{$genomeFilter}); |
my $genomeCount = (keys %{$genomeFilter}); |
329 |
my $featureCount = $genomeCount * 4000; |
my $featureCount = $genomeCount * 4000; |
330 |
# Start the loads. |
# Start the loads. |
331 |
my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount); |
my $loadCoupling = $self->_TableLoader('Coupling'); |
332 |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000); |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy'); |
333 |
my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000); |
my $loadPCH = $self->_TableLoader('PCH'); |
334 |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000); |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling'); |
335 |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000); |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence'); |
336 |
Trace("Beginning coupling data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
337 |
|
Trace("Loading from existing files.") if T(2); |
338 |
|
} else { |
339 |
|
Trace("Generating coupling data.") if T(2); |
340 |
# Loop through the genomes found. |
# Loop through the genomes found. |
341 |
for my $genome (sort keys %{$genomeFilter}) { |
for my $genome (sort keys %{$genomeFilter}) { |
342 |
Trace("Generating coupling data for $genome.") if T(3); |
Trace("Generating coupling data for $genome.") if T(3); |
388 |
# We store this evidence in the hash if the usage |
# We store this evidence in the hash if the usage |
389 |
# is nonzero or no prior evidence has been found. This |
# is nonzero or no prior evidence has been found. This |
390 |
# insures that if there is duplicate evidence, we |
# insures that if there is duplicate evidence, we |
391 |
# at least keep the meaningful ones. Only evidence is |
# at least keep the meaningful ones. Only evidence in |
392 |
# the hash makes it to the output. |
# the hash makes it to the output. |
393 |
if ($usage || ! exists $evidenceMap{$evidenceKey}) { |
if ($usage || ! exists $evidenceMap{$evidenceKey}) { |
394 |
$evidenceMap{$evidenceKey} = $evidenceData; |
$evidenceMap{$evidenceKey} = $evidenceData; |
403 |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
404 |
# Connect it to the features. |
# Connect it to the features. |
405 |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
406 |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 1); |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 2); |
407 |
|
} |
408 |
} |
} |
409 |
} |
} |
410 |
} |
} |
449 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
450 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
451 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
452 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
453 |
my $loadFeature = $self->_TableLoader('Feature', $featureCount); |
my $loadFeature = $self->_TableLoader('Feature'); |
454 |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6); |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn'); |
455 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10); |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
456 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
457 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
458 |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
459 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
460 |
# locations. |
# locations. |
461 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
462 |
Trace("Beginning feature data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
463 |
|
Trace("Loading from existing files.") if T(2); |
464 |
|
} else { |
465 |
|
Trace("Generating feature data.") if T(2); |
466 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
467 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
468 |
Trace("Loading features for genome $genomeID.") if T(3); |
Trace("Loading features for genome $genomeID.") if T(3); |
473 |
for my $featureData (@{$features}) { |
for my $featureData (@{$features}) { |
474 |
$loadFeature->Add("featureIn"); |
$loadFeature->Add("featureIn"); |
475 |
# Split the tuple. |
# Split the tuple. |
476 |
my ($featureID, $locations, $aliases, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
477 |
# Create the feature record. |
# Create the feature record. |
478 |
$loadFeature->Put($featureID, 1, $type); |
$loadFeature->Put($featureID, 1, $type); |
479 |
# Create the aliases. |
# Create the aliases. |
480 |
for my $alias (split /\s*,\s*/, $aliases) { |
for my $alias ($fig->feature_aliases($featureID)) { |
481 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
482 |
} |
} |
483 |
# Get the links. |
# Get the links. |
503 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
504 |
# for Sprout. |
# for Sprout. |
505 |
my @locationList = split /\s*,\s*/, $locations; |
my @locationList = split /\s*,\s*/, $locations; |
506 |
|
# Create the location position indicator. |
507 |
|
my $i = 1; |
508 |
# Loop through the locations. |
# Loop through the locations. |
509 |
for my $location (@locationList) { |
for my $location (@locationList) { |
510 |
# Parse the location. |
# Parse the location. |
511 |
my $locObject = BasicLocation->new($location); |
my $locObject = BasicLocation->new("$genomeID:$location"); |
512 |
# Split it into a list of chunks. |
# Split it into a list of chunks. |
513 |
my @locOList = (); |
my @locOList = (); |
514 |
while (my $peeling = $locObject->Peel($chunkSize)) { |
while (my $peeling = $locObject->Peel($chunkSize)) { |
518 |
push @locOList, $locObject; |
push @locOList, $locObject; |
519 |
# Loop through the chunks, creating IsLocatedIn records. The variable |
# Loop through the chunks, creating IsLocatedIn records. The variable |
520 |
# "$i" will be used to keep the location index. |
# "$i" will be used to keep the location index. |
|
my $i = 1; |
|
521 |
for my $locChunk (@locOList) { |
for my $locChunk (@locOList) { |
522 |
$loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
$loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
523 |
$locChunk->Dir, $locChunk->Length, $i); |
$locChunk->Dir, $locChunk->Length, $i); |
526 |
} |
} |
527 |
} |
} |
528 |
} |
} |
529 |
|
} |
530 |
# Finish the loads. |
# Finish the loads. |
531 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
532 |
return $retVal; |
return $retVal; |
563 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
564 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
565 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
566 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
567 |
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf', |
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf'); |
568 |
$featureCount * $genomeCount); |
if ($self->{options}->{loadOnly}) { |
569 |
Trace("Beginning BBH load.") if T(2); |
Trace("Loading from existing files.") if T(2); |
570 |
|
} else { |
571 |
|
Trace("Generating BBH data.") if T(2); |
572 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
573 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
574 |
$loadIsBidirectionalBestHitOf->Add("genomeIn"); |
$loadIsBidirectionalBestHitOf->Add("genomeIn"); |
594 |
} |
} |
595 |
} |
} |
596 |
} |
} |
597 |
|
} |
598 |
# Finish the loads. |
# Finish the loads. |
599 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
600 |
return $retVal; |
return $retVal; |
616 |
|
|
617 |
Subsystem |
Subsystem |
618 |
Role |
Role |
619 |
|
RoleEC |
620 |
SSCell |
SSCell |
621 |
ContainsFeature |
ContainsFeature |
622 |
IsGenomeOf |
IsGenomeOf |
624 |
OccursInSubsystem |
OccursInSubsystem |
625 |
ParticipatesIn |
ParticipatesIn |
626 |
HasSSCell |
HasSSCell |
627 |
|
ConsistsOfRoles |
628 |
|
RoleSubset |
629 |
|
HasRoleSubset |
630 |
|
ConsistsOfGenomes |
631 |
|
GenomeSubset |
632 |
|
HasGenomeSubset |
633 |
|
Catalyzes |
634 |
|
Diagram |
635 |
|
RoleOccursIn |
636 |
|
|
637 |
=over 4 |
=over 4 |
638 |
|
|
642 |
|
|
643 |
=back |
=back |
644 |
|
|
|
B<TO DO> |
|
|
|
|
|
Generate RoleName table? |
|
|
|
|
645 |
=cut |
=cut |
646 |
#: Return Type $%; |
#: Return Type $%; |
647 |
sub LoadSubsystemData { |
sub LoadSubsystemData { |
655 |
# Get the subsystem hash. This lists the subsystems we'll process. |
# Get the subsystem hash. This lists the subsystems we'll process. |
656 |
my $subsysHash = $self->{subsystems}; |
my $subsysHash = $self->{subsystems}; |
657 |
my @subsysIDs = sort keys %{$subsysHash}; |
my @subsysIDs = sort keys %{$subsysHash}; |
658 |
my $subsysCount = @subsysIDs; |
# Get the map list. |
659 |
my $genomeCount = (keys %{$genomeHash}); |
my @maps = $fig->all_maps; |
|
my $featureCount = $genomeCount * 4000; |
|
660 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
661 |
my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount); |
my $loadDiagram = $self->_TableLoader('Diagram'); |
662 |
my $loadRole = $self->_TableLoader('Role', $featureCount * 6); |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn'); |
663 |
my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount); |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
664 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount); |
my $loadRole = $self->_TableLoader('Role'); |
665 |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount); |
my $loadRoleEC = $self->_TableLoader('RoleEC'); |
666 |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount); |
my $loadCatalyzes = $self->_TableLoader('Catalyzes'); |
667 |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6); |
my $loadSSCell = $self->_TableLoader('SSCell'); |
668 |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount); |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); |
669 |
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount); |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf'); |
670 |
Trace("Beginning subsystem data load.") if T(2); |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf'); |
671 |
|
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem'); |
672 |
|
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn'); |
673 |
|
my $loadHasSSCell = $self->_TableLoader('HasSSCell'); |
674 |
|
my $loadRoleSubset = $self->_TableLoader('RoleSubset'); |
675 |
|
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset'); |
676 |
|
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles'); |
677 |
|
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes'); |
678 |
|
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset'); |
679 |
|
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset'); |
680 |
|
if ($self->{options}->{loadOnly}) { |
681 |
|
Trace("Loading from existing files.") if T(2); |
682 |
|
} else { |
683 |
|
Trace("Generating subsystem data.") if T(2); |
684 |
|
# This hash will contain the role for each EC. When we're done, this |
685 |
|
# information will be used to generate the Catalyzes table. |
686 |
|
my %ecToRoles = (); |
687 |
# Loop through the subsystems. Our first task will be to create the |
# Loop through the subsystems. Our first task will be to create the |
688 |
# roles. We do this by looping through the subsystems and creating a |
# roles. We do this by looping through the subsystems and creating a |
689 |
# role hash. The hash tracks each role ID so that we don't create |
# role hash. The hash tracks each role ID so that we don't create |
690 |
# duplicates. As we move along, we'll connect the roles and subsystems. |
# duplicates. As we move along, we'll connect the roles and subsystems |
691 |
|
# and memorize up the reactions. |
692 |
|
my ($genomeID, $roleID); |
693 |
my %roleData = (); |
my %roleData = (); |
694 |
for my $subsysID (@subsysIDs) { |
for my $subsysID (@subsysIDs) { |
695 |
Trace("Creating subsystem $subsysID.") if T(3); |
Trace("Creating subsystem $subsysID.") if T(3); |
696 |
$loadSubsystem->Add("subsystemIn"); |
$loadSubsystem->Add("subsystemIn"); |
697 |
|
# Get the subsystem object. |
698 |
|
my $sub = $fig->get_subsystem($subsysID); |
699 |
# Create the subsystem record. |
# Create the subsystem record. |
700 |
$loadSubsystem->Put($subsysID); |
my $curator = $sub->get_curator(); |
701 |
# Get the subsystem's roles. |
my $notes = $sub->get_notes(); |
702 |
my @roles = $fig->subsystem_to_roles($subsysID); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
703 |
# Connect the roles to the subsystem. If a role is new, we create |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
704 |
# a role record for it. |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
705 |
for my $roleID (@roles) { |
# Connect to this role. |
706 |
$loadOccursInSubsystem->Add("roleIn"); |
$loadOccursInSubsystem->Add("roleIn"); |
707 |
$loadOccursInSubsystem->Put($roleID, $subsysID); |
$loadOccursInSubsystem->Put($roleID, $subsysID, $col); |
708 |
|
# If it's a new role, add it to the role table. |
709 |
if (! exists $roleData{$roleID}) { |
if (! exists $roleData{$roleID}) { |
710 |
$loadRole->Put($roleID); |
# Get the role's abbreviation. |
711 |
|
my $abbr = $sub->get_role_abbr($col); |
712 |
|
# Add the role. |
713 |
|
$loadRole->Put($roleID, $abbr); |
714 |
$roleData{$roleID} = 1; |
$roleData{$roleID} = 1; |
715 |
|
# Check for an EC number. |
716 |
|
if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) { |
717 |
|
my $ec = $1; |
718 |
|
$loadRoleEC->Put($roleID, $ec); |
719 |
|
$ecToRoles{$ec} = $roleID; |
720 |
|
} |
721 |
} |
} |
722 |
} |
} |
723 |
# Now all roles for this subsystem have been filled in. We create the |
# Now we create the spreadsheet for the subsystem by matching roles to |
724 |
# spreadsheet by matches roles to genomes. To do this, we need to |
# genomes. Each genome is a row and each role is a column. We may need |
725 |
# get the genomes on the sheet. |
# to actually create the roles as we find them. |
726 |
Trace("Creating subsystem $subsysID spreadsheet.") if T(3); |
Trace("Creating subsystem $subsysID spreadsheet.") if T(3); |
727 |
my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)}; |
for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) { |
728 |
for my $genomeID (@genomes) { |
# Only proceed if this is one of our genomes. |
|
# Only process this genome if it's one of ours. |
|
729 |
if (exists $genomeHash->{$genomeID}) { |
if (exists $genomeHash->{$genomeID}) { |
730 |
# Connect the genome to the subsystem. |
# Count the PEGs and cells found for verification purposes. |
731 |
$loadParticipatesIn->Put($genomeID, $subsysID); |
my $pegCount = 0; |
732 |
|
my $cellCount = 0; |
733 |
|
# Create a list for the PEGs we find. This list will be used |
734 |
|
# to generate cluster numbers. |
735 |
|
my @pegsFound = (); |
736 |
|
# Create a hash that maps spreadsheet IDs to PEGs. We will |
737 |
|
# use this to generate the ContainsFeature data after we have |
738 |
|
# the cluster numbers. |
739 |
|
my %cellPegs = (); |
740 |
|
# Get the genome's variant code for this subsystem. |
741 |
|
my $variantCode = $sub->get_variant_code($row); |
742 |
# Loop through the subsystem's roles. We use an index because it is |
# Loop through the subsystem's roles. We use an index because it is |
743 |
# part of the spreadsheet cell ID. |
# part of the spreadsheet cell ID. |
744 |
for (my $i = 0; $i <= $#roles; $i++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
|
my $role = $roles[$i]; |
|
745 |
# Get the features in the spreadsheet cell for this genome and role. |
# Get the features in the spreadsheet cell for this genome and role. |
746 |
my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i); |
my @pegs = $sub->get_pegs_from_cell($row, $col); |
747 |
# Only proceed if features exist. |
# Only proceed if features exist. |
748 |
if (@pegs > 0) { |
if (@pegs > 0) { |
749 |
# Create the spreadsheet cell. |
# Create the spreadsheet cell. |
750 |
my $cellID = "$subsysID:$genomeID:$i"; |
$cellCount++; |
751 |
|
my $cellID = "$subsysID:$genomeID:$col"; |
752 |
$loadSSCell->Put($cellID); |
$loadSSCell->Put($cellID); |
753 |
$loadIsGenomeOf->Put($genomeID, $cellID); |
$loadIsGenomeOf->Put($genomeID, $cellID); |
754 |
$loadIsRoleOf->Put($role, $cellID); |
$loadIsRoleOf->Put($roleID, $cellID); |
755 |
$loadHasSSCell->Put($subsysID, $cellID); |
$loadHasSSCell->Put($subsysID, $cellID); |
756 |
# Attach the features to it. |
# Remember its features. |
757 |
for my $pegID (@pegs) { |
push @pegsFound, @pegs; |
758 |
$loadContainsFeature->Put($cellID, $pegID); |
$cellPegs{$cellID} = \@pegs; |
759 |
|
$pegCount += @pegs; |
760 |
|
} |
761 |
|
} |
762 |
|
# If we found some cells for this genome, we need to compute clusters and |
763 |
|
# denote it participates in the subsystem. |
764 |
|
if ($pegCount > 0) { |
765 |
|
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
766 |
|
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
767 |
|
# Partition the PEGs found into clusters. |
768 |
|
my @clusters = $fig->compute_clusters(\@pegsFound, $sub); |
769 |
|
# Create a hash mapping PEG IDs to cluster numbers. |
770 |
|
# We default to -1 for all of them. |
771 |
|
my %clusterOf = map { $_ => -1 } @pegsFound; |
772 |
|
for (my $i = 0; $i <= $#clusters; $i++) { |
773 |
|
my $subList = $clusters[$i]; |
774 |
|
for my $peg (@{$subList}) { |
775 |
|
$clusterOf{$peg} = $i; |
776 |
|
} |
777 |
|
} |
778 |
|
# Create the ContainsFeature data. |
779 |
|
for my $cellID (keys %cellPegs) { |
780 |
|
my $cellList = $cellPegs{$cellID}; |
781 |
|
for my $cellPeg (@$cellList) { |
782 |
|
$loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg}); |
783 |
} |
} |
784 |
} |
} |
785 |
} |
} |
786 |
} |
} |
787 |
} |
} |
788 |
|
# Now we need to generate the subsets. The subset names must be concatenated to |
789 |
|
# the subsystem name to make them unique keys. There are two types of subsets: |
790 |
|
# genome subsets and role subsets. We do the role subsets first. |
791 |
|
my @subsetNames = $sub->get_subset_names(); |
792 |
|
for my $subsetID (@subsetNames) { |
793 |
|
# Create the subset record. |
794 |
|
my $actualID = "$subsysID:$subsetID"; |
795 |
|
$loadRoleSubset->Put($actualID); |
796 |
|
# Connect the subset to the subsystem. |
797 |
|
$loadHasRoleSubset->Put($subsysID, $actualID); |
798 |
|
# Connect the subset to its roles. |
799 |
|
my @roles = $sub->get_subset($subsetID); |
800 |
|
for my $roleID (@roles) { |
801 |
|
$loadConsistsOfRoles->Put($actualID, $roleID); |
802 |
} |
} |
|
# Finish the load. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
803 |
} |
} |
804 |
|
# Next the genome subsets. |
805 |
=head3 LoadDiagramData |
@subsetNames = $sub->get_subset_namesR(); |
806 |
|
for my $subsetID (@subsetNames) { |
807 |
C<< my $stats = $spl->LoadDiagramData(); >> |
# Create the subset record. |
808 |
|
my $actualID = "$subsysID:$subsetID"; |
809 |
Load the diagram data from FIG into Sprout. |
$loadGenomeSubset->Put($actualID); |
810 |
|
# Connect the subset to the subsystem. |
811 |
Diagrams are used to organize functional roles. The diagram shows the |
$loadHasGenomeSubset->Put($subsysID, $actualID); |
812 |
connections between chemicals that interact with a subsystem. |
# Connect the subset to its genomes. |
813 |
|
my @genomes = $sub->get_subsetR($subsetID); |
814 |
The following relations are loaded by this method. |
for my $genomeID (@genomes) { |
815 |
|
$loadConsistsOfGenomes->Put($actualID, $genomeID); |
816 |
Diagram |
} |
817 |
RoleOccursIn |
} |
818 |
|
} |
819 |
=over 4 |
# Now we loop through the diagrams. We need to create the diagram records |
820 |
|
# and link each diagram to its roles. Note that only roles which occur |
821 |
=item RETURNS |
# in subsystems (and therefore appear in the %ecToRoles hash) are |
822 |
|
# included. |
823 |
Returns a statistics object for the loads. |
for my $map (@maps) { |
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadDiagramData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the map list. |
|
|
my @maps = $fig->all_maps; |
|
|
my $mapCount = @maps; |
|
|
my $genomeCount = (keys %{$self->{genomes}}); |
|
|
my $featureCount = $genomeCount * 4000; |
|
|
# Create load objects for each of the tables we're loading. |
|
|
my $loadDiagram = $self->_TableLoader('Diagram', $mapCount); |
|
|
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6); |
|
|
Trace("Beginning diagram data load.") if T(2); |
|
|
# Loop through the diagrams. |
|
|
for my $map ($fig->all_maps) { |
|
824 |
Trace("Loading diagram $map.") if T(3); |
Trace("Loading diagram $map.") if T(3); |
825 |
# Get the diagram's descriptive name. |
# Get the diagram's descriptive name. |
826 |
my $name = $fig->map_name($map); |
my $name = $fig->map_name($map); |
829 |
# A hash is used to prevent duplicates. |
# A hash is used to prevent duplicates. |
830 |
my %roleHash = (); |
my %roleHash = (); |
831 |
for my $role ($fig->map_to_ecs($map)) { |
for my $role ($fig->map_to_ecs($map)) { |
832 |
if (! $roleHash{$role}) { |
if (exists $ecToRoles{$role} && ! $roleHash{$role}) { |
833 |
$loadRoleOccursIn->Put($role, $map); |
$loadRoleOccursIn->Put($ecToRoles{$role}, $map); |
834 |
$roleHash{$role} = 1; |
$roleHash{$role} = 1; |
835 |
} |
} |
836 |
} |
} |
837 |
} |
} |
838 |
|
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
839 |
|
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
840 |
|
my @reactions = $fig->all_reactions(); |
841 |
|
for my $reactionID (@reactions) { |
842 |
|
# Get this reaction's list of roles. The results will be EC numbers. |
843 |
|
my @roles = $fig->catalyzed_by($reactionID); |
844 |
|
# Loop through the roles, creating catalyzation records. |
845 |
|
for my $thisRole (@roles) { |
846 |
|
if (exists $ecToRoles{$thisRole}) { |
847 |
|
$loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID); |
848 |
|
} |
849 |
|
} |
850 |
|
} |
851 |
|
} |
852 |
# Finish the load. |
# Finish the load. |
853 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
854 |
return $retVal; |
return $retVal; |
890 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
891 |
# Get the genome hash. |
# Get the genome hash. |
892 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
893 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
894 |
my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500); |
my $loadProperty = $self->_TableLoader('Property'); |
895 |
my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500); |
my $loadHasProperty = $self->_TableLoader('HasProperty'); |
896 |
Trace("Beginning property data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
897 |
|
Trace("Loading from existing files.") if T(2); |
898 |
|
} else { |
899 |
|
Trace("Generating property data.") if T(2); |
900 |
# Create a hash for storing property IDs. |
# Create a hash for storing property IDs. |
901 |
my %propertyKeys = (); |
my %propertyKeys = (); |
902 |
my $nextID = 1; |
my $nextID = 1; |
903 |
# Loop through the genomes. |
# Loop through the genomes. |
904 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (keys %{$genomeHash}) { |
905 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
906 |
|
Trace("Generating properties for $genomeID.") if T(3); |
907 |
# Get the genome's features. The feature ID is the first field in the |
# Get the genome's features. The feature ID is the first field in the |
908 |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
909 |
# rather than "all_features" because we want all features regardless of type. |
# rather than "all_features" because we want all features regardless of type. |
910 |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
911 |
|
my $featureCount = 0; |
912 |
|
my $propertyCount = 0; |
913 |
# Loop through the features, creating HasProperty records. |
# Loop through the features, creating HasProperty records. |
914 |
for my $fid (@features) { |
for my $fid (@features) { |
|
$loadProperty->Add("featureIn"); |
|
915 |
# Get all attributes for this feature. We do this one feature at a time |
# Get all attributes for this feature. We do this one feature at a time |
916 |
# to insure we do not get any genome attributes. |
# to insure we do not get any genome attributes. |
917 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
918 |
|
if (scalar @attributeList) { |
919 |
|
$featureCount++; |
920 |
|
} |
921 |
# Loop through the attributes. |
# Loop through the attributes. |
922 |
for my $tuple (@attributeList) { |
for my $tuple (@attributeList) { |
923 |
|
$propertyCount++; |
924 |
# Get this attribute value's data. Note that we throw away the FID, |
# Get this attribute value's data. Note that we throw away the FID, |
925 |
# since it will always be the same as the value if "$fid". |
# since it will always be the same as the value if "$fid". |
926 |
my (undef, $key, $value, $url) = @{$tuple}; |
my (undef, $key, $value, $url) = @{$tuple}; |
942 |
$loadHasProperty->Put($fid, $propertyID, $url); |
$loadHasProperty->Put($fid, $propertyID, $url); |
943 |
} |
} |
944 |
} |
} |
945 |
|
# Update the statistics. |
946 |
|
Trace("$propertyCount attributes processed for $featureCount features.") if T(3); |
947 |
|
$loadHasProperty->Add("featuresIn", $featureCount); |
948 |
|
$loadHasProperty->Add("propertiesIn", $propertyCount); |
949 |
|
} |
950 |
} |
} |
951 |
# Finish the load. |
# Finish the load. |
952 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
987 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
988 |
# Get the genome hash. |
# Get the genome hash. |
989 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
990 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
991 |
my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000); |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
992 |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000); |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation'); |
993 |
my $loadSproutUser = $self->_TableLoader('SproutUser', 100); |
my $loadSproutUser = $self->_TableLoader('SproutUser'); |
994 |
my $loadUserAccess = $self->_TableLoader('UserAccess', 1000); |
my $loadUserAccess = $self->_TableLoader('UserAccess'); |
995 |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000); |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation'); |
996 |
Trace("Beginning annotation data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
997 |
|
Trace("Loading from existing files.") if T(2); |
998 |
|
} else { |
999 |
|
Trace("Generating annotation data.") if T(2); |
1000 |
# Create a hash of user names. We'll use this to prevent us from generating duplicate |
# Create a hash of user names. We'll use this to prevent us from generating duplicate |
1001 |
# user records. |
# user records. |
1002 |
my %users = ( FIG => 1, master => 1 ); |
my %users = ( FIG => 1, master => 1 ); |
1017 |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
1018 |
# from showing up for a single PEG's annotations. |
# from showing up for a single PEG's annotations. |
1019 |
my %seenTimestamps = (); |
my %seenTimestamps = (); |
1020 |
# Check for a functional assignment. |
# Loop through the annotations. |
|
my $func = $fig->function_of($peg); |
|
|
if ($func) { |
|
|
# If this is NOT a hypothetical assignment, we create an |
|
|
# assignment annotation for it. |
|
|
if (! FIG::hypo($peg)) { |
|
|
# Note that we double the slashes so that what goes into the database is |
|
|
# a new-line escape sequence rather than an actual new-line. |
|
|
$loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func"); |
|
|
$loadIsTargetOfAnnotation->Put($peg, "$peg:$time"); |
|
|
$loadMadeAnnotation->Put("FIG", "$peg:$time"); |
|
|
# Denote we've seen this timestamp. |
|
|
$seenTimestamps{$time} = 1; |
|
|
} |
|
|
# Now loop through the real annotations. |
|
1021 |
for my $tuple ($fig->feature_annotations($peg, "raw")) { |
for my $tuple ($fig->feature_annotations($peg, "raw")) { |
1022 |
my ($fid, $timestamp, $user, $text) = @{$tuple}; |
my ($fid, $timestamp, $user, $text) = @{$tuple}; |
1023 |
# Here we fix up the annotation text. "\r" is removed, |
# Here we fix up the annotation text. "\r" is removed, |
1031 |
$text =~ s/Set master function/Set FIG function/s; |
$text =~ s/Set master function/Set FIG function/s; |
1032 |
# Insure the time stamp is valid. |
# Insure the time stamp is valid. |
1033 |
if ($timestamp =~ /^\d+$/) { |
if ($timestamp =~ /^\d+$/) { |
1034 |
# Here it's a number. We need to insure it's unique. |
# Here it's a number. We need to insure the one we use to form |
1035 |
while ($seenTimestamps{$timestamp}) { |
# the key is unique. |
1036 |
$timestamp++; |
my $keyStamp = $timestamp; |
1037 |
|
while ($seenTimestamps{$keyStamp}) { |
1038 |
|
$keyStamp++; |
1039 |
} |
} |
1040 |
$seenTimestamps{$timestamp} = 1; |
$seenTimestamps{$keyStamp} = 1; |
1041 |
my $annotationID = "$peg:$timestamp"; |
my $annotationID = "$peg:$keyStamp"; |
1042 |
# Insure the user exists. |
# Insure the user exists. |
1043 |
if (! $users{$user}) { |
if (! $users{$user}) { |
1044 |
$loadSproutUser->Put($user, "SEED user"); |
$loadSproutUser->Put($user, "SEED user"); |
1046 |
$users{$user} = 1; |
$users{$user} = 1; |
1047 |
} |
} |
1048 |
# Generate the annotation. |
# Generate the annotation. |
1049 |
$loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text"); |
$loadAnnotation->Put($annotationID, $timestamp, $text); |
1050 |
$loadIsTargetOfAnnotation->Put($peg, $annotationID); |
$loadIsTargetOfAnnotation->Put($peg, $annotationID); |
1051 |
$loadMadeAnnotation->Put($user, $annotationID); |
$loadMadeAnnotation->Put($user, $annotationID); |
1052 |
} else { |
} else { |
1097 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1098 |
# Get the genome hash. |
# Get the genome hash. |
1099 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1100 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1101 |
my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4); |
my $loadComesFrom = $self->_TableLoader('ComesFrom'); |
1102 |
my $loadSource = $self->_TableLoader('Source', $genomeCount * 4); |
my $loadSource = $self->_TableLoader('Source'); |
1103 |
my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8); |
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
1104 |
Trace("Beginning source data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1105 |
|
Trace("Loading from existing files.") if T(2); |
1106 |
|
} else { |
1107 |
|
Trace("Generating annotation data.") if T(2); |
1108 |
# Create hashes to collect the Source information. |
# Create hashes to collect the Source information. |
1109 |
my %sourceURL = (); |
my %sourceURL = (); |
1110 |
my %sourceDesc = (); |
my %sourceDesc = (); |
1118 |
chomp $line; |
chomp $line; |
1119 |
my($sourceID, $desc, $url) = split(/\t/,$line); |
my($sourceID, $desc, $url) = split(/\t/,$line); |
1120 |
$loadComesFrom->Put($genomeID, $sourceID); |
$loadComesFrom->Put($genomeID, $sourceID); |
1121 |
if ($url && ! exists $sourceURL{$genomeID}) { |
if ($url && ! exists $sourceURL{$sourceID}) { |
1122 |
$loadSourceURL->Put($sourceID, $url); |
$loadSourceURL->Put($sourceID, $url); |
1123 |
$sourceURL{$sourceID} = 1; |
$sourceURL{$sourceID} = 1; |
1124 |
} |
} |
1125 |
if ($desc && ! exists $sourceDesc{$sourceID}) { |
if ($desc) { |
1126 |
$loadSource->Put($sourceID, $desc); |
$sourceDesc{$sourceID} = $desc; |
1127 |
$sourceDesc{$sourceID} = 1; |
} elsif (! exists $sourceDesc{$sourceID}) { |
1128 |
|
$sourceDesc{$sourceID} = $sourceID; |
1129 |
} |
} |
1130 |
} |
} |
1131 |
close TMP; |
close TMP; |
1132 |
} |
} |
1133 |
|
# Write the source descriptions. |
1134 |
|
for my $sourceID (keys %sourceDesc) { |
1135 |
|
$loadSource->Put($sourceID, $sourceDesc{$sourceID}); |
1136 |
|
} |
1137 |
|
} |
1138 |
# Finish the load. |
# Finish the load. |
1139 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1140 |
return $retVal; |
return $retVal; |
1174 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1175 |
# Get the genome hash. |
# Get the genome hash. |
1176 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1177 |
# Convert the genome hash. We'll get the genus and species for each genome and make |
# Convert the genome hash. We'll get the genus and species for each genome and make |
1178 |
# it the key. |
# it the key. |
1179 |
my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); |
my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); |
1180 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1181 |
my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000); |
my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc'); |
1182 |
my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000); |
my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg'); |
1183 |
Trace("Beginning external data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1184 |
|
Trace("Loading from existing files.") if T(2); |
1185 |
|
} else { |
1186 |
|
Trace("Generating external data.") if T(2); |
1187 |
# We loop through the files one at a time. First, the organism file. |
# We loop through the files one at a time. First, the organism file. |
1188 |
Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); |
Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); |
1189 |
my $orgLine; |
my $orgLine; |
1213 |
$loadExternalAliasFunc->Put(@funcFields[0,1]); |
$loadExternalAliasFunc->Put(@funcFields[0,1]); |
1214 |
} |
} |
1215 |
} |
} |
1216 |
|
} |
1217 |
|
# Finish the load. |
1218 |
|
my $retVal = $self->_FinishAll(); |
1219 |
|
return $retVal; |
1220 |
|
} |
1221 |
|
|
1222 |
|
|
1223 |
|
=head3 LoadReactionData |
1224 |
|
|
1225 |
|
C<< my $stats = $spl->LoadReactionData(); >> |
1226 |
|
|
1227 |
|
Load the reaction data from FIG into Sprout. |
1228 |
|
|
1229 |
|
Reaction data connects reactions to the compounds that participate in them. |
1230 |
|
|
1231 |
|
The following relations are loaded by this method. |
1232 |
|
|
1233 |
|
Reaction |
1234 |
|
ReactionURL |
1235 |
|
Compound |
1236 |
|
CompoundName |
1237 |
|
CompoundCAS |
1238 |
|
IsAComponentOf |
1239 |
|
|
1240 |
|
This method proceeds reaction by reaction rather than genome by genome. |
1241 |
|
|
1242 |
|
=over 4 |
1243 |
|
|
1244 |
|
=item RETURNS |
1245 |
|
|
1246 |
|
Returns a statistics object for the loads. |
1247 |
|
|
1248 |
|
=back |
1249 |
|
|
1250 |
|
=cut |
1251 |
|
#: Return Type $%; |
1252 |
|
sub LoadReactionData { |
1253 |
|
# Get this object instance. |
1254 |
|
my ($self) = @_; |
1255 |
|
# Get the FIG object. |
1256 |
|
my $fig = $self->{fig}; |
1257 |
|
# Create load objects for each of the tables we're loading. |
1258 |
|
my $loadReaction = $self->_TableLoader('Reaction'); |
1259 |
|
my $loadReactionURL = $self->_TableLoader('ReactionURL'); |
1260 |
|
my $loadCompound = $self->_TableLoader('Compound'); |
1261 |
|
my $loadCompoundName = $self->_TableLoader('CompoundName'); |
1262 |
|
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS'); |
1263 |
|
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf'); |
1264 |
|
if ($self->{options}->{loadOnly}) { |
1265 |
|
Trace("Loading from existing files.") if T(2); |
1266 |
|
} else { |
1267 |
|
Trace("Generating annotation data.") if T(2); |
1268 |
|
# First we create the compounds. |
1269 |
|
my @compounds = $fig->all_compounds(); |
1270 |
|
for my $cid (@compounds) { |
1271 |
|
# Check for names. |
1272 |
|
my @names = $fig->names_of_compound($cid); |
1273 |
|
# Each name will be given a priority number, starting with 1. |
1274 |
|
my $prio = 1; |
1275 |
|
for my $name (@names) { |
1276 |
|
$loadCompoundName->Put($cid, $name, $prio++); |
1277 |
|
} |
1278 |
|
# Create the main compound record. Note that the first name |
1279 |
|
# becomes the label. |
1280 |
|
my $label = (@names > 0 ? $names[0] : $cid); |
1281 |
|
$loadCompound->Put($cid, $label); |
1282 |
|
# Check for a CAS ID. |
1283 |
|
my $cas = $fig->cas($cid); |
1284 |
|
if ($cas) { |
1285 |
|
$loadCompoundCAS->Put($cid, $cas); |
1286 |
|
} |
1287 |
|
} |
1288 |
|
# All the compounds are set up, so we need to loop through the reactions next. First, |
1289 |
|
# we initialize the discriminator index. This is a single integer used to insure |
1290 |
|
# duplicate elements in a reaction are not accidentally collapsed. |
1291 |
|
my $discrim = 0; |
1292 |
|
my @reactions = $fig->all_reactions(); |
1293 |
|
for my $reactionID (@reactions) { |
1294 |
|
# Create the reaction record. |
1295 |
|
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
1296 |
|
# Compute the reaction's URL. |
1297 |
|
my $url = HTML::reaction_link($reactionID); |
1298 |
|
# Put it in the ReactionURL table. |
1299 |
|
$loadReactionURL->Put($reactionID, $url); |
1300 |
|
# Now we need all of the reaction's compounds. We get these in two phases, |
1301 |
|
# substrates first and then products. |
1302 |
|
for my $product (0, 1) { |
1303 |
|
# Get the compounds of the current type for the current reaction. FIG will |
1304 |
|
# give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not |
1305 |
|
# have location data in SEED, so it defaults to the empty string. |
1306 |
|
my @compounds = $fig->reaction2comp($reactionID, $product); |
1307 |
|
for my $compData (@compounds) { |
1308 |
|
# Extract the compound data from the current tuple. |
1309 |
|
my ($cid, $stoich, $main) = @{$compData}; |
1310 |
|
# Link the compound to the reaction. |
1311 |
|
$loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main, |
1312 |
|
$product, $stoich); |
1313 |
|
} |
1314 |
|
} |
1315 |
|
} |
1316 |
|
} |
1317 |
# Finish the load. |
# Finish the load. |
1318 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1319 |
return $retVal; |
return $retVal; |
1349 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1350 |
# Get the genome hash. |
# Get the genome hash. |
1351 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
|
my $genomeCount = (keys %{$genomeHash}); |
|
1352 |
# Create a load object for the table we're loading. |
# Create a load object for the table we're loading. |
1353 |
my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4); |
my $loadGenomeGroups = $self->_TableLoader('GenomeGroups'); |
1354 |
Trace("Beginning group data load.") if T(2); |
if ($self->{options}->{loadOnly}) { |
1355 |
|
Trace("Loading from existing files.") if T(2); |
1356 |
|
} else { |
1357 |
|
Trace("Generating group data.") if T(2); |
1358 |
# Loop through the genomes. |
# Loop through the genomes. |
1359 |
my $line; |
my $line; |
1360 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (keys %{$genomeHash}) { |
1370 |
} |
} |
1371 |
close TMP; |
close TMP; |
1372 |
} |
} |
1373 |
|
} |
1374 |
# Finish the load. |
# Finish the load. |
1375 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1376 |
return $retVal; |
return $retVal; |
1392 |
|
|
1393 |
Name of the table (relation) being loaded. |
Name of the table (relation) being loaded. |
1394 |
|
|
|
=item rowCount (optional) |
|
|
|
|
|
Estimated maximum number of rows in the table. |
|
|
|
|
1395 |
=item RETURN |
=item RETURN |
1396 |
|
|
1397 |
Returns an ERDBLoad object for loading the specified table. |
Returns an ERDBLoad object for loading the specified table. |
1402 |
|
|
1403 |
sub _TableLoader { |
sub _TableLoader { |
1404 |
# Get the parameters. |
# Get the parameters. |
1405 |
my ($self, $tableName, $rowCount) = @_; |
my ($self, $tableName, $loadOnly) = @_; |
1406 |
# Create the load object. |
# Create the load object. |
1407 |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount); |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly); |
1408 |
# Cache it in the loader list. |
# Cache it in the loader list. |
1409 |
push @{$self->{loaders}}, $retVal; |
push @{$self->{loaders}}, $retVal; |
1410 |
# Return it to the caller. |
# Return it to the caller. |
1441 |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
1442 |
# ignominiously. At some future point, we want to make the loads restartable. |
# ignominiously. At some future point, we want to make the loads restartable. |
1443 |
while (my $loader = pop @{$loadList}) { |
while (my $loader = pop @{$loadList}) { |
1444 |
|
# Trace the fact that we're cleaning up. |
1445 |
|
my $relName = $loader->RelName; |
1446 |
|
Trace("Finishing $relName.") if T(2); |
1447 |
my $stats = $loader->Finish(); |
my $stats = $loader->Finish(); |
1448 |
|
if ($self->{options}->{dbLoad}) { |
1449 |
|
# Here we want to use the load file just created to load the database. |
1450 |
|
Trace("Loading relation $relName.") if T(2); |
1451 |
|
my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]); |
1452 |
|
# Accumulate the statistics from the DB load. |
1453 |
|
$stats->Accumulate($newStats); |
1454 |
|
} |
1455 |
$retVal->Accumulate($stats); |
$retVal->Accumulate($stats); |
|
my $relName = $loader->RelName; |
|
1456 |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
1457 |
} |
} |
1458 |
# Return the load statistics. |
# Return the load statistics. |