30 |
$stats->Accumulate($spl->LoadFeatureData()); |
$stats->Accumulate($spl->LoadFeatureData()); |
31 |
print $stats->Show(); |
print $stats->Show(); |
32 |
|
|
|
This module makes use of the internal Sprout property C<_erdb>. |
|
|
|
|
33 |
It is worth noting that the FIG object does not need to be a real one. Any object |
It is worth noting that the FIG object does not need to be a real one. Any object |
34 |
that implements the FIG methods for data retrieval could be used. So, for example, |
that implements the FIG methods for data retrieval could be used. So, for example, |
35 |
this object could be used to copy data from one Sprout database to another, or |
this object could be used to copy data from one Sprout database to another, or |
78 |
=item subsysFile |
=item subsysFile |
79 |
|
|
80 |
Either the name of the file containing the list of trusted subsystems or a reference |
Either the name of the file containing the list of trusted subsystems or a reference |
81 |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be |
82 |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR> |
83 |
|
in its data directory.) Only subsystem data related to the trusted subsystems is loaded. |
84 |
|
|
85 |
=item options |
=item options |
86 |
|
|
93 |
sub new { |
sub new { |
94 |
# Get the parameters. |
# Get the parameters. |
95 |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
96 |
# Load the list of genomes into a hash. |
# Create the genome hash. |
97 |
my %genomes; |
my %genomes = (); |
98 |
|
# We only need it if load-only is NOT specified. |
99 |
|
if (! $options->{loadOnly}) { |
100 |
if (! defined($genomeFile) || $genomeFile eq '') { |
if (! defined($genomeFile) || $genomeFile eq '') { |
101 |
# Here we want all the complete genomes and an access code of 1. |
# Here we want all the complete genomes and an access code of 1. |
102 |
my @genomeList = $fig->genomes(1); |
my @genomeList = $fig->genomes(1); |
130 |
Confess("Invalid genome parameter ($type) in SproutLoad constructor."); |
Confess("Invalid genome parameter ($type) in SproutLoad constructor."); |
131 |
} |
} |
132 |
} |
} |
133 |
|
} |
134 |
# Load the list of trusted subsystems. |
# Load the list of trusted subsystems. |
135 |
my %subsystems = (); |
my %subsystems = (); |
136 |
|
# We only need it if load-only is NOT specified. |
137 |
|
if (! $options->{loadOnly}) { |
138 |
if (! defined $subsysFile || $subsysFile eq '') { |
if (! defined $subsysFile || $subsysFile eq '') { |
139 |
# Here we want all the subsystems. |
# Here we want all the NMPDR subsystems. First we get the whole list. |
140 |
%subsystems = map { $_ => 1 } $fig->all_subsystems(); |
my @subs = $fig->all_subsystems(); |
141 |
|
# Loop through, checking for the NMPDR file. |
142 |
|
for my $sub (@subs) { |
143 |
|
if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") { |
144 |
|
$subsystems{$sub} = 1; |
145 |
|
} |
146 |
|
} |
147 |
} else { |
} else { |
148 |
my $type = ref $subsysFile; |
my $type = ref $subsysFile; |
149 |
if ($type eq 'ARRAY') { |
if ($type eq 'ARRAY') { |
163 |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
164 |
} |
} |
165 |
} |
} |
166 |
|
} |
167 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
168 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
169 |
# Create the Sprout load object. |
# Create the Sprout load object. |
173 |
subsystems => \%subsystems, |
subsystems => \%subsystems, |
174 |
sprout => $sprout, |
sprout => $sprout, |
175 |
loadDirectory => $directory, |
loadDirectory => $directory, |
176 |
erdb => $sprout->{_erdb}, |
erdb => $sprout, |
177 |
loaders => [], |
loaders => [], |
178 |
options => $options |
options => $options |
179 |
}; |
}; |
195 |
return $self->{options}->{loadOnly}; |
return $self->{options}->{loadOnly}; |
196 |
} |
} |
197 |
|
|
198 |
|
=head3 PrimaryOnly |
199 |
|
|
200 |
|
C<< my $flag = $spl->PrimaryOnly; >> |
201 |
|
|
202 |
|
Return TRUE if only the main entity is to be loaded, else FALSE. |
203 |
|
|
204 |
|
=cut |
205 |
|
|
206 |
|
sub PrimaryOnly { |
207 |
|
my ($self) = @_; |
208 |
|
return $self->{options}->{primaryOnly}; |
209 |
|
} |
210 |
|
|
211 |
=head3 LoadGenomeData |
=head3 LoadGenomeData |
212 |
|
|
213 |
C<< my $stats = $spl->LoadGenomeData(); >> |
C<< my $stats = $spl->LoadGenomeData(); >> |
235 |
|
|
236 |
=back |
=back |
237 |
|
|
|
B<TO DO> |
|
|
|
|
|
Real quality vectors instead of C<unknown> for everything. |
|
|
|
|
|
GenomeGroup relation. (The original script took group information from the C<NMPDR> file |
|
|
in each genome's main directory, but no such file exists anywhere in my version of the |
|
|
data store.) |
|
|
|
|
238 |
=cut |
=cut |
239 |
#: Return Type $%; |
#: Return Type $%; |
240 |
sub LoadGenomeData { |
sub LoadGenomeData { |
247 |
my $genomeCount = (keys %{$genomeHash}); |
my $genomeCount = (keys %{$genomeHash}); |
248 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
249 |
my $loadGenome = $self->_TableLoader('Genome'); |
my $loadGenome = $self->_TableLoader('Genome'); |
250 |
my $loadHasContig = $self->_TableLoader('HasContig'); |
my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly); |
251 |
my $loadContig = $self->_TableLoader('Contig'); |
my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly); |
252 |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf'); |
my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly); |
253 |
my $loadSequence = $self->_TableLoader('Sequence'); |
my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly); |
254 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
255 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
256 |
} else { |
} else { |
261 |
$loadGenome->Add("genomeIn"); |
$loadGenome->Add("genomeIn"); |
262 |
# The access code comes in via the genome hash. |
# The access code comes in via the genome hash. |
263 |
my $accessCode = $genomeHash->{$genomeID}; |
my $accessCode = $genomeHash->{$genomeID}; |
264 |
# Get the genus, species, and strain from the scientific name. Note that we append |
# Get the genus, species, and strain from the scientific name. |
|
# the genome ID to the strain. In some cases this is the totality of the strain name. |
|
265 |
my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
266 |
my $extra = join " ", @extraData, "[$genomeID]"; |
my $extra = join " ", @extraData; |
267 |
# Get the full taxonomy. |
# Get the full taxonomy. |
268 |
my $taxonomy = $fig->taxonomy_of($genomeID); |
my $taxonomy = $fig->taxonomy_of($genomeID); |
269 |
# Output the genome record. |
# Output the genome record. |
344 |
my $featureCount = $genomeCount * 4000; |
my $featureCount = $genomeCount * 4000; |
345 |
# Start the loads. |
# Start the loads. |
346 |
my $loadCoupling = $self->_TableLoader('Coupling'); |
my $loadCoupling = $self->_TableLoader('Coupling'); |
347 |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy'); |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
348 |
my $loadPCH = $self->_TableLoader('PCH'); |
my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly); |
349 |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling'); |
my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly); |
350 |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence'); |
my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly); |
351 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
352 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
353 |
} else { |
} else { |
375 |
for my $coupleData (@couplings) { |
for my $coupleData (@couplings) { |
376 |
my ($peg2, $score) = @{$coupleData}; |
my ($peg2, $score) = @{$coupleData}; |
377 |
# Compute the coupling ID. |
# Compute the coupling ID. |
378 |
my $coupleID = Sprout::CouplingID($peg1, $peg2); |
my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2); |
379 |
if (! exists $dupHash{$coupleID}) { |
if (! exists $dupHash{$coupleID}) { |
380 |
$loadCoupling->Add("couplingIn"); |
$loadCoupling->Add("couplingIn"); |
381 |
# Here we have a new coupling to store in the load files. |
# Here we have a new coupling to store in the load files. |
446 |
FeatureTranslation |
FeatureTranslation |
447 |
FeatureUpstream |
FeatureUpstream |
448 |
IsLocatedIn |
IsLocatedIn |
449 |
|
HasFeature |
450 |
|
|
451 |
=over 4 |
=over 4 |
452 |
|
|
467 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
468 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
469 |
my $loadFeature = $self->_TableLoader('Feature'); |
my $loadFeature = $self->_TableLoader('Feature'); |
470 |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn'); |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly); |
471 |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
472 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
473 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
474 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
475 |
|
my $loadHasFeature = $self->_TableLoader('HasFeature'); |
476 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
477 |
# locations. |
# locations. |
478 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
493 |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
494 |
# Create the feature record. |
# Create the feature record. |
495 |
$loadFeature->Put($featureID, 1, $type); |
$loadFeature->Put($featureID, 1, $type); |
496 |
|
# Link it to the parent genome. |
497 |
|
$loadHasFeature->Put($genomeID, $featureID, $type); |
498 |
# Create the aliases. |
# Create the aliases. |
499 |
for my $alias ($fig->feature_aliases($featureID)) { |
for my $alias ($fig->feature_aliases($featureID)) { |
500 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
634 |
The following relations are loaded by this method. |
The following relations are loaded by this method. |
635 |
|
|
636 |
Subsystem |
Subsystem |
637 |
|
SubsystemClass |
638 |
Role |
Role |
639 |
RoleEC |
RoleEC |
640 |
SSCell |
SSCell |
678 |
# Get the map list. |
# Get the map list. |
679 |
my @maps = $fig->all_maps; |
my @maps = $fig->all_maps; |
680 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
681 |
my $loadDiagram = $self->_TableLoader('Diagram'); |
my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly); |
682 |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn'); |
my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly); |
683 |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
my $loadSubsystem = $self->_TableLoader('Subsystem'); |
684 |
my $loadRole = $self->_TableLoader('Role'); |
my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly); |
685 |
my $loadRoleEC = $self->_TableLoader('RoleEC'); |
my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly); |
686 |
my $loadCatalyzes = $self->_TableLoader('Catalyzes'); |
my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly); |
687 |
my $loadSSCell = $self->_TableLoader('SSCell'); |
my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly); |
688 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly); |
689 |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf'); |
my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly); |
690 |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf'); |
my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly); |
691 |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem'); |
my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly); |
692 |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn'); |
my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly); |
693 |
my $loadHasSSCell = $self->_TableLoader('HasSSCell'); |
my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly); |
694 |
my $loadRoleSubset = $self->_TableLoader('RoleSubset'); |
my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly); |
695 |
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset'); |
my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly); |
696 |
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles'); |
my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly); |
697 |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes'); |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); |
698 |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset'); |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); |
699 |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset'); |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); |
700 |
|
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly); |
701 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
702 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
703 |
} else { |
} else { |
713 |
my ($genomeID, $roleID); |
my ($genomeID, $roleID); |
714 |
my %roleData = (); |
my %roleData = (); |
715 |
for my $subsysID (@subsysIDs) { |
for my $subsysID (@subsysIDs) { |
|
Trace("Creating subsystem $subsysID.") if T(3); |
|
|
$loadSubsystem->Add("subsystemIn"); |
|
716 |
# Get the subsystem object. |
# Get the subsystem object. |
717 |
my $sub = $fig->get_subsystem($subsysID); |
my $sub = $fig->get_subsystem($subsysID); |
718 |
|
# Only proceed if the subsystem has a spreadsheet. |
719 |
|
if (! $sub->{empty_ss}) { |
720 |
|
Trace("Creating subsystem $subsysID.") if T(3); |
721 |
|
$loadSubsystem->Add("subsystemIn"); |
722 |
# Create the subsystem record. |
# Create the subsystem record. |
723 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
724 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
725 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
726 |
|
my $class = $fig->subsystem_classification($subsysID); |
727 |
|
if ($class) { |
728 |
|
$loadSubsystemClass->Put($subsysID, $class); |
729 |
|
} |
730 |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
731 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
732 |
# Connect to this role. |
# Connect to this role. |
770 |
# part of the spreadsheet cell ID. |
# part of the spreadsheet cell ID. |
771 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
772 |
# Get the features in the spreadsheet cell for this genome and role. |
# Get the features in the spreadsheet cell for this genome and role. |
773 |
my @pegs = $sub->get_pegs_from_cell($row, $col); |
my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col); |
774 |
# Only proceed if features exist. |
# Only proceed if features exist. |
775 |
if (@pegs > 0) { |
if (@pegs > 0) { |
776 |
# Create the spreadsheet cell. |
# Create the spreadsheet cell. |
791 |
if ($pegCount > 0) { |
if ($pegCount > 0) { |
792 |
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
793 |
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
|
# Partition the PEGs found into clusters. |
|
|
my @clusters = $fig->compute_clusters(\@pegsFound, $sub); |
|
794 |
# Create a hash mapping PEG IDs to cluster numbers. |
# Create a hash mapping PEG IDs to cluster numbers. |
795 |
# We default to -1 for all of them. |
# We default to -1 for all of them. |
796 |
my %clusterOf = map { $_ => -1 } @pegsFound; |
my %clusterOf = map { $_ => -1 } @pegsFound; |
797 |
|
# Partition the PEGs found into clusters. |
798 |
|
my @clusters = $fig->compute_clusters([keys %clusterOf], $sub); |
799 |
for (my $i = 0; $i <= $#clusters; $i++) { |
for (my $i = 0; $i <= $#clusters; $i++) { |
800 |
my $subList = $clusters[$i]; |
my $subList = $clusters[$i]; |
801 |
for my $peg (@{$subList}) { |
for my $peg (@{$subList}) { |
823 |
# Connect the subset to the subsystem. |
# Connect the subset to the subsystem. |
824 |
$loadHasRoleSubset->Put($subsysID, $actualID); |
$loadHasRoleSubset->Put($subsysID, $actualID); |
825 |
# Connect the subset to its roles. |
# Connect the subset to its roles. |
826 |
my @roles = $sub->get_subset($subsetID); |
my @roles = $sub->get_subsetC_roles($subsetID); |
827 |
for my $roleID (@roles) { |
for my $roleID (@roles) { |
828 |
$loadConsistsOfRoles->Put($actualID, $roleID); |
$loadConsistsOfRoles->Put($actualID, $roleID); |
829 |
} |
} |
876 |
} |
} |
877 |
} |
} |
878 |
} |
} |
879 |
|
} |
880 |
# Finish the load. |
# Finish the load. |
881 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
882 |
return $retVal; |
return $retVal; |
920 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
921 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
922 |
my $loadProperty = $self->_TableLoader('Property'); |
my $loadProperty = $self->_TableLoader('Property'); |
923 |
my $loadHasProperty = $self->_TableLoader('HasProperty'); |
my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly); |
924 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
925 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
926 |
} else { |
} else { |
931 |
# Loop through the genomes. |
# Loop through the genomes. |
932 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (keys %{$genomeHash}) { |
933 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
934 |
|
Trace("Generating properties for $genomeID.") if T(3); |
935 |
# Get the genome's features. The feature ID is the first field in the |
# Get the genome's features. The feature ID is the first field in the |
936 |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
# tuples returned by "all_features_detailed". We use "all_features_detailed" |
937 |
# rather than "all_features" because we want all features regardless of type. |
# rather than "all_features" because we want all features regardless of type. |
938 |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
939 |
|
my $featureCount = 0; |
940 |
|
my $propertyCount = 0; |
941 |
# Loop through the features, creating HasProperty records. |
# Loop through the features, creating HasProperty records. |
942 |
for my $fid (@features) { |
for my $fid (@features) { |
|
$loadProperty->Add("featureIn"); |
|
943 |
# Get all attributes for this feature. We do this one feature at a time |
# Get all attributes for this feature. We do this one feature at a time |
944 |
# to insure we do not get any genome attributes. |
# to insure we do not get any genome attributes. |
945 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
946 |
|
if (scalar @attributeList) { |
947 |
|
$featureCount++; |
948 |
|
} |
949 |
# Loop through the attributes. |
# Loop through the attributes. |
950 |
for my $tuple (@attributeList) { |
for my $tuple (@attributeList) { |
951 |
|
$propertyCount++; |
952 |
# Get this attribute value's data. Note that we throw away the FID, |
# Get this attribute value's data. Note that we throw away the FID, |
953 |
# since it will always be the same as the value if "$fid". |
# since it will always be the same as the value if "$fid". |
954 |
my (undef, $key, $value, $url) = @{$tuple}; |
my (undef, $key, $value, $url) = @{$tuple}; |
970 |
$loadHasProperty->Put($fid, $propertyID, $url); |
$loadHasProperty->Put($fid, $propertyID, $url); |
971 |
} |
} |
972 |
} |
} |
973 |
|
# Update the statistics. |
974 |
|
Trace("$propertyCount attributes processed for $featureCount features.") if T(3); |
975 |
|
$loadHasProperty->Add("featuresIn", $featureCount); |
976 |
|
$loadHasProperty->Add("propertiesIn", $propertyCount); |
977 |
} |
} |
978 |
} |
} |
979 |
# Finish the load. |
# Finish the load. |
1017 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1018 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1019 |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
my $loadAnnotation = $self->_TableLoader('Annotation'); |
1020 |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation'); |
my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly); |
1021 |
my $loadSproutUser = $self->_TableLoader('SproutUser'); |
my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly); |
1022 |
my $loadUserAccess = $self->_TableLoader('UserAccess'); |
my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly); |
1023 |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation'); |
my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly); |
1024 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1025 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1026 |
} else { |
} else { |
1038 |
# Loop through the genomes. |
# Loop through the genomes. |
1039 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1040 |
Trace("Processing $genomeID.") if T(3); |
Trace("Processing $genomeID.") if T(3); |
|
# Get the genome's PEGs. |
|
|
my @pegs = $fig->pegs_of($genomeID); |
|
|
for my $peg (@pegs) { |
|
|
Trace("Processing $peg.") if T(4); |
|
1041 |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
# Create a hash of timestamps. We use this to prevent duplicate time stamps |
1042 |
# from showing up for a single PEG's annotations. |
# from showing up for a single PEG's annotations. |
1043 |
my %seenTimestamps = (); |
my %seenTimestamps = (); |
1044 |
# Loop through the annotations. |
# Get the genome's annotations. |
1045 |
for my $tuple ($fig->feature_annotations($peg, "raw")) { |
my @annotations = $fig->read_all_annotations($genomeID); |
1046 |
my ($fid, $timestamp, $user, $text) = @{$tuple}; |
Trace("Processing annotations.") if T(2); |
1047 |
|
for my $tuple (@annotations) { |
1048 |
|
# Get the annotation tuple. |
1049 |
|
my ($peg, $timestamp, $user, $text) = @{$tuple}; |
1050 |
# Here we fix up the annotation text. "\r" is removed, |
# Here we fix up the annotation text. "\r" is removed, |
1051 |
# and "\t" and "\n" are escaped. Note we use the "s" |
# and "\t" and "\n" are escaped. Note we use the "gs" |
1052 |
# modifier so that new-lines inside the text do not |
# modifier so that new-lines inside the text do not |
1053 |
# stop the substitution search. |
# stop the substitution search. |
1054 |
$text =~ s/\r//gs; |
$text =~ s/\r//gs; |
1061 |
# Here it's a number. We need to insure the one we use to form |
# Here it's a number. We need to insure the one we use to form |
1062 |
# the key is unique. |
# the key is unique. |
1063 |
my $keyStamp = $timestamp; |
my $keyStamp = $timestamp; |
1064 |
while ($seenTimestamps{$keyStamp}) { |
while ($seenTimestamps{"$peg:$keyStamp"}) { |
1065 |
$keyStamp++; |
$keyStamp++; |
1066 |
} |
} |
|
$seenTimestamps{$keyStamp} = 1; |
|
1067 |
my $annotationID = "$peg:$keyStamp"; |
my $annotationID = "$peg:$keyStamp"; |
1068 |
|
$seenTimestamps{$annotationID} = 1; |
1069 |
# Insure the user exists. |
# Insure the user exists. |
1070 |
if (! $users{$user}) { |
if (! $users{$user}) { |
1071 |
$loadSproutUser->Put($user, "SEED user"); |
$loadSproutUser->Put($user, "SEED user"); |
1083 |
} |
} |
1084 |
} |
} |
1085 |
} |
} |
|
} |
|
1086 |
# Finish the load. |
# Finish the load. |
1087 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1088 |
return $retVal; |
return $retVal; |
1124 |
# Get the genome hash. |
# Get the genome hash. |
1125 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1126 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1127 |
my $loadComesFrom = $self->_TableLoader('ComesFrom'); |
my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly); |
1128 |
my $loadSource = $self->_TableLoader('Source'); |
my $loadSource = $self->_TableLoader('Source'); |
1129 |
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
my $loadSourceURL = $self->_TableLoader('SourceURL'); |
1130 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1282 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
1283 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
1284 |
my $loadReaction = $self->_TableLoader('Reaction'); |
my $loadReaction = $self->_TableLoader('Reaction'); |
1285 |
my $loadReactionURL = $self->_TableLoader('ReactionURL'); |
my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly); |
1286 |
my $loadCompound = $self->_TableLoader('Compound'); |
my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly); |
1287 |
my $loadCompoundName = $self->_TableLoader('CompoundName'); |
my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly); |
1288 |
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS'); |
my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly); |
1289 |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf'); |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly); |
1290 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1291 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1292 |
} else { |
} else { |
1402 |
return $retVal; |
return $retVal; |
1403 |
} |
} |
1404 |
|
|
1405 |
|
=head3 LoadSynonymData |
1406 |
|
|
1407 |
|
C<< my $stats = $spl->LoadSynonymData(); >> |
1408 |
|
|
1409 |
|
Load the synonym groups into Sprout. |
1410 |
|
|
1411 |
|
The following relations are loaded by this method. |
1412 |
|
|
1413 |
|
SynonymGroup |
1414 |
|
IsSynonymGroupFor |
1415 |
|
|
1416 |
|
The source information for these relations is taken from the C<maps_to_id> method |
1417 |
|
of the B<FIG> object. The process starts from the features, so it is possible |
1418 |
|
that there will be duplicates in the SynonymGroup load file, since the relationship |
1419 |
|
is one-to-many toward the features. The automatic sort on primary entity relations |
1420 |
|
will fix this for us. |
1421 |
|
|
1422 |
|
=over 4 |
1423 |
|
|
1424 |
|
=item RETURNS |
1425 |
|
|
1426 |
|
Returns a statistics object for the loads. |
1427 |
|
|
1428 |
|
=back |
1429 |
|
|
1430 |
|
=cut |
1431 |
|
#: Return Type $%; |
1432 |
|
sub LoadSynonymData { |
1433 |
|
# Get this object instance. |
1434 |
|
my ($self) = @_; |
1435 |
|
# Get the FIG object. |
1436 |
|
my $fig = $self->{fig}; |
1437 |
|
# Get the genome hash. |
1438 |
|
my $genomeHash = $self->{genomes}; |
1439 |
|
# Create a load object for the table we're loading. |
1440 |
|
my $loadSynonymGroup = $self->_TableLoader('SynonymGroup'); |
1441 |
|
my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor'); |
1442 |
|
if ($self->{options}->{loadOnly}) { |
1443 |
|
Trace("Loading from existing files.") if T(2); |
1444 |
|
} else { |
1445 |
|
Trace("Generating synonym group data.") if T(2); |
1446 |
|
# Loop through the genomes. |
1447 |
|
for my $genomeID (sort keys %{$genomeHash}) { |
1448 |
|
Trace("Processing $genomeID.") if T(3); |
1449 |
|
# Get all of the features for this genome. The only method that does this is |
1450 |
|
# all_features_detailed, which returns extra baggage that we discard. |
1451 |
|
my $featureData = $fig->all_features_detailed($genomeID); |
1452 |
|
my @fids = map { $_->[0] } @{$featureData}; |
1453 |
|
Trace(scalar(@fids) . " features found for genome $genomeID.") if T(3); |
1454 |
|
# Loop through the feature IDs. |
1455 |
|
for my $fid (@fids) { |
1456 |
|
# Get the group for this feature. |
1457 |
|
my $synonym = $fig->maps_to_id($fid); |
1458 |
|
# Only proceed if the synonym is a real group. |
1459 |
|
if ($synonym ne $fid) { |
1460 |
|
$loadSynonymGroup->Put($synonym); |
1461 |
|
$loadIsSynonymGroupFor->Put($synonym, $fid); |
1462 |
|
} |
1463 |
|
} |
1464 |
|
} |
1465 |
|
} |
1466 |
|
# Finish the load. |
1467 |
|
my $retVal = $self->_FinishAll(); |
1468 |
|
return $retVal; |
1469 |
|
} |
1470 |
|
|
1471 |
|
|
1472 |
=head2 Internal Utility Methods |
=head2 Internal Utility Methods |
1473 |
|
|
1474 |
=head3 TableLoader |
=head3 TableLoader |
1485 |
|
|
1486 |
Name of the table (relation) being loaded. |
Name of the table (relation) being loaded. |
1487 |
|
|
1488 |
|
=item ignore |
1489 |
|
|
1490 |
|
TRUE if the table should be ignored entirely, else FALSE. |
1491 |
|
|
1492 |
=item RETURN |
=item RETURN |
1493 |
|
|
1494 |
Returns an ERDBLoad object for loading the specified table. |
Returns an ERDBLoad object for loading the specified table. |
1499 |
|
|
1500 |
sub _TableLoader { |
sub _TableLoader { |
1501 |
# Get the parameters. |
# Get the parameters. |
1502 |
my ($self, $tableName, $loadOnly) = @_; |
my ($self, $tableName, $ignore) = @_; |
1503 |
# Create the load object. |
# Create the load object. |
1504 |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly); |
my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly, |
1505 |
|
$ignore); |
1506 |
# Cache it in the loader list. |
# Cache it in the loader list. |
1507 |
push @{$self->{loaders}}, $retVal; |
push @{$self->{loaders}}, $retVal; |
1508 |
# Return it to the caller. |
# Return it to the caller. |
1536 |
my $retVal = Stats->new(); |
my $retVal = Stats->new(); |
1537 |
# Get the loader list. |
# Get the loader list. |
1538 |
my $loadList = $self->{loaders}; |
my $loadList = $self->{loaders}; |
1539 |
|
# Create a hash to hold the statistics objects, keyed on relation name. |
1540 |
|
my %loaderHash = (); |
1541 |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
1542 |
# ignominiously. At some future point, we want to make the loads restartable. |
# ignominiously. At some future point, we want to make the loads more restartable. |
1543 |
while (my $loader = pop @{$loadList}) { |
while (my $loader = pop @{$loadList}) { |
1544 |
# Trace the fact that we're cleaning up. |
# Get the relation name. |
1545 |
my $relName = $loader->RelName; |
my $relName = $loader->RelName; |
1546 |
|
# Check the ignore flag. |
1547 |
|
if ($loader->Ignore) { |
1548 |
|
Trace("Relation $relName not loaded.") if T(2); |
1549 |
|
} else { |
1550 |
|
# Here we really need to finish. |
1551 |
Trace("Finishing $relName.") if T(2); |
Trace("Finishing $relName.") if T(2); |
1552 |
my $stats = $loader->Finish(); |
my $stats = $loader->Finish(); |
1553 |
|
$loaderHash{$relName} = $stats; |
1554 |
|
} |
1555 |
|
} |
1556 |
|
# Now we loop through again, actually loading the tables. We want to finish before |
1557 |
|
# loading so that if something goes wrong at this point, all the load files are usable |
1558 |
|
# and we don't have to redo all that work. |
1559 |
|
for my $relName (sort keys %loaderHash) { |
1560 |
|
# Get the statistics for this relation. |
1561 |
|
my $stats = $loaderHash{$relName}; |
1562 |
|
# Check for a database load. |
1563 |
if ($self->{options}->{dbLoad}) { |
if ($self->{options}->{dbLoad}) { |
1564 |
# Here we want to use the load file just created to load the database. |
# Here we want to use the load file just created to load the database. |
1565 |
Trace("Loading relation $relName.") if T(2); |
Trace("Loading relation $relName.") if T(2); |
1570 |
$retVal->Accumulate($stats); |
$retVal->Accumulate($stats); |
1571 |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
1572 |
} |
} |
1573 |
|
} |
1574 |
# Return the load statistics. |
# Return the load statistics. |
1575 |
return $retVal; |
return $retVal; |
1576 |
} |
} |