120 |
# an omitted access code can be defaulted to 1. |
# an omitted access code can be defaulted to 1. |
121 |
for my $genomeLine (@genomeList) { |
for my $genomeLine (@genomeList) { |
122 |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
123 |
if (undef $accessCode) { |
if (! defined($accessCode)) { |
124 |
$accessCode = 1; |
$accessCode = 1; |
125 |
} |
} |
126 |
$genomes{$genomeID} = $accessCode; |
$genomes{$genomeID} = $accessCode; |
136 |
# We only need it if load-only is NOT specified. |
# We only need it if load-only is NOT specified. |
137 |
if (! $options->{loadOnly}) { |
if (! $options->{loadOnly}) { |
138 |
if (! defined $subsysFile || $subsysFile eq '') { |
if (! defined $subsysFile || $subsysFile eq '') { |
139 |
# Here we want all the NMPDR subsystems. First we get the whole list. |
# Here we want all the usable subsystems. First we get the whole list. |
140 |
my @subs = $fig->all_subsystems(); |
my @subs = $fig->all_subsystems(); |
141 |
# Loop through, checking for the NMPDR file. |
# Loop through, checking for usability. |
142 |
for my $sub (@subs) { |
for my $sub (@subs) { |
143 |
if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") { |
if ($fig->usable_subsystem($sub)) { |
144 |
$subsystems{$sub} = 1; |
$subsystems{$sub} = 1; |
145 |
} |
} |
146 |
} |
} |
163 |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
164 |
} |
} |
165 |
} |
} |
166 |
|
# Go through the subsys hash again, creating the keyword list for each subsystem. |
167 |
|
for my $subsystem (keys %subsystems) { |
168 |
|
my $name = $subsystem; |
169 |
|
$name =~ s/_/ /g; |
170 |
|
my $classes = $fig->subsystem_classification($subsystem); |
171 |
|
my @classList = map { " $_" } @{$classes}; |
172 |
|
$name .= join("", @classList); |
173 |
|
$subsystems{$subsystem} = $name; |
174 |
|
} |
175 |
} |
} |
176 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
177 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
275 |
my $extra = join " ", @extraData; |
my $extra = join " ", @extraData; |
276 |
# Get the full taxonomy. |
# Get the full taxonomy. |
277 |
my $taxonomy = $fig->taxonomy_of($genomeID); |
my $taxonomy = $fig->taxonomy_of($genomeID); |
278 |
|
# Open the NMPDR group file for this genome. |
279 |
|
my $group; |
280 |
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
281 |
|
defined($group = <TMP>)) { |
282 |
|
# Clean the line ending. |
283 |
|
chomp $group; |
284 |
|
} else { |
285 |
|
# No group, so use the default. |
286 |
|
$group = $FIG_Config::otherGroup; |
287 |
|
} |
288 |
|
close TMP; |
289 |
# Output the genome record. |
# Output the genome record. |
290 |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
291 |
$species, $extra, $taxonomy); |
$group, $species, $extra, $taxonomy); |
292 |
# Now we loop through each of the genome's contigs. |
# Now we loop through each of the genome's contigs. |
293 |
my @contigs = $fig->all_contigs($genomeID); |
my @contigs = $fig->all_contigs($genomeID); |
294 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
360 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
361 |
# Get the genome hash. |
# Get the genome hash. |
362 |
my $genomeFilter = $self->{genomes}; |
my $genomeFilter = $self->{genomes}; |
363 |
my $genomeCount = (keys %{$genomeFilter}); |
# Set up an ID counter for the PCHs. |
364 |
my $featureCount = $genomeCount * 4000; |
my $pchID = 0; |
365 |
# Start the loads. |
# Start the loads. |
366 |
my $loadCoupling = $self->_TableLoader('Coupling'); |
my $loadCoupling = $self->_TableLoader('Coupling'); |
367 |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
395 |
for my $coupleData (@couplings) { |
for my $coupleData (@couplings) { |
396 |
my ($peg2, $score) = @{$coupleData}; |
my ($peg2, $score) = @{$coupleData}; |
397 |
# Compute the coupling ID. |
# Compute the coupling ID. |
398 |
my $coupleID = Sprout::CouplingID($peg1, $peg2); |
my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2); |
399 |
if (! exists $dupHash{$coupleID}) { |
if (! exists $dupHash{$coupleID}) { |
400 |
$loadCoupling->Add("couplingIn"); |
$loadCoupling->Add("couplingIn"); |
401 |
# Here we have a new coupling to store in the load files. |
# Here we have a new coupling to store in the load files. |
431 |
} |
} |
432 |
} |
} |
433 |
for my $evidenceID (keys %evidenceMap) { |
for my $evidenceID (keys %evidenceMap) { |
434 |
|
# Get the ID for this evidence. |
435 |
|
$pchID++; |
436 |
# Create the evidence record. |
# Create the evidence record. |
437 |
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
438 |
$loadPCH->Put($evidenceID, $usage); |
$loadPCH->Put($pchID, $usage); |
439 |
# Connect it to the coupling. |
# Connect it to the coupling. |
440 |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
$loadIsEvidencedBy->Put($coupleID, $pchID); |
441 |
# Connect it to the features. |
# Connect it to the features. |
442 |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
$loadUsesAsEvidence->Put($pchID, $peg3, 1); |
443 |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 2); |
$loadUsesAsEvidence->Put($pchID, $peg4, 2); |
444 |
} |
} |
445 |
} |
} |
446 |
} |
} |
469 |
FeatureUpstream |
FeatureUpstream |
470 |
IsLocatedIn |
IsLocatedIn |
471 |
HasFeature |
HasFeature |
472 |
|
HasRoleInSubsystem |
473 |
|
|
474 |
=over 4 |
=over 4 |
475 |
|
|
484 |
sub LoadFeatureData { |
sub LoadFeatureData { |
485 |
# Get this object instance. |
# Get this object instance. |
486 |
my ($self) = @_; |
my ($self) = @_; |
487 |
# Get the FIG object. |
# Get the FIG and Sprout objects. |
488 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
489 |
|
my $sprout = $self->{sprout}; |
490 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
491 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
492 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
496 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
497 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
498 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
499 |
my $loadHasFeature = $self->_TableLoader('HasFeature'); |
my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly); |
500 |
|
my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly); |
501 |
|
# Get the subsystem hash. |
502 |
|
my $subHash = $self->{subsystems}; |
503 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
504 |
# locations. |
# locations. |
505 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
513 |
$loadFeature->Add("genomeIn"); |
$loadFeature->Add("genomeIn"); |
514 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
515 |
my $features = $fig->all_features_detailed($genomeID); |
my $features = $fig->all_features_detailed($genomeID); |
516 |
|
# Sort and count the list. |
517 |
|
my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; |
518 |
|
my $count = scalar @featureTuples; |
519 |
|
Trace("$count features found for genome $genomeID.") if T(3); |
520 |
|
# Set up for our duplicate-feature check. |
521 |
|
my $oldFeatureID = ""; |
522 |
# Loop through the features. |
# Loop through the features. |
523 |
for my $featureData (@{$features}) { |
for my $featureTuple (@featureTuples) { |
|
$loadFeature->Add("featureIn"); |
|
524 |
# Split the tuple. |
# Split the tuple. |
525 |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type) = @{$featureTuple}; |
526 |
# Create the feature record. |
# Check for duplicates. |
527 |
$loadFeature->Put($featureID, 1, $type); |
if ($featureID eq $oldFeatureID) { |
528 |
# Link it to the parent genome. |
Trace("Duplicate feature $featureID found.") if T(1); |
529 |
|
} else { |
530 |
|
$oldFeatureID = $featureID; |
531 |
|
# Count this feature. |
532 |
|
$loadFeature->Add("featureIn"); |
533 |
|
# Get the functional assignment. |
534 |
|
my $assignment = $fig->function_of($featureID); |
535 |
|
# Begin building the keywords. |
536 |
|
my $keywords = "$assignment $genomeID"; |
537 |
|
# Link this feature to the parent genome. |
538 |
$loadHasFeature->Put($genomeID, $featureID, $type); |
$loadHasFeature->Put($genomeID, $featureID, $type); |
539 |
# Create the aliases. |
# Create the aliases. |
540 |
for my $alias ($fig->feature_aliases($featureID)) { |
for my $alias ($fig->feature_aliases($featureID)) { |
541 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
542 |
|
$keywords .= " $alias"; |
543 |
} |
} |
544 |
# Get the links. |
# Get the links. |
545 |
my @links = $fig->fid_links($featureID); |
my @links = $fig->fid_links($featureID); |
559 |
$loadFeatureUpstream->Put($featureID, $upstream); |
$loadFeatureUpstream->Put($featureID, $upstream); |
560 |
} |
} |
561 |
} |
} |
562 |
|
# Now we need to find the subsystems this feature participates in. |
563 |
|
# We also add the subsystems to the keyword list. Before we do that, |
564 |
|
# we must convert underscores to spaces and tack on the classifications. |
565 |
|
my @subsystems = $fig->peg_to_subsystems($featureID); |
566 |
|
for my $subsystem (@subsystems) { |
567 |
|
# Only proceed if we like this subsystem. |
568 |
|
if (exists $subHash->{$subsystem}) { |
569 |
|
# Store the has-role link. |
570 |
|
$loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type); |
571 |
|
# Save the subsystem's keyword data. |
572 |
|
my $subKeywords = $subHash->{$subsystem}; |
573 |
|
$keywords .= " $subKeywords"; |
574 |
|
} |
575 |
|
} |
576 |
|
# The final task is to add virulence and essentiality attributes. |
577 |
|
if ($fig->virulent($featureID)) { |
578 |
|
$keywords .= " virulent"; |
579 |
|
} |
580 |
|
if ($fig->essential($featureID)) { |
581 |
|
$keywords .= " essential"; |
582 |
|
} |
583 |
|
# Clean the keyword list. |
584 |
|
my $cleanWords = $sprout->CleanKeywords($keywords); |
585 |
|
# Create the feature record. |
586 |
|
$loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords); |
587 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
588 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
589 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
613 |
} |
} |
614 |
} |
} |
615 |
} |
} |
616 |
|
} |
617 |
# Finish the loads. |
# Finish the loads. |
618 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
619 |
return $retVal; |
return $retVal; |
662 |
Trace("Processing features for genome $genomeID.") if T(3); |
Trace("Processing features for genome $genomeID.") if T(3); |
663 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
664 |
my $features = $fig->all_features_detailed($genomeID); |
my $features = $fig->all_features_detailed($genomeID); |
665 |
|
# Count the BBHs we find. |
666 |
|
my $bbhCount = 0; |
667 |
# Loop through the features. |
# Loop through the features. |
668 |
for my $featureData (@{$features}) { |
for my $featureData (@{$features}) { |
669 |
# Split the tuple. |
# Split the tuple. |
679 |
if ($genomeHash->{$targetGenomeID}) { |
if ($genomeHash->{$targetGenomeID}) { |
680 |
$loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, |
$loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, |
681 |
$score); |
$score); |
682 |
|
$bbhCount++; |
683 |
} |
} |
684 |
} |
} |
685 |
} |
} |
686 |
|
Trace("$bbhCount BBHs found for $genomeID.") if T(3); |
687 |
} |
} |
688 |
} |
} |
689 |
# Finish the loads. |
# Finish the loads. |
706 |
The following relations are loaded by this method. |
The following relations are loaded by this method. |
707 |
|
|
708 |
Subsystem |
Subsystem |
709 |
|
SubsystemClass |
710 |
Role |
Role |
711 |
RoleEC |
RoleEC |
712 |
SSCell |
SSCell |
769 |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); |
my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); |
770 |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); |
771 |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); |
772 |
|
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly); |
773 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
774 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
775 |
} else { |
} else { |
795 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
796 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
797 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
798 |
|
# Now for the classification string. This comes back as a list |
799 |
|
# reference and we convert it to a space-delimited string. |
800 |
|
my $classList = $fig->subsystem_classification($subsysID); |
801 |
|
my $classString = join(" ", grep { $_ } @$classList); |
802 |
|
$loadSubsystemClass->Put($subsysID, $classString); |
803 |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
804 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
805 |
# Connect to this role. |
# Connect to this role. |
864 |
if ($pegCount > 0) { |
if ($pegCount > 0) { |
865 |
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
866 |
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
$loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); |
|
# Partition the PEGs found into clusters. |
|
|
my @clusters = $fig->compute_clusters(\@pegsFound, $sub); |
|
867 |
# Create a hash mapping PEG IDs to cluster numbers. |
# Create a hash mapping PEG IDs to cluster numbers. |
868 |
# We default to -1 for all of them. |
# We default to -1 for all of them. |
869 |
my %clusterOf = map { $_ => -1 } @pegsFound; |
my %clusterOf = map { $_ => -1 } @pegsFound; |
870 |
|
# Partition the PEGs found into clusters. |
871 |
|
my @clusters = $fig->compute_clusters([keys %clusterOf], $sub); |
872 |
for (my $i = 0; $i <= $#clusters; $i++) { |
for (my $i = 0; $i <= $#clusters; $i++) { |
873 |
my $subList = $clusters[$i]; |
my $subList = $clusters[$i]; |
874 |
for my $peg (@{$subList}) { |
for my $peg (@{$subList}) { |
916 |
} |
} |
917 |
} |
} |
918 |
} |
} |
919 |
|
} |
920 |
# Now we loop through the diagrams. We need to create the diagram records |
# Now we loop through the diagrams. We need to create the diagram records |
921 |
# and link each diagram to its roles. Note that only roles which occur |
# and link each diagram to its roles. Note that only roles which occur |
922 |
# in subsystems (and therefore appear in the %ecToRoles hash) are |
# in subsystems (and therefore appear in the %ecToRoles hash) are |
950 |
} |
} |
951 |
} |
} |
952 |
} |
} |
|
} |
|
953 |
# Finish the load. |
# Finish the load. |
954 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
955 |
return $retVal; |
return $retVal; |
1002 |
my %propertyKeys = (); |
my %propertyKeys = (); |
1003 |
my $nextID = 1; |
my $nextID = 1; |
1004 |
# Loop through the genomes. |
# Loop through the genomes. |
1005 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1006 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
1007 |
Trace("Generating properties for $genomeID.") if T(3); |
Trace("Generating properties for $genomeID.") if T(3); |
1008 |
# Get the genome's features. The feature ID is the first field in the |
# Get the genome's features. The feature ID is the first field in the |
1016 |
# Get all attributes for this feature. We do this one feature at a time |
# Get all attributes for this feature. We do this one feature at a time |
1017 |
# to insure we do not get any genome attributes. |
# to insure we do not get any genome attributes. |
1018 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
1019 |
|
# Add essentiality and virulence attributes. |
1020 |
|
if ($fig->essential($fid)) { |
1021 |
|
push @attributeList, [$fid, 'essential', 1, '']; |
1022 |
|
} |
1023 |
|
if ($fig->virulent($fid)) { |
1024 |
|
push @attributeList, [$fid, 'virulent', 1, '']; |
1025 |
|
} |
1026 |
if (scalar @attributeList) { |
if (scalar @attributeList) { |
1027 |
$featureCount++; |
$featureCount++; |
1028 |
} |
} |
1128 |
# Get the annotation tuple. |
# Get the annotation tuple. |
1129 |
my ($peg, $timestamp, $user, $text) = @{$tuple}; |
my ($peg, $timestamp, $user, $text) = @{$tuple}; |
1130 |
# Here we fix up the annotation text. "\r" is removed, |
# Here we fix up the annotation text. "\r" is removed, |
1131 |
# and "\t" and "\n" are escaped. Note we use the "s" |
# and "\t" and "\n" are escaped. Note we use the "gs" |
1132 |
# modifier so that new-lines inside the text do not |
# modifier so that new-lines inside the text do not |
1133 |
# stop the substitution search. |
# stop the substitution search. |
1134 |
$text =~ s/\r//gs; |
$text =~ s/\r//gs; |
1291 |
} else { |
} else { |
1292 |
Trace("Generating external data.") if T(2); |
Trace("Generating external data.") if T(2); |
1293 |
# We loop through the files one at a time. First, the organism file. |
# We loop through the files one at a time. First, the organism file. |
1294 |
Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); |
Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |"); |
1295 |
my $orgLine; |
my $orgLine; |
1296 |
while (defined($orgLine = <ORGS>)) { |
while (defined($orgLine = <ORGS>)) { |
1297 |
# Clean the input line. |
# Clean the input line. |
1303 |
close ORGS; |
close ORGS; |
1304 |
# Now the function file. |
# Now the function file. |
1305 |
my $funcLine; |
my $funcLine; |
1306 |
Open(\*FUNCS, "<$FIG_Config::global/ext_func.table"); |
Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |"); |
1307 |
while (defined($funcLine = <FUNCS>)) { |
while (defined($funcLine = <FUNCS>)) { |
1308 |
# Clean the line ending. |
# Clean the line ending. |
1309 |
chomp $funcLine; |
chomp $funcLine; |
1435 |
|
|
1436 |
GenomeGroups |
GenomeGroups |
1437 |
|
|
1438 |
There is no direct support for genome groups in FIG, so we access the SEED |
Currently, we do not use groups. We used to use them for NMPDR groups, |
1439 |
|
butThere is no direct support for genome groups in FIG, so we access the SEED |
1440 |
files directly. |
files directly. |
1441 |
|
|
1442 |
=over 4 |
=over 4 |
1462 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1463 |
} else { |
} else { |
1464 |
Trace("Generating group data.") if T(2); |
Trace("Generating group data.") if T(2); |
1465 |
|
# Currently there are no groups. |
1466 |
|
} |
1467 |
|
# Finish the load. |
1468 |
|
my $retVal = $self->_FinishAll(); |
1469 |
|
return $retVal; |
1470 |
|
} |
1471 |
|
|
1472 |
|
=head3 LoadSynonymData |
1473 |
|
|
1474 |
|
C<< my $stats = $spl->LoadSynonymData(); >> |
1475 |
|
|
1476 |
|
Load the synonym groups into Sprout. |
1477 |
|
|
1478 |
|
The following relations are loaded by this method. |
1479 |
|
|
1480 |
|
SynonymGroup |
1481 |
|
IsSynonymGroupFor |
1482 |
|
|
1483 |
|
The source information for these relations is taken from the C<maps_to_id> method |
1484 |
|
of the B<FIG> object. Unfortunately, to make this work, we need to use direct |
1485 |
|
SQL against the FIG database. |
1486 |
|
|
1487 |
|
=over 4 |
1488 |
|
|
1489 |
|
=item RETURNS |
1490 |
|
|
1491 |
|
Returns a statistics object for the loads. |
1492 |
|
|
1493 |
|
=back |
1494 |
|
|
1495 |
|
=cut |
1496 |
|
#: Return Type $%; |
1497 |
|
sub LoadSynonymData { |
1498 |
|
# Get this object instance. |
1499 |
|
my ($self) = @_; |
1500 |
|
# Get the FIG object. |
1501 |
|
my $fig = $self->{fig}; |
1502 |
|
# Get the genome hash. |
1503 |
|
my $genomeHash = $self->{genomes}; |
1504 |
|
# Create a load object for the table we're loading. |
1505 |
|
my $loadSynonymGroup = $self->_TableLoader('SynonymGroup'); |
1506 |
|
my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor'); |
1507 |
|
if ($self->{options}->{loadOnly}) { |
1508 |
|
Trace("Loading from existing files.") if T(2); |
1509 |
|
} else { |
1510 |
|
Trace("Generating synonym group data.") if T(2); |
1511 |
|
# Get the database handle. |
1512 |
|
my $dbh = $fig->db_handle(); |
1513 |
|
# Ask for the synonyms. |
1514 |
|
my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to"); |
1515 |
|
my $result = $sth->execute(); |
1516 |
|
if (! defined($result)) { |
1517 |
|
Confess("Database error in Synonym load: " . $sth->errstr()); |
1518 |
|
} else { |
1519 |
|
# Remember the current synonym. |
1520 |
|
my $current_syn = ""; |
1521 |
|
# Count the features. |
1522 |
|
my $featureCount = 0; |
1523 |
|
# Loop through the synonym/peg pairs. |
1524 |
|
while (my @row = $sth->fetchrow()) { |
1525 |
|
# Get the synonym ID and feature ID. |
1526 |
|
my ($syn_id, $peg) = @row; |
1527 |
|
# Insure it's for one of our genomes. |
1528 |
|
my $genomeID = FIG::genome_of($peg); |
1529 |
|
if (exists $genomeHash->{$genomeID}) { |
1530 |
|
# Verify the synonym. |
1531 |
|
if ($syn_id ne $current_syn) { |
1532 |
|
# It's new, so put it in the group table. |
1533 |
|
$loadSynonymGroup->Put($syn_id); |
1534 |
|
$current_syn = $syn_id; |
1535 |
|
} |
1536 |
|
# Connect the synonym to the peg. |
1537 |
|
$loadIsSynonymGroupFor->Put($syn_id, $peg); |
1538 |
|
# Count this feature. |
1539 |
|
$featureCount++; |
1540 |
|
if ($featureCount % 1000 == 0) { |
1541 |
|
Trace("$featureCount features processed.") if T(3); |
1542 |
|
} |
1543 |
|
} |
1544 |
|
} |
1545 |
|
} |
1546 |
|
} |
1547 |
|
# Finish the load. |
1548 |
|
my $retVal = $self->_FinishAll(); |
1549 |
|
return $retVal; |
1550 |
|
} |
1551 |
|
|
1552 |
|
=head3 LoadFamilyData |
1553 |
|
|
1554 |
|
C<< my $stats = $spl->LoadFamilyData(); >> |
1555 |
|
|
1556 |
|
Load the protein families into Sprout. |
1557 |
|
|
1558 |
|
The following relations are loaded by this method. |
1559 |
|
|
1560 |
|
Family |
1561 |
|
IsFamilyForFeature |
1562 |
|
|
1563 |
|
The source information for these relations is taken from the C<families_for_protein>, |
1564 |
|
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
1565 |
|
|
1566 |
|
=over 4 |
1567 |
|
|
1568 |
|
=item RETURNS |
1569 |
|
|
1570 |
|
Returns a statistics object for the loads. |
1571 |
|
|
1572 |
|
=back |
1573 |
|
|
1574 |
|
=cut |
1575 |
|
#: Return Type $%; |
1576 |
|
sub LoadFamilyData { |
1577 |
|
# Get this object instance. |
1578 |
|
my ($self) = @_; |
1579 |
|
# Get the FIG object. |
1580 |
|
my $fig = $self->{fig}; |
1581 |
|
# Get the genome hash. |
1582 |
|
my $genomeHash = $self->{genomes}; |
1583 |
|
# Create load objects for the tables we're loading. |
1584 |
|
my $loadFamily = $self->_TableLoader('Family'); |
1585 |
|
my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature'); |
1586 |
|
if ($self->{options}->{loadOnly}) { |
1587 |
|
Trace("Loading from existing files.") if T(2); |
1588 |
|
} else { |
1589 |
|
Trace("Generating family data.") if T(2); |
1590 |
|
# Create a hash for the family IDs. |
1591 |
|
my %familyHash = (); |
1592 |
# Loop through the genomes. |
# Loop through the genomes. |
1593 |
my $line; |
for my $genomeID (sort keys %{$genomeHash}) { |
1594 |
for my $genomeID (keys %{$genomeHash}) { |
Trace("Processing features for $genomeID.") if T(2); |
1595 |
Trace("Processing $genomeID.") if T(3); |
# Loop through this genome's PEGs. |
1596 |
# Open the NMPDR group file for this genome. |
for my $fid ($fig->all_features($genomeID, "peg")) { |
1597 |
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
$loadIsFamilyForFeature->Add("features", 1); |
1598 |
defined($line = <TMP>)) { |
# Get this feature's families. |
1599 |
# Clean the line ending. |
my @families = $fig->families_for_protein($fid); |
1600 |
chomp $line; |
# Loop through the families, connecting them to the feature. |
1601 |
# Add the group to the table. Note that there can only be one group |
for my $family (@families) { |
1602 |
# per genome. |
$loadIsFamilyForFeature->Put($family, $fid); |
1603 |
$loadGenomeGroups->Put($genomeID, $line); |
# If this is a new family, create a record for it. |
1604 |
|
if (! exists $familyHash{$family}) { |
1605 |
|
$familyHash{$family} = 1; |
1606 |
|
$loadFamily->Add("families", 1); |
1607 |
|
my $size = $fig->sz_family($family); |
1608 |
|
my $func = $fig->family_function($family); |
1609 |
|
$loadFamily->Put($family, $size, $func); |
1610 |
|
} |
1611 |
|
} |
1612 |
} |
} |
|
close TMP; |
|
1613 |
} |
} |
1614 |
} |
} |
1615 |
# Finish the load. |
# Finish the load. |
1617 |
return $retVal; |
return $retVal; |
1618 |
} |
} |
1619 |
|
|
1620 |
|
|
1621 |
|
|
1622 |
=head2 Internal Utility Methods |
=head2 Internal Utility Methods |
1623 |
|
|
1624 |
=head3 TableLoader |
=head3 TableLoader |
1686 |
my $retVal = Stats->new(); |
my $retVal = Stats->new(); |
1687 |
# Get the loader list. |
# Get the loader list. |
1688 |
my $loadList = $self->{loaders}; |
my $loadList = $self->{loaders}; |
1689 |
|
# Create a hash to hold the statistics objects, keyed on relation name. |
1690 |
|
my %loaderHash = (); |
1691 |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
1692 |
# ignominiously. At some future point, we want to make the loads restartable. |
# ignominiously. At some future point, we want to make the loads more restartable. |
1693 |
while (my $loader = pop @{$loadList}) { |
while (my $loader = pop @{$loadList}) { |
1694 |
# Get the relation name. |
# Get the relation name. |
1695 |
my $relName = $loader->RelName; |
my $relName = $loader->RelName; |
1700 |
# Here we really need to finish. |
# Here we really need to finish. |
1701 |
Trace("Finishing $relName.") if T(2); |
Trace("Finishing $relName.") if T(2); |
1702 |
my $stats = $loader->Finish(); |
my $stats = $loader->Finish(); |
1703 |
|
$loaderHash{$relName} = $stats; |
1704 |
|
} |
1705 |
|
} |
1706 |
|
# Now we loop through again, actually loading the tables. We want to finish before |
1707 |
|
# loading so that if something goes wrong at this point, all the load files are usable |
1708 |
|
# and we don't have to redo all that work. |
1709 |
|
for my $relName (sort keys %loaderHash) { |
1710 |
|
# Get the statistics for this relation. |
1711 |
|
my $stats = $loaderHash{$relName}; |
1712 |
|
# Check for a database load. |
1713 |
if ($self->{options}->{dbLoad}) { |
if ($self->{options}->{dbLoad}) { |
1714 |
# Here we want to use the load file just created to load the database. |
# Here we want to use the load file just created to load the database. |
1715 |
Trace("Loading relation $relName.") if T(2); |
Trace("Loading relation $relName.") if T(2); |
1720 |
$retVal->Accumulate($stats); |
$retVal->Accumulate($stats); |
1721 |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
1722 |
} |
} |
|
} |
|
1723 |
# Return the load statistics. |
# Return the load statistics. |
1724 |
return $retVal; |
return $retVal; |
1725 |
} |
} |