136 |
# We only need it if load-only is NOT specified. |
# We only need it if load-only is NOT specified. |
137 |
if (! $options->{loadOnly}) { |
if (! $options->{loadOnly}) { |
138 |
if (! defined $subsysFile || $subsysFile eq '') { |
if (! defined $subsysFile || $subsysFile eq '') { |
139 |
# Here we want all the NMPDR subsystems. First we get the whole list. |
# Here we want all the usable subsystems. First we get the whole list. |
140 |
my @subs = $fig->all_subsystems(); |
my @subs = $fig->all_subsystems(); |
141 |
# Loop through, checking for the NMPDR file. |
# Loop through, checking for usability. |
142 |
for my $sub (@subs) { |
for my $sub (@subs) { |
143 |
if (-e "$FIG_Config::data/Subsystems/$sub/NMPDR") { |
if ($fig->usable_subsystem($sub)) { |
144 |
$subsystems{$sub} = 1; |
$subsystems{$sub} = 1; |
145 |
} |
} |
146 |
} |
} |
340 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
341 |
# Get the genome hash. |
# Get the genome hash. |
342 |
my $genomeFilter = $self->{genomes}; |
my $genomeFilter = $self->{genomes}; |
343 |
my $genomeCount = (keys %{$genomeFilter}); |
# Set up an ID counter for the PCHs. |
344 |
my $featureCount = $genomeCount * 4000; |
my $pchID = 0; |
345 |
# Start the loads. |
# Start the loads. |
346 |
my $loadCoupling = $self->_TableLoader('Coupling'); |
my $loadCoupling = $self->_TableLoader('Coupling'); |
347 |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
375 |
for my $coupleData (@couplings) { |
for my $coupleData (@couplings) { |
376 |
my ($peg2, $score) = @{$coupleData}; |
my ($peg2, $score) = @{$coupleData}; |
377 |
# Compute the coupling ID. |
# Compute the coupling ID. |
378 |
my $coupleID = Sprout::CouplingID($peg1, $peg2); |
my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2); |
379 |
if (! exists $dupHash{$coupleID}) { |
if (! exists $dupHash{$coupleID}) { |
380 |
$loadCoupling->Add("couplingIn"); |
$loadCoupling->Add("couplingIn"); |
381 |
# Here we have a new coupling to store in the load files. |
# Here we have a new coupling to store in the load files. |
411 |
} |
} |
412 |
} |
} |
413 |
for my $evidenceID (keys %evidenceMap) { |
for my $evidenceID (keys %evidenceMap) { |
414 |
|
# Get the ID for this evidence. |
415 |
|
$pchID++; |
416 |
# Create the evidence record. |
# Create the evidence record. |
417 |
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; |
418 |
$loadPCH->Put($evidenceID, $usage); |
$loadPCH->Put($pchID, $usage); |
419 |
# Connect it to the coupling. |
# Connect it to the coupling. |
420 |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
$loadIsEvidencedBy->Put($coupleID, $pchID); |
421 |
# Connect it to the features. |
# Connect it to the features. |
422 |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
$loadUsesAsEvidence->Put($pchID, $peg3, 1); |
423 |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 2); |
$loadUsesAsEvidence->Put($pchID, $peg4, 2); |
424 |
} |
} |
425 |
} |
} |
426 |
} |
} |
488 |
$loadFeature->Add("genomeIn"); |
$loadFeature->Add("genomeIn"); |
489 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
490 |
my $features = $fig->all_features_detailed($genomeID); |
my $features = $fig->all_features_detailed($genomeID); |
491 |
|
# Sort and count the list. |
492 |
|
my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; |
493 |
|
my $count = scalar @featureTuples; |
494 |
|
Trace("$count features found for genome $genomeID.") if T(3); |
495 |
|
# Set up for our duplicate-feature check. |
496 |
|
my $oldFeatureID = ""; |
497 |
# Loop through the features. |
# Loop through the features. |
498 |
for my $featureData (@{$features}) { |
for my $featureTuple (@featureTuples) { |
|
$loadFeature->Add("featureIn"); |
|
499 |
# Split the tuple. |
# Split the tuple. |
500 |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type) = @{$featureTuple}; |
501 |
|
# Check for duplicates. |
502 |
|
if ($featureID eq $oldFeatureID) { |
503 |
|
Trace("Duplicate feature $featureID found.") if T(1); |
504 |
|
} else { |
505 |
|
$oldFeatureID = $featureID; |
506 |
|
# Count this feature. |
507 |
|
$loadFeature->Add("featureIn"); |
508 |
# Create the feature record. |
# Create the feature record. |
509 |
$loadFeature->Put($featureID, 1, $type); |
$loadFeature->Put($featureID, 1, $type); |
510 |
# Link it to the parent genome. |
# Link it to the parent genome. |
560 |
} |
} |
561 |
} |
} |
562 |
} |
} |
563 |
|
} |
564 |
# Finish the loads. |
# Finish the loads. |
565 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
566 |
return $retVal; |
return $retVal; |
858 |
} |
} |
859 |
} |
} |
860 |
} |
} |
861 |
|
} |
862 |
# Now we loop through the diagrams. We need to create the diagram records |
# Now we loop through the diagrams. We need to create the diagram records |
863 |
# and link each diagram to its roles. Note that only roles which occur |
# and link each diagram to its roles. Note that only roles which occur |
864 |
# in subsystems (and therefore appear in the %ecToRoles hash) are |
# in subsystems (and therefore appear in the %ecToRoles hash) are |
892 |
} |
} |
893 |
} |
} |
894 |
} |
} |
|
} |
|
895 |
# Finish the load. |
# Finish the load. |
896 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
897 |
return $retVal; |
return $retVal; |
1226 |
} else { |
} else { |
1227 |
Trace("Generating external data.") if T(2); |
Trace("Generating external data.") if T(2); |
1228 |
# We loop through the files one at a time. First, the organism file. |
# We loop through the files one at a time. First, the organism file. |
1229 |
Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); |
Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |"); |
1230 |
my $orgLine; |
my $orgLine; |
1231 |
while (defined($orgLine = <ORGS>)) { |
while (defined($orgLine = <ORGS>)) { |
1232 |
# Clean the input line. |
# Clean the input line. |
1238 |
close ORGS; |
close ORGS; |
1239 |
# Now the function file. |
# Now the function file. |
1240 |
my $funcLine; |
my $funcLine; |
1241 |
Open(\*FUNCS, "<$FIG_Config::global/ext_func.table"); |
Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |"); |
1242 |
while (defined($funcLine = <FUNCS>)) { |
while (defined($funcLine = <FUNCS>)) { |
1243 |
# Clean the line ending. |
# Clean the line ending. |
1244 |
chomp $funcLine; |
chomp $funcLine; |
1429 |
IsSynonymGroupFor |
IsSynonymGroupFor |
1430 |
|
|
1431 |
The source information for these relations is taken from the C<maps_to_id> method |
The source information for these relations is taken from the C<maps_to_id> method |
1432 |
of the B<FIG> object. The process starts from the features, so it is possible |
of the B<FIG> object. Unfortunately, to make this work, we need to use direct |
1433 |
that there will be duplicates in the SynonymGroup load file, since the relationship |
SQL against the FIG database. |
|
is one-to-many toward the features. The automatic sort on primary entity relations |
|
|
will fix this for us. |
|
1434 |
|
|
1435 |
=over 4 |
=over 4 |
1436 |
|
|
1456 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1457 |
} else { |
} else { |
1458 |
Trace("Generating synonym group data.") if T(2); |
Trace("Generating synonym group data.") if T(2); |
1459 |
|
# Get the database handle. |
1460 |
|
my $dbh = $fig->db_handle(); |
1461 |
|
# Ask for the synonyms. |
1462 |
|
my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to"); |
1463 |
|
my $result = $sth->execute(); |
1464 |
|
if (! defined($result)) { |
1465 |
|
Confess("Database error in Synonym load: " . $sth->errstr()); |
1466 |
|
} else { |
1467 |
|
# Remember the current synonym. |
1468 |
|
my $current_syn = ""; |
1469 |
|
# Count the features. |
1470 |
|
my $featureCount = 0; |
1471 |
|
# Loop through the synonym/peg pairs. |
1472 |
|
while (my @row = $sth->fetchrow()) { |
1473 |
|
# Get the synonym ID and feature ID. |
1474 |
|
my ($syn_id, $peg) = @row; |
1475 |
|
# Insure it's for one of our genomes. |
1476 |
|
my $genomeID = FIG::genome_of($peg); |
1477 |
|
if (exists $genomeHash->{$genomeID}) { |
1478 |
|
# Verify the synonym. |
1479 |
|
if ($syn_id ne $current_syn) { |
1480 |
|
# It's new, so put it in the group table. |
1481 |
|
$loadSynonymGroup->Put($syn_id); |
1482 |
|
$current_syn = $syn_id; |
1483 |
|
} |
1484 |
|
# Connect the synonym to the peg. |
1485 |
|
$loadIsSynonymGroupFor->Put($syn_id, $peg); |
1486 |
|
# Count this feature. |
1487 |
|
$featureCount++; |
1488 |
|
if ($featureCount % 1000 == 0) { |
1489 |
|
Trace("$featureCount features processed.") if T(3); |
1490 |
|
} |
1491 |
|
} |
1492 |
|
} |
1493 |
|
} |
1494 |
|
} |
1495 |
|
# Finish the load. |
1496 |
|
my $retVal = $self->_FinishAll(); |
1497 |
|
return $retVal; |
1498 |
|
} |
1499 |
|
|
1500 |
|
=head3 LoadFamilyData |
1501 |
|
|
1502 |
|
C<< my $stats = $spl->LoadFamilyData(); >> |
1503 |
|
|
1504 |
|
Load the protein families into Sprout. |
1505 |
|
|
1506 |
|
The following relations are loaded by this method. |
1507 |
|
|
1508 |
|
Family |
1509 |
|
ContainsFeature |
1510 |
|
|
1511 |
|
The source information for these relations is taken from the C<families_for_protein>, |
1512 |
|
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
1513 |
|
|
1514 |
|
=over 4 |
1515 |
|
|
1516 |
|
=item RETURNS |
1517 |
|
|
1518 |
|
Returns a statistics object for the loads. |
1519 |
|
|
1520 |
|
=back |
1521 |
|
|
1522 |
|
=cut |
1523 |
|
#: Return Type $%; |
1524 |
|
sub LoadFamilyData { |
1525 |
|
# Get this object instance. |
1526 |
|
my ($self) = @_; |
1527 |
|
# Get the FIG object. |
1528 |
|
my $fig = $self->{fig}; |
1529 |
|
# Get the genome hash. |
1530 |
|
my $genomeHash = $self->{genomes}; |
1531 |
|
# Create load objects for the tables we're loading. |
1532 |
|
my $loadFamily = $self->_TableLoader('Family'); |
1533 |
|
my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); |
1534 |
|
if ($self->{options}->{loadOnly}) { |
1535 |
|
Trace("Loading from existing files.") if T(2); |
1536 |
|
} else { |
1537 |
|
Trace("Generating family data.") if T(2); |
1538 |
|
# Create a hash for the family IDs. |
1539 |
|
my %familyHash = (); |
1540 |
# Loop through the genomes. |
# Loop through the genomes. |
1541 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
1542 |
Trace("Processing $genomeID.") if T(3); |
Trace("Processing features for $genomeID.") if T(2); |
1543 |
# Get all of the features for this genome. The only method that does this is |
# Loop through this genome's PEGs. |
1544 |
# all_features_detailed, which returns extra baggage that we discard. |
for my $fid ($fig->all_features($genomeID, "peg")) { |
1545 |
my $featureData = $fig->all_features_detailed($genomeID); |
$loadContainsFeature->Add("features", 1); |
1546 |
my @fids = map { $_->[0] } @{$featureData}; |
# Get this feature's families. |
1547 |
Trace(scalar(@fids) . " features found for genome $genomeID.") if T(3); |
my @families = $fig->families_for_protein($fid); |
1548 |
# Loop through the feature IDs. |
# Loop through the families, connecting them to the feature. |
1549 |
for my $fid (@fids) { |
for my $family (@families) { |
1550 |
# Get the group for this feature. |
$loadContainsFeature->Put($family, $fid); |
1551 |
my $synonym = $fig->maps_to_id($fid); |
# If this is a new family, create a record for it. |
1552 |
# Only proceed if the synonym is a real group. |
if (! exists $familyHash{$family}) { |
1553 |
if ($synonym ne $fid) { |
$familyHash{$family} = 1; |
1554 |
$loadSynonymGroup->Put($synonym); |
$loadFamily->Add("families", 1); |
1555 |
$loadIsSynonymGroupFor->Put($synonym, $fid); |
my $size = $fig->sz_family($family); |
1556 |
|
my $func = $fig->family_function($family); |
1557 |
|
$loadFamily->Put($family, $size, $func); |
1558 |
|
} |
1559 |
} |
} |
1560 |
} |
} |
1561 |
} |
} |
1565 |
return $retVal; |
return $retVal; |
1566 |
} |
} |
1567 |
|
|
|
|
|
1568 |
=head2 Internal Utility Methods |
=head2 Internal Utility Methods |
1569 |
|
|
1570 |
=head3 TableLoader |
=head3 TableLoader |
1632 |
my $retVal = Stats->new(); |
my $retVal = Stats->new(); |
1633 |
# Get the loader list. |
# Get the loader list. |
1634 |
my $loadList = $self->{loaders}; |
my $loadList = $self->{loaders}; |
1635 |
|
# Create a hash to hold the statistics objects, keyed on relation name. |
1636 |
|
my %loaderHash = (); |
1637 |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
# Loop through the list, finishing the loads. Note that if the finish fails, we die |
1638 |
# ignominiously. At some future point, we want to make the loads restartable. |
# ignominiously. At some future point, we want to make the loads more restartable. |
1639 |
while (my $loader = pop @{$loadList}) { |
while (my $loader = pop @{$loadList}) { |
1640 |
# Get the relation name. |
# Get the relation name. |
1641 |
my $relName = $loader->RelName; |
my $relName = $loader->RelName; |
1646 |
# Here we really need to finish. |
# Here we really need to finish. |
1647 |
Trace("Finishing $relName.") if T(2); |
Trace("Finishing $relName.") if T(2); |
1648 |
my $stats = $loader->Finish(); |
my $stats = $loader->Finish(); |
1649 |
|
$loaderHash{$relName} = $stats; |
1650 |
|
} |
1651 |
|
} |
1652 |
|
# Now we loop through again, actually loading the tables. We want to finish before |
1653 |
|
# loading so that if something goes wrong at this point, all the load files are usable |
1654 |
|
# and we don't have to redo all that work. |
1655 |
|
for my $relName (sort keys %loaderHash) { |
1656 |
|
# Get the statistics for this relation. |
1657 |
|
my $stats = $loaderHash{$relName}; |
1658 |
|
# Check for a database load. |
1659 |
if ($self->{options}->{dbLoad}) { |
if ($self->{options}->{dbLoad}) { |
1660 |
# Here we want to use the load file just created to load the database. |
# Here we want to use the load file just created to load the database. |
1661 |
Trace("Loading relation $relName.") if T(2); |
Trace("Loading relation $relName.") if T(2); |
1666 |
$retVal->Accumulate($stats); |
$retVal->Accumulate($stats); |
1667 |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); |
1668 |
} |
} |
|
} |
|
1669 |
# Return the load statistics. |
# Return the load statistics. |
1670 |
return $retVal; |
return $retVal; |
1671 |
} |
} |