12 |
use Stats; |
use Stats; |
13 |
use BasicLocation; |
use BasicLocation; |
14 |
use HTML; |
use HTML; |
15 |
|
use AliasAnalysis; |
16 |
|
|
17 |
=head1 Sprout Load Methods |
=head1 Sprout Load Methods |
18 |
|
|
52 |
|
|
53 |
=head3 new |
=head3 new |
54 |
|
|
55 |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); |
56 |
|
|
57 |
Construct a new Sprout Loader object, specifying the two participating databases and |
Construct a new Sprout Loader object, specifying the two participating databases and |
58 |
the name of the files containing the list of genomes and subsystems to use. |
the name of the files containing the list of genomes and subsystems to use. |
103 |
# Here we want all the complete genomes and an access code of 1. |
# Here we want all the complete genomes and an access code of 1. |
104 |
my @genomeList = $fig->genomes(1); |
my @genomeList = $fig->genomes(1); |
105 |
%genomes = map { $_ => 1 } @genomeList; |
%genomes = map { $_ => 1 } @genomeList; |
106 |
|
Trace(scalar(keys %genomes) . " genomes found.") if T(3); |
107 |
} else { |
} else { |
108 |
my $type = ref $genomeFile; |
my $type = ref $genomeFile; |
109 |
Trace("Genome file parameter type is \"$type\".") if T(3); |
Trace("Genome file parameter type is \"$type\".") if T(3); |
170 |
for my $subsystem (keys %subsystems) { |
for my $subsystem (keys %subsystems) { |
171 |
my $name = $subsystem; |
my $name = $subsystem; |
172 |
$name =~ s/_/ /g; |
$name =~ s/_/ /g; |
173 |
my $classes = $fig->subsystem_classification($subsystem); |
# my $classes = $fig->subsystem_classification($subsystem); |
174 |
$name .= " " . join(" ", @{$classes}); |
# $name .= " " . join(" ", @{$classes}); |
175 |
$subsystems{$subsystem} = $name; |
$subsystems{$subsystem} = $name; |
176 |
} |
} |
177 |
} |
} |
198 |
|
|
199 |
=head3 LoadOnly |
=head3 LoadOnly |
200 |
|
|
201 |
C<< my $flag = $spl->LoadOnly; >> |
my $flag = $spl->LoadOnly; |
202 |
|
|
203 |
Return TRUE if we are in load-only mode, else FALSE. |
Return TRUE if we are in load-only mode, else FALSE. |
204 |
|
|
212 |
|
|
213 |
=head3 LoadGenomeData |
=head3 LoadGenomeData |
214 |
|
|
215 |
C<< my $stats = $spl->LoadGenomeData(); >> |
my $stats = $spl->LoadGenomeData(); |
216 |
|
|
217 |
Load the Genome, Contig, and Sequence data from FIG into Sprout. |
Load the Genome, Contig, and Sequence data from FIG into Sprout. |
218 |
|
|
328 |
|
|
329 |
=head3 LoadFeatureData |
=head3 LoadFeatureData |
330 |
|
|
331 |
C<< my $stats = $spl->LoadFeatureData(); >> |
my $stats = $spl->LoadFeatureData(); |
332 |
|
|
333 |
Load the feature data from FIG into Sprout. |
Load the feature data from FIG into Sprout. |
334 |
|
|
399 |
} else { |
} else { |
400 |
Trace("Generating feature data.") if T(2); |
Trace("Generating feature data.") if T(2); |
401 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
402 |
for my $genomeID (sort keys %{$genomeHash}) { |
my @allGenomes = sort keys %{$genomeHash}; |
403 |
|
Trace(scalar(@allGenomes) . " genomes found in list.") if T(3); |
404 |
|
for my $genomeID (@allGenomes) { |
405 |
Trace("Loading features for genome $genomeID.") if T(3); |
Trace("Loading features for genome $genomeID.") if T(3); |
406 |
$loadFeature->Add("genomeIn"); |
$loadFeature->Add("genomeIn"); |
407 |
# Get the feature list for this genome. |
# Get the feature list for this genome. |
413 |
Trace("$count features found for genome $genomeID.") if T(3); |
Trace("$count features found for genome $genomeID.") if T(3); |
414 |
# Get the attributes for this genome and put them in a hash by feature ID. |
# Get the attributes for this genome and put them in a hash by feature ID. |
415 |
my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys); |
my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys); |
416 |
|
Trace("Looping through features for $genomeID.") if T(3); |
417 |
# Set up for our duplicate-feature check. |
# Set up for our duplicate-feature check. |
418 |
my $oldFeatureID = ""; |
my $oldFeatureID = ""; |
419 |
# Loop through the features. |
# Loop through the features. |
479 |
} |
} |
480 |
# Now we need to find the subsystems this feature participates in. |
# Now we need to find the subsystems this feature participates in. |
481 |
# We also add the subsystems to the keyword list. Before we do that, |
# We also add the subsystems to the keyword list. Before we do that, |
482 |
# we must convert underscores to spaces and tack on the classifications. |
# we must convert underscores to spaces. |
483 |
my @subsystems = $fig->peg_to_subsystems($featureID); |
my @subsystems = $fig->peg_to_subsystems($featureID); |
484 |
for my $subsystem (@subsystems) { |
for my $subsystem (@subsystems) { |
485 |
# Only proceed if we like this subsystem. |
# Only proceed if we like this subsystem. |
557 |
my @cddData = sort keys %{$cddHash}; |
my @cddData = sort keys %{$cddHash}; |
558 |
for my $cdd (@cddData) { |
for my $cdd (@cddData) { |
559 |
# Extract the score for this CDD and decode it. |
# Extract the score for this CDD and decode it. |
560 |
my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[0]); |
my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]); |
561 |
my $realScore = FIGRules::DecodeScore($codeScore); |
my $realScore = FIGRules::DecodeScore($codeScore); |
562 |
|
# We can't afford to crash because of a bad attribute |
563 |
|
# value, hence the IF below. |
564 |
|
if (! defined($realScore)) { |
565 |
|
# Bad score, so count it. |
566 |
|
$loadFeature->Add('badCDDscore'); |
567 |
|
} else { |
568 |
# Create the connection. |
# Create the connection. |
569 |
$loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore); |
$loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore); |
570 |
# If this CDD does not yet exist, create its record. |
# If this CDD does not yet exist, create its record. |
574 |
} |
} |
575 |
} |
} |
576 |
} |
} |
577 |
|
} |
578 |
# Now we need to bust up hyphenated words in the keyword |
# Now we need to bust up hyphenated words in the keyword |
579 |
# list. We keep them separate and put them at the end so |
# list. We keep them separate and put them at the end so |
580 |
# the original word order is available. |
# the original word order is available. |
599 |
my @locationList = split /\s*,\s*/, $locations; |
my @locationList = split /\s*,\s*/, $locations; |
600 |
# Next, we convert them to Sprout location objects. |
# Next, we convert them to Sprout location objects. |
601 |
my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; |
my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; |
602 |
|
# Assemble them into a sprout location string for later. |
603 |
|
my $locationString = join(", ", map { $_->String } @locObjectList); |
604 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
605 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
606 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
624 |
} |
} |
625 |
} |
} |
626 |
# Finally, reassemble the location objects into a list of Sprout location strings. |
# Finally, reassemble the location objects into a list of Sprout location strings. |
|
$locations = join(", ", map { $_->String } @locObjectList); |
|
627 |
# Create the feature record. |
# Create the feature record. |
628 |
$loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations); |
$loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString); |
629 |
} |
} |
630 |
} |
} |
631 |
|
Trace("Genome $genomeID processed.") if T(3); |
632 |
} |
} |
633 |
} |
} |
634 |
# Finish the loads. |
# Finish the loads. |
638 |
|
|
639 |
=head3 LoadSubsystemData |
=head3 LoadSubsystemData |
640 |
|
|
641 |
C<< my $stats = $spl->LoadSubsystemData(); >> |
my $stats = $spl->LoadSubsystemData(); |
642 |
|
|
643 |
Load the subsystem data from FIG into Sprout. |
Load the subsystem data from FIG into Sprout. |
644 |
|
|
886 |
# Now we need to link all the map's roles to it. |
# Now we need to link all the map's roles to it. |
887 |
# A hash is used to prevent duplicates. |
# A hash is used to prevent duplicates. |
888 |
my %roleHash = (); |
my %roleHash = (); |
889 |
for my $role ($fig->map_to_ecs($map)) { |
for my $ec ($fig->map_to_ecs($map)) { |
890 |
if (exists $ecToRoles{$role} && ! $roleHash{$role}) { |
if (exists $ecToRoles{$ec}) { |
891 |
$loadRoleOccursIn->Put($ecToRoles{$role}, $map); |
for my $role (@{$ecToRoles{$ec}}) { |
892 |
|
if (! $roleHash{$role}) { |
893 |
|
$loadRoleOccursIn->Put($role, $map); |
894 |
$roleHash{$role} = 1; |
$roleHash{$role} = 1; |
895 |
} |
} |
896 |
} |
} |
897 |
} |
} |
898 |
|
} |
899 |
|
} |
900 |
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
901 |
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
902 |
my @reactions = $fig->all_reactions(); |
my @reactions = $fig->all_reactions(); |
920 |
|
|
921 |
=head3 LoadPropertyData |
=head3 LoadPropertyData |
922 |
|
|
923 |
C<< my $stats = $spl->LoadPropertyData(); >> |
my $stats = $spl->LoadPropertyData(); |
924 |
|
|
925 |
Load the attribute data from FIG into Sprout. |
Load the attribute data from FIG into Sprout. |
926 |
|
|
1008 |
|
|
1009 |
=head3 LoadAnnotationData |
=head3 LoadAnnotationData |
1010 |
|
|
1011 |
C<< my $stats = $spl->LoadAnnotationData(); >> |
my $stats = $spl->LoadAnnotationData(); |
1012 |
|
|
1013 |
Load the annotation data from FIG into Sprout. |
Load the annotation data from FIG into Sprout. |
1014 |
|
|
1115 |
|
|
1116 |
=head3 LoadSourceData |
=head3 LoadSourceData |
1117 |
|
|
1118 |
C<< my $stats = $spl->LoadSourceData(); >> |
my $stats = $spl->LoadSourceData(); |
1119 |
|
|
1120 |
Load the source data from FIG into Sprout. |
Load the source data from FIG into Sprout. |
1121 |
|
|
1193 |
|
|
1194 |
=head3 LoadExternalData |
=head3 LoadExternalData |
1195 |
|
|
1196 |
C<< my $stats = $spl->LoadExternalData(); >> |
my $stats = $spl->LoadExternalData(); |
1197 |
|
|
1198 |
Load the external data from FIG into Sprout. |
Load the external data from FIG into Sprout. |
1199 |
|
|
1273 |
|
|
1274 |
=head3 LoadReactionData |
=head3 LoadReactionData |
1275 |
|
|
1276 |
C<< my $stats = $spl->LoadReactionData(); >> |
my $stats = $spl->LoadReactionData(); |
1277 |
|
|
1278 |
Load the reaction data from FIG into Sprout. |
Load the reaction data from FIG into Sprout. |
1279 |
|
|
1387 |
|
|
1388 |
=head3 LoadSynonymData |
=head3 LoadSynonymData |
1389 |
|
|
1390 |
C<< my $stats = $spl->LoadSynonymData(); >> |
my $stats = $spl->LoadSynonymData(); |
1391 |
|
|
1392 |
Load the synonym groups into Sprout. |
Load the synonym groups into Sprout. |
1393 |
|
|
1432 |
if (! defined($result)) { |
if (! defined($result)) { |
1433 |
Confess("Database error in Synonym load: " . $sth->errstr()); |
Confess("Database error in Synonym load: " . $sth->errstr()); |
1434 |
} else { |
} else { |
1435 |
|
Trace("Processing synonym results.") if T(2); |
1436 |
# Remember the current synonym. |
# Remember the current synonym. |
1437 |
my $current_syn = ""; |
my $current_syn = ""; |
1438 |
# Count the features. |
# Count the features. |
1439 |
my $featureCount = 0; |
my $featureCount = 0; |
1440 |
|
my $entryCount = 0; |
1441 |
# Loop through the synonym/peg pairs. |
# Loop through the synonym/peg pairs. |
1442 |
while (my @row = $sth->fetchrow()) { |
while (my @row = $sth->fetchrow()) { |
1443 |
# Get the synonym group ID and feature ID. |
# Get the synonym group ID and feature ID. |
1444 |
my ($syn_id, $peg) = @row; |
my ($syn_id, $peg) = @row; |
1445 |
|
# Count this row. |
1446 |
|
$entryCount++; |
1447 |
|
if ($entryCount % 1000 == 0) { |
1448 |
|
Trace("$entryCount rows processed.") if T(3); |
1449 |
|
} |
1450 |
# Insure it's for one of our genomes. |
# Insure it's for one of our genomes. |
1451 |
my $genomeID = FIG::genome_of($peg); |
my $genomeID = FIG::genome_of($peg); |
1452 |
if (exists $genomeHash->{$genomeID}) { |
if (exists $genomeHash->{$genomeID}) { |
1465 |
} |
} |
1466 |
} |
} |
1467 |
} |
} |
1468 |
|
Trace("$entryCount rows produced $featureCount features.") if T(2); |
1469 |
} |
} |
1470 |
} |
} |
1471 |
# Finish the load. |
# Finish the load. |
1475 |
|
|
1476 |
=head3 LoadFamilyData |
=head3 LoadFamilyData |
1477 |
|
|
1478 |
C<< my $stats = $spl->LoadFamilyData(); >> |
my $stats = $spl->LoadFamilyData(); |
1479 |
|
|
1480 |
Load the protein families into Sprout. |
Load the protein families into Sprout. |
1481 |
|
|
1543 |
|
|
1544 |
=head3 LoadDrugData |
=head3 LoadDrugData |
1545 |
|
|
1546 |
C<< my $stats = $spl->LoadDrugData(); >> |
my $stats = $spl->LoadDrugData(); |
1547 |
|
|
1548 |
Load the drug target data into Sprout. |
Load the drug target data into Sprout. |
1549 |
|
|
1677 |
# Decode the score. |
# Decode the score. |
1678 |
my $realScore = FIGRules::DecodeScore($score); |
my $realScore = FIGRules::DecodeScore($score); |
1679 |
# Connect the PDB to the feature. |
# Connect the PDB to the feature. |
1680 |
$loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end); |
$loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end); |
1681 |
} |
} |
1682 |
} |
} |
1683 |
} |
} |
1742 |
|
|
1743 |
=head3 SpecialAttribute |
=head3 SpecialAttribute |
1744 |
|
|
1745 |
C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >> |
my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); |
1746 |
|
|
1747 |
Look for special attributes of a given type. A special attribute is found by comparing one of |
Look for special attributes of a given type. A special attribute is found by comparing one of |
1748 |
the columns of the incoming attribute list to a search pattern. If a match is found, then |
the columns of the incoming attribute list to a search pattern. If a match is found, then |
1918 |
|
|
1919 |
=head3 GetGenomeAttributes |
=head3 GetGenomeAttributes |
1920 |
|
|
1921 |
C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >> |
my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); |
1922 |
|
|
1923 |
Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related |
Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related |
1924 |
attributes for all the features of a genome in a single call, then organizes them into |
attributes for all the features of a genome in a single call, then organizes them into |