13 |
use BasicLocation; |
use BasicLocation; |
14 |
use HTML; |
use HTML; |
15 |
use AliasAnalysis; |
use AliasAnalysis; |
16 |
|
use BioWords; |
17 |
|
|
18 |
=head1 Sprout Load Methods |
=head1 Sprout Load Methods |
19 |
|
|
171 |
for my $subsystem (keys %subsystems) { |
for my $subsystem (keys %subsystems) { |
172 |
my $name = $subsystem; |
my $name = $subsystem; |
173 |
$name =~ s/_/ /g; |
$name =~ s/_/ /g; |
|
# my $classes = $fig->subsystem_classification($subsystem); |
|
|
# $name .= " " . join(" ", @{$classes}); |
|
174 |
$subsystems{$subsystem} = $name; |
$subsystems{$subsystem} = $name; |
175 |
} |
} |
176 |
} |
} |
256 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
257 |
} else { |
} else { |
258 |
Trace("Generating genome data.") if T(2); |
Trace("Generating genome data.") if T(2); |
259 |
|
# Get the full info for the FIG genomes. |
260 |
|
my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3], |
261 |
|
pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()}; |
262 |
# Now we loop through the genomes, generating the data for each one. |
# Now we loop through the genomes, generating the data for each one. |
263 |
for my $genomeID (sort keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
264 |
Trace("Generating data for genome $genomeID.") if T(3); |
Trace("Generating data for genome $genomeID.") if T(3); |
288 |
$group = $FIG_Config::otherGroup; |
$group = $FIG_Config::otherGroup; |
289 |
} |
} |
290 |
close TMP; |
close TMP; |
291 |
|
# Get the contigs. |
292 |
|
my @contigs = $fig->all_contigs($genomeID); |
293 |
|
# Get this genome's info array. |
294 |
|
my $info = $genomeInfo{$genomeID}; |
295 |
# Output the genome record. |
# Output the genome record. |
296 |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), |
$loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs), |
297 |
$dnaSize, $genus, $group, $species, $extra, $version, $taxonomy); |
$dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy); |
298 |
# Now we loop through each of the genome's contigs. |
# Now we loop through each of the genome's contigs. |
|
my @contigs = $fig->all_contigs($genomeID); |
|
299 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
300 |
Trace("Processing contig $contigID for $genomeID.") if T(4); |
Trace("Processing contig $contigID for $genomeID.") if T(4); |
301 |
$loadContig->Add("contigIn"); |
$loadContig->Add("contigIn"); |
355 |
FeatureIEDB |
FeatureIEDB |
356 |
CDD |
CDD |
357 |
IsPresentOnProteinOf |
IsPresentOnProteinOf |
358 |
|
CellLocation |
359 |
|
IsPossiblePlaceFor |
360 |
|
ExternalDatabase |
361 |
|
IsAlsoFoundIn |
362 |
|
Keyword |
363 |
|
|
364 |
=over 4 |
=over 4 |
365 |
|
|
394 |
my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); |
my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); |
395 |
my $loadCDD = $self->_TableLoader('CDD'); |
my $loadCDD = $self->_TableLoader('CDD'); |
396 |
my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf'); |
my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf'); |
397 |
|
my $loadCellLocation = $self->_TableLoader('CellLocation'); |
398 |
|
my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor'); |
399 |
|
my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn'); |
400 |
|
my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase'); |
401 |
|
my $loadKeyword = $self->_TableLoader('Keyword'); |
402 |
# Get the subsystem hash. |
# Get the subsystem hash. |
403 |
my $subHash = $self->{subsystems}; |
my $subHash = $self->{subsystems}; |
404 |
# Get the property keys. |
# Get the property keys. |
405 |
my $propKeys = $self->{propKeys}; |
my $propKeys = $self->{propKeys}; |
406 |
# Create a hashes to hold CDD and alias values. |
# Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values. |
407 |
my %CDD = (); |
my %CDD = (); |
408 |
my %alias = (); |
my %alias = (); |
409 |
|
my %cellLocation = (); |
410 |
|
my %xdb = (); |
411 |
|
# Create the bio-words object. |
412 |
|
my $biowords = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt", |
413 |
|
stops => "$FIG_Config::sproutData/StopWords.txt", |
414 |
|
cache => 0); |
415 |
|
# One of the things we have to do here is build the keyword table, and the keyword |
416 |
|
# table needs to contain the originating text and feature count for each stem. Unfortunately, |
417 |
|
# the number of distinct keywords is so large it causes PERL to hang if we try to |
418 |
|
# keep them in memory. As a result, we need to track them using disk files. |
419 |
|
# Our approach will be to use two sequential files. One will contain stems and phonexes. |
420 |
|
# Each time a stem occurs in a feature, a record will be written to that file. The stem |
421 |
|
# file can then be sorted and collated to determine the number of features for each |
422 |
|
# stem. A separate file will contain keywords and stems. This last file |
423 |
|
# will be subjected to a sort unique on stem/keyword. The file is then merged |
424 |
|
# with the stem file to create the keyword table relation (keyword, stem, phonex, count). |
425 |
|
my $stemFileName = "$FIG_Config::temp/stems$$.tbl"; |
426 |
|
my $keyFileName = "$FIG_Config::temp/keys$$.tbl"; |
427 |
|
my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName"); |
428 |
|
my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName"); |
429 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
430 |
# locations. |
# locations. |
431 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
488 |
$alias{$alias} = 1; |
$alias{$alias} = 1; |
489 |
} |
} |
490 |
} |
} |
491 |
|
# Add the corresponding IDs. We ask for 2-tuples of the form (id, database). |
492 |
|
my @corresponders = $fig->get_corresponding_ids($featureID, 1); |
493 |
|
for my $tuple (@corresponders) { |
494 |
|
my ($id, $xdb) = @{$tuple}; |
495 |
|
# Ignore SEED: that's us. |
496 |
|
if ($xdb ne 'SEED') { |
497 |
|
# Connect this ID to the feature. |
498 |
|
$loadIsAlsoFoundIn->Put($featureID, $xdb, $id); |
499 |
|
# Add it as a keyword. |
500 |
|
push @keywords, $id; |
501 |
|
# If this is a new database, create a record for it. |
502 |
|
if (! exists $xdb{$xdb}) { |
503 |
|
$xdb{$xdb} = 1; |
504 |
|
$loadExternalDatabase->Put($xdb); |
505 |
|
} |
506 |
|
} |
507 |
|
} |
508 |
Trace("Assignment for $featureID is: $assignment") if T(4); |
Trace("Assignment for $featureID is: $assignment") if T(4); |
509 |
# Break the assignment into words and shove it onto the |
# Break the assignment into words and shove it onto the |
510 |
# keyword list. |
# keyword list. |
580 |
push @keywords, 'iedb'; |
push @keywords, 'iedb'; |
581 |
$loadFeature->Add('iedb'); |
$loadFeature->Add('iedb'); |
582 |
} |
} |
583 |
# Now we have some other attributes we need to process. Currently, |
# Now we have some other attributes we need to process. To get |
584 |
# this is CDD and CELLO, but we expect the number to increase. |
# through them, we convert the attribute list for this feature |
585 |
|
# into a two-layer hash: key => subkey => value. |
586 |
my %attributeHash = (); |
my %attributeHash = (); |
587 |
for my $attrRow (@{$attributes->{$featureID}}) { |
for my $attrRow (@{$attributes->{$featureID}}) { |
588 |
my (undef, $key, @values) = @{$attrRow}; |
my (undef, $key, @values) = @{$attrRow}; |
589 |
$key =~ /^([^:]+)::(.+)/; |
my ($realKey, $subKey); |
590 |
|
if ($key =~ /^([^:]+)::(.+)/) { |
591 |
|
($realKey, $subKey) = ($1, $2); |
592 |
|
} else { |
593 |
|
($realKey, $subKey) = ($key, ""); |
594 |
|
} |
595 |
if (exists $attributeHash{$1}) { |
if (exists $attributeHash{$1}) { |
596 |
$attributeHash{$1}->{$2} = \@values; |
$attributeHash{$1}->{$2} = \@values; |
597 |
} else { |
} else { |
598 |
$attributeHash{$1} = {$2 => \@values}; |
$attributeHash{$1} = {$2 => \@values}; |
599 |
} |
} |
600 |
} |
} |
601 |
my $celloValue = "unknown"; |
# First we handle CDD. This is a bit complicated, because |
|
# Pull in the CELLO attribute. There will never be more than one. |
|
|
# If we have one, it's a feature attribute AND a keyword. |
|
|
my @celloData = keys %{$attributeHash{CELLO}}; |
|
|
if (@celloData) { |
|
|
$celloValue = $celloData[0]; |
|
|
push @keywords, $celloValue; |
|
|
} |
|
|
# Now we handle CDD. This is a bit more complicated, because |
|
602 |
# there are multiple CDDs per protein. |
# there are multiple CDDs per protein. |
603 |
if (exists $attributeHash{CDD}) { |
if (exists $attributeHash{CDD}) { |
604 |
# Get the hash of CDD IDs to scores for this feature. We |
# Get the hash of CDD IDs to scores for this feature. We |
607 |
my @cddData = sort keys %{$cddHash}; |
my @cddData = sort keys %{$cddHash}; |
608 |
for my $cdd (@cddData) { |
for my $cdd (@cddData) { |
609 |
# Extract the score for this CDD and decode it. |
# Extract the score for this CDD and decode it. |
610 |
my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]); |
my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]); |
611 |
my $realScore = FIGRules::DecodeScore($codeScore); |
my $realScore = FIGRules::DecodeScore($codeScore); |
612 |
# We can't afford to crash because of a bad attribute |
# We can't afford to crash because of a bad attribute |
613 |
# value, hence the IF below. |
# value, hence the IF below. |
614 |
if (! defined($realScore)) { |
if (! defined($realScore)) { |
615 |
# Bad score, so count it. |
# Bad score, so count it. |
616 |
$loadFeature->Add('badCDDscore'); |
$loadFeature->Add('badCDDscore'); |
617 |
|
Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3); |
618 |
} else { |
} else { |
619 |
# Create the connection. |
# Create the connection. |
620 |
$loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore); |
$loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore); |
626 |
} |
} |
627 |
} |
} |
628 |
} |
} |
629 |
# Now we need to bust up hyphenated words in the keyword |
# Next we do PSORT cell locations. here the confidence value |
630 |
# list. We keep them separate and put them at the end so |
# could have the value "unknown", which we translate to -1. |
631 |
# the original word order is available. |
if (exists $attributeHash{PSORT}) { |
632 |
my $keywordString = ""; |
# This will be a hash of cell locations to confidence |
633 |
my $bustedString = ""; |
# factors. |
634 |
for my $keyword (@keywords) { |
my $psortHash = $attributeHash{PSORT}; |
635 |
if (length $keyword >= 3) { |
for my $psort (keys %{$psortHash}) { |
636 |
$keywordString .= " $keyword"; |
# Get the confidence, and convert it to a number if necessary. |
637 |
if ($keyword =~ /-/) { |
my $confidence = $psortHash->{$psort}; |
638 |
my @words = split /-/, $keyword; |
if ($confidence eq 'unknown') { |
639 |
$bustedString .= join(" ", "", @words); |
$confidence = -1; |
640 |
|
} |
641 |
|
$loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence); |
642 |
|
# If this cell location does not yet exist, create its record. |
643 |
|
if (! exists $cellLocation{$psort}) { |
644 |
|
$cellLocation{$psort} = 1; |
645 |
|
$loadCellLocation->Put($psort); |
646 |
|
} |
647 |
|
# If this is a significant location, add it as a keyword. |
648 |
|
if ($confidence > 2.5) { |
649 |
|
push @keywords, $psort; |
650 |
} |
} |
651 |
} |
} |
652 |
} |
} |
653 |
$keywordString .= $bustedString; |
# Phobius data is next. This consists of the signal peptide location and |
654 |
|
# the transmembrane locations. |
655 |
|
my $signalList = ""; |
656 |
|
my $transList = ""; |
657 |
|
if (exists $attributeHash{Phobius}) { |
658 |
|
# This will be a hash of two keys (transmembrane and signal) to |
659 |
|
# location strings. If there's no value, we stuff in an empty string. |
660 |
|
$signalList = ($attributeHash{Phobius}->{signal} || ""); |
661 |
|
$transList = ($attributeHash{Phobius}->{transmembrane} || ""); |
662 |
|
} |
663 |
|
# Here are some more numbers: isoelectric point, molecular weight, and |
664 |
|
# the similar-to-human flag. |
665 |
|
my $isoelectric = 0; |
666 |
|
if (exists $attributeHash{isoelectric_point}) { |
667 |
|
$isoelectric = $attributeHash{isoelectric_point}->{""}; |
668 |
|
} |
669 |
|
my $similarToHuman = 0; |
670 |
|
if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') { |
671 |
|
$similarToHuman = 1; |
672 |
|
} |
673 |
|
my $molecularWeight = 0; |
674 |
|
if (exists $attributeHash{molecular_weight}) { |
675 |
|
$molecularWeight = $attributeHash{molecular_weight}->{""}; |
676 |
|
} |
677 |
|
# Create the keyword string. |
678 |
|
my $keywordString = join(" ", @keywords); |
679 |
|
Trace("Real keyword string for $featureID: $keywordString.") if T(4); |
680 |
# Get rid of annoying punctuation. |
# Get rid of annoying punctuation. |
681 |
$keywordString =~ s/[();]//g; |
$keywordString =~ s/[();@#\/]/ /g; |
682 |
# Clean the keyword list. |
# Get the list of keywords in the keyword string. |
683 |
my $cleanWords = $sprout->CleanKeywords($keywordString); |
my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString); |
684 |
|
# We need to do two things here: create the keyword string for the feature table |
685 |
|
# and write records to the keyword and stem files. The stuff we write to |
686 |
|
# the files will be taken from the following two hashes. The stuff used |
687 |
|
# to create the keyword string will be taken from the list. |
688 |
|
my (%keys, %stems, @realStems); |
689 |
|
for my $keyword (@realKeywords) { |
690 |
|
# Compute the stem and phonex for this keyword. |
691 |
|
my ($stem, $phonex) = $biowords->StemLookup($keyword); |
692 |
|
# Only proceed if a stem comes back. If no stem came back, it's a |
693 |
|
# stop word and we throw it away. |
694 |
|
if ($stem) { |
695 |
|
$keys{$keyword} = $stem; |
696 |
|
$stems{$stem} = $phonex; |
697 |
|
push @realStems, $stem; |
698 |
|
} |
699 |
|
} |
700 |
|
# Now create the keyword string. |
701 |
|
my $cleanWords = join(" ", @realStems); |
702 |
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
703 |
|
# Write the stem and keyword records. |
704 |
|
for my $stem (keys %stems) { |
705 |
|
Tracer::PutLine($stemh, [$stem, $stems{$stem}]); |
706 |
|
} |
707 |
|
for my $key (keys %keys) { |
708 |
|
# The stem goes first in this file, because we want to sort |
709 |
|
# by stem and then keyword. |
710 |
|
Tracer::PutLine($keyh, [$keys{$key}, $key]); |
711 |
|
} |
712 |
# Now we need to process the feature's locations. First, we split them up. |
# Now we need to process the feature's locations. First, we split them up. |
713 |
my @locationList = split /\s*,\s*/, $locations; |
my @locationList = split /\s*,\s*/, $locations; |
714 |
# Next, we convert them to Sprout location objects. |
# Next, we convert them to Sprout location objects. |
715 |
my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; |
my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; |
716 |
# Assemble them into a sprout location string for later. |
# Assemble them into a sprout location string for later. |
717 |
my $locationString = join(", ", map { $_->String } @locObjectList); |
my $locationString = join(", ", map { $_->String } @locObjectList); |
718 |
|
# We'll store the sequence length in here. |
719 |
|
my $sequenceLength = 0; |
720 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
721 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
722 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
724 |
my $i = 1; |
my $i = 1; |
725 |
# Loop through the locations. |
# Loop through the locations. |
726 |
for my $locObject (@locObjectList) { |
for my $locObject (@locObjectList) { |
727 |
|
# Record the length. |
728 |
|
$sequenceLength += $locObject->Length; |
729 |
# Split this location into a list of chunks. |
# Split this location into a list of chunks. |
730 |
my @locOList = (); |
my @locOList = (); |
731 |
while (my $peeling = $locObject->Peel($chunkSize)) { |
while (my $peeling = $locObject->Peel($chunkSize)) { |
741 |
$i++; |
$i++; |
742 |
} |
} |
743 |
} |
} |
744 |
# Finally, reassemble the location objects into a list of Sprout location strings. |
# Now we get some ancillary flags. |
745 |
|
my $locked = $fig->is_locked_fid($featureID); |
746 |
|
my $in_genbank = $fig->peg_in_gendb($featureID); |
747 |
# Create the feature record. |
# Create the feature record. |
748 |
$loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locationString); |
$loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight, |
749 |
|
$sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString, |
750 |
|
$transList); |
751 |
} |
} |
752 |
} |
} |
753 |
Trace("Genome $genomeID processed.") if T(3); |
Trace("Genome $genomeID processed.") if T(3); |
754 |
} |
} |
755 |
} |
} |
756 |
|
Trace("Sorting keywords.") if T(2); |
757 |
|
# Now we need to load the keyword table from the key and stem files. |
758 |
|
close $keyh; |
759 |
|
close $stemh; |
760 |
|
Trace("Loading keywords.") if T(2); |
761 |
|
$keyh = Open(undef, "<$keyFileName"); |
762 |
|
$stemh = Open(undef, "<$stemFileName"); |
763 |
|
# We'll count the keywords in here, for tracing purposes. |
764 |
|
my $count = 0; |
765 |
|
# These variables track the current stem's data. When an incoming |
766 |
|
# keyword's stem changes, these will be recomputed. |
767 |
|
my ($currentStem, $currentPhonex, $currentCount); |
768 |
|
# Prime the loop by reading the first stem in the stem file. |
769 |
|
my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh); |
770 |
|
# Loop through the keyword file. |
771 |
|
while (! eof $keyh) { |
772 |
|
# Read this keyword. |
773 |
|
my ($thisStem, $thisKey) = Tracer::GetLine($keyh); |
774 |
|
# Check to see if it's the new stem yet. |
775 |
|
if ($thisStem ne $currentStem) { |
776 |
|
# Yes. It's a terrible error if it's not also the next stem. |
777 |
|
if ($thisStem ne $nextStem) { |
778 |
|
Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\"."); |
779 |
|
} else { |
780 |
|
# Here we're okay. |
781 |
|
($currentStem, $currentPhonex) = ($nextStem, $nextPhonex); |
782 |
|
# Count the number of features for this stem. |
783 |
|
$currentCount = 0; |
784 |
|
while ($nextStem eq $thisStem) { |
785 |
|
($nextStem, $nextPhonex) = Tracer::GetLine($stemh); |
786 |
|
$currentCount++; |
787 |
|
} |
788 |
|
} |
789 |
|
} |
790 |
|
# Now $currentStem is the same as $thisStem, and the other $current-vars |
791 |
|
# contain the stem's data (phonex and count). |
792 |
|
$loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem); |
793 |
|
if (++$count % 1000 == 0 && T(3)) { |
794 |
|
Trace("$count keywords loaded."); |
795 |
|
} |
796 |
|
} |
797 |
|
Trace("$count keywords loaded into keyword table.") if T(2); |
798 |
# Finish the loads. |
# Finish the loads. |
799 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
800 |
return $retVal; |
return $retVal; |
832 |
ConsistsOfGenomes |
ConsistsOfGenomes |
833 |
GenomeSubset |
GenomeSubset |
834 |
HasGenomeSubset |
HasGenomeSubset |
|
Catalyzes |
|
835 |
Diagram |
Diagram |
836 |
RoleOccursIn |
RoleOccursIn |
837 |
|
SubsystemHopeNotes |
838 |
|
|
839 |
=over 4 |
=over 4 |
840 |
|
|
881 |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset'); |
my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset'); |
882 |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset'); |
my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset'); |
883 |
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass'); |
my $loadSubsystemClass = $self->_TableLoader('SubsystemClass'); |
884 |
|
my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes'); |
885 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
886 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
887 |
} else { |
} else { |
906 |
# Create the subsystem record. |
# Create the subsystem record. |
907 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
908 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
909 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
my $version = $sub->get_version(); |
910 |
|
my $description = $sub->get_description(); |
911 |
|
$loadSubsystem->Put($subsysID, $curator, $version, $description, $notes); |
912 |
|
# Add the hope notes. |
913 |
|
my $hopeNotes = $sub->get_hope_curation_notes(); |
914 |
|
if ($hopeNotes) { |
915 |
|
$loadSubsystemHopeNotes->Put($sub, $hopeNotes); |
916 |
|
} |
917 |
# Now for the classification string. This comes back as a list |
# Now for the classification string. This comes back as a list |
918 |
# reference and we convert it to a space-delimited string. |
# reference and we convert it to a space-delimited string. |
919 |
my $classList = $fig->subsystem_classification($subsysID); |
my $classList = $fig->subsystem_classification($subsysID); |
923 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
924 |
# Get the role's abbreviation. |
# Get the role's abbreviation. |
925 |
my $abbr = $sub->get_role_abbr($col); |
my $abbr = $sub->get_role_abbr($col); |
926 |
|
# Get its essentiality. |
927 |
|
my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID); |
928 |
|
# Get its reaction note. |
929 |
|
my $hope_note = $sub->get_hope_reaction_notes($roleID) || ""; |
930 |
# Connect to this role. |
# Connect to this role. |
931 |
$loadOccursInSubsystem->Add("roleIn"); |
$loadOccursInSubsystem->Add("roleIn"); |
932 |
$loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col); |
$loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note); |
933 |
# If it's a new role, add it to the role table. |
# If it's a new role, add it to the role table. |
934 |
if (! exists $roleData{$roleID}) { |
if (! exists $roleData{$roleID}) { |
935 |
# Get the role's abbreviation. |
# Get the role's abbreviation. |
1073 |
} |
} |
1074 |
} |
} |
1075 |
} |
} |
|
# Before we leave, we must create the Catalyzes table. We start with the reactions, |
|
|
# then use the "ecToRoles" table to convert EC numbers to role IDs. |
|
|
my @reactions = $fig->all_reactions(); |
|
|
for my $reactionID (@reactions) { |
|
|
# Get this reaction's list of roles. The results will be EC numbers. |
|
|
my @ecs = $fig->catalyzed_by($reactionID); |
|
|
# Loop through the roles, creating catalyzation records. |
|
|
for my $thisEC (@ecs) { |
|
|
if (exists $ecToRoles{$thisEC}) { |
|
|
for my $thisRole (@{$ecToRoles{$thisEC}}) { |
|
|
$loadCatalyzes->Put($thisRole, $reactionID); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
1076 |
} |
} |
1077 |
# Finish the load. |
# Finish the load. |
1078 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1450 |
IsIdentifiedByCAS |
IsIdentifiedByCAS |
1451 |
HasCompoundName |
HasCompoundName |
1452 |
IsAComponentOf |
IsAComponentOf |
1453 |
|
Scenario |
1454 |
|
Catalyzes |
1455 |
|
HasScenario |
1456 |
|
IsInputFor |
1457 |
|
IsOutputOf |
1458 |
|
ExcludesReaction |
1459 |
|
IncludesReaction |
1460 |
|
IsOnDiagram |
1461 |
|
IncludesReaction |
1462 |
|
|
1463 |
This method proceeds reaction by reaction rather than genome by genome. |
This method proceeds reaction by reaction rather than genome by genome. |
1464 |
|
|
1486 |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf'); |
my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf'); |
1487 |
my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS'); |
my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS'); |
1488 |
my $loadHasCompoundName = $self->_TableLoader('HasCompoundName'); |
my $loadHasCompoundName = $self->_TableLoader('HasCompoundName'); |
1489 |
|
my $loadScenario = $self->_TableLoader('Scenario'); |
1490 |
|
my $loadHasScenario = $self->_TableLoader('HasScenario'); |
1491 |
|
my $loadIsInputFor = $self->_TableLoader('IsInputFor'); |
1492 |
|
my $loadIsOutputOf = $self->_TableLoader('IsOutputOf'); |
1493 |
|
my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram'); |
1494 |
|
my $loadIncludesReaction = $self->_TableLoader('IncludesReaction'); |
1495 |
|
my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction'); |
1496 |
|
my $loadCatalyzes = $self->_TableLoader('Catalyzes'); |
1497 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1498 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1499 |
} else { |
} else { |
1502 |
my %compoundNames = (); |
my %compoundNames = (); |
1503 |
my %compoundCASes = (); |
my %compoundCASes = (); |
1504 |
# First we create the compounds. |
# First we create the compounds. |
1505 |
my @compounds = $fig->all_compounds(); |
my %compounds = map { $_ => 1 } $fig->all_compounds(); |
1506 |
for my $cid (@compounds) { |
for my $cid (keys %compounds) { |
1507 |
# Check for names. |
# Check for names. |
1508 |
my @names = $fig->names_of_compound($cid); |
my @names = $fig->names_of_compound($cid); |
1509 |
# Each name will be given a priority number, starting with 1. |
# Each name will be given a priority number, starting with 1. |
1533 |
# we initialize the discriminator index. This is a single integer used to insure |
# we initialize the discriminator index. This is a single integer used to insure |
1534 |
# duplicate elements in a reaction are not accidentally collapsed. |
# duplicate elements in a reaction are not accidentally collapsed. |
1535 |
my $discrim = 0; |
my $discrim = 0; |
1536 |
my @reactions = $fig->all_reactions(); |
my %reactions = map { $_ => 1 } $fig->all_reactions(); |
1537 |
for my $reactionID (@reactions) { |
for my $reactionID (keys %reactions) { |
1538 |
# Create the reaction record. |
# Create the reaction record. |
1539 |
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
$loadReaction->Put($reactionID, $fig->reversible($reactionID)); |
1540 |
# Compute the reaction's URL. |
# Compute the reaction's URL. |
1557 |
} |
} |
1558 |
} |
} |
1559 |
} |
} |
1560 |
|
# Now we run through the subsystems and roles, generating the scenarios |
1561 |
|
# and connecting the reactions. We'll need some hashes to prevent |
1562 |
|
# duplicates and a counter for compound group keys. |
1563 |
|
my %roles = (); |
1564 |
|
my %scenarios = (); |
1565 |
|
my @subsystems = $fig->all_subsystems(); |
1566 |
|
for my $subName (@subsystems) { |
1567 |
|
my $sub = $fig->get_subsystem($subName); |
1568 |
|
Trace("Processing $subName reactions.") if T(3); |
1569 |
|
# Get the subsystem's reactions. |
1570 |
|
my %reactions = $sub->get_hope_reactions(); |
1571 |
|
# Loop through the roles, connecting them to the reactions. |
1572 |
|
for my $role (keys %reactions) { |
1573 |
|
# Only process this role if it is new. |
1574 |
|
if (! $roles{$role}) { |
1575 |
|
$roles{$role} = 1; |
1576 |
|
my @reactions = @{$reactions{$role}}; |
1577 |
|
for my $reaction (@reactions) { |
1578 |
|
$loadCatalyzes->Put($role, $reaction); |
1579 |
|
} |
1580 |
|
} |
1581 |
|
} |
1582 |
|
Trace("Processing $subName scenarios.") if T(3); |
1583 |
|
# Get the subsystem's scenarios. |
1584 |
|
my @scenarioNames = $sub->get_hope_scenario_names(); |
1585 |
|
# Loop through the scenarios, creating scenario data. |
1586 |
|
for my $scenarioName (@scenarioNames) { |
1587 |
|
# Link this scenario to this subsystem. |
1588 |
|
$loadHasScenario->Put($subName, $scenarioName); |
1589 |
|
# If this scenario is new, we need to create it. |
1590 |
|
if (! $scenarios{$scenarioName}) { |
1591 |
|
Trace("Creating scenario $scenarioName.") if T(3); |
1592 |
|
$scenarios{$scenarioName} = 1; |
1593 |
|
# Create the scenario itself. |
1594 |
|
$loadScenario->Put($scenarioName); |
1595 |
|
# Attach the input compounds. |
1596 |
|
for my $input ($sub->get_hope_input_compounds($scenarioName)) { |
1597 |
|
$loadIsInputFor->Put($input, $scenarioName); |
1598 |
|
} |
1599 |
|
# Now we need to set up the output compounds. They come in two |
1600 |
|
# groups, which we mark 0 and 1. |
1601 |
|
my $outputGroup = 0; |
1602 |
|
# Set up the output compounds. |
1603 |
|
for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) { |
1604 |
|
# Attach the compounds. |
1605 |
|
for my $compound (@$outputGroup) { |
1606 |
|
$loadIsOutputOf->Put($scenarioName, $compound, $outputGroup); |
1607 |
|
} |
1608 |
|
} |
1609 |
|
# Create the reaction lists. |
1610 |
|
my @addReactions = $sub->get_hope_additional_reactions($scenarioName); |
1611 |
|
for my $reaction (@addReactions) { |
1612 |
|
$loadIncludesReaction->Put($scenarioName, $reaction); |
1613 |
|
} |
1614 |
|
my @notReactions = $sub->get_hope_ignore_reactions($scenarioName); |
1615 |
|
for my $reaction (@notReactions) { |
1616 |
|
$loadExcludesReaction->Put($scenarioName, $reaction); |
1617 |
|
} |
1618 |
|
# Link the maps. |
1619 |
|
my @maps = $sub->get_hope_map_ids($scenarioName); |
1620 |
|
for my $map (@maps) { |
1621 |
|
$loadIsOnDiagram->Put($scenarioName, "map$map"); |
1622 |
|
} |
1623 |
|
} |
1624 |
|
} |
1625 |
|
} |
1626 |
} |
} |
1627 |
# Finish the load. |
# Finish the load. |
1628 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |