80 |
Either the name of the file containing the list of trusted subsystems or a reference |
Either the name of the file containing the list of trusted subsystems or a reference |
81 |
to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be |
to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be |
82 |
considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR> |
considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR> |
83 |
in its data directory.) Only subsystem data related to the trusted subsystems is loaded. |
in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded. |
84 |
|
|
85 |
=item options |
=item options |
86 |
|
|
120 |
# an omitted access code can be defaulted to 1. |
# an omitted access code can be defaulted to 1. |
121 |
for my $genomeLine (@genomeList) { |
for my $genomeLine (@genomeList) { |
122 |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
123 |
if (undef $accessCode) { |
if (! defined($accessCode)) { |
124 |
$accessCode = 1; |
$accessCode = 1; |
125 |
} |
} |
126 |
$genomes{$genomeID} = $accessCode; |
$genomes{$genomeID} = $accessCode; |
138 |
if (! defined $subsysFile || $subsysFile eq '') { |
if (! defined $subsysFile || $subsysFile eq '') { |
139 |
# Here we want all the usable subsystems. First we get the whole list. |
# Here we want all the usable subsystems. First we get the whole list. |
140 |
my @subs = $fig->all_subsystems(); |
my @subs = $fig->all_subsystems(); |
141 |
# Loop through, checking for usability. |
# Loop through, checking for the NMPDR file. |
142 |
for my $sub (@subs) { |
for my $sub (@subs) { |
143 |
if ($fig->usable_subsystem($sub)) { |
if ($fig->nmpdr_subsystem($sub)) { |
144 |
$subsystems{$sub} = 1; |
$subsystems{$sub} = 1; |
145 |
} |
} |
146 |
} |
} |
163 |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
164 |
} |
} |
165 |
} |
} |
166 |
|
# Go through the subsys hash again, creating the keyword list for each subsystem. |
167 |
|
for my $subsystem (keys %subsystems) { |
168 |
|
my $name = $subsystem; |
169 |
|
$name =~ s/_/ /g; |
170 |
|
my $classes = $fig->subsystem_classification($subsystem); |
171 |
|
$name .= " " . join(" ", @{$classes}); |
172 |
|
$subsystems{$subsystem} = $name; |
173 |
|
} |
174 |
} |
} |
175 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
176 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
274 |
my $extra = join " ", @extraData; |
my $extra = join " ", @extraData; |
275 |
# Get the full taxonomy. |
# Get the full taxonomy. |
276 |
my $taxonomy = $fig->taxonomy_of($genomeID); |
my $taxonomy = $fig->taxonomy_of($genomeID); |
277 |
|
# Open the NMPDR group file for this genome. |
278 |
|
my $group; |
279 |
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
280 |
|
defined($group = <TMP>)) { |
281 |
|
# Clean the line ending. |
282 |
|
chomp $group; |
283 |
|
} else { |
284 |
|
# No group, so use the default. |
285 |
|
$group = $FIG_Config::otherGroup; |
286 |
|
} |
287 |
|
close TMP; |
288 |
# Output the genome record. |
# Output the genome record. |
289 |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
290 |
$species, $extra, $taxonomy); |
$group, $species, $extra, $taxonomy); |
291 |
# Now we loop through each of the genome's contigs. |
# Now we loop through each of the genome's contigs. |
292 |
my @contigs = $fig->all_contigs($genomeID); |
my @contigs = $fig->all_contigs($genomeID); |
293 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
468 |
FeatureUpstream |
FeatureUpstream |
469 |
IsLocatedIn |
IsLocatedIn |
470 |
HasFeature |
HasFeature |
471 |
|
HasRoleInSubsystem |
472 |
|
FeatureEssential |
473 |
|
FeatureVirulent |
474 |
|
FeatureIEDB |
475 |
|
|
476 |
=over 4 |
=over 4 |
477 |
|
|
486 |
sub LoadFeatureData { |
sub LoadFeatureData { |
487 |
# Get this object instance. |
# Get this object instance. |
488 |
my ($self) = @_; |
my ($self) = @_; |
489 |
# Get the FIG object. |
# Get the FIG and Sprout objects. |
490 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
491 |
|
my $sprout = $self->{sprout}; |
492 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
493 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
494 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
498 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
499 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
500 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
501 |
my $loadHasFeature = $self->_TableLoader('HasFeature'); |
my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly); |
502 |
|
my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly); |
503 |
|
my $loadFeatureEssential = $self->_TableLoader('FeatureEssential'); |
504 |
|
my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent'); |
505 |
|
my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); |
506 |
|
# Get the subsystem hash. |
507 |
|
my $subHash = $self->{subsystems}; |
508 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
509 |
# locations. |
# locations. |
510 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
521 |
# Sort and count the list. |
# Sort and count the list. |
522 |
my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; |
my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; |
523 |
my $count = scalar @featureTuples; |
my $count = scalar @featureTuples; |
524 |
|
my @fids = map { $_->[0] } @featureTuples; |
525 |
Trace("$count features found for genome $genomeID.") if T(3); |
Trace("$count features found for genome $genomeID.") if T(3); |
526 |
|
# Get the attributes for this genome and put them in a hash by feature ID. |
527 |
|
my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids); |
528 |
# Set up for our duplicate-feature check. |
# Set up for our duplicate-feature check. |
529 |
my $oldFeatureID = ""; |
my $oldFeatureID = ""; |
530 |
# Loop through the features. |
# Loop through the features. |
538 |
$oldFeatureID = $featureID; |
$oldFeatureID = $featureID; |
539 |
# Count this feature. |
# Count this feature. |
540 |
$loadFeature->Add("featureIn"); |
$loadFeature->Add("featureIn"); |
541 |
# Create the feature record. |
# Begin building the keywords. We start with the genome ID, the |
542 |
$loadFeature->Put($featureID, 1, $type); |
# feature ID, the taxonomy, and the organism name. |
543 |
# Link it to the parent genome. |
my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID), |
544 |
$loadHasFeature->Put($genomeID, $featureID, $type); |
$fig->taxonomy_of($genomeID)); |
545 |
|
# Get the functional assignment and aliases. |
546 |
|
my $assignment = $fig->function_of($featureID); |
547 |
# Create the aliases. |
# Create the aliases. |
548 |
for my $alias ($fig->feature_aliases($featureID)) { |
for my $alias ($fig->feature_aliases($featureID)) { |
549 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
550 |
|
push @keywords, $alias; |
551 |
} |
} |
552 |
|
Trace("Assignment for $featureID is: $assignment") if T(4); |
553 |
|
# Break the assignment into words and shove it onto the |
554 |
|
# keyword list. |
555 |
|
push @keywords, split(/\s+/, $assignment); |
556 |
|
# Link this feature to the parent genome. |
557 |
|
$loadHasFeature->Put($genomeID, $featureID, $type); |
558 |
# Get the links. |
# Get the links. |
559 |
my @links = $fig->fid_links($featureID); |
my @links = $fig->fid_links($featureID); |
560 |
for my $link (@links) { |
for my $link (@links) { |
573 |
$loadFeatureUpstream->Put($featureID, $upstream); |
$loadFeatureUpstream->Put($featureID, $upstream); |
574 |
} |
} |
575 |
} |
} |
576 |
|
# Now we need to find the subsystems this feature participates in. |
577 |
|
# We also add the subsystems to the keyword list. Before we do that, |
578 |
|
# we must convert underscores to spaces and tack on the classifications. |
579 |
|
my @subsystems = $fig->peg_to_subsystems($featureID); |
580 |
|
for my $subsystem (@subsystems) { |
581 |
|
# Only proceed if we like this subsystem. |
582 |
|
if (exists $subHash->{$subsystem}) { |
583 |
|
# Store the has-role link. |
584 |
|
$loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type); |
585 |
|
# Save the subsystem's keyword data. |
586 |
|
my $subKeywords = $subHash->{$subsystem}; |
587 |
|
push @keywords, split /\s+/, $subKeywords; |
588 |
|
# Now we need to get this feature's role in the subsystem. |
589 |
|
my $subObject = $fig->get_subsystem($subsystem); |
590 |
|
my @roleColumns = $subObject->get_peg_roles($featureID); |
591 |
|
my @allRoles = $subObject->get_roles(); |
592 |
|
for my $col (@roleColumns) { |
593 |
|
my $role = $allRoles[$col]; |
594 |
|
push @keywords, split /\s+/, $role; |
595 |
|
push @keywords, $subObject->get_role_abbr($col); |
596 |
|
} |
597 |
|
} |
598 |
|
} |
599 |
|
# There are three special attributes computed from property |
600 |
|
# data that we build next. If the special attribute is non-empty, |
601 |
|
# its name will be added to the keyword list. First, we get all |
602 |
|
# the attributes for this feature. They will come back as |
603 |
|
# 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead: |
604 |
|
# [name, value, value with URL]. (We don't need the PEG, since |
605 |
|
# we already know it.) |
606 |
|
my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] } |
607 |
|
@{$attributes->{$featureID}}; |
608 |
|
# Now we process each of the special attributes. |
609 |
|
if (SpecialAttribute($featureID, \@attributes, |
610 |
|
1, [0,2], '^(essential|potential_essential)$', |
611 |
|
$loadFeatureEssential)) { |
612 |
|
push @keywords, 'essential'; |
613 |
|
$loadFeature->Add('essential'); |
614 |
|
} |
615 |
|
if (SpecialAttribute($featureID, \@attributes, |
616 |
|
0, [2], '^virulen', |
617 |
|
$loadFeatureVirulent)) { |
618 |
|
push @keywords, 'virulent'; |
619 |
|
$loadFeature->Add('virulent'); |
620 |
|
} |
621 |
|
if (SpecialAttribute($featureID, \@attributes, |
622 |
|
0, [0,2], '^iedb_', |
623 |
|
$loadFeatureIEDB)) { |
624 |
|
push @keywords, 'iedb'; |
625 |
|
$loadFeature->Add('iedb'); |
626 |
|
} |
627 |
|
# Now we need to bust up hyphenated words in the keyword |
628 |
|
# list. We keep them separate and put them at the end so |
629 |
|
# the original word order is available. |
630 |
|
my $keywordString = ""; |
631 |
|
my $bustedString = ""; |
632 |
|
for my $keyword (@keywords) { |
633 |
|
if (length $keyword >= 3) { |
634 |
|
$keywordString .= " $keyword"; |
635 |
|
if ($keyword =~ /-/) { |
636 |
|
my @words = split /-/, $keyword; |
637 |
|
$bustedString .= join(" ", "", @words); |
638 |
|
} |
639 |
|
} |
640 |
|
} |
641 |
|
$keywordString .= $bustedString; |
642 |
|
# Get rid of annoying punctuation. |
643 |
|
$keywordString =~ s/[();]//g; |
644 |
|
# Clean the keyword list. |
645 |
|
my $cleanWords = $sprout->CleanKeywords($keywordString); |
646 |
|
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
647 |
|
# Create the feature record. |
648 |
|
$loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords); |
649 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
650 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
651 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
681 |
return $retVal; |
return $retVal; |
682 |
} |
} |
683 |
|
|
|
=head3 LoadBBHData |
|
|
|
|
|
C<< my $stats = $spl->LoadBBHData(); >> |
|
|
|
|
|
Load the bidirectional best hit data from FIG into Sprout. |
|
|
|
|
|
Sprout does not store information on similarities. Instead, it has only the |
|
|
bi-directional best hits. Even so, the BBH table is one of the largest in |
|
|
the database. |
|
|
|
|
|
The following relations are loaded by this method. |
|
|
|
|
|
IsBidirectionalBestHitOf |
|
|
|
|
|
=over 4 |
|
|
|
|
|
=item RETURNS |
|
|
|
|
|
Returns a statistics object for the loads. |
|
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadBBHData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the table of genome IDs. |
|
|
my $genomeHash = $self->{genomes}; |
|
|
# Create load objects for each of the tables we're loading. |
|
|
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf'); |
|
|
if ($self->{options}->{loadOnly}) { |
|
|
Trace("Loading from existing files.") if T(2); |
|
|
} else { |
|
|
Trace("Generating BBH data.") if T(2); |
|
|
# Now we loop through the genomes, generating the data for each one. |
|
|
for my $genomeID (sort keys %{$genomeHash}) { |
|
|
$loadIsBidirectionalBestHitOf->Add("genomeIn"); |
|
|
Trace("Processing features for genome $genomeID.") if T(3); |
|
|
# Get the feature list for this genome. |
|
|
my $features = $fig->all_features_detailed($genomeID); |
|
|
# Loop through the features. |
|
|
for my $featureData (@{$features}) { |
|
|
# Split the tuple. |
|
|
my ($featureID, $locations, $aliases, $type) = @{$featureData}; |
|
|
# Get the bi-directional best hits. |
|
|
my @bbhList = $fig->bbhs($featureID); |
|
|
for my $bbhEntry (@bbhList) { |
|
|
# Get the target feature ID and the score. |
|
|
my ($targetID, $score) = @{$bbhEntry}; |
|
|
# Check the target feature's genome. |
|
|
my $targetGenomeID = $fig->genome_of($targetID); |
|
|
# Only proceed if it's one of our genomes. |
|
|
if ($genomeHash->{$targetGenomeID}) { |
|
|
$loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, |
|
|
$score); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
# Finish the loads. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
|
} |
|
|
|
|
684 |
=head3 LoadSubsystemData |
=head3 LoadSubsystemData |
685 |
|
|
686 |
C<< my $stats = $spl->LoadSubsystemData(); >> |
C<< my $stats = $spl->LoadSubsystemData(); >> |
785 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
786 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
787 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
788 |
my $class = $fig->subsystem_classification($subsysID); |
# Now for the classification string. This comes back as a list |
789 |
if ($class) { |
# reference and we convert it to a space-delimited string. |
790 |
$loadSubsystemClass->Put($subsysID, $class); |
my $classList = $fig->subsystem_classification($subsysID); |
791 |
} |
my $classString = join($FIG_Config::splitter, grep { $_ } @$classList); |
792 |
|
$loadSubsystemClass->Put($subsysID, $classString); |
793 |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
794 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
795 |
# Connect to this role. |
# Connect to this role. |
992 |
my %propertyKeys = (); |
my %propertyKeys = (); |
993 |
my $nextID = 1; |
my $nextID = 1; |
994 |
# Loop through the genomes. |
# Loop through the genomes. |
995 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
996 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
997 |
Trace("Generating properties for $genomeID.") if T(3); |
Trace("Generating properties for $genomeID.") if T(3); |
998 |
# Get the genome's features. The feature ID is the first field in the |
# Get the genome's features. The feature ID is the first field in the |
1001 |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; |
1002 |
my $featureCount = 0; |
my $featureCount = 0; |
1003 |
my $propertyCount = 0; |
my $propertyCount = 0; |
1004 |
|
# Get the properties for this genome's features. |
1005 |
|
my $attributes = GetGenomeAttributes($fig, $genomeID, \@features); |
1006 |
|
Trace("Property hash built for $genomeID.") if T(3); |
1007 |
# Loop through the features, creating HasProperty records. |
# Loop through the features, creating HasProperty records. |
1008 |
for my $fid (@features) { |
for my $fid (@features) { |
1009 |
# Get all attributes for this feature. We do this one feature at a time |
# Get all attributes for this feature. We do this one feature at a time |
1010 |
# to insure we do not get any genome attributes. |
# to insure we do not get any genome attributes. |
1011 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my @attributeList = @{$attributes->{$fid}}; |
1012 |
if (scalar @attributeList) { |
if (scalar @attributeList) { |
1013 |
$featureCount++; |
$featureCount++; |
1014 |
} |
} |
1421 |
|
|
1422 |
GenomeGroups |
GenomeGroups |
1423 |
|
|
1424 |
There is no direct support for genome groups in FIG, so we access the SEED |
Currently, we do not use groups. We used to use them for NMPDR groups, |
1425 |
|
butThere is no direct support for genome groups in FIG, so we access the SEED |
1426 |
files directly. |
files directly. |
1427 |
|
|
1428 |
=over 4 |
=over 4 |
1448 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1449 |
} else { |
} else { |
1450 |
Trace("Generating group data.") if T(2); |
Trace("Generating group data.") if T(2); |
1451 |
# Loop through the genomes. |
# Currently there are no groups. |
|
my $line; |
|
|
for my $genomeID (keys %{$genomeHash}) { |
|
|
Trace("Processing $genomeID.") if T(3); |
|
|
# Open the NMPDR group file for this genome. |
|
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
|
|
defined($line = <TMP>)) { |
|
|
# Clean the line ending. |
|
|
chomp $line; |
|
|
# Add the group to the table. Note that there can only be one group |
|
|
# per genome. |
|
|
$loadGenomeGroups->Put($genomeID, $line); |
|
|
} |
|
|
close TMP; |
|
|
} |
|
1452 |
} |
} |
1453 |
# Finish the load. |
# Finish the load. |
1454 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1544 |
The following relations are loaded by this method. |
The following relations are loaded by this method. |
1545 |
|
|
1546 |
Family |
Family |
1547 |
ContainsFeature |
IsFamilyForFeature |
1548 |
|
|
1549 |
The source information for these relations is taken from the C<families_for_protein>, |
The source information for these relations is taken from the C<families_for_protein>, |
1550 |
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
1568 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1569 |
# Create load objects for the tables we're loading. |
# Create load objects for the tables we're loading. |
1570 |
my $loadFamily = $self->_TableLoader('Family'); |
my $loadFamily = $self->_TableLoader('Family'); |
1571 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); |
my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature'); |
1572 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1573 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1574 |
} else { |
} else { |
1580 |
Trace("Processing features for $genomeID.") if T(2); |
Trace("Processing features for $genomeID.") if T(2); |
1581 |
# Loop through this genome's PEGs. |
# Loop through this genome's PEGs. |
1582 |
for my $fid ($fig->all_features($genomeID, "peg")) { |
for my $fid ($fig->all_features($genomeID, "peg")) { |
1583 |
$loadContainsFeature->Add("features", 1); |
$loadIsFamilyForFeature->Add("features", 1); |
1584 |
# Get this feature's families. |
# Get this feature's families. |
1585 |
my @families = $fig->families_for_protein($fid); |
my @families = $fig->families_for_protein($fid); |
1586 |
# Loop through the families, connecting them to the feature. |
# Loop through the families, connecting them to the feature. |
1587 |
for my $family (@families) { |
for my $family (@families) { |
1588 |
$loadContainsFeature->Put($family, $fid); |
$loadIsFamilyForFeature->Put($family, $fid); |
1589 |
# If this is a new family, create a record for it. |
# If this is a new family, create a record for it. |
1590 |
if (! exists $familyHash{$family}) { |
if (! exists $familyHash{$family}) { |
1591 |
$familyHash{$family} = 1; |
$familyHash{$family} = 1; |
1603 |
return $retVal; |
return $retVal; |
1604 |
} |
} |
1605 |
|
|
1606 |
|
=head3 LoadDrugData |
1607 |
|
|
1608 |
|
C<< my $stats = $spl->LoadDrugData(); >> |
1609 |
|
|
1610 |
|
Load the drug target data into Sprout. |
1611 |
|
|
1612 |
|
The following relations are loaded by this method. |
1613 |
|
|
1614 |
|
DrugProject |
1615 |
|
ContainsTopic |
1616 |
|
DrugTopic |
1617 |
|
ContainsAnalysisOf |
1618 |
|
PDB |
1619 |
|
IncludesBound |
1620 |
|
IsBoundIn |
1621 |
|
BindsWith |
1622 |
|
Ligand |
1623 |
|
DescribesProteinForFeature |
1624 |
|
FeatureConservation |
1625 |
|
|
1626 |
|
The source information for these relations is taken from flat files in the |
1627 |
|
C<$FIG_Config::drug_directory>. The file C<master_tables.list> contains |
1628 |
|
a list of drug project names paired with file names. The named file (in the |
1629 |
|
same directory) contains all the data for the project. |
1630 |
|
|
1631 |
|
=over 4 |
1632 |
|
|
1633 |
|
=item RETURNS |
1634 |
|
|
1635 |
|
Returns a statistics object for the loads. |
1636 |
|
|
1637 |
|
=back |
1638 |
|
|
1639 |
|
=cut |
1640 |
|
#: Return Type $%; |
1641 |
|
sub LoadDrugData { |
1642 |
|
# Get this object instance. |
1643 |
|
my ($self) = @_; |
1644 |
|
# Get the FIG object. |
1645 |
|
my $fig = $self->{fig}; |
1646 |
|
# Get the genome hash. |
1647 |
|
my $genomeHash = $self->{genomes}; |
1648 |
|
# Create load objects for the tables we're loading. |
1649 |
|
my $loadDrugProject = $self->_TableLoader('DrugProject'); |
1650 |
|
my $loadContainsTopic = $self->_TableLoader('ContainsTopic'); |
1651 |
|
my $loadDrugTopic = $self->_TableLoader('DrugTopic'); |
1652 |
|
my $loadContainsAnalysisOf = $self->_TableLoader('ContainsAnalysisOf'); |
1653 |
|
my $loadPDB = $self->_TableLoader('PDB'); |
1654 |
|
my $loadIncludesBound = $self->_TableLoader('IncludesBound'); |
1655 |
|
my $loadIsBoundIn = $self->_TableLoader('IsBoundIn'); |
1656 |
|
my $loadBindsWith = $self->_TableLoader('BindsWith'); |
1657 |
|
my $loadLigand = $self->_TableLoader('Ligand'); |
1658 |
|
my $loadDescribesProteinForFeature = $self->_TableLoader('DescribesProteinForFeature'); |
1659 |
|
my $loadFeatureConservation = $self->_TableLoader('FeatureConservation'); |
1660 |
|
if ($self->{options}->{loadOnly}) { |
1661 |
|
Trace("Loading from existing files.") if T(2); |
1662 |
|
} else { |
1663 |
|
Trace("Generating drug target data.") if T(2); |
1664 |
|
# Load the project list. The file comes in as a list of chomped lines, |
1665 |
|
# and we split them on the TAB character to make the project name the |
1666 |
|
# key and the file name the value of the resulting hash. |
1667 |
|
my %projects = map { split /\t/, $_ } Tracer::GetFile("$FIG_Config::drug_directory/master_tables.list"); |
1668 |
|
# Create hashes for the derived objects: PDBs, Features, and Ligands. These objects |
1669 |
|
# may occur multiple times in a single project file or even in multiple project |
1670 |
|
# files. |
1671 |
|
my %ligands = (); |
1672 |
|
my %pdbs = (); |
1673 |
|
my %features = (); |
1674 |
|
my %bindings = (); |
1675 |
|
# Set up a counter for drug topics. This will be used as the key. |
1676 |
|
my $topicCounter = 0; |
1677 |
|
# Loop through the projects. We sort the keys not because we need them sorted, but |
1678 |
|
# because it makes it easier to infer our progress from trace messages. |
1679 |
|
for my $project (sort keys %projects) { |
1680 |
|
Trace("Processing project $project.") if T(3); |
1681 |
|
# Only proceed if the download file exists. |
1682 |
|
my $projectFile = "$FIG_Config::drug_directory/$projects{$project}"; |
1683 |
|
if (! -f $projectFile) { |
1684 |
|
Trace("Project file $projectFile not found.") if T(0); |
1685 |
|
} else { |
1686 |
|
# Create the project record. |
1687 |
|
$loadDrugProject->Put($project); |
1688 |
|
# Create a hash for the topics. Each project has one or more topics. The |
1689 |
|
# topic is identified by a URL, a category, and an identifier. |
1690 |
|
my %topics = (); |
1691 |
|
# Now we can open the project file. |
1692 |
|
Trace("Reading project file $projectFile.") if T(3); |
1693 |
|
Open(\*PROJECT, "<$projectFile"); |
1694 |
|
# Get the first record, which is a list of column headers. We don't use this |
1695 |
|
# for anything, but it may be useful for debugging. |
1696 |
|
my $headerLine = <PROJECT>; |
1697 |
|
# Loop through the rest of the records. |
1698 |
|
while (! eof PROJECT) { |
1699 |
|
# Get the current line of data. Note that not all lines will have all |
1700 |
|
# the fields. In particular, the CLIBE data is fairly rare. |
1701 |
|
my ($authorOrganism, $category, $tag, $refURL, $peg, $conservation, |
1702 |
|
$pdbBound, $pdbBoundEval, $pdbFree, $pdbFreeEval, $pdbFreeTitle, |
1703 |
|
$protDistInfo, $passAspInfo, $passAspFile, $passWeightInfo, |
1704 |
|
$passWeightFile, $clibeInfo, $clibeURL, $clibeTotalEnergy, |
1705 |
|
$clibeVanderwaals, $clibeHBonds, $clibeEI, $clibeSolvationE) |
1706 |
|
= Tracer::GetLine(\*PROJECT); |
1707 |
|
# The tag contains an identifier for the current line of data followed |
1708 |
|
# by a text statement that generally matches a property name in the |
1709 |
|
# main database. We split it up, since the identifier goes with |
1710 |
|
# the PDB data and the text statement is part of the topic. |
1711 |
|
my ($lineID, $topicTag) = split /\s*,\s*/, $tag; |
1712 |
|
$loadDrugProject->Add("data line"); |
1713 |
|
# Check for a new topic. |
1714 |
|
my $topicData = "$category\t$topicTag\t$refURL"; |
1715 |
|
if (! exists $topics{$topicData}) { |
1716 |
|
# Here we have a new topic. Compute its ID. |
1717 |
|
$topicCounter++; |
1718 |
|
$topics{$topicData} = $topicCounter; |
1719 |
|
# Create its database record. |
1720 |
|
$loadDrugTopic->Put($topicCounter, $refURL, $category, $authorOrganism, |
1721 |
|
$topicTag); |
1722 |
|
# Connect it to the project. |
1723 |
|
$loadContainsTopic->Put($project, $topicCounter); |
1724 |
|
$loadDrugTopic->Add("topic"); |
1725 |
|
} |
1726 |
|
# Now we know the topic ID exists in the hash and the topic will |
1727 |
|
# appear in the database, so we get this topic's ID. |
1728 |
|
my $topicID = $topics{$topicData}; |
1729 |
|
# If the feature in this line is new, we need to save its conservation |
1730 |
|
# number. |
1731 |
|
if (! exists $features{$peg}) { |
1732 |
|
$loadFeatureConservation->Put($peg, $conservation); |
1733 |
|
$features{$peg} = 1; |
1734 |
|
} |
1735 |
|
# Now we have two PDBs to deal with-- a bound PDB and a free PDB. |
1736 |
|
# The free PDB will have data about docking points; the bound PDB |
1737 |
|
# will have data about docking. We store both types as PDBs, and |
1738 |
|
# the special data comes from relationships. First we process the |
1739 |
|
# bound PDB. |
1740 |
|
if ($pdbBound) { |
1741 |
|
$loadPDB->Add("bound line"); |
1742 |
|
# Insure this PDB is in the database. |
1743 |
|
$self->CreatePDB($pdbBound, lc "$pdbFreeTitle (bound)", "bound", \%pdbs, $loadPDB); |
1744 |
|
# Connect it to this topic. |
1745 |
|
$loadIncludesBound->Put($topicID, $pdbBound); |
1746 |
|
# Check for CLIBE data. |
1747 |
|
if ($clibeInfo) { |
1748 |
|
$loadLigand->Add("clibes"); |
1749 |
|
# We have CLIBE data, so we create a ligand and relate it to the PDB. |
1750 |
|
if (! exists $ligands{$clibeInfo}) { |
1751 |
|
# This is a new ligand, so create its record. |
1752 |
|
$loadLigand->Put($clibeInfo); |
1753 |
|
$loadLigand->Add("ligand"); |
1754 |
|
# Make sure we know this ligand already exists. |
1755 |
|
$ligands{$clibeInfo} = 1; |
1756 |
|
} |
1757 |
|
# Now connect the PDB to the ligand using the CLIBE data. |
1758 |
|
$loadBindsWith->Put($pdbBound, $clibeInfo, $clibeURL, $clibeHBonds, $clibeEI, |
1759 |
|
$clibeSolvationE, $clibeVanderwaals); |
1760 |
|
} |
1761 |
|
# Connect this PDB to the feature. |
1762 |
|
$loadDescribesProteinForFeature->Put($pdbBound, $peg, $protDistInfo, $pdbBoundEval); |
1763 |
|
} |
1764 |
|
# Next is the free PDB. |
1765 |
|
if ($pdbFree) { |
1766 |
|
$loadPDB->Add("free line"); |
1767 |
|
# Insure this PDB is in the database. |
1768 |
|
$self->CreatePDB($pdbFree, lc $pdbFreeTitle, "free", \%pdbs, $loadPDB); |
1769 |
|
# Connect it to this topic. |
1770 |
|
$loadContainsAnalysisOf->Put($topicID, $pdbFree, $passAspInfo, |
1771 |
|
$passWeightFile, $passWeightInfo, $passAspFile); |
1772 |
|
# Connect this PDB to the feature. |
1773 |
|
$loadDescribesProteinForFeature->Put($pdbFree, $peg, $protDistInfo, $pdbFreeEval); |
1774 |
|
} |
1775 |
|
# If we have both PDBs, we may need to link them. |
1776 |
|
if ($pdbFree && $pdbBound) { |
1777 |
|
$loadIsBoundIn->Add("connection"); |
1778 |
|
# Insure we only link them once. |
1779 |
|
my $bindingKey = "$pdbFree\t$pdbBound"; |
1780 |
|
if (! exists $bindings{$bindingKey}) { |
1781 |
|
$loadIsBoundIn->Add("newConnection"); |
1782 |
|
$loadIsBoundIn->Put($pdbFree, $pdbBound); |
1783 |
|
$bindings{$bindingKey} = 1; |
1784 |
|
} |
1785 |
|
} |
1786 |
|
} |
1787 |
|
# Close off this project. |
1788 |
|
close PROJECT; |
1789 |
|
} |
1790 |
|
} |
1791 |
|
} |
1792 |
|
# Finish the load. |
1793 |
|
my $retVal = $self->_FinishAll(); |
1794 |
|
return $retVal; |
1795 |
|
} |
1796 |
|
|
1797 |
|
|
1798 |
=head2 Internal Utility Methods |
=head2 Internal Utility Methods |
1799 |
|
|
1800 |
|
=head3 SpecialAttribute |
1801 |
|
|
1802 |
|
C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >> |
1803 |
|
|
1804 |
|
Look for special attributes of a given type. A special attribute is found by comparing one of |
1805 |
|
the columns of the incoming attribute list to a search pattern. If a match is found, then |
1806 |
|
a set of columns is put into an output table connected to the specified ID. |
1807 |
|
|
1808 |
|
For example, when processing features, the attribute list we look at has three columns: attribute |
1809 |
|
name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name |
1810 |
|
begins with C<iedb_>. The call signature is therefore |
1811 |
|
|
1812 |
|
my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB); |
1813 |
|
|
1814 |
|
The pattern is matched against column 0, and if we have a match, then column 2's value is put |
1815 |
|
to the output along with the specified feature ID. |
1816 |
|
|
1817 |
|
=over 4 |
1818 |
|
|
1819 |
|
=item id |
1820 |
|
|
1821 |
|
ID of the object whose special attributes are being loaded. This forms the first column of the |
1822 |
|
output. |
1823 |
|
|
1824 |
|
=item attributes |
1825 |
|
|
1826 |
|
Reference to a list of tuples. |
1827 |
|
|
1828 |
|
=item idxMatch |
1829 |
|
|
1830 |
|
Index in each tuple of the column to be matched against the pattern. If the match is |
1831 |
|
successful, an output record will be generated. |
1832 |
|
|
1833 |
|
=item idxValues |
1834 |
|
|
1835 |
|
Reference to a list containing the indexes in each tuple of the columns to be put as |
1836 |
|
the second column of the output. |
1837 |
|
|
1838 |
|
=item pattern |
1839 |
|
|
1840 |
|
Pattern to be matched against the specified column. The match will be case-insensitive. |
1841 |
|
|
1842 |
|
=item loader |
1843 |
|
|
1844 |
|
An object to which each output record will be put. Usually this is an B<ERDBLoad> object, |
1845 |
|
but technically it could be anything with a C<Put> method. |
1846 |
|
|
1847 |
|
=item RETURN |
1848 |
|
|
1849 |
|
Returns a count of the matches found. |
1850 |
|
|
1851 |
|
=item |
1852 |
|
|
1853 |
|
=back |
1854 |
|
|
1855 |
|
=cut |
1856 |
|
|
1857 |
|
sub SpecialAttribute { |
1858 |
|
# Get the parameters. |
1859 |
|
my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_; |
1860 |
|
# Declare the return variable. |
1861 |
|
my $retVal = 0; |
1862 |
|
# Loop through the attribute rows. |
1863 |
|
for my $row (@{$attributes}) { |
1864 |
|
# Check for a match. |
1865 |
|
if ($row->[$idxMatch] =~ m/$pattern/i) { |
1866 |
|
# We have a match, so output a row. This is a bit tricky, since we may |
1867 |
|
# be putting out multiple columns of data from the input. |
1868 |
|
my $value = join(" ", map { $row->[$_] } @{$idxValues}); |
1869 |
|
$loader->Put($id, $value); |
1870 |
|
$retVal++; |
1871 |
|
} |
1872 |
|
} |
1873 |
|
Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal; |
1874 |
|
# Return the number of matches. |
1875 |
|
return $retVal; |
1876 |
|
} |
1877 |
|
|
1878 |
|
=head3 CreatePDB |
1879 |
|
|
1880 |
|
C<< $loader->CreatePDB($pdbID, $title, $type, \%pdbHash); >> |
1881 |
|
|
1882 |
|
Insure that a PDB record exists for the identified PDB. If one does not exist, it will be |
1883 |
|
created. |
1884 |
|
|
1885 |
|
=over 4 |
1886 |
|
|
1887 |
|
=item pdbID |
1888 |
|
|
1889 |
|
ID string (usually an unqualified file name) for the desired PDB. |
1890 |
|
|
1891 |
|
=item title |
1892 |
|
|
1893 |
|
Title to use if the PDB must be created. |
1894 |
|
|
1895 |
|
=item type |
1896 |
|
|
1897 |
|
Type of PDB: C<free> or C<bound> |
1898 |
|
|
1899 |
|
=item pdbHash |
1900 |
|
|
1901 |
|
Hash containing the IDs of PDBs that have already been created. |
1902 |
|
|
1903 |
|
=item pdbLoader |
1904 |
|
|
1905 |
|
Load object for the PDB table. |
1906 |
|
|
1907 |
|
=back |
1908 |
|
|
1909 |
|
=cut |
1910 |
|
|
1911 |
|
sub CreatePDB { |
1912 |
|
# Get the parameters. |
1913 |
|
my ($self, $pdbID, $title, $type, $pdbHash, $pdbLoader) = @_; |
1914 |
|
$pdbLoader->Add("PDB check"); |
1915 |
|
# Check to see if this is a new PDB. |
1916 |
|
if (! exists $pdbHash->{$pdbID}) { |
1917 |
|
# It is, so we create it. |
1918 |
|
$pdbLoader->Put($pdbID, $title, $type); |
1919 |
|
$pdbHash->{$pdbID} = 1; |
1920 |
|
# Count it. |
1921 |
|
$pdbLoader->Add("PDB-$type"); |
1922 |
|
} |
1923 |
|
} |
1924 |
|
|
1925 |
=head3 TableLoader |
=head3 TableLoader |
1926 |
|
|
1927 |
Create an ERDBLoad object for the specified table. The object is also added to |
Create an ERDBLoad object for the specified table. The object is also added to |
2024 |
# Return the load statistics. |
# Return the load statistics. |
2025 |
return $retVal; |
return $retVal; |
2026 |
} |
} |
2027 |
|
=head3 GetGenomeAttributes |
2028 |
|
|
2029 |
|
C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids); >> |
2030 |
|
|
2031 |
|
Return a hash of attributes keyed on feature ID. This method gets all the attributes |
2032 |
|
for all the features of a genome in a single call, then organizes them into a hash. |
2033 |
|
|
2034 |
|
=over 4 |
2035 |
|
|
2036 |
|
=item fig |
2037 |
|
|
2038 |
|
FIG-like object for accessing attributes. |
2039 |
|
|
2040 |
|
=item genomeID |
2041 |
|
|
2042 |
|
ID of the genome who's attributes are desired. |
2043 |
|
|
2044 |
|
=item fids |
2045 |
|
|
2046 |
|
Reference to a list of the feature IDs whose attributes are to be kept. |
2047 |
|
|
2048 |
|
=item RETURN |
2049 |
|
|
2050 |
|
Returns a reference to a hash. The key of the hash is the feature ID. The value is the |
2051 |
|
reference to a list of the feature's attribute tuples. Each tuple contains the feature ID, |
2052 |
|
the attribute key, and one or more attribute values. |
2053 |
|
|
2054 |
|
=back |
2055 |
|
|
2056 |
|
=cut |
2057 |
|
|
2058 |
|
sub GetGenomeAttributes { |
2059 |
|
# Get the parameters. |
2060 |
|
my ($fig, $genomeID, $fids) = @_; |
2061 |
|
# Declare the return variable. |
2062 |
|
my $retVal = {}; |
2063 |
|
# Get the attributes. |
2064 |
|
my @aList = $fig->get_attributes("fig|$genomeID%"); |
2065 |
|
# Initialize the hash. This not only enables us to easily determine which FIDs to |
2066 |
|
# keep, it insures that the caller sees a list reference for every known fid, |
2067 |
|
# simplifying the logic. |
2068 |
|
for my $fid (@{$fids}) { |
2069 |
|
$retVal->{$fid} = []; |
2070 |
|
} |
2071 |
|
# Populate the hash. |
2072 |
|
for my $aListEntry (@aList) { |
2073 |
|
my $fid = $aListEntry->[0]; |
2074 |
|
if (exists $retVal->{$fid}) { |
2075 |
|
push @{$retVal->{$fid}}, $aListEntry; |
2076 |
|
} |
2077 |
|
} |
2078 |
|
# Return the result. |
2079 |
|
return $retVal; |
2080 |
|
} |
2081 |
|
|
2082 |
1; |
1; |