120 |
# an omitted access code can be defaulted to 1. |
# an omitted access code can be defaulted to 1. |
121 |
for my $genomeLine (@genomeList) { |
for my $genomeLine (@genomeList) { |
122 |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
my ($genomeID, $accessCode) = split("\t", $genomeLine); |
123 |
if (undef $accessCode) { |
if (! defined($accessCode)) { |
124 |
$accessCode = 1; |
$accessCode = 1; |
125 |
} |
} |
126 |
$genomes{$genomeID} = $accessCode; |
$genomes{$genomeID} = $accessCode; |
163 |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
Confess("Invalid subsystem parameter in SproutLoad constructor."); |
164 |
} |
} |
165 |
} |
} |
166 |
|
# Go through the subsys hash again, creating the keyword list for each subsystem. |
167 |
|
for my $subsystem (keys %subsystems) { |
168 |
|
my $name = $subsystem; |
169 |
|
$name =~ s/_/ /g; |
170 |
|
my $classes = $fig->subsystem_classification($subsystem); |
171 |
|
my @classList = map { " $_" } @{$classes}; |
172 |
|
$name .= join("", @classList); |
173 |
|
$subsystems{$subsystem} = $name; |
174 |
|
} |
175 |
} |
} |
176 |
# Get the data directory from the Sprout object. |
# Get the data directory from the Sprout object. |
177 |
my ($directory) = $sprout->LoadInfo(); |
my ($directory) = $sprout->LoadInfo(); |
275 |
my $extra = join " ", @extraData; |
my $extra = join " ", @extraData; |
276 |
# Get the full taxonomy. |
# Get the full taxonomy. |
277 |
my $taxonomy = $fig->taxonomy_of($genomeID); |
my $taxonomy = $fig->taxonomy_of($genomeID); |
278 |
|
# Open the NMPDR group file for this genome. |
279 |
|
my $group; |
280 |
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
281 |
|
defined($group = <TMP>)) { |
282 |
|
# Clean the line ending. |
283 |
|
chomp $group; |
284 |
|
} else { |
285 |
|
# No group, so use the default. |
286 |
|
$group = $FIG_Config::otherGroup; |
287 |
|
} |
288 |
|
close TMP; |
289 |
# Output the genome record. |
# Output the genome record. |
290 |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
$loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, |
291 |
$species, $extra, $taxonomy); |
$group, $species, $extra, $taxonomy); |
292 |
# Now we loop through each of the genome's contigs. |
# Now we loop through each of the genome's contigs. |
293 |
my @contigs = $fig->all_contigs($genomeID); |
my @contigs = $fig->all_contigs($genomeID); |
294 |
for my $contigID (@contigs) { |
for my $contigID (@contigs) { |
469 |
FeatureUpstream |
FeatureUpstream |
470 |
IsLocatedIn |
IsLocatedIn |
471 |
HasFeature |
HasFeature |
472 |
|
HasRoleInSubsystem |
473 |
|
|
474 |
=over 4 |
=over 4 |
475 |
|
|
484 |
sub LoadFeatureData { |
sub LoadFeatureData { |
485 |
# Get this object instance. |
# Get this object instance. |
486 |
my ($self) = @_; |
my ($self) = @_; |
487 |
# Get the FIG object. |
# Get the FIG and Sprout objects. |
488 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
489 |
|
my $sprout = $self->{sprout}; |
490 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
491 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
492 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
496 |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
497 |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); |
498 |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); |
499 |
my $loadHasFeature = $self->_TableLoader('HasFeature'); |
my $loadHasFeature = $self->_TableLoader('HasFeature', $self->PrimaryOnly); |
500 |
|
my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem', $self->PrimaryOnly); |
501 |
|
# Get the subsystem hash. |
502 |
|
my $subHash = $self->{subsystems}; |
503 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
504 |
# locations. |
# locations. |
505 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
530 |
$oldFeatureID = $featureID; |
$oldFeatureID = $featureID; |
531 |
# Count this feature. |
# Count this feature. |
532 |
$loadFeature->Add("featureIn"); |
$loadFeature->Add("featureIn"); |
533 |
# Create the feature record. |
# Begin building the keywords. |
534 |
$loadFeature->Put($featureID, 1, $type); |
my @keywords = ($genomeID); |
535 |
# Link it to the parent genome. |
# Get the functional assignment and aliases. This |
536 |
$loadHasFeature->Put($genomeID, $featureID, $type); |
# depends on the feature type. |
537 |
|
my $assignment; |
538 |
|
if ($type eq "peg") { |
539 |
|
$assignment = $fig->function_of($featureID); |
540 |
# Create the aliases. |
# Create the aliases. |
541 |
for my $alias ($fig->feature_aliases($featureID)) { |
for my $alias ($fig->feature_aliases($featureID)) { |
542 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
543 |
|
push @keywords, $alias; |
544 |
} |
} |
545 |
|
} else { |
546 |
|
# For other types, the assignment is the first (and ONLY) alias. |
547 |
|
($assignment) = $fig->feature_aliases($featureID); |
548 |
|
} |
549 |
|
Trace("Assignment for $featureID is: $assignment") if T(4); |
550 |
|
# Break the assignment into words and shove it onto the |
551 |
|
# keyword list. |
552 |
|
push @keywords, split(/\s+/, $assignment); |
553 |
|
# Link this feature to the parent genome. |
554 |
|
$loadHasFeature->Put($genomeID, $featureID, $type); |
555 |
# Get the links. |
# Get the links. |
556 |
my @links = $fig->fid_links($featureID); |
my @links = $fig->fid_links($featureID); |
557 |
for my $link (@links) { |
for my $link (@links) { |
570 |
$loadFeatureUpstream->Put($featureID, $upstream); |
$loadFeatureUpstream->Put($featureID, $upstream); |
571 |
} |
} |
572 |
} |
} |
573 |
|
# Now we need to find the subsystems this feature participates in. |
574 |
|
# We also add the subsystems to the keyword list. Before we do that, |
575 |
|
# we must convert underscores to spaces and tack on the classifications. |
576 |
|
my @subsystems = $fig->peg_to_subsystems($featureID); |
577 |
|
for my $subsystem (@subsystems) { |
578 |
|
# Only proceed if we like this subsystem. |
579 |
|
if (exists $subHash->{$subsystem}) { |
580 |
|
# Store the has-role link. |
581 |
|
$loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type); |
582 |
|
# Save the subsystem's keyword data. |
583 |
|
my $subKeywords = $subHash->{$subsystem}; |
584 |
|
push @keywords, split /\s+/, $subKeywords; |
585 |
|
# Now we need to get this feature's role in the subsystem. |
586 |
|
my $subObject = $fig->get_subsystem($subsystem); |
587 |
|
my @roleColumns = $subObject->get_peg_roles($featureID); |
588 |
|
my @allRoles = $subObject->get_roles(); |
589 |
|
for my $col (@roleColumns) { |
590 |
|
my $role = $allRoles[$col]; |
591 |
|
push @keywords, split /\s+/, $role; |
592 |
|
push @keywords, $subObject->get_role_abbr($col); |
593 |
|
} |
594 |
|
} |
595 |
|
} |
596 |
|
# The final task is to add virulence and essentiality attributes. |
597 |
|
if ($fig->virulent($featureID)) { |
598 |
|
push @keywords, "virulent"; |
599 |
|
} |
600 |
|
if ($fig->essential($featureID)) { |
601 |
|
push @keywords, "essential"; |
602 |
|
} |
603 |
|
# Now we need to bust up hyphenated words in the keyword |
604 |
|
# list. |
605 |
|
my $keywordString = ""; |
606 |
|
for my $keyword (@keywords) { |
607 |
|
if (length $keyword >= 4) { |
608 |
|
$keywordString .= " $keyword"; |
609 |
|
if ($keyword =~ /-/) { |
610 |
|
my @words = grep { length($_) >= 4 } split /-/, $keyword; |
611 |
|
$keywordString .= join(" ", "", @words); |
612 |
|
} |
613 |
|
} |
614 |
|
} |
615 |
|
# Clean the keyword list. |
616 |
|
my $cleanWords = $sprout->CleanKeywords($keywordString); |
617 |
|
Trace("Keyword string for $featureID: $cleanWords") if T(4); |
618 |
|
# Create the feature record. |
619 |
|
$loadFeature->Put($featureID, 1, $type, $assignment, $cleanWords); |
620 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
621 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
622 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
652 |
return $retVal; |
return $retVal; |
653 |
} |
} |
654 |
|
|
|
=head3 LoadBBHData |
|
|
|
|
|
C<< my $stats = $spl->LoadBBHData(); >> |
|
|
|
|
|
Load the bidirectional best hit data from FIG into Sprout. |
|
|
|
|
|
Sprout does not store information on similarities. Instead, it has only the |
|
|
bi-directional best hits. Even so, the BBH table is one of the largest in |
|
|
the database. |
|
|
|
|
|
The following relations are loaded by this method. |
|
|
|
|
|
IsBidirectionalBestHitOf |
|
|
|
|
|
=over 4 |
|
|
|
|
|
=item RETURNS |
|
|
|
|
|
Returns a statistics object for the loads. |
|
|
|
|
|
=back |
|
|
|
|
|
=cut |
|
|
#: Return Type $%; |
|
|
sub LoadBBHData { |
|
|
# Get this object instance. |
|
|
my ($self) = @_; |
|
|
# Get the FIG object. |
|
|
my $fig = $self->{fig}; |
|
|
# Get the table of genome IDs. |
|
|
my $genomeHash = $self->{genomes}; |
|
|
# Create load objects for each of the tables we're loading. |
|
|
my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf'); |
|
|
if ($self->{options}->{loadOnly}) { |
|
|
Trace("Loading from existing files.") if T(2); |
|
|
} else { |
|
|
Trace("Generating BBH data.") if T(2); |
|
|
# Now we loop through the genomes, generating the data for each one. |
|
|
for my $genomeID (sort keys %{$genomeHash}) { |
|
|
$loadIsBidirectionalBestHitOf->Add("genomeIn"); |
|
|
Trace("Processing features for genome $genomeID.") if T(3); |
|
|
# Get the feature list for this genome. |
|
|
my $features = $fig->all_features_detailed($genomeID); |
|
|
# Loop through the features. |
|
|
for my $featureData (@{$features}) { |
|
|
# Split the tuple. |
|
|
my ($featureID, $locations, $aliases, $type) = @{$featureData}; |
|
|
# Get the bi-directional best hits. |
|
|
my @bbhList = $fig->bbhs($featureID); |
|
|
for my $bbhEntry (@bbhList) { |
|
|
# Get the target feature ID and the score. |
|
|
my ($targetID, $score) = @{$bbhEntry}; |
|
|
# Check the target feature's genome. |
|
|
my $targetGenomeID = $fig->genome_of($targetID); |
|
|
# Only proceed if it's one of our genomes. |
|
|
if ($genomeHash->{$targetGenomeID}) { |
|
|
$loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, |
|
|
$score); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
# Finish the loads. |
|
|
my $retVal = $self->_FinishAll(); |
|
|
return $retVal; |
|
|
} |
|
|
|
|
655 |
=head3 LoadSubsystemData |
=head3 LoadSubsystemData |
656 |
|
|
657 |
C<< my $stats = $spl->LoadSubsystemData(); >> |
C<< my $stats = $spl->LoadSubsystemData(); >> |
756 |
my $curator = $sub->get_curator(); |
my $curator = $sub->get_curator(); |
757 |
my $notes = $sub->get_notes(); |
my $notes = $sub->get_notes(); |
758 |
$loadSubsystem->Put($subsysID, $curator, $notes); |
$loadSubsystem->Put($subsysID, $curator, $notes); |
759 |
my $class = $fig->subsystem_classification($subsysID); |
# Now for the classification string. This comes back as a list |
760 |
if ($class) { |
# reference and we convert it to a space-delimited string. |
761 |
$loadSubsystemClass->Put($subsysID, $class); |
my $classList = $fig->subsystem_classification($subsysID); |
762 |
} |
my $classString = join(" : ", grep { $_ } @$classList); |
763 |
|
$loadSubsystemClass->Put($subsysID, $classString); |
764 |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
# Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
765 |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
766 |
# Connect to this role. |
# Connect to this role. |
963 |
my %propertyKeys = (); |
my %propertyKeys = (); |
964 |
my $nextID = 1; |
my $nextID = 1; |
965 |
# Loop through the genomes. |
# Loop through the genomes. |
966 |
for my $genomeID (keys %{$genomeHash}) { |
for my $genomeID (sort keys %{$genomeHash}) { |
967 |
$loadProperty->Add("genomeIn"); |
$loadProperty->Add("genomeIn"); |
968 |
Trace("Generating properties for $genomeID.") if T(3); |
Trace("Generating properties for $genomeID.") if T(3); |
969 |
# Get the genome's features. The feature ID is the first field in the |
# Get the genome's features. The feature ID is the first field in the |
977 |
# Get all attributes for this feature. We do this one feature at a time |
# Get all attributes for this feature. We do this one feature at a time |
978 |
# to insure we do not get any genome attributes. |
# to insure we do not get any genome attributes. |
979 |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
my @attributeList = $fig->get_attributes($fid, '', '', ''); |
980 |
|
# Add essentiality and virulence attributes. |
981 |
|
if ($fig->essential($fid)) { |
982 |
|
push @attributeList, [$fid, 'essential', 1, '']; |
983 |
|
} |
984 |
|
if ($fig->virulent($fid)) { |
985 |
|
push @attributeList, [$fid, 'virulent', 1, '']; |
986 |
|
} |
987 |
if (scalar @attributeList) { |
if (scalar @attributeList) { |
988 |
$featureCount++; |
$featureCount++; |
989 |
} |
} |
1396 |
|
|
1397 |
GenomeGroups |
GenomeGroups |
1398 |
|
|
1399 |
There is no direct support for genome groups in FIG, so we access the SEED |
Currently, we do not use groups. We used to use them for NMPDR groups, |
1400 |
|
butThere is no direct support for genome groups in FIG, so we access the SEED |
1401 |
files directly. |
files directly. |
1402 |
|
|
1403 |
=over 4 |
=over 4 |
1423 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1424 |
} else { |
} else { |
1425 |
Trace("Generating group data.") if T(2); |
Trace("Generating group data.") if T(2); |
1426 |
# Loop through the genomes. |
# Currently there are no groups. |
|
my $line; |
|
|
for my $genomeID (keys %{$genomeHash}) { |
|
|
Trace("Processing $genomeID.") if T(3); |
|
|
# Open the NMPDR group file for this genome. |
|
|
if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && |
|
|
defined($line = <TMP>)) { |
|
|
# Clean the line ending. |
|
|
chomp $line; |
|
|
# Add the group to the table. Note that there can only be one group |
|
|
# per genome. |
|
|
$loadGenomeGroups->Put($genomeID, $line); |
|
|
} |
|
|
close TMP; |
|
|
} |
|
1427 |
} |
} |
1428 |
# Finish the load. |
# Finish the load. |
1429 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1519 |
The following relations are loaded by this method. |
The following relations are loaded by this method. |
1520 |
|
|
1521 |
Family |
Family |
1522 |
ContainsFeature |
IsFamilyForFeature |
1523 |
|
|
1524 |
The source information for these relations is taken from the C<families_for_protein>, |
The source information for these relations is taken from the C<families_for_protein>, |
1525 |
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
C<family_function>, and C<sz_family> methods of the B<FIG> object. |
1543 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
1544 |
# Create load objects for the tables we're loading. |
# Create load objects for the tables we're loading. |
1545 |
my $loadFamily = $self->_TableLoader('Family'); |
my $loadFamily = $self->_TableLoader('Family'); |
1546 |
my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); |
my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature'); |
1547 |
if ($self->{options}->{loadOnly}) { |
if ($self->{options}->{loadOnly}) { |
1548 |
Trace("Loading from existing files.") if T(2); |
Trace("Loading from existing files.") if T(2); |
1549 |
} else { |
} else { |
1555 |
Trace("Processing features for $genomeID.") if T(2); |
Trace("Processing features for $genomeID.") if T(2); |
1556 |
# Loop through this genome's PEGs. |
# Loop through this genome's PEGs. |
1557 |
for my $fid ($fig->all_features($genomeID, "peg")) { |
for my $fid ($fig->all_features($genomeID, "peg")) { |
1558 |
$loadContainsFeature->Add("features", 1); |
$loadIsFamilyForFeature->Add("features", 1); |
1559 |
# Get this feature's families. |
# Get this feature's families. |
1560 |
my @families = $fig->families_for_protein($fid); |
my @families = $fig->families_for_protein($fid); |
1561 |
# Loop through the families, connecting them to the feature. |
# Loop through the families, connecting them to the feature. |
1562 |
for my $family (@families) { |
for my $family (@families) { |
1563 |
$loadContainsFeature->Put($family, $fid); |
$loadIsFamilyForFeature->Put($family, $fid); |
1564 |
# If this is a new family, create a record for it. |
# If this is a new family, create a record for it. |
1565 |
if (! exists $familyHash{$family}) { |
if (! exists $familyHash{$family}) { |
1566 |
|
$familyHash{$family} = 1; |
1567 |
$loadFamily->Add("families", 1); |
$loadFamily->Add("families", 1); |
1568 |
my $size = $fig->sz_family($family); |
my $size = $fig->sz_family($family); |
1569 |
my $func = $fig->family_function($family); |
my $func = $fig->family_function($family); |
1578 |
return $retVal; |
return $retVal; |
1579 |
} |
} |
1580 |
|
|
1581 |
|
|
1582 |
|
|
1583 |
=head2 Internal Utility Methods |
=head2 Internal Utility Methods |
1584 |
|
|
1585 |
=head3 TableLoader |
=head3 TableLoader |