51 |
|
|
52 |
=head3 new |
=head3 new |
53 |
|
|
54 |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >> |
C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
55 |
|
|
56 |
Construct a new Sprout Loader object, specifying the two participating databases and |
Construct a new Sprout Loader object, specifying the two participating databases and |
57 |
the name of the files containing the list of genomes and subsystems to use. |
the name of the files containing the list of genomes and subsystems to use. |
82 |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
to a list of subsystem names. If nothing is specified, all known subsystems will be |
83 |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
considered trusted. Only subsystem data related to the trusted subsystems is loaded. |
84 |
|
|
85 |
|
=item options |
86 |
|
|
87 |
|
Reference to a hash of command-line options. |
88 |
|
|
89 |
=back |
=back |
90 |
|
|
91 |
=cut |
=cut |
92 |
|
|
93 |
sub new { |
sub new { |
94 |
# Get the parameters. |
# Get the parameters. |
95 |
my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_; |
my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
96 |
# Load the list of genomes into a hash. |
# Load the list of genomes into a hash. |
97 |
my %genomes; |
my %genomes; |
98 |
if (! defined($genomeFile) || $genomeFile eq '') { |
if (! defined($genomeFile) || $genomeFile eq '') { |
162 |
sprout => $sprout, |
sprout => $sprout, |
163 |
loadDirectory => $directory, |
loadDirectory => $directory, |
164 |
erdb => $sprout->{_erdb}, |
erdb => $sprout->{_erdb}, |
165 |
loaders => [] |
loaders => [], |
166 |
|
options => $options |
167 |
}; |
}; |
168 |
# Bless and return it. |
# Bless and return it. |
169 |
bless $retVal, $class; |
bless $retVal, $class; |
382 |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
$loadIsEvidencedBy->Put($coupleID, $evidenceID); |
383 |
# Connect it to the features. |
# Connect it to the features. |
384 |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
$loadUsesAsEvidence->Put($evidenceID, $peg3, 1); |
385 |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 1); |
$loadUsesAsEvidence->Put($evidenceID, $peg4, 2); |
386 |
} |
} |
387 |
} |
} |
388 |
} |
} |
425 |
my ($self) = @_; |
my ($self) = @_; |
426 |
# Get the FIG object. |
# Get the FIG object. |
427 |
my $fig = $self->{fig}; |
my $fig = $self->{fig}; |
428 |
|
# Find out if this is a limited run. |
429 |
|
my $limited = $self->{options}->{limitedFeatures}; |
430 |
# Get the table of genome IDs. |
# Get the table of genome IDs. |
431 |
my $genomeHash = $self->{genomes}; |
my $genomeHash = $self->{genomes}; |
432 |
my $genomeCount = (keys %{$genomeHash}); |
my $genomeCount = (keys %{$genomeHash}); |
433 |
my $featureCount = $genomeCount * 4000; |
my $featureCount = $genomeCount * 4000; |
434 |
# Create load objects for each of the tables we're loading. |
# Create load objects for each of the tables we're loading. |
435 |
my $loadFeature = $self->_TableLoader('Feature', $featureCount); |
my $loadFeature = $self->_TableLoader('Feature', $featureCount); |
|
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6); |
|
|
my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10); |
|
|
my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount); |
|
|
my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount); |
|
436 |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount); |
my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount); |
437 |
|
my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6); |
438 |
|
my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream); |
439 |
|
if (! $limited) { |
440 |
|
$loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10); |
441 |
|
$loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount); |
442 |
|
$loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount); |
443 |
|
} |
444 |
# Get the maximum sequence size. We need this later for splitting up the |
# Get the maximum sequence size. We need this later for splitting up the |
445 |
# locations. |
# locations. |
446 |
my $chunkSize = $self->{sprout}->MaxSegment(); |
my $chunkSize = $self->{sprout}->MaxSegment(); |
455 |
for my $featureData (@{$features}) { |
for my $featureData (@{$features}) { |
456 |
$loadFeature->Add("featureIn"); |
$loadFeature->Add("featureIn"); |
457 |
# Split the tuple. |
# Split the tuple. |
458 |
my ($featureID, $locations, $aliases, $type) = @{$featureData}; |
my ($featureID, $locations, undef, $type) = @{$featureData}; |
459 |
# Create the feature record. |
# Create the feature record. |
460 |
$loadFeature->Put($featureID, 1, $type); |
$loadFeature->Put($featureID, 1, $type); |
461 |
# Create the aliases. |
# Create the aliases. |
462 |
for my $alias (split /\s*,\s*/, $aliases) { |
for my $alias ($fig->feature_aliases($featureID)) { |
463 |
$loadFeatureAlias->Put($featureID, $alias); |
$loadFeatureAlias->Put($featureID, $alias); |
464 |
} |
} |
465 |
|
# The next stuff is for a full load only. |
466 |
|
if (! $limited) { |
467 |
# Get the links. |
# Get the links. |
468 |
my @links = $fig->fid_links($featureID); |
my @links = $fig->fid_links($featureID); |
469 |
for my $link (@links) { |
for my $link (@links) { |
482 |
$loadFeatureUpstream->Put($featureID, $upstream); |
$loadFeatureUpstream->Put($featureID, $upstream); |
483 |
} |
} |
484 |
} |
} |
485 |
|
} |
486 |
# This part is the roughest. We need to relate the features to contig |
# This part is the roughest. We need to relate the features to contig |
487 |
# locations, and the locations must be split so that none of them exceed |
# locations, and the locations must be split so that none of them exceed |
488 |
# the maximum segment size. This simplifies the genes_in_region processing |
# the maximum segment size. This simplifies the genes_in_region processing |
489 |
# for Sprout. |
# for Sprout. |
490 |
my @locationList = split /\s*,\s*/, $locations; |
my @locationList = split /\s*,\s*/, $locations; |
491 |
|
# Create the location position indicator. |
492 |
|
my $i = 1; |
493 |
# Loop through the locations. |
# Loop through the locations. |
494 |
for my $location (@locationList) { |
for my $location (@locationList) { |
495 |
# Parse the location. |
# Parse the location. |
496 |
my $locObject = BasicLocation->new($location); |
my $locObject = BasicLocation->new("$genomeID:$location"); |
497 |
# Split it into a list of chunks. |
# Split it into a list of chunks. |
498 |
my @locOList = (); |
my @locOList = (); |
499 |
while (my $peeling = $locObject->Peel($chunkSize)) { |
while (my $peeling = $locObject->Peel($chunkSize)) { |
503 |
push @locOList, $locObject; |
push @locOList, $locObject; |
504 |
# Loop through the chunks, creating IsLocatedIn records. The variable |
# Loop through the chunks, creating IsLocatedIn records. The variable |
505 |
# "$i" will be used to keep the location index. |
# "$i" will be used to keep the location index. |
|
my $i = 1; |
|
506 |
for my $locChunk (@locOList) { |
for my $locChunk (@locOList) { |
507 |
$loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
$loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
508 |
$locChunk->Dir, $locChunk->Length, $i); |
$locChunk->Dir, $locChunk->Length, $i); |
650 |
# roles. We do this by looping through the subsystems and creating a |
# roles. We do this by looping through the subsystems and creating a |
651 |
# role hash. The hash tracks each role ID so that we don't create |
# role hash. The hash tracks each role ID so that we don't create |
652 |
# duplicates. As we move along, we'll connect the roles and subsystems. |
# duplicates. As we move along, we'll connect the roles and subsystems. |
653 |
|
my ($genomeID, $roleID); |
654 |
my %roleData = (); |
my %roleData = (); |
655 |
for my $subsysID (@subsysIDs) { |
for my $subsysID (@subsysIDs) { |
656 |
Trace("Creating subsystem $subsysID.") if T(3); |
Trace("Creating subsystem $subsysID.") if T(3); |
657 |
$loadSubsystem->Add("subsystemIn"); |
$loadSubsystem->Add("subsystemIn"); |
658 |
# Create the subsystem record. |
# Create the subsystem record. |
659 |
$loadSubsystem->Put($subsysID); |
$loadSubsystem->Put($subsysID); |
660 |
# Get the subsystem's roles. |
# Get the subsystem object. |
661 |
my @roles = $fig->subsystem_to_roles($subsysID); |
my $sub = $fig->get_subsystem($subsysID); |
662 |
# Connect the roles to the subsystem. If a role is new, we create |
# Connect it to its roles. |
663 |
# a role record for it. |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
|
for my $roleID (@roles) { |
|
664 |
$loadOccursInSubsystem->Add("roleIn"); |
$loadOccursInSubsystem->Add("roleIn"); |
665 |
$loadOccursInSubsystem->Put($roleID, $subsysID); |
$loadOccursInSubsystem->Put($roleID, $subsysID); |
666 |
if (! exists $roleData{$roleID}) { |
if (! exists $roleData{$roleID}) { |
668 |
$roleData{$roleID} = 1; |
$roleData{$roleID} = 1; |
669 |
} |
} |
670 |
} |
} |
671 |
# Now all roles for this subsystem have been filled in. We create the |
# Now we create the spreadsheet for the subsystem by matching roles to |
672 |
# spreadsheet by matches roles to genomes. To do this, we need to |
# genomes. Each genome is a row and each role is a column. We may need |
673 |
# get the genomes on the sheet. |
# to actually create the roles as we find them. |
674 |
Trace("Creating subsystem $subsysID spreadsheet.") if T(3); |
Trace("Creating subsystem $subsysID spreadsheet.") if T(3); |
675 |
my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)}; |
for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) { |
676 |
for my $genomeID (@genomes) { |
# Only proceed if this is one of our genomes. |
|
# Only process this genome if it's one of ours. |
|
677 |
if (exists $genomeHash->{$genomeID}) { |
if (exists $genomeHash->{$genomeID}) { |
678 |
# Connect the genome to the subsystem. |
# Count the PEGs and cells found for verification purposes. |
679 |
$loadParticipatesIn->Put($genomeID, $subsysID); |
my $pegCount = 0; |
680 |
|
my $cellCount = 0; |
681 |
# Loop through the subsystem's roles. We use an index because it is |
# Loop through the subsystem's roles. We use an index because it is |
682 |
# part of the spreadsheet cell ID. |
# part of the spreadsheet cell ID. |
683 |
for (my $i = 0; $i <= $#roles; $i++) { |
for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { |
|
my $role = $roles[$i]; |
|
684 |
# Get the features in the spreadsheet cell for this genome and role. |
# Get the features in the spreadsheet cell for this genome and role. |
685 |
my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i); |
my @pegs = $sub->get_pegs_from_cell($row, $col); |
686 |
# Only proceed if features exist. |
# Only proceed if features exist. |
687 |
if (@pegs > 0) { |
if (@pegs > 0) { |
688 |
# Create the spreadsheet cell. |
# Create the spreadsheet cell. |
689 |
my $cellID = "$subsysID:$genomeID:$i"; |
$cellCount++; |
690 |
|
my $cellID = "$subsysID:$genomeID:$col"; |
691 |
$loadSSCell->Put($cellID); |
$loadSSCell->Put($cellID); |
692 |
$loadIsGenomeOf->Put($genomeID, $cellID); |
$loadIsGenomeOf->Put($genomeID, $cellID); |
693 |
$loadIsRoleOf->Put($role, $cellID); |
$loadIsRoleOf->Put($roleID, $cellID); |
694 |
$loadHasSSCell->Put($subsysID, $cellID); |
$loadHasSSCell->Put($subsysID, $cellID); |
695 |
# Attach the features to it. |
# Attach the features to it. |
696 |
for my $pegID (@pegs) { |
for my $pegID (@pegs) { |
697 |
$loadContainsFeature->Put($cellID, $pegID); |
$loadContainsFeature->Put($cellID, $pegID); |
698 |
|
$pegCount++; |
699 |
} |
} |
700 |
} |
} |
701 |
} |
} |
702 |
|
# If we found some cells for this genome, denote it participates in the |
703 |
|
# subsystem. |
704 |
|
if ($pegCount > 0) { |
705 |
|
Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); |
706 |
|
$loadParticipatesIn->Put($genomeID, $subsysID); |
707 |
|
} |
708 |
} |
} |
709 |
} |
} |
710 |
} |
} |
1031 |
chomp $line; |
chomp $line; |
1032 |
my($sourceID, $desc, $url) = split(/\t/,$line); |
my($sourceID, $desc, $url) = split(/\t/,$line); |
1033 |
$loadComesFrom->Put($genomeID, $sourceID); |
$loadComesFrom->Put($genomeID, $sourceID); |
1034 |
if ($url && ! exists $sourceURL{$genomeID}) { |
if ($url && ! exists $sourceURL{$sourceID}) { |
1035 |
$loadSourceURL->Put($sourceID, $url); |
$loadSourceURL->Put($sourceID, $url); |
1036 |
$sourceURL{$sourceID} = 1; |
$sourceURL{$sourceID} = 1; |
1037 |
} |
} |
1038 |
if ($desc && ! exists $sourceDesc{$sourceID}) { |
if ($desc) { |
1039 |
$loadSource->Put($sourceID, $desc); |
$sourceDesc{$sourceID} = $desc; |
1040 |
$sourceDesc{$sourceID} = 1; |
} elsif (! exists $sourceDesc{$sourceID}) { |
1041 |
|
$sourceDesc{$sourceID} = $sourceID; |
1042 |
} |
} |
1043 |
} |
} |
1044 |
close TMP; |
close TMP; |
1045 |
} |
} |
1046 |
|
# Write the source descriptions. |
1047 |
|
for my $sourceID (keys %sourceDesc) { |
1048 |
|
$loadSource->Put($sourceID, $sourceDesc{$sourceID}); |
1049 |
|
} |
1050 |
# Finish the load. |
# Finish the load. |
1051 |
my $retVal = $self->_FinishAll(); |
my $retVal = $self->_FinishAll(); |
1052 |
return $retVal; |
return $retVal; |