--- Sprout.pm 2006/06/23 01:34:42 1.69
+++ Sprout.pm 2006/09/19 00:14:04 1.85
@@ -12,9 +12,10 @@
use DBObject;
use Tracer;
use FIGRules;
+ use FidCheck;
use Stats;
use POSIX qw(strftime);
-
+ use BasicLocation;
=head1 Sprout Database Manipulation Object
@@ -91,6 +92,9 @@
sub new {
# Get the parameters.
my ($class, $dbName, $options) = @_;
+ # Compute the DBD directory.
+ my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir :
+ $FIG_Config::fig );
# Compute the options. We do this by starting with a table of defaults and overwriting with
# the incoming data.
my $optionTable = Tracer::GetOptions({
@@ -98,13 +102,14 @@
# database type
dataDir => $FIG_Config::sproutData,
# data file directory
- xmlFileName => "$FIG_Config::fig/SproutDBD.xml",
+ xmlFileName => "$dbd_dir/SproutDBD.xml",
# database definition file name
userData => "$FIG_Config::dbuser/$FIG_Config::dbpass",
# user name and password
port => $FIG_Config::dbport,
# database connection port
sock => $FIG_Config::dbsock,
+ host => $FIG_Config::dbhost,
maxSegmentLength => 4500, # maximum feature segment length
maxSequenceLength => 8000, # maximum contig sequence length
noDBOpen => 0, # 1 to suppress the database open
@@ -118,7 +123,7 @@
my $dbh;
if (! $optionTable->{noDBOpen}) {
$dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName,
- $password, $optionTable->{port}, undef, $optionTable->{sock});
+ $password, $optionTable->{port}, $optionTable->{host}, $optionTable->{sock});
}
# Create the ERDB object.
my $xmlFileName = "$optionTable->{xmlFileName}";
@@ -126,6 +131,8 @@
# Add the option table and XML file name.
$retVal->{_options} = $optionTable;
$retVal->{_xmlName} = $xmlFileName;
+ # Set up space for the group file data.
+ $retVal->{groupHash} = undef;
# Return it.
return $retVal;
}
@@ -335,7 +342,7 @@
=head3 GeneMenu
-C<< my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params); >>
+C<< my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params, $selected); >>
Return an HTML select menu of genomes. Each genome will be an option in the menu,
and will be displayed by name with the ID and a contig count attached. The selection
@@ -357,6 +364,14 @@
Reference to a list of values to be substituted in for the parameter marks in
the filter string.
+=item selected (optional)
+
+ID of the genome to be initially selected.
+
+=item fast (optional)
+
+If specified and TRUE, the contig counts will be omitted to improve performance.
+
=item RETURN
Returns an HTML select menu with the specified genomes as selectable options.
@@ -367,7 +382,12 @@
sub GeneMenu {
# Get the parameters.
- my ($self, $attributes, $filterString, $params) = @_;
+ my ($self, $attributes, $filterString, $params, $selected, $fast) = @_;
+ my $slowMode = ! $fast;
+ # Default to nothing selected. This prevents an execution warning if "$selected"
+ # is undefined.
+ $selected = "" unless defined $selected;
+ Trace("Gene Menu called with slow mode \"$slowMode\" and selection \"$selected\".") if T(3);
# Start the menu.
my $retVal = "\n";
# Return the result.
return $retVal;
}
+
=head3 Build
C<< $sprout->Build(); >>
@@ -628,6 +655,8 @@
return ($contigID, $start, $dir, $len);
}
+
+
=head3 PointLocation
C<< my $found = Sprout::PointLocation($location, $point); >>
@@ -1470,14 +1499,12 @@
my %retVal = ();
# Loop through the incoming features.
for my $featureID (@{$featureList}) {
- # Create a query to get the feature's best hit.
- my $query = $self->Get(['IsBidirectionalBestHitOf'],
- "IsBidirectionalBestHitOf(from-link) = ? AND IsBidirectionalBestHitOf(genome) = ?",
- [$featureID, $genomeID]);
+ # Ask the server for the feature's best hit.
+ my @bbhData = FIGRules::BBHData($featureID);
# Peel off the BBHs found.
my @found = ();
- while (my $bbh = $query->Fetch) {
- push @found, $bbh->Value('IsBidirectionalBestHitOf(to-link)');
+ for my $bbh (@bbhData) {
+ push @found, $bbh->[0];
}
$retVal{$featureID} = \@found;
}
@@ -1491,8 +1518,7 @@
Return a list of the similarities to the specified feature.
-Sprout does not support real similarities, so this method just returns the bidirectional
-best hits.
+This method just returns the bidirectional best hits for performance reasons.
=over 4
@@ -1512,10 +1538,7 @@
# Get the parameters.
my ($self, $featureID, $count) = @_;
# Ask for the best hits.
- my @lists = $self->GetAll(['IsBidirectionalBestHitOf'],
- "IsBidirectionalBestHitOf(from-link) = ? ORDER BY IsBidirectionalBestHitOf(score) DESC",
- [$featureID], ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(score)'],
- $count);
+ my @lists = FIGRules::BBHData($featureID);
# Create the return value.
my %retVal = ();
for my $tuple (@lists) {
@@ -1525,8 +1548,6 @@
return %retVal;
}
-
-
=head3 IsComplete
C<< my $flag = $sprout->IsComplete($genomeID); >>
@@ -1654,6 +1675,7 @@
sub CoupledFeatures {
# Get the parameters.
my ($self, $featureID) = @_;
+ Trace("Looking for features coupled to $featureID.") if T(coupling => 3);
# Create a query to retrieve the functionally-coupled features.
my $query = $self->Get(['ParticipatesInCoupling', 'Coupling'],
"ParticipatesInCoupling(from-link) = ?", [$featureID]);
@@ -1666,10 +1688,12 @@
# Get the ID and score of the coupling.
my ($couplingID, $score) = $clustering->Values(['Coupling(id)',
'Coupling(score)']);
+ Trace("$featureID coupled with score $score to ID $couplingID.") if T(coupling => 4);
# Get the other feature that participates in the coupling.
my ($otherFeatureID) = $self->GetFlat(['ParticipatesInCoupling'],
"ParticipatesInCoupling(to-link) = ? AND ParticipatesInCoupling(from-link) <> ?",
[$couplingID, $featureID], 'ParticipatesInCoupling(from-link)');
+ Trace("$couplingID target feature is $otherFeatureID.") if T(coupling => 4);
# Attach the other feature's score to its ID.
$retVal{$otherFeatureID} = $score;
$found = 1;
@@ -1802,7 +1826,7 @@
my ($self, $peg1, $peg2) = @_;
# Declare the return values. We'll start with the coupling ID and undefine the
# flag and score until we have more information.
- my ($retVal, $inverted, $score) = (CouplingID($peg1, $peg2), undef, undef);
+ my ($retVal, $inverted, $score) = ($self->CouplingID($peg1, $peg2), undef, undef);
# Find the coupling data.
my @pegs = $self->GetAll(['Coupling', 'ParticipatesInCoupling'],
"Coupling(id) = ? ORDER BY ParticipatesInCoupling(pos)",
@@ -1823,9 +1847,112 @@
return ($retVal, $inverted, $score);
}
+=head3 GetSynonymGroup
+
+C<< my $id = $sprout->GetSynonymGroup($fid); >>
+
+Return the synonym group name for the specified feature.
+
+=over 4
+
+=item fid
+
+ID of the feature whose synonym group is desired.
+
+=item RETURN
+
+The name of the synonym group to which the feature belongs. If the feature does
+not belong to a synonym group, the feature ID itself is returned.
+
+=back
+
+=cut
+
+sub GetSynonymGroup {
+ # Get the parameters.
+ my ($self, $fid) = @_;
+ # Declare the return variable.
+ my $retVal;
+ # Find the synonym group.
+ my @groups = $self->GetFlat(['IsSynonymGroupFor'], "IsSynonymGroupFor(to-link) = ?",
+ [$fid], 'IsSynonymGroupFor(from-link)');
+ # Check to see if we found anything.
+ if (@groups) {
+ $retVal = $groups[0];
+ } else {
+ $retVal = $fid;
+ }
+ # Return the result.
+ return $retVal;
+}
+
+=head3 GetBoundaries
+
+C<< my ($contig, $beg, $end) = $sprout->GetBoundaries(@locList); >>
+
+Determine the begin and end boundaries for the locations in a list. All of the
+locations must belong to the same contig and have mostly the same direction in
+order for this method to produce a meaningful result. The resulting
+begin/end pair will contain all of the bases in any of the locations.
+
+=over 4
+
+=item locList
+
+List of locations to process.
+
+=item RETURN
+
+Returns a 3-tuple consisting of the contig ID, the beginning boundary,
+and the ending boundary. The beginning boundary will be left of the
+end for mostly-forward locations and right of the end for mostly-backward
+locations.
+
+=back
+
+=cut
+
+sub GetBoundaries {
+ # Get the parameters.
+ my ($self, @locList) = @_;
+ # Set up the counters used to determine the most popular direction.
+ my %counts = ( '+' => 0, '-' => 0 );
+ # Get the last location and parse it.
+ my $locObject = BasicLocation->new(pop @locList);
+ # Prime the loop with its data.
+ my ($contig, $beg, $end) = ($locObject->Contig, $locObject->Left, $locObject->Right);
+ # Count its direction.
+ $counts{$locObject->Dir}++;
+ # Loop through the remaining locations. Note that in most situations, this loop
+ # will not iterate at all, because most of the time we will be dealing with a
+ # singleton list.
+ for my $loc (@locList) {
+ # Create a location object.
+ my $locObject = BasicLocation->new($loc);
+ # Count the direction.
+ $counts{$locObject->Dir}++;
+ # Get the left end and the right end.
+ my $left = $locObject->Left;
+ my $right = $locObject->Right;
+ # Merge them into the return variables.
+ if ($left < $beg) {
+ $beg = $left;
+ }
+ if ($right > $end) {
+ $end = $right;
+ }
+ }
+ # If the most common direction is reverse, flip the begin and end markers.
+ if ($counts{'-'} > $counts{'+'}) {
+ ($beg, $end) = ($end, $beg);
+ }
+ # Return the result.
+ return ($contig, $beg, $end);
+}
+
=head3 CouplingID
-C<< my $couplingID = Sprout::CouplingID($peg1, $peg2); >>
+C<< my $couplingID = $sprout->CouplingID($peg1, $peg2); >>
Return the coupling ID for a pair of feature IDs.
@@ -1858,7 +1985,8 @@
=cut
#: Return Type $;
sub CouplingID {
- return join " ", sort @_;
+ my ($self, @pegs) = @_;
+ return $self->DigestKey(join " ", sort @pegs);
}
=head3 ReadFasta
@@ -2217,41 +2345,6 @@
return @retVal;
}
-=head3 Exists
-
-C<< my $found = $sprout->Exists($entityName, $entityID); >>
-
-Return TRUE if an entity exists, else FALSE.
-
-=over 4
-
-=item entityName
-
-Name of the entity type (e.g. C) relevant to the existence check.
-
-=item entityID
-
-ID of the entity instance whose existence is to be checked.
-
-=item RETURN
-
-Returns TRUE if the entity instance exists, else FALSE.
-
-=back
-
-=cut
-#: Return Type $;
-sub Exists {
- # Get the parameters.
- my ($self, $entityName, $entityID) = @_;
- # Check for the entity instance.
- Trace("Checking existence of $entityName with ID=$entityID.") if T(4);
- my $testInstance = $self->GetEntity($entityName, $entityID);
- # Return an existence indicator.
- my $retVal = ($testInstance ? 1 : 0);
- return $retVal;
-}
-
=head3 FeatureTranslation
C<< my $translation = $sprout->FeatureTranslation($featureID); >>
@@ -2789,7 +2882,65 @@
return @retVal;
}
+=head3 GenomeSubsystemData
+
+C<< my %featureData = $sprout->GenomeSubsystemData($genomeID); >>
+Return a hash mapping genome features to their subsystem roles.
+
+=over 4
+
+=item genomeID
+
+ID of the genome whose subsystem feature map is desired.
+
+=item RETURN
+
+Returns a hash mapping each feature of the genome to a list of 2-tuples. Eacb
+2-tuple contains a subsystem name followed by a role ID.
+
+=back
+
+=cut
+
+sub GenomeSubsystemData {
+ # Get the parameters.
+ my ($self, $genomeID) = @_;
+ # Declare the return variable.
+ my %retVal = ();
+ # Get a list of the genome features that participate in subsystems. For each
+ # feature we get its spreadsheet cells and the corresponding roles.
+ my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf'],
+ "HasFeature(from-link) = ?", [$genomeID],
+ ['HasFeature(to-link)', 'IsRoleOf(to-link)', 'IsRoleOf(from-link)']);
+ # Now we get a list of the spreadsheet cells and their associated subsystems. Subsystems
+ # with an unknown variant code (-1) are skipped. Note the genome ID is at both ends of the
+ # list. We use it at the beginning to get all the spreadsheet cells for the genome and
+ # again at the end to filter out participation in subsystems with a negative variant code.
+ my @cellData = $self->GetAll(['IsGenomeOf', 'HasSSCell', 'ParticipatesIn'],
+ "IsGenomeOf(from-link) = ? AND ParticipatesIn(variant-code) >= 0 AND ParticipatesIn(from-link) = ?",
+ [$genomeID, $genomeID], ['HasSSCell(to-link)', 'HasSSCell(from-link)']);
+ # Now "@roleData" lists the spreadsheet cell and role for each of the genome's features.
+ # "@cellData" lists the subsystem name for each of the genome's spreadsheet cells. We
+ # link these two lists together to create the result. First, we want a hash mapping
+ # spreadsheet cells to subsystem names.
+ my %subHash = map { $_->[0] => $_->[1] } @cellData;
+ # We loop through @cellData to build the hash.
+ for my $roleEntry (@roleData) {
+ # Get the data for this feature and cell.
+ my ($fid, $cellID, $role) = @{$roleEntry};
+ # Check for a subsystem name.
+ my $subsys = $subHash{$cellID};
+ if ($subsys) {
+ # Insure this feature has an entry in the return hash.
+ if (! exists $retVal{$fid}) { $retVal{$fid} = []; }
+ # Merge in this new data.
+ push @{$retVal{$fid}}, [$subsys, $role];
+ }
+ }
+ # Return the result.
+ return %retVal;
+}
=head3 RelatedFeatures
@@ -2826,9 +2977,7 @@
# Get the parameters.
my ($self, $featureID, $function, $userID) = @_;
# Get a list of the features that are BBHs of the incoming feature.
- my @bbhFeatures = $self->GetFlat(['IsBidirectionalBestHitOf'],
- "IsBidirectionalBestHitOf(from-link) = ?", [$featureID],
- 'IsBidirectionalBestHitOf(to-link)');
+ my @bbhFeatures = map { $_->[0] } FIGRules::BBHData($featureID);
# Now we loop through the features, pulling out the ones that have the correct
# functional assignment.
my @retVal = ();
@@ -3029,11 +3178,8 @@
my ($self, $featureID, $cutoff) = @_;
# Create the return hash.
my %retVal = ();
- # Create a query to get the desired BBHs.
- my @bbhList = $self->GetAll(['IsBidirectionalBestHitOf'],
- 'IsBidirectionalBestHitOf(sc) <= ? AND IsBidirectionalBestHitOf(from-link) = ?',
- [$cutoff, $featureID],
- ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(sc)']);
+ # Query for the desired BBHs.
+ my @bbhList = FIGRules::BBHData($featureID, $cutoff);
# Form the results into the return hash.
for my $pair (@bbhList) {
$retVal{$pair->[0]} = $pair->[1];
@@ -3042,6 +3188,71 @@
return %retVal;
}
+=head3 Sims
+
+C<< my $simList = $sprout->Sims($fid, $maxN, $maxP, $select, $max_expand, $filters); >>
+
+Get a list of similarities for a specified feature. Similarity information is not kept in the
+Sprout database; rather, they are retrieved from a network server. The similarities are
+returned as B objects. A Sim object is actually a list reference that has been blessed
+so that its elements can be accessed by name.
+
+Similarities can be either raw or expanded. The raw similarities are basic
+hits between features with similar DNA. Expanding a raw similarity drags in any
+features considered substantially identical. So, for example, if features B,
+B, and B are all substatially identical to B, then a raw similarity
+B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>.
+
+=over 4
+
+=item fid
+
+ID of the feature whose similarities are desired.
+
+=item maxN
+
+Maximum number of similarities to return.
+
+=item maxP
+
+Minumum allowable similarity score.
+
+=item select
+
+Selection criterion: C means only raw similarities are returned; C
+means only similarities to FIG features are returned; C means all expanded
+similarities are returned; and C means similarities are expanded until the
+number of FIG features equals the maximum.
+
+=item max_expand
+
+The maximum number of features to expand.
+
+=item filters
+
+Reference to a hash containing filter information, or a subroutine that can be
+used to filter the sims.
+
+=item RETURN
+
+Returns a reference to a list of similarity objects, or C if an error
+occurred.
+
+=back
+
+=cut
+
+sub Sims {
+ # Get the parameters.
+ my ($self, $fid, $maxN, $maxP, $select, $max_expand, $filters) = @_;
+ # Create the shim object to test for deleted FIDs.
+ my $shim = FidCheck->new($self);
+ # Ask the network for sims.
+ my $retVal = FIGRules::GetNetworkSims($shim, $fid, {}, $maxN, $maxP, $select, $max_expand, $filters);
+ # Return the result.
+ return $retVal;
+}
+
=head3 GetGroups
C<< my %groups = $sprout->GetGroups(\@groupList); >>
@@ -3199,6 +3410,153 @@
return $retVal;
}
+=head3 Fix
+
+C<< my %fixedHash = Sprout::Fix(%groupHash); >>
+
+Prepare a genome group hash (like that returned by L for processing.
+Groups with the same primary name will be combined. The primary name is the
+first capitalized word in the group name.
+
+=over 4
+
+=item groupHash
+
+Hash to be fixed up.
+
+=item RETURN
+
+Returns a fixed-up version of the hash.
+
+=back
+
+=cut
+
+sub Fix {
+ # Get the parameters.
+ my (%groupHash) = @_;
+ # Create the result hash.
+ my %retVal = ();
+ # Copy over the genomes.
+ for my $groupID (keys %groupHash) {
+ # Make a safety copy of the group ID.
+ my $realGroupID = $groupID;
+ # Yank the primary name.
+ if ($groupID =~ /([A-Z]\w+)/) {
+ $realGroupID = $1;
+ }
+ # Append this group's genomes into the result hash.
+ Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});
+ }
+ # Return the result hash.
+ return %retVal;
+}
+
+=head3 GroupPageName
+
+C<< my $name = $sprout->GroupPageName($group); >>
+
+Return the name of the page for the specified NMPDR group.
+
+=over 4
+
+=item group
+
+Name of the relevant group.
+
+=item RETURN
+
+Returns the relative page name (e.g. C<../content/campy.php>). If the group file is not in
+memory it will be read in.
+
+=back
+
+=cut
+
+sub GroupPageName {
+ # Get the parameters.
+ my ($self, $group) = @_;
+ # Declare the return variable.
+ my $retVal;
+ # Check for the group file data.
+ if (! defined $self->{groupHash}) {
+ # Read the group file.
+ my %groupData = Sprout::ReadGroupFile($self->{_options}->{dataDir} . "/groups.tbl");
+ # Store it in our object.
+ $self->{groupHash} = \%groupData;
+ }
+ # Compute the real group name.
+ my $realGroup = $group;
+ if ($group =~ /([A-Z]\w+)/) {
+ $realGroup = $1;
+ }
+ # Return the page name.
+ $retVal = "../content/" . $self->{groupHash}->{$realGroup}->[1];
+ # Return the result.
+ return $retVal;
+}
+
+=head3 ReadGroupFile
+
+C<< my %groupData = Sprout::ReadGroupFile($groupFileName); >>
+
+Read in the data from the specified group file. The group file contains information
+about each of the NMPDR groups.
+
+=over 4
+
+=item name
+
+Name of the group.
+
+=item page
+
+Name of the group's page on the web site (e.g. C for
+Campylobacter)
+
+=item genus
+
+Genus of the group
+
+=item species
+
+Species of the group, or an empty string if the group is for an entire
+genus. If the group contains more than one species, the species names
+should be separated by commas.
+
+=back
+
+The parameters to this method are as follows
+
+=over 4
+
+=item groupFile
+
+Name of the file containing the group data.
+
+=item RETURN
+
+Returns a hash keyed on group name. The value of each hash
+
+=back
+
+=cut
+
+sub ReadGroupFile {
+ # Get the parameters.
+ my ($groupFileName) = @_;
+ # Declare the return variable.
+ my %retVal;
+ # Read the group file.
+ my @groupLines = Tracer::GetFile($groupFileName);
+ for my $groupLine (@groupLines) {
+ my ($name, $page, $genus, $species) = split(/\t/, $groupLine);
+ $retVal{$name} = [$page, $genus, $species];
+ }
+ # Return the result.
+ return %retVal;
+}
+
=head2 Internal Utility Methods
=head3 ParseAssignment
@@ -3255,7 +3613,7 @@
}
# If we have an assignment, we need to clean the function text. There may be
# extra junk at the end added as a note from the user.
- if (@retVal) {
+ if (defined( $retVal[1] )) {
$retVal[1] =~ s/(\t\S)?\s*$//;
}
# Return the result list.
@@ -3346,4 +3704,4 @@
}
-1;
\ No newline at end of file
+1;