--- Sprout.pm 2006/06/23 01:34:42 1.69 +++ Sprout.pm 2006/09/19 00:14:04 1.85 @@ -12,9 +12,10 @@ use DBObject; use Tracer; use FIGRules; + use FidCheck; use Stats; use POSIX qw(strftime); - + use BasicLocation; =head1 Sprout Database Manipulation Object @@ -91,6 +92,9 @@ sub new { # Get the parameters. my ($class, $dbName, $options) = @_; + # Compute the DBD directory. + my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir : + $FIG_Config::fig ); # Compute the options. We do this by starting with a table of defaults and overwriting with # the incoming data. my $optionTable = Tracer::GetOptions({ @@ -98,13 +102,14 @@ # database type dataDir => $FIG_Config::sproutData, # data file directory - xmlFileName => "$FIG_Config::fig/SproutDBD.xml", + xmlFileName => "$dbd_dir/SproutDBD.xml", # database definition file name userData => "$FIG_Config::dbuser/$FIG_Config::dbpass", # user name and password port => $FIG_Config::dbport, # database connection port sock => $FIG_Config::dbsock, + host => $FIG_Config::dbhost, maxSegmentLength => 4500, # maximum feature segment length maxSequenceLength => 8000, # maximum contig sequence length noDBOpen => 0, # 1 to suppress the database open @@ -118,7 +123,7 @@ my $dbh; if (! $optionTable->{noDBOpen}) { $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName, - $password, $optionTable->{port}, undef, $optionTable->{sock}); + $password, $optionTable->{port}, $optionTable->{host}, $optionTable->{sock}); } # Create the ERDB object. my $xmlFileName = "$optionTable->{xmlFileName}"; @@ -126,6 +131,8 @@ # Add the option table and XML file name. $retVal->{_options} = $optionTable; $retVal->{_xmlName} = $xmlFileName; + # Set up space for the group file data. + $retVal->{groupHash} = undef; # Return it. return $retVal; } @@ -335,7 +342,7 @@ =head3 GeneMenu -C<< my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params); >> +C<< my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params, $selected); >> Return an HTML select menu of genomes. Each genome will be an option in the menu, and will be displayed by name with the ID and a contig count attached. The selection @@ -357,6 +364,14 @@ Reference to a list of values to be substituted in for the parameter marks in the filter string. +=item selected (optional) + +ID of the genome to be initially selected. + +=item fast (optional) + +If specified and TRUE, the contig counts will be omitted to improve performance. + =item RETURN Returns an HTML select menu with the specified genomes as selectable options. @@ -367,7 +382,12 @@ sub GeneMenu { # Get the parameters. - my ($self, $attributes, $filterString, $params) = @_; + my ($self, $attributes, $filterString, $params, $selected, $fast) = @_; + my $slowMode = ! $fast; + # Default to nothing selected. This prevents an execution warning if "$selected" + # is undefined. + $selected = "" unless defined $selected; + Trace("Gene Menu called with slow mode \"$slowMode\" and selection \"$selected\".") if T(3); # Start the menu. my $retVal = "\n"; # Return the result. return $retVal; } + =head3 Build C<< $sprout->Build(); >> @@ -628,6 +655,8 @@ return ($contigID, $start, $dir, $len); } + + =head3 PointLocation C<< my $found = Sprout::PointLocation($location, $point); >> @@ -1470,14 +1499,12 @@ my %retVal = (); # Loop through the incoming features. for my $featureID (@{$featureList}) { - # Create a query to get the feature's best hit. - my $query = $self->Get(['IsBidirectionalBestHitOf'], - "IsBidirectionalBestHitOf(from-link) = ? AND IsBidirectionalBestHitOf(genome) = ?", - [$featureID, $genomeID]); + # Ask the server for the feature's best hit. + my @bbhData = FIGRules::BBHData($featureID); # Peel off the BBHs found. my @found = (); - while (my $bbh = $query->Fetch) { - push @found, $bbh->Value('IsBidirectionalBestHitOf(to-link)'); + for my $bbh (@bbhData) { + push @found, $bbh->[0]; } $retVal{$featureID} = \@found; } @@ -1491,8 +1518,7 @@ Return a list of the similarities to the specified feature. -Sprout does not support real similarities, so this method just returns the bidirectional -best hits. +This method just returns the bidirectional best hits for performance reasons. =over 4 @@ -1512,10 +1538,7 @@ # Get the parameters. my ($self, $featureID, $count) = @_; # Ask for the best hits. - my @lists = $self->GetAll(['IsBidirectionalBestHitOf'], - "IsBidirectionalBestHitOf(from-link) = ? ORDER BY IsBidirectionalBestHitOf(score) DESC", - [$featureID], ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(score)'], - $count); + my @lists = FIGRules::BBHData($featureID); # Create the return value. my %retVal = (); for my $tuple (@lists) { @@ -1525,8 +1548,6 @@ return %retVal; } - - =head3 IsComplete C<< my $flag = $sprout->IsComplete($genomeID); >> @@ -1654,6 +1675,7 @@ sub CoupledFeatures { # Get the parameters. my ($self, $featureID) = @_; + Trace("Looking for features coupled to $featureID.") if T(coupling => 3); # Create a query to retrieve the functionally-coupled features. my $query = $self->Get(['ParticipatesInCoupling', 'Coupling'], "ParticipatesInCoupling(from-link) = ?", [$featureID]); @@ -1666,10 +1688,12 @@ # Get the ID and score of the coupling. my ($couplingID, $score) = $clustering->Values(['Coupling(id)', 'Coupling(score)']); + Trace("$featureID coupled with score $score to ID $couplingID.") if T(coupling => 4); # Get the other feature that participates in the coupling. my ($otherFeatureID) = $self->GetFlat(['ParticipatesInCoupling'], "ParticipatesInCoupling(to-link) = ? AND ParticipatesInCoupling(from-link) <> ?", [$couplingID, $featureID], 'ParticipatesInCoupling(from-link)'); + Trace("$couplingID target feature is $otherFeatureID.") if T(coupling => 4); # Attach the other feature's score to its ID. $retVal{$otherFeatureID} = $score; $found = 1; @@ -1802,7 +1826,7 @@ my ($self, $peg1, $peg2) = @_; # Declare the return values. We'll start with the coupling ID and undefine the # flag and score until we have more information. - my ($retVal, $inverted, $score) = (CouplingID($peg1, $peg2), undef, undef); + my ($retVal, $inverted, $score) = ($self->CouplingID($peg1, $peg2), undef, undef); # Find the coupling data. my @pegs = $self->GetAll(['Coupling', 'ParticipatesInCoupling'], "Coupling(id) = ? ORDER BY ParticipatesInCoupling(pos)", @@ -1823,9 +1847,112 @@ return ($retVal, $inverted, $score); } +=head3 GetSynonymGroup + +C<< my $id = $sprout->GetSynonymGroup($fid); >> + +Return the synonym group name for the specified feature. + +=over 4 + +=item fid + +ID of the feature whose synonym group is desired. + +=item RETURN + +The name of the synonym group to which the feature belongs. If the feature does +not belong to a synonym group, the feature ID itself is returned. + +=back + +=cut + +sub GetSynonymGroup { + # Get the parameters. + my ($self, $fid) = @_; + # Declare the return variable. + my $retVal; + # Find the synonym group. + my @groups = $self->GetFlat(['IsSynonymGroupFor'], "IsSynonymGroupFor(to-link) = ?", + [$fid], 'IsSynonymGroupFor(from-link)'); + # Check to see if we found anything. + if (@groups) { + $retVal = $groups[0]; + } else { + $retVal = $fid; + } + # Return the result. + return $retVal; +} + +=head3 GetBoundaries + +C<< my ($contig, $beg, $end) = $sprout->GetBoundaries(@locList); >> + +Determine the begin and end boundaries for the locations in a list. All of the +locations must belong to the same contig and have mostly the same direction in +order for this method to produce a meaningful result. The resulting +begin/end pair will contain all of the bases in any of the locations. + +=over 4 + +=item locList + +List of locations to process. + +=item RETURN + +Returns a 3-tuple consisting of the contig ID, the beginning boundary, +and the ending boundary. The beginning boundary will be left of the +end for mostly-forward locations and right of the end for mostly-backward +locations. + +=back + +=cut + +sub GetBoundaries { + # Get the parameters. + my ($self, @locList) = @_; + # Set up the counters used to determine the most popular direction. + my %counts = ( '+' => 0, '-' => 0 ); + # Get the last location and parse it. + my $locObject = BasicLocation->new(pop @locList); + # Prime the loop with its data. + my ($contig, $beg, $end) = ($locObject->Contig, $locObject->Left, $locObject->Right); + # Count its direction. + $counts{$locObject->Dir}++; + # Loop through the remaining locations. Note that in most situations, this loop + # will not iterate at all, because most of the time we will be dealing with a + # singleton list. + for my $loc (@locList) { + # Create a location object. + my $locObject = BasicLocation->new($loc); + # Count the direction. + $counts{$locObject->Dir}++; + # Get the left end and the right end. + my $left = $locObject->Left; + my $right = $locObject->Right; + # Merge them into the return variables. + if ($left < $beg) { + $beg = $left; + } + if ($right > $end) { + $end = $right; + } + } + # If the most common direction is reverse, flip the begin and end markers. + if ($counts{'-'} > $counts{'+'}) { + ($beg, $end) = ($end, $beg); + } + # Return the result. + return ($contig, $beg, $end); +} + =head3 CouplingID -C<< my $couplingID = Sprout::CouplingID($peg1, $peg2); >> +C<< my $couplingID = $sprout->CouplingID($peg1, $peg2); >> Return the coupling ID for a pair of feature IDs. @@ -1858,7 +1985,8 @@ =cut #: Return Type $; sub CouplingID { - return join " ", sort @_; + my ($self, @pegs) = @_; + return $self->DigestKey(join " ", sort @pegs); } =head3 ReadFasta @@ -2217,41 +2345,6 @@ return @retVal; } -=head3 Exists - -C<< my $found = $sprout->Exists($entityName, $entityID); >> - -Return TRUE if an entity exists, else FALSE. - -=over 4 - -=item entityName - -Name of the entity type (e.g. C) relevant to the existence check. - -=item entityID - -ID of the entity instance whose existence is to be checked. - -=item RETURN - -Returns TRUE if the entity instance exists, else FALSE. - -=back - -=cut -#: Return Type $; -sub Exists { - # Get the parameters. - my ($self, $entityName, $entityID) = @_; - # Check for the entity instance. - Trace("Checking existence of $entityName with ID=$entityID.") if T(4); - my $testInstance = $self->GetEntity($entityName, $entityID); - # Return an existence indicator. - my $retVal = ($testInstance ? 1 : 0); - return $retVal; -} - =head3 FeatureTranslation C<< my $translation = $sprout->FeatureTranslation($featureID); >> @@ -2789,7 +2882,65 @@ return @retVal; } +=head3 GenomeSubsystemData + +C<< my %featureData = $sprout->GenomeSubsystemData($genomeID); >> +Return a hash mapping genome features to their subsystem roles. + +=over 4 + +=item genomeID + +ID of the genome whose subsystem feature map is desired. + +=item RETURN + +Returns a hash mapping each feature of the genome to a list of 2-tuples. Eacb +2-tuple contains a subsystem name followed by a role ID. + +=back + +=cut + +sub GenomeSubsystemData { + # Get the parameters. + my ($self, $genomeID) = @_; + # Declare the return variable. + my %retVal = (); + # Get a list of the genome features that participate in subsystems. For each + # feature we get its spreadsheet cells and the corresponding roles. + my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf'], + "HasFeature(from-link) = ?", [$genomeID], + ['HasFeature(to-link)', 'IsRoleOf(to-link)', 'IsRoleOf(from-link)']); + # Now we get a list of the spreadsheet cells and their associated subsystems. Subsystems + # with an unknown variant code (-1) are skipped. Note the genome ID is at both ends of the + # list. We use it at the beginning to get all the spreadsheet cells for the genome and + # again at the end to filter out participation in subsystems with a negative variant code. + my @cellData = $self->GetAll(['IsGenomeOf', 'HasSSCell', 'ParticipatesIn'], + "IsGenomeOf(from-link) = ? AND ParticipatesIn(variant-code) >= 0 AND ParticipatesIn(from-link) = ?", + [$genomeID, $genomeID], ['HasSSCell(to-link)', 'HasSSCell(from-link)']); + # Now "@roleData" lists the spreadsheet cell and role for each of the genome's features. + # "@cellData" lists the subsystem name for each of the genome's spreadsheet cells. We + # link these two lists together to create the result. First, we want a hash mapping + # spreadsheet cells to subsystem names. + my %subHash = map { $_->[0] => $_->[1] } @cellData; + # We loop through @cellData to build the hash. + for my $roleEntry (@roleData) { + # Get the data for this feature and cell. + my ($fid, $cellID, $role) = @{$roleEntry}; + # Check for a subsystem name. + my $subsys = $subHash{$cellID}; + if ($subsys) { + # Insure this feature has an entry in the return hash. + if (! exists $retVal{$fid}) { $retVal{$fid} = []; } + # Merge in this new data. + push @{$retVal{$fid}}, [$subsys, $role]; + } + } + # Return the result. + return %retVal; +} =head3 RelatedFeatures @@ -2826,9 +2977,7 @@ # Get the parameters. my ($self, $featureID, $function, $userID) = @_; # Get a list of the features that are BBHs of the incoming feature. - my @bbhFeatures = $self->GetFlat(['IsBidirectionalBestHitOf'], - "IsBidirectionalBestHitOf(from-link) = ?", [$featureID], - 'IsBidirectionalBestHitOf(to-link)'); + my @bbhFeatures = map { $_->[0] } FIGRules::BBHData($featureID); # Now we loop through the features, pulling out the ones that have the correct # functional assignment. my @retVal = (); @@ -3029,11 +3178,8 @@ my ($self, $featureID, $cutoff) = @_; # Create the return hash. my %retVal = (); - # Create a query to get the desired BBHs. - my @bbhList = $self->GetAll(['IsBidirectionalBestHitOf'], - 'IsBidirectionalBestHitOf(sc) <= ? AND IsBidirectionalBestHitOf(from-link) = ?', - [$cutoff, $featureID], - ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(sc)']); + # Query for the desired BBHs. + my @bbhList = FIGRules::BBHData($featureID, $cutoff); # Form the results into the return hash. for my $pair (@bbhList) { $retVal{$pair->[0]} = $pair->[1]; @@ -3042,6 +3188,71 @@ return %retVal; } +=head3 Sims + +C<< my $simList = $sprout->Sims($fid, $maxN, $maxP, $select, $max_expand, $filters); >> + +Get a list of similarities for a specified feature. Similarity information is not kept in the +Sprout database; rather, they are retrieved from a network server. The similarities are +returned as B objects. A Sim object is actually a list reference that has been blessed +so that its elements can be accessed by name. + +Similarities can be either raw or expanded. The raw similarities are basic +hits between features with similar DNA. Expanding a raw similarity drags in any +features considered substantially identical. So, for example, if features B, +B, and B are all substatially identical to B, then a raw similarity +B<[C,A]> would be expanded to B<[C,A] [C,A1] [C,A2] [C,A3]>. + +=over 4 + +=item fid + +ID of the feature whose similarities are desired. + +=item maxN + +Maximum number of similarities to return. + +=item maxP + +Minumum allowable similarity score. + +=item select + +Selection criterion: C means only raw similarities are returned; C +means only similarities to FIG features are returned; C means all expanded +similarities are returned; and C means similarities are expanded until the +number of FIG features equals the maximum. + +=item max_expand + +The maximum number of features to expand. + +=item filters + +Reference to a hash containing filter information, or a subroutine that can be +used to filter the sims. + +=item RETURN + +Returns a reference to a list of similarity objects, or C if an error +occurred. + +=back + +=cut + +sub Sims { + # Get the parameters. + my ($self, $fid, $maxN, $maxP, $select, $max_expand, $filters) = @_; + # Create the shim object to test for deleted FIDs. + my $shim = FidCheck->new($self); + # Ask the network for sims. + my $retVal = FIGRules::GetNetworkSims($shim, $fid, {}, $maxN, $maxP, $select, $max_expand, $filters); + # Return the result. + return $retVal; +} + =head3 GetGroups C<< my %groups = $sprout->GetGroups(\@groupList); >> @@ -3199,6 +3410,153 @@ return $retVal; } +=head3 Fix + +C<< my %fixedHash = Sprout::Fix(%groupHash); >> + +Prepare a genome group hash (like that returned by L for processing. +Groups with the same primary name will be combined. The primary name is the +first capitalized word in the group name. + +=over 4 + +=item groupHash + +Hash to be fixed up. + +=item RETURN + +Returns a fixed-up version of the hash. + +=back + +=cut + +sub Fix { + # Get the parameters. + my (%groupHash) = @_; + # Create the result hash. + my %retVal = (); + # Copy over the genomes. + for my $groupID (keys %groupHash) { + # Make a safety copy of the group ID. + my $realGroupID = $groupID; + # Yank the primary name. + if ($groupID =~ /([A-Z]\w+)/) { + $realGroupID = $1; + } + # Append this group's genomes into the result hash. + Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}}); + } + # Return the result hash. + return %retVal; +} + +=head3 GroupPageName + +C<< my $name = $sprout->GroupPageName($group); >> + +Return the name of the page for the specified NMPDR group. + +=over 4 + +=item group + +Name of the relevant group. + +=item RETURN + +Returns the relative page name (e.g. C<../content/campy.php>). If the group file is not in +memory it will be read in. + +=back + +=cut + +sub GroupPageName { + # Get the parameters. + my ($self, $group) = @_; + # Declare the return variable. + my $retVal; + # Check for the group file data. + if (! defined $self->{groupHash}) { + # Read the group file. + my %groupData = Sprout::ReadGroupFile($self->{_options}->{dataDir} . "/groups.tbl"); + # Store it in our object. + $self->{groupHash} = \%groupData; + } + # Compute the real group name. + my $realGroup = $group; + if ($group =~ /([A-Z]\w+)/) { + $realGroup = $1; + } + # Return the page name. + $retVal = "../content/" . $self->{groupHash}->{$realGroup}->[1]; + # Return the result. + return $retVal; +} + +=head3 ReadGroupFile + +C<< my %groupData = Sprout::ReadGroupFile($groupFileName); >> + +Read in the data from the specified group file. The group file contains information +about each of the NMPDR groups. + +=over 4 + +=item name + +Name of the group. + +=item page + +Name of the group's page on the web site (e.g. C for +Campylobacter) + +=item genus + +Genus of the group + +=item species + +Species of the group, or an empty string if the group is for an entire +genus. If the group contains more than one species, the species names +should be separated by commas. + +=back + +The parameters to this method are as follows + +=over 4 + +=item groupFile + +Name of the file containing the group data. + +=item RETURN + +Returns a hash keyed on group name. The value of each hash + +=back + +=cut + +sub ReadGroupFile { + # Get the parameters. + my ($groupFileName) = @_; + # Declare the return variable. + my %retVal; + # Read the group file. + my @groupLines = Tracer::GetFile($groupFileName); + for my $groupLine (@groupLines) { + my ($name, $page, $genus, $species) = split(/\t/, $groupLine); + $retVal{$name} = [$page, $genus, $species]; + } + # Return the result. + return %retVal; +} + =head2 Internal Utility Methods =head3 ParseAssignment @@ -3255,7 +3613,7 @@ } # If we have an assignment, we need to clean the function text. There may be # extra junk at the end added as a note from the user. - if (@retVal) { + if (defined( $retVal[1] )) { $retVal[1] =~ s/(\t\S)?\s*$//; } # Return the result list. @@ -3346,4 +3704,4 @@ } -1; \ No newline at end of file +1;