--- Sprout.pm 2008/02/14 19:13:33 1.108 +++ Sprout.pm 2009/01/19 21:46:21 1.122 @@ -4,7 +4,7 @@ use strict; use DBKernel; use XML::Simple; - use DBQuery; + use ERDBQuery; use ERDBObject; use Tracer; use FIGRules; @@ -14,6 +14,9 @@ use BasicLocation; use CustomAttributes; use RemoteCustomAttributes; + use CGI qw(-nosticky); + use WikiTools; + use BioWords; use base qw(ERDB); =head1 Sprout Database Manipulation Object @@ -38,8 +41,6 @@ =cut -#: Constructor SFXlate->new_sprout_only(); - =head2 Public Methods =head3 new @@ -54,7 +55,7 @@ =item dbName -Name of the database. +Name of the database. If omitted, the default Sprout database name is used. =item options @@ -88,11 +89,29 @@ my $sprout = Sprout->new('Sprout', { userData => 'fig/admin', dataDir => '/usr/fig/SproutData' }); +In order to work properly with [[ERDBGeneratorPl]], the constructor has an alternate +form. + + my $sprout = Sprout->new(dbd => $filename); + +Where I<$fileName> is the name of the DBD file. This enables us to specify an alternate +DBD for the loader, which is important when the database format changes. + =cut sub new { # Get the parameters. my ($class, $dbName, $options) = @_; + # Check for the alternate signature, and default the database name if it is missing. + if ($dbName eq 'dbd') { + $dbName = $FIG_Config::sproutDB; + $options = { xmlFileName => $options }; + } elsif (! defined $dbName) { + $dbName = $FIG_Config::sproutDB; + } elsif (ref $dbName eq 'HASH') { + $options = $dbName; + $dbName = $FIG_Config::sproutDB; + } # Compute the DBD directory. my $dbd_dir = (defined($FIG_Config::dbd_dir) ? $FIG_Config::dbd_dir : $FIG_Config::fig ); @@ -105,11 +124,11 @@ # data file directory xmlFileName => "$dbd_dir/SproutDBD.xml", # database definition file name - userData => "$FIG_Config::dbuser/$FIG_Config::dbpass", + userData => "$FIG_Config::sproutUser/$FIG_Config::sproutPass", # user name and password - port => $FIG_Config::dbport, + port => $FIG_Config::sproutPort, # database connection port - sock => $FIG_Config::dbsock, + sock => $FIG_Config::sproutSock, host => $FIG_Config::sprout_host, maxSegmentLength => 4500, # maximum feature segment length maxSequenceLength => 8000, # maximum contig sequence length @@ -135,21 +154,47 @@ $retVal->{_xmlName} = $xmlFileName; # Set up space for the group file data. $retVal->{groupHash} = undef; - # Set up space for the genome hash. We use this to identify NMPDR genomes. - $retVal->{genomeHash} = undef; - # Connect to the attributes. - if ($FIG_Config::attrURL) { - Trace("Remote attribute server $FIG_Config::attrURL chosen.") if T(3); - $retVal->{_ca} = RemoteCustomAttributes->new($FIG_Config::attrURL); - } elsif ($FIG_Config::attrDbName) { - Trace("Local attribute database $FIG_Config::attrDbName chosen.") if T(3); - my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent())); - $retVal->{_ca} = CustomAttributes->new(user => $user); - } + # Set up space for the genome hash. We use this to identify NMPDR genomes + # and remember genome data. + $retVal->{genomeHash} = {}; + $retVal->{genomeHashFilled} = 0; + # Remember the data directory name. + $retVal->{dataDir} = $dataDir; # Return it. return $retVal; } +=head3 ca + + my $ca = $sprout->ca():; + +Return the [[CustomAttributesPm]] object for retrieving object +properties. + +=cut + +sub ca { + # Get the parameters. + my ($self) = @_; + # Do we already have an attribute object? + my $retVal = $self->{_ca}; + if (! defined $retVal) { + # No, create one. How we do it depends on the configuration. + if ($FIG_Config::attrURL) { + Trace("Remote attribute server $FIG_Config::attrURL chosen.") if T(3); + $retVal = RemoteCustomAttributes->new($FIG_Config::attrURL); + } elsif ($FIG_Config::attrDbName) { + Trace("Local attribute database $FIG_Config::attrDbName chosen.") if T(3); + my $user = ($FIG_Config::arch eq 'win' ? 'self' : scalar(getpwent())); + $retVal = CustomAttributes->new(user => $user); + } + # Save it for next time. + $self->{_ca} = $retVal; + } + # Return the result. + return $retVal; +} + =head3 CoreGenomes my @genomes = $sprout->CoreGenomes($scope); @@ -308,7 +353,7 @@ The files are loaded based on the presumption that each line of the file is a record in the relation, and the individual fields are delimited by tabs. Tab and new-line characters inside fields must be represented by the escape sequences C<\t> and C<\n>, respectively. The fields must -be presented in the order given in the relation tables produced by the L method. +be presented in the order given in the relation tables produced by the database documentation. =over 4 @@ -465,87 +510,284 @@ return $retVal; } -=head3 GeneMenu +=head3 GenomeMenu - my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params, $selected); + my $html = $sprout->GenomeMenu(%options); -Return an HTML select menu of genomes. Each genome will be an option in the menu, -and will be displayed by name with the ID and a contig count attached. The selection -value will be the genome ID. The genomes will be sorted by genus/species name. +Generate a genome selection control with the specified name and options. +This control is almost but not quite the same as the genome control in the +B class. Eventually, the two will be combined. =over 4 -=item attributes +=item options -Reference to a hash mapping attributes to values for the SELECT tag generated. +Optional parameters for the control (see below). -=item filterString +=item RETURN -A filter string for use in selecting the genomes. The filter string must conform -to the rules for the C<< ERDB->Get >> method. +Returns the HTML for a genome selection control on a form (sometimes called a popup menu). -=item params +=back -Reference to a list of values to be substituted in for the parameter marks in -the filter string. +The valid options are as follows. -=item selected (optional) +=over 4 -ID of the genome to be initially selected. +=item name -=item fast (optional) +Name to give this control for use in passing it to the form. The default is C. +Terrible things will happen if you have two controls with the same name on the same page. -If specified and TRUE, the contig counts will be omitted to improve performance. +=item filter -=item RETURN +If specified, a filter for the list of genomes to display. The filter should be in the form of a +list reference, a string, or a hash reference. If it is a list reference, the first element +of the list should be the filter string, and the remaining elements the filter parameters. If it is a +string, it will be split into a list at each included tab. If it is a hash reference, it should be +a hash that maps genomes which should be included to a TRUE value. + +=item multiSelect + +If TRUE, then the user can select multiple genomes. If FALSE, the user can only select one genome. -Returns an HTML select menu with the specified genomes as selectable options. +=item size + +Number of rows to display in the control. The default is C<10> + +=item id + +ID to give this control. The default is the value of the C option. Nothing will work correctly +unless this ID is unique. + +=item selected + +A comma-delimited list of selected genomes, or a reference to a list of selected genomes. The +default is none. + +=item class + +If specified, a style class to assign to the genome control. =back =cut -sub GeneMenu { +sub GenomeMenu { # Get the parameters. - my ($self, $attributes, $filterString, $params, $selected, $fast) = @_; - my $slowMode = ! $fast; - # Default to nothing selected. This prevents an execution warning if "$selected" - # is undefined. - $selected = "" unless defined $selected; - Trace("Gene Menu called with slow mode \"$slowMode\" and selection \"$selected\".") if T(3); - # Start the menu. - my $retVal = ""); + # Loop through the groups. + for my $group (@groups) { + # Get the genomes in the group. + for my $genome (@{$gHash{$group}}) { + # If this is an NMPDR organism, we add an extra style and count it. + my $nmpdrStyle = ""; + if ($nmpdrGroupCount > 0) { + $nmpdrCount++; + $nmpdrStyle = " Core"; + } + # Get the organism ID, name, contig count, and domain. + my ($genomeID, $name, $contigCount, $domain) = @{$genome}; + # See if we're pre-selected. + my $selectTag = ($selected{$genomeID} ? " SELECTED" : ""); + # Compute the display name. + my $nameString = "$name ($genomeID$contigCount)"; + # Generate the option tag. + my $optionTag = ""; + push @lines, " $optionTag"; + } + # Record this group in the nmpdrGroup count. When that gets to 0, we've finished the NMPDR + # groups. + $nmpdrGroupCount--; } # Close the SELECT tag. - $retVal .= "\n"; + push @lines, ""; + if ($rows > 1) { + # We're in a non-compact mode, so we need to add some selection helpers. First is + # the search box. This allows the user to type text and change which genomes are + # displayed. For multiple-select mode, we include a button that selects the displayed + # genes. For single-select mode, we use a plain label instead. + my $searchThingName = "${menuID}_SearchThing"; + my $searchThingLabel = ($multiSelect ? "" + : "Show genomes containing"); + push @lines, "
$searchThingLabel " . + "" . + Hint("GenomeControl", "Type here to filter the genomes displayed.") . "
"; + # For multi-select mode, we also have buttons to set and clear selections. + if ($multiSelect) { + push @lines, ""; + push @lines, ""; + push @lines, ""; + } + # Add a hidden field we can use to generate organism page hyperlinks. + push @lines, ""; + # Add the status display. This tells the user what's selected no matter where the list is scrolled. + push @lines, "
"; + } + # Assemble all the lines into a string. + my $retVal = join("\n", @lines, ""); # Return the result. return $retVal; } + +=head3 Stem + + my $stem = $sprout->Stem($word); + +Return the stem of the specified word, or C if the word is not +stemmable. Note that even if the word is stemmable, the stem may be +the same as the original word. + +=over 4 + +=item word + +Word to convert into a stem. + +=item RETURN + +Returns a stem of the word (which may be the word itself), or C if +the word is not stemmable. + +=back + +=cut + +sub Stem { + # Get the parameters. + my ($self, $word) = @_; + # Get the stemmer object. + my $stemmer = $self->{stemmer}; + if (! defined $stemmer) { + # We don't have one pre-built, so we build and save it now. + $stemmer = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt", + stops => "$FIG_Config::sproutData/StopWords.txt", + cache => 0); + $self->{stemmer} = $stemmer; + } + # Try to stem the word. + my $retVal = $stemmer->Process($word); + # Return the result. + return $retVal; +} + + =head3 Build $sprout->Build(); @@ -604,11 +846,15 @@ sub GenusSpecies { # Get the parameters. my ($self, $genomeID) = @_; - # Get the data for the specified genome. - my @values = $self->GetEntityValues('Genome', $genomeID, ['Genome(genus)', 'Genome(species)', - 'Genome(unique-characterization)']); - # Format the result and return it. - my $retVal = join(' ', @values); + # Declare the return value. + my $retVal; + # Get the genome data. + my $genomeData = $self->_GenomeData($genomeID); + # Only proceed if we found the genome. + if (defined $genomeData) { + $retVal = $genomeData->PrimaryValue('Genome(scientific-name)'); + } + # Return it. return $retVal; } @@ -683,7 +929,8 @@ =item RETURN Returns a list of the feature's contig segments. The locations are returned as a list in a list -context and as a comma-delimited string in a scalar context. +context and as a comma-delimited string in a scalar context. An empty list means the feature +wasn't found. =back @@ -692,13 +939,17 @@ sub FeatureLocation { # Get the parameters. my ($self, $featureID) = @_; + # Declare the return variable. + my @retVal = (); # Get the feature record. my $object = $self->GetEntity('Feature', $featureID); - Confess("Feature $featureID not found.") if ! defined($object); - # Get the location string. - my $locString = $object->PrimaryValue('Feature(location-string)'); - # Create the return list. - my @retVal = split /\s*,\s*/, $locString; + # Only proceed if we found it. + if (defined $object) { + # Get the location string. + my $locString = $object->PrimaryValue('Feature(location-string)'); + # Create the return list. + @retVal = split /\s*,\s*/, $locString; + } # Return the list in the format indicated by the context. return (wantarray ? @retVal : join(',', @retVal)); } @@ -941,11 +1192,12 @@ my ($self, $genomeID) = @_; # Declare the return variable. my $retVal = 0; - # Get the genome's contig sequence lengths. - my @lens = $self->GetFlat(['HasContig', 'IsMadeUpOf'], 'HasContig(from-link) = ?', - [$genomeID], 'IsMadeUpOf(len)'); - # Sum the lengths. - map { $retVal += $_ } @lens; + # Get the genome data. + my $genomeData = $self->_GenomeData($genomeID); + # Only proceed if it exists. + if (defined $genomeData) { + $retVal = $genomeData->PrimaryValue('Genome(dna-size)'); + } # Return the result. return $retVal; } @@ -1430,9 +1682,8 @@ the specified user and FIG are considered trusted. If the user ID is omitted, only FIG is trusted. -If the feature is B identified by a FIG ID, then the functional assignment -information is taken from the B table. If the table does -not contain an entry for the feature, an undefined value is returned. +If the feature is B identified by a FIG ID, then we search the aliases for it. +If no matching alias is found, we return an undefined value. =over 4 @@ -1458,12 +1709,14 @@ my ($self, $featureID, $userID) = @_; # Declare the return value. my $retVal; - # Determine the ID type. - if ($featureID =~ m/^fig\|/) { + # Find a FIG ID for this feature. + my ($fid) = $self->FeaturesByAlias($featureID); + # Only proceed if we have an ID. + if ($fid) { # Here we have a FIG feature ID. if (!$userID) { # Use the primary assignment. - ($retVal) = $self->GetEntityValues('Feature', $featureID, ['Feature(assignment)']); + ($retVal) = $self->GetEntityValues('Feature', $fid, ['Feature(assignment)']); } else { # We must build the list of trusted users. my %trusteeTable = (); @@ -1489,7 +1742,7 @@ # Build a query for all of the feature's annotations, sorted by date. my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], "IsTargetOfAnnotation(from-link) = ? ORDER BY Annotation(time) DESC", - [$featureID]); + [$fid]); my $timeSelected = 0; # Loop until we run out of annotations. while (my $annotation = $query->Fetch()) { @@ -1509,11 +1762,6 @@ } } } - } else { - # Here we have a non-FIG feature ID. In this case the user ID does not - # matter. We simply get the information from the External Alias Function - # table. - ($retVal) = $self->GetEntityValues('ExternalAliasFunc', $featureID, ['ExternalAliasFunc(func)']); } # Return the assignment found. return $retVal; @@ -1532,10 +1780,6 @@ annotation itself because it's a text field; however, this is not a big problem because most features only have a small number of annotations. -If the feature is B identified by a FIG ID, then the functional assignment -information is taken from the B table. If the table does -not contain an entry for the feature, an empty list is returned. - =over 4 =item featureID @@ -1556,15 +1800,17 @@ my ($self, $featureID) = @_; # Declare the return value. my @retVal = (); - # Determine the ID type. - if ($featureID =~ m/^fig\|/) { + # Convert to a FIG ID. + my ($fid) = $self->FeaturesByAlias($featureID); + # Only proceed if we found one. + if ($fid) { # Here we have a FIG feature ID. We must build the list of trusted # users. my %trusteeTable = (); # Build a query for all of the feature's annotations, sorted by date. my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], "IsTargetOfAnnotation(from-link) = ? ORDER BY Annotation(time) DESC", - [$featureID]); + [$fid]); my $timeSelected = 0; # Loop until we run out of annotations. while (my $annotation = $query->Fetch()) { @@ -1579,13 +1825,6 @@ push @retVal, [$actualUser, $function]; } } - } else { - # Here we have a non-FIG feature ID. In this case the user ID does not - # matter. We simply get the information from the External Alias Function - # table. - my @assignments = $self->GetEntityValues('ExternalAliasFunc', $featureID, - ['ExternalAliasFunc(func)']); - push @retVal, map { ['master', $_] } @assignments; } # Return the assignments found. return @retVal; @@ -1625,10 +1864,10 @@ # Loop through the incoming features. for my $featureID (@{$featureList}) { # Ask the server for the feature's best hit. - my @bbhData = FIGRules::BBHData($featureID); + my $bbhData = FIGRules::BBHData($featureID); # Peel off the BBHs found. my @found = (); - for my $bbh (@bbhData) { + for my $bbh (@$bbhData) { my $fid = $bbh->[0]; my $bbGenome = $self->GenomeOf($fid); if ($bbGenome eq $genomeID) { @@ -1667,10 +1906,10 @@ # Get the parameters. my ($self, $featureID, $count) = @_; # Ask for the best hits. - my @lists = FIGRules::BBHData($featureID); + my $lists = FIGRules::BBHData($featureID); # Create the return value. my %retVal = (); - for my $tuple (@lists) { + for my $tuple (@$lists) { $retVal{$tuple->[0]} = $tuple->[1]; } # Return the result. @@ -1704,8 +1943,9 @@ # Declare the return variable. my $retVal; # Get the genome's data. - my $genomeData = $self->GetEntity('Genome', $genomeID); - if ($genomeData) { + my $genomeData = $self->_GenomeData($genomeID); + # Only proceed if it exists. + if (defined $genomeData) { # The genome exists, so get the completeness flag. $retVal = $genomeData->PrimaryValue('Genome(complete)'); } @@ -1773,7 +2013,11 @@ if ($featureID =~ /^fig\|(\d+\.\d+)/) { $retVal = $1; } else { - Confess("Invalid feature ID $featureID."); + # Find the feature by alias. + my ($realFeatureID) = $self->FeaturesByAlias($featureID); + if ($realFeatureID && $realFeatureID =~ /^fig\|(\d+\.\d+)/) { + $retVal = $1; + } } # Return the value found. return $retVal; @@ -1817,11 +2061,6 @@ $retVal{$featureID2} = $score; } } - # Functional coupling is reflexive. If we found at least one coupled feature, we must add - # the incoming feature as well. - if (keys %retVal) { - $retVal{$featureID} = 9999; - } # Return the hash. return %retVal; } @@ -2178,11 +2417,18 @@ sub GetGenomeNameData { # Get the parameters. my ($self, $genomeID) = @_; - # Get the desired values. - my ($genus, $species, $strain) = $self->GetEntityValues('Genome', $genomeID => - [qw(Genome(genus) Genome(species) Genome(unique-characterization))]); - # Throw an error if they were not found. - if (! defined $genus) { + # Declare the return variables. + my ($genus, $species, $strain); + # Get the genome's data. + my $genomeData = $self->_GenomeData($genomeID); + # Only proceed if the genome exists. + if (defined $genomeData) { + # Get the desired values. + ($genus, $species, $strain) = $genomeData->Values(['Genome(genus)', + 'Genome(species)', + 'Genome(unique-characterization)']); + } else { + # Throw an error because they were not found. Confess("Genome $genomeID not found in database."); } # Return the results. @@ -2483,14 +2729,16 @@ sub Taxonomy { # Get the parameters. my ($self, $genome) = @_; - # Find the specified genome's taxonomy string. - my ($list) = $self->GetEntityValues('Genome', $genome, ['Genome(taxonomy)']); # Declare the return variable. my @retVal = (); - # If we found the genome, return its taxonomy string. - if ($list) { - @retVal = split /\s*;\s*/, $list; + # Get the genome data. + my $genomeData = $self->_GenomeData($genome); + # Only proceed if it exists. + if (defined $genomeData) { + # Create the taxonomy from the taxonomy string. + @retVal = split /\s*;\s*/, $genomeData->PrimaryValue('Genome(taxonomy)'); } else { + # Genome doesn't exist, so emit a warning. Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0); } # Return the value found. @@ -2535,17 +2783,8 @@ } my @taxA = $self->Taxonomy($genomeA); my @taxB = $self->Taxonomy($genomeB); - # Initialize the distance to 1. We'll reduce it each time we find a match between the - # taxonomies. - my $retVal = 1.0; - # Initialize the subtraction amount. This amount determines the distance reduction caused - # by a mismatch at the current level. - my $v = 0.5; - # Loop through the taxonomies. - for (my $i = 0; ($i < @taxA) && ($i < @taxB) && ($taxA[$i] eq $taxB[$i]); $i++) { - $retVal -= $v; - $v /= 2; - } + # Compute the distance. + my $retVal = FIGRules::CrudeDistanceFormula(\@taxA, \@taxB); return $retVal; } @@ -2613,92 +2852,6 @@ return @retVal; } -=head3 GetProperties - - my @list = $sprout->GetProperties($fid, $key, $value, $url); - -Return a list of the properties with the specified characteristics. - -Properties are the Sprout analog of the FIG attributes. The call is -passed directly to the CustomAttributes or RemoteCustomAttributes object -contained in this object. - -This method returns a series of tuples that match the specified criteria. Each tuple -will contain an object ID, a key, and one or more values. The parameters to this -method therefore correspond structurally to the values expected in each tuple. In -addition, you can ask for a generic search by suffixing a percent sign (C<%>) to any -of the parameters. So, for example, - - my @attributeList = $sprout->GetProperties('fig|100226.1.peg.1004', 'structure%', 1, 2); - -would return something like - - ['fig}100226.1.peg.1004', 'structure', 1, 2] - ['fig}100226.1.peg.1004', 'structure1', 1, 2] - ['fig}100226.1.peg.1004', 'structure2', 1, 2] - ['fig}100226.1.peg.1004', 'structureA', 1, 2] - -Use of C in any position acts as a wild card (all values). You can also specify -a list reference in the ID column. Thus, - - my @attributeList = $sprout->GetProperties(['100226.1', 'fig|100226.1.%'], 'PUBMED'); - -would get the PUBMED attribute data for Streptomyces coelicolor A3(2) and all its -features. - -In addition to values in multiple sections, a single attribute key can have multiple -values, so even - - my @attributeList = $sprout->GetProperties($peg, 'virulent'); - -which has no wildcard in the key or the object ID, may return multiple tuples. - -=over 4 - -=item objectID - -ID of object whose attributes are desired. If the attributes are desired for multiple -objects, this parameter can be specified as a list reference. If the attributes are -desired for all objects, specify C or an empty string. Finally, you can specify -attributes for a range of object IDs by putting a percent sign (C<%>) at the end. - -=item key - -Attribute key name. A value of C or an empty string will match all -attribute keys. If the values are desired for multiple keys, this parameter can be -specified as a list reference. Finally, you can specify attributes for a range of -keys by putting a percent sign (C<%>) at the end. - -=item values - -List of the desired attribute values, section by section. If C -or an empty string is specified, all values in that section will match. A -generic match can be requested by placing a percent sign (C<%>) at the end. -In that case, all values that match up to and not including the percent sign -will match. You may also specify a regular expression enclosed -in slashes. All values that match the regular expression will be returned. For -performance reasons, only values have this extra capability. - -=item RETURN - -Returns a list of tuples. The first element in the tuple is an object ID, the -second is an attribute key, and the remaining elements are the sections of -the attribute value. All of the tuples will match the criteria set forth in -the parameter list. - -=back - -=cut - -sub GetProperties { - # Get the parameters. - my ($self, @parms) = @_; - # Declare the return variable. - my @retVal = $self->{_ca}->GetAttributes(@parms); - # Return the result. - return @retVal; -} - =head3 FeatureProperties my @properties = $sprout->FeatureProperties($featureID); @@ -2727,7 +2880,7 @@ # Get the parameters. my ($self, $featureID) = @_; # Get the properties. - my @attributes = $self->{_ca}->GetAttributes($featureID); + my @attributes = $self->ca->GetAttributes($featureID); # Strip the feature ID off each tuple. my @retVal = (); for my $attributeRow (@attributes) { @@ -2999,8 +3152,9 @@ sub SubsystemList { # Get the parameters. my ($self, $featureID) = @_; - # Get the list of names. - my @retVal = $self->GetFlat(['HasRoleInSubsystem'], "HasRoleInSubsystem(from-link) = ?", + # Get the list of names. We do a join to the Subsystem table because we have missing subsystems in + # the Sprout database! + my @retVal = $self->GetFlat(['HasRoleInSubsystem', 'Subsystem'], "HasRoleInSubsystem(from-link) = ?", [$featureID], 'HasRoleInSubsystem(to-link)'); # Return the result, sorted. return sort @retVal; @@ -3033,29 +3187,23 @@ # Declare the return variable. my %retVal = (); # Get a list of the genome features that participate in subsystems. For each - # feature we get its spreadsheet cells and the corresponding roles. - my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf'], - "HasFeature(from-link) = ?", [$genomeID], - ['HasFeature(to-link)', 'IsRoleOf(to-link)', 'IsRoleOf(from-link)']); - # Now we get a list of the spreadsheet cells and their associated subsystems. Subsystems - # with an unknown variant code (-1) are skipped. Note the genome ID is at both ends of the - # list. We use it at the beginning to get all the spreadsheet cells for the genome and - # again at the end to filter out participation in subsystems with a negative variant code. - my @cellData = $self->GetAll(['IsGenomeOf', 'HasSSCell', 'ParticipatesIn'], - "IsGenomeOf(from-link) = ? AND ParticipatesIn(variant-code) >= 0 AND ParticipatesIn(from-link) = ?", - [$genomeID, $genomeID], ['HasSSCell(to-link)', 'HasSSCell(from-link)']); - # Now "@roleData" lists the spreadsheet cell and role for each of the genome's features. - # "@cellData" lists the subsystem name for each of the genome's spreadsheet cells. We - # link these two lists together to create the result. First, we want a hash mapping - # spreadsheet cells to subsystem names. - my %subHash = map { $_->[0] => $_->[1] } @cellData; - # We loop through @cellData to build the hash. + # feature we get its subsystem ID and the corresponding roles. + my @roleData = $self->GetAll(['HasFeature', 'ContainsFeature', 'IsRoleOf', 'HasSSCell'], + "HasFeature(from-link) = ?", [$genomeID], + ['HasFeature(to-link)', 'IsRoleOf(from-link)', 'HasSSCell(from-link)']); + # Now we get a list of valid subsystems. These are the subsystems connected to the genome with + # a non-negative variant code. + my %subs = map { $_ => 1 } $self->GetFlat(['ParticipatesIn'], + "ParticipatesIn(from-link) = ? AND ParticipatesIn(variant-code) >= 0", + [$genomeID], 'ParticipatesIn(to-link)'); + # We loop through @roleData to build the hash. for my $roleEntry (@roleData) { # Get the data for this feature and cell. - my ($fid, $cellID, $role) = @{$roleEntry}; - # Check for a subsystem name. - my $subsys = $subHash{$cellID}; - if ($subsys) { + my ($fid, $role, $subsys) = @{$roleEntry}; + Trace("Subsystem for $fid is $subsys.") if T(4); + # Check the subsystem; + if ($subs{$subsys}) { + Trace("Subsystem found.") if T(4); # Insure this feature has an entry in the return hash. if (! exists $retVal{$fid}) { $retVal{$fid} = []; } # Merge in this new data. @@ -3101,7 +3249,8 @@ # Get the parameters. my ($self, $featureID, $function, $userID) = @_; # Get a list of the features that are BBHs of the incoming feature. - my @bbhFeatures = map { $_->[0] } FIGRules::BBHData($featureID); + my $bbhData = FIGRules::BBHData($featureID); + my @bbhFeatures = map { $_->[0] } @$bbhData; # Now we loop through the features, pulling out the ones that have the correct # functional assignment. my @retVal = (); @@ -3274,7 +3423,7 @@ =head3 BBHMatrix - my %bbhMap = $sprout->BBHMatrix($genomeID, $cutoff, @targets); + my $bbhMap = $sprout->BBHMatrix($genomeID, $cutoff, @targets); Find all the bidirectional best hits for the features of a genome in a specified list of target genomes. The return value will be a hash mapping @@ -3298,8 +3447,8 @@ =item RETURN -Returns a hash mapping each feature in the original genome to a hash mapping its -BBH pegs in the target genomes to their scores. +Returns a reference to a hash mapping each feature in the original genome +to a sub-hash mapping its BBH pegs in the target genomes to their scores. =back @@ -3312,6 +3461,7 @@ my %retVal = (); # Ask for the BBHs. my @bbhList = FIGRules::BatchBBHs("fig|$genomeID.%", $cutoff, @targets); + Trace("Retrieved " . scalar(@bbhList) . " BBH results.") if T(3); # We now have a set of 4-tuples that we need to convert into a hash of hashes. for my $bbhData (@bbhList) { my ($peg1, $peg2, $score) = @{$bbhData}; @@ -3322,7 +3472,7 @@ } } # Return the result. - return %retVal; + return \%retVal; } @@ -3428,9 +3578,9 @@ # Create the return hash. my %retVal = (); # Query for the desired BBHs. - my @bbhList = FIGRules::BBHData($featureID, $cutoff); + my $bbhList = FIGRules::BBHData($featureID, $cutoff); # Form the results into the return hash. - for my $pair (@bbhList) { + for my $pair (@$bbhList) { my $fid = $pair->[0]; if ($self->Exists('Feature', $fid)) { $retVal{$fid} = $pair->[1]; @@ -3808,7 +3958,7 @@ # Get the parameters. my ($self, $featureID, $key, @values) = @_; # Add the property using the attached attributes object. - $self->{_ca}->AddAttribute($featureID, $key, @values); + $self->ca->AddAttribute($featureID, $key, @values); } =head3 CheckGroupFile @@ -3842,7 +3992,7 @@ =cut -sub CheckGroupFile{ +sub CheckGroupFile { # Get the parameters. my ($self) = @_; # Check to see if we already have this hash. @@ -3895,22 +4045,158 @@ sub CleanKeywords { # Get the parameters. my ($self, $searchExpression) = @_; - # Perform the standard cleanup. - my $retVal = $self->ERDB::CleanKeywords($searchExpression); - # Fix the periods in EC and TC numbers. - $retVal =~ s/(\d+|\-)\.(\d+|-)\.(\d+|-)\.(\d+|-)/$1_$2_$3_$4/g; - # Fix non-trailing periods. - $retVal =~ s/\.(\w)/_$1/g; - # Fix non-leading minus signs. - $retVal =~ s/(\w)[\-]/$1_/g; - # Fix the vertical bars and colons - $retVal =~ s/(\w)[|:](\w)/$1'$2/g; + # Get the stemmer. + my $stemmer = $self->GetStemmer(); + # Convert the search expression using the stemmer. + my $retVal = $stemmer->PrepareSearchExpression($searchExpression); + Trace("Cleaned keyword list for \"$searchExpression\" is \"$retVal\".") if T(3); # Return the result. return $retVal; } +=head3 GetSourceObject + + my $source = $erdb->GetSourceObject(); + +Return the object to be used in creating load files for this database. + +=cut + +sub GetSourceObject { + # Get the parameters. + my ($self) = @_; + # Check to see if we already have a source object. + my $retVal = $self->{_fig}; + if (! defined $retVal) { + # No, so create one. + require FIG; + $retVal = FIG->new(); + } + # Return the object. + return $retVal; +} + +=head3 SectionList + + my @sections = $erdb->SectionList(); + +Return a list of the names for the different data sections used when loading this database. +The default is a single string, in which case there is only one section representing the +entire database. + +=cut + +sub SectionList { + # Get the parameters. + my ($self, $source) = @_; + # Ask the BaseSproutLoader for a section list. + require BaseSproutLoader; + my @retVal = BaseSproutLoader::GetSectionList($self); + # Return the list. + return @retVal; +} + +=head3 Loader + + my $groupLoader = $erdb->Loader($groupName, $options); + +Return an [[ERDBLoadGroupPm]] object for the specified load group. This method is used +by [[ERDBGeneratorPl]] to create the load group objects. If you are not using +[[ERDBGeneratorPl]], you don't need to override this method. + +=over 4 + +=item groupName + +Name of the load group whose object is to be returned. The group name is +guaranteed to be a single word with only the first letter capitalized. + +=item options + +Reference to a hash of command-line options. + +=item RETURN + +Returns an [[ERDBLoadGroupPm]] object that can be used to process the specified load group +for this database. + +=back + +=cut + +sub Loader { + # Get the parameters. + my ($self, $groupName, $options) = @_; + # Compute the loader name. + my $loaderClass = "${groupName}SproutLoader"; + # Pull in its definition. + require "$loaderClass.pm"; + # Create an object for it. + my $retVal = eval("$loaderClass->new(\$self, \$options)"); + # Insure it worked. + Confess("Could not create $loaderClass object: $@") if $@; + # Return it to the caller. + return $retVal; +} + + +=head3 LoadGroupList + + my @groups = $erdb->LoadGroupList(); + +Returns a list of the names for this database's load groups. This method is used +by [[ERDBGeneratorPl]] when the user wishes to load all table groups. The default +is a single group called 'All' that loads everything. + +=cut + +sub LoadGroupList { + # Return the list. + return qw(Genome Subsystem Annotation Property Source Reaction Synonym Feature Drug); +} + +=head3 LoadDirectory + + my $dirName = $erdb->LoadDirectory(); + +Return the name of the directory in which load files are kept. The default is +the FIG temporary directory, which is a really bad choice, but it's always there. + +=cut + +sub LoadDirectory { + # Get the parameters. + my ($self) = @_; + # Return the directory name. + return $self->{dataDir}; +} + =head2 Internal Utility Methods +=head3 GetStemmer + + my $stermmer = $sprout->GetStemmer(); + +Return the stemmer object for this database. + +=cut + +sub GetStemmer { + # Get the parameters. + my ($self) = @_; + # Declare the return variable. + my $retVal = $self->{stemmer}; + if (! defined $retVal) { + # We don't have one pre-built, so we build and save it now. + $retVal = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt", + stops => "$FIG_Config::sproutData/StopWords.txt", + cache => 0); + $self->{stemmer} = $retVal; + } + # Return the result. + return $retVal; +} + =head3 ParseAssignment Parse annotation text to determine whether or not it is a functional assignment. If it is, @@ -3997,10 +4283,7 @@ # Get the parameters. my ($self, $fid) = @_; # Insure we have a genome hash. - if (! defined $self->{genomeHash}) { - my %genomeHash = map { $_ => 1 } $self->GetFlat(['Genome'], "", [], 'Genome(id)'); - $self->{genomeHash} = \%genomeHash; - } + my $genomes = $self->_GenomeHash(); # Get the feature's genome ID. my ($genomeID) = FIGRules::ParseFeatureID($fid); # Return an indicator of whether or not the genome ID is in the hash. @@ -4034,4 +4317,136 @@ } +=head3 Hint + + my $htmlText = SearchHelper::Hint($wikiPage, $hintText); + +Return the HTML for a small question mark that displays the specified hint text when it is clicked. +This HTML can be put in forms to provide a useful hinting mechanism. + +=over 4 + +=item wikiPage + +Name of the wiki page to be popped up when the hint mark is clicked. + +=item hintText + +Text to display for the hint. It is raw html, but may not contain any double quotes. + +=item RETURN + +Returns the html for the hint facility. The resulting html shows a small button-like thing that +uses the standard FIG popup technology. + +=back + +=cut + +sub Hint { + # Get the parameters. + my ($wikiPage, $hintText) = @_; + # Escape the single quotes in the hint text. + my $quotedText = $hintText; + $quotedText =~ s/'/\\'/g; + # Convert the wiki page name to a URL. + my $wikiURL = join("", map { ucfirst $_ } split /\s+/, $wikiPage); + $wikiURL = "$FIG_Config::cgi_url/wiki/view.cgi/FIG/$wikiURL"; + # Compute the mouseover script. + my $mouseOver = "doTooltip(this, '$quotedText')"; + # Create the html. + my $retVal = " "; + # Return it. + return $retVal; +} + +=head3 _GenomeHash + + my $gHash = $sprout->_GenomeHash(); + +Return a hash mapping all NMPDR genome IDs to [[ERDBObjectPm]] genome objects. + +=cut + +sub _GenomeHash { + # Get the parameters. + my ($self) = @_; + # Do we already have a filled hash? + if (! $self->{genomeHashFilled}) { + # No, create it. + my %gHash = map { $_->PrimaryValue('id') => $_ } $self->GetList("Genome", "", []); + $self->{genomeHash} = \%gHash; + # Denote we have it. + $self->{genomeHashFilled} = 1; + } + # Return the hash. + return $self->{genomeHash}; +} + +=head3 _GenomeData + + my $genomeData = $sprout->_GenomeData($genomeID); + +Return an [[ERDBObjectPm]] object for the specified genome, or an undefined +value if the genome does not exist. + +=over 4 + +=item genomeID + +ID of the desired genome. + +=item RETURN + +Returns either an [[ERDBObjectPm]] containing the genome, or an undefined value. +If the genome exists, it will have been read into the genome cache. + +=back + +=cut + +sub _GenomeData { + # Get the parameters. + my ($self, $genomeID) = @_; + # Are we in the genome hash? + if (! exists $self->{genomeHash}->{$genomeID} && ! $self->{genomeHashFilled}) { + # The genome isn't in the hash, and the hash is not complete, so we try to + # read it. + $self->{genomeHash}->{$genomeID} = $self->GetEntity(Genome => $genomeID); + } + # Return the result. + return $self->{genomeHash}->{$genomeID}; +} + +=head3 _CacheGenome + + $sprout->_CacheGenome($genomeID, $genomeData); + +Store the specified genome object in the genome cache if it is already there. + +=over 4 + +=item genomeID + +ID of the genome to store in the cache. + +=item genomeData + +An [[ERDBObjectPm]] containing at least the data for the specified genome. +Note that the Genome may not be the primary object in it, so a fully-qualified +field name has to be used to retrieve data from it. + +=back + +=cut + +sub _CacheGenome { + # Get the parameters. + my ($self, $genomeID, $genomeData) = @_; + # Only proceed if we don't already have the genome. + if (! exists $self->{genomeHash}->{$genomeID}) { + $self->{genomeHash}->{$genomeID} = $genomeData; + } +} + 1; \ No newline at end of file