--- Sprout.pm 2005/05/04 03:24:43 1.12 +++ Sprout.pm 2005/09/11 17:29:52 1.25 @@ -1,16 +1,16 @@ package Sprout; - use Data::Dumper; - use strict; - use Carp; - use DBKernel; - use XML::Simple; - use DBQuery; - use DBObject; - use ERDB; - use Tracer; - use FIGRules; - use Stats; + use Data::Dumper; + use strict; + use Carp; + use DBKernel; + use XML::Simple; + use DBQuery; + use DBObject; + use ERDB; + use Tracer; + use FIGRules; + use Stats; use POSIX qw(strftime); @@ -70,6 +70,8 @@ * B maximum number of residues per sequence, (default C<8000>) +* B suppresses the connection to the database if TRUE, else FALSE + =back For example, the following constructor call specifies a database named I and a user name of @@ -81,34 +83,44 @@ =cut sub new { - # Get the parameters. - my ($class, $dbName, $options) = @_; - # Compute the options. We do this by starting with a table of defaults and overwriting with - # the incoming data. - my $optionTable = Tracer::GetOptions({ - dbType => 'mysql', # database type - dataDir => 'Data', # data file directory - xmlFileName => 'SproutDBD.xml', # database definition file name - userData => 'root/', # user name and password - port => 0, # database connection port - maxSegmentLength => 4500, # maximum feature segment length - maxSequenceLength => 8000, # maximum contig sequence length - }, $options); - # Get the data directory. - my $dataDir = $optionTable->{dataDir}; - # Extract the user ID and password. - $optionTable->{userData} =~ m!([^/]*)/(.*)$!; - my ($userName, $password) = ($1, $2); - # Connect to the database. - my $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName, $password, $optionTable->{port}); - # Create the ERDB object. - my $xmlFileName = "$optionTable->{xmlFileName}"; - my $erdb = ERDB->new($dbh, $xmlFileName); - # Create this object. - my $self = { _erdb => $erdb, _options => $optionTable, _xmlName => $xmlFileName }; - # Bless and return it. - bless $self; - return $self; + # Get the parameters. + my ($class, $dbName, $options) = @_; + # Compute the options. We do this by starting with a table of defaults and overwriting with + # the incoming data. + my $optionTable = Tracer::GetOptions({ + dbType => $FIG_Config::dbms, + # database type + dataDir => $FIG_Config::sproutData, + # data file directory + xmlFileName => "$FIG_Config::sproutData/SproutDBD.xml", + # database definition file name + userData => "$FIG_Config::dbuser/$FIG_Config::dbpass", + # user name and password + port => $FIG_Config::dbport, + # database connection port + maxSegmentLength => 4500, # maximum feature segment length + maxSequenceLength => 8000, # maximum contig sequence length + noDBOpen => 0, # 1 to suppress the database open + }, $options); + # Get the data directory. + my $dataDir = $optionTable->{dataDir}; + # Extract the user ID and password. + $optionTable->{userData} =~ m!([^/]*)/(.*)$!; + my ($userName, $password) = ($1, $2); + # Connect to the database. + my $dbh; + if (! $optionTable->{noDBOpen}) { + $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName, + $password, $optionTable->{port}); + } + # Create the ERDB object. + my $xmlFileName = "$optionTable->{xmlFileName}"; + my $erdb = ERDB->new($dbh, $xmlFileName); + # Create this object. + my $self = { _erdb => $erdb, _options => $optionTable, _xmlName => $xmlFileName }; + # Bless and return it. + bless $self; + return $self; } =head3 MaxSegment @@ -124,8 +136,8 @@ =cut #: Return Type $; sub MaxSegment { - my ($self) = @_; - return $self->{_options}->{maxSegmentLength}; + my ($self) = @_; + return $self->{_options}->{maxSegmentLength}; } =head3 MaxSequence @@ -139,8 +151,8 @@ =cut #: Return Type $; sub MaxSequence { - my ($self) = @_; - return $self->{_options}->{maxSequenceLength}; + my ($self) = @_; + return $self->{_options}->{maxSequenceLength}; } =head3 Get @@ -231,14 +243,14 @@ =cut sub Get { - # Get the parameters. - my ($self, $objectNames, $filterClause, $parameterList) = @_; - # We differ from the ERDB Get method in that the parameter list is passed in as a list reference - # rather than a list of parameters. The next step is to convert the parameters from a reference - # to a real list. We can only do this if the parameters have been specified. - my @parameters; - if ($parameterList) { @parameters = @{$parameterList}; } - return $self->{_erdb}->Get($objectNames, $filterClause, @parameters); + # Get the parameters. + my ($self, $objectNames, $filterClause, $parameterList) = @_; + # We differ from the ERDB Get method in that the parameter list is passed in as a list reference + # rather than a list of parameters. The next step is to convert the parameters from a reference + # to a real list. We can only do this if the parameters have been specified. + my @parameters; + if ($parameterList) { @parameters = @{$parameterList}; } + return $self->{_erdb}->Get($objectNames, $filterClause, @parameters); } =head3 GetEntity @@ -267,10 +279,10 @@ =cut sub GetEntity { - # Get the parameters. - my ($self, $entityType, $ID) = @_; - # Call the ERDB method. - return $self->{_erdb}->GetEntity($entityType, $ID); + # Get the parameters. + my ($self, $entityType, $ID) = @_; + # Call the ERDB method. + return $self->{_erdb}->GetEntity($entityType, $ID); } =head3 GetEntityValues @@ -302,10 +314,10 @@ =cut #: Return Type @; sub GetEntityValues { - # Get the parameters. - my ($self, $entityType, $ID, $fields) = @_; - # Call the ERDB method. - return $self->{_erdb}->GetEntityValues($entityType, $ID, $fields); + # Get the parameters. + my ($self, $entityType, $ID, $fields) = @_; + # Call the ERDB method. + return $self->{_erdb}->GetEntityValues($entityType, $ID, $fields); } =head3 ShowMetaData @@ -325,12 +337,12 @@ =cut sub ShowMetaData { - # Get the parameters. - my ($self, $fileName) = @_; - # Compute the file name. - my $options = $self->{_options}; - # Call the show method on the underlying ERDB object. - $self->{_erdb}->ShowMetaData($fileName); + # Get the parameters. + my ($self, $fileName) = @_; + # Compute the file name. + my $options = $self->{_options}; + # Call the show method on the underlying ERDB object. + $self->{_erdb}->ShowMetaData($fileName); } =head3 Load @@ -365,14 +377,14 @@ =cut #: Return Type %; sub Load { - # Get the parameters. - my ($self, $rebuild) = @_; - # Get the database object. - my $erdb = $self->{_erdb}; - # Load the tables from the data directory. - my $retVal = $erdb->LoadTables($self->{_options}->{dataDir}, $rebuild); - # Return the statistics. - return $retVal; + # Get the parameters. + my ($self, $rebuild) = @_; + # Get the database object. + my $erdb = $self->{_erdb}; + # Load the tables from the data directory. + my $retVal = $erdb->LoadTables($self->{_options}->{dataDir}, $rebuild); + # Return the statistics. + return $retVal; } =head3 LoadUpdate @@ -408,29 +420,30 @@ =cut #: Return Type $%; sub LoadUpdate { - # Get the parameters. - my ($self, $truncateFlag, $tableList) = @_; - # Get the database object. - my $erdb = $self->{_erdb}; - # Declare the return value. - my $retVal = Stats->new(); - # Get the data directory. - my $optionTable = $self->{_options}; - my $dataDir = $optionTable->{dataDir}; - # Loop through the incoming table names. - for my $tableName (@{$tableList}) { - # Find the table's file. - my $fileName = "$dataDir/$tableName"; - if (! -e $fileName) { - $fileName = "$fileName.dtx"; - } - # Attempt to load this table. - my $result = $erdb->LoadTable($fileName, $tableName, $truncateFlag); - # Accumulate the resulting statistics. - $retVal->Accumulate($result); - } - # Return the statistics. - return $retVal; + # Get the parameters. + my ($self, $truncateFlag, $tableList) = @_; + # Get the database object. + my $erdb = $self->{_erdb}; + # Declare the return value. + my $retVal = Stats->new(); + # Get the data directory. + my $optionTable = $self->{_options}; + my $dataDir = $optionTable->{dataDir}; + # Loop through the incoming table names. + for my $tableName (@{$tableList}) { + # Find the table's file. + my $fileName = LoadFileName($dataDir, $tableName); + if (! $fileName) { + Trace("No load file found for $tableName in $dataDir.") if T(0); + } else { + # Attempt to load this table. + my $result = $erdb->LoadTable($fileName, $tableName, $truncateFlag); + # Accumulate the resulting statistics. + $retVal->Accumulate($result); + } + } + # Return the statistics. + return $retVal; } =head3 Build @@ -444,10 +457,10 @@ =cut #: Return Type ; sub Build { - # Get the parameters. - my ($self) = @_; - # Create the tables. - $self->{_erdb}->CreateTables; + # Get the parameters. + my ($self) = @_; + # Create the tables. + $self->{_erdb}->CreateTables; } =head3 Genomes @@ -459,12 +472,12 @@ =cut #: Return Type @; sub Genomes { - # Get the parameters. - my ($self) = @_; - # Get all the genomes. - my @retVal = $self->GetFlat(['Genome'], "", [], 'Genome(id)'); - # Return the list of IDs. - return @retVal; + # Get the parameters. + my ($self) = @_; + # Get all the genomes. + my @retVal = $self->GetFlat(['Genome'], "", [], 'Genome(id)'); + # Return the list of IDs. + return @retVal; } =head3 GenusSpecies @@ -489,14 +502,14 @@ =cut #: Return Type $; sub GenusSpecies { - # Get the parameters. - my ($self, $genomeID) = @_; - # Get the data for the specified genome. - my @values = $self->GetEntityValues('Genome', $genomeID, ['Genome(genus)', 'Genome(species)', - 'Genome(unique-characterization)']); - # Format the result and return it. - my $retVal = join(' ', @values); - return $retVal; + # Get the parameters. + my ($self, $genomeID) = @_; + # Get the data for the specified genome. + my @values = $self->GetEntityValues('Genome', $genomeID, ['Genome(genus)', 'Genome(species)', + 'Genome(unique-characterization)']); + # Format the result and return it. + my $retVal = join(' ', @values); + return $retVal; } =head3 FeaturesOf @@ -525,23 +538,23 @@ =cut #: Return Type @; sub FeaturesOf { - # Get the parameters. - my ($self, $genomeID,$ftype) = @_; - # Get the features we want. - my @features; - if (!$ftype) { - @features = $self->GetFlat(['HasContig', 'IsLocatedIn'], "HasContig(from-link) = ?", - [$genomeID], 'IsLocatedIn(from-link)'); - } else { - @features = $self->GetFlat(['HasContig', 'IsLocatedIn', 'Feature'], - "HasContig(from-link) = ? AND Feature(feature-type) = ?", - [$genomeID, $ftype], 'IsLocatedIn(from-link)'); - } - # Return the list with duplicates merged out. We need to merge out duplicates because - # a feature will appear twice if it spans more than one contig. - my @retVal = Tracer::Merge(@features); - # Return the list of feature IDs. - return @retVal; + # Get the parameters. + my ($self, $genomeID,$ftype) = @_; + # Get the features we want. + my @features; + if (!$ftype) { + @features = $self->GetFlat(['HasContig', 'IsLocatedIn'], "HasContig(from-link) = ?", + [$genomeID], 'IsLocatedIn(from-link)'); + } else { + @features = $self->GetFlat(['HasContig', 'IsLocatedIn', 'Feature'], + "HasContig(from-link) = ? AND Feature(feature-type) = ?", + [$genomeID, $ftype], 'IsLocatedIn(from-link)'); + } + # Return the list with duplicates merged out. We need to merge out duplicates because + # a feature will appear twice if it spans more than one contig. + my @retVal = Tracer::Merge(@features); + # Return the list of feature IDs. + return @retVal; } =head3 FeatureLocation @@ -570,7 +583,7 @@ =item RETURN Returns a list of the feature's contig segments. The locations are returned as a list in a list -context and as a space-delimited string in a scalar context. +context and as a comma-delimited string in a scalar context. =back @@ -578,42 +591,42 @@ #: Return Type @; #: Return Type $; sub FeatureLocation { - # Get the parameters. - my ($self, $featureID) = @_; - # Create a query for the feature locations. - my $query = $self->Get(['IsLocatedIn'], "IsLocatedIn(from-link) = ? ORDER BY IsLocatedIn(locN)", - [$featureID]); - # Create the return list. - my @retVal = (); - # Set up the variables used to determine if we have adjacent segments. This initial setup will - # not match anything. - my ($prevContig, $prevBeg, $prevDir, $prevLen) = ("", 0, "0", 0); - # Loop through the query results, creating location specifiers. - while (my $location = $query->Fetch()) { - # Get the location parameters. - my ($contigID, $beg, $dir, $len) = $location->Values(['IsLocatedIn(to-link)', - 'IsLocatedIn(beg)', 'IsLocatedIn(dir)', 'IsLocatedIn(len)']); - # Check to see if we are adjacent to the previous segment. - if ($prevContig eq $contigID && $dir eq $prevDir) { - # Here the new segment is in the same direction on the same contig. Insure the - # new segment's beginning is next to the old segment's end. - if (($dir eq "-" && $beg == $prevBeg - $prevLen) || - ($dir eq "+" && $beg == $prevBeg + $prevLen)) { - # Here we need to merge two segments. Adjust the beginning and length values - # to include both segments. - $beg = $prevBeg; - $len += $prevLen; - # Pop the old segment off. The new one will replace it later. - pop @retVal; - } - } - # Remember this specifier for the adjacent-segment test the next time through. - ($prevContig, $prevBeg, $prevDir, $prevLen) = ($contigID, $beg, $dir, $len); - # Add the specifier to the list. - push @retVal, "${contigID}_$beg$dir$len"; - } - # Return the list in the format indicated by the context. - return (wantarray ? @retVal : join(' ', @retVal)); + # Get the parameters. + my ($self, $featureID) = @_; + # Create a query for the feature locations. + my $query = $self->Get(['IsLocatedIn'], "IsLocatedIn(from-link) = ? ORDER BY IsLocatedIn(locN)", + [$featureID]); + # Create the return list. + my @retVal = (); + # Set up the variables used to determine if we have adjacent segments. This initial setup will + # not match anything. + my ($prevContig, $prevBeg, $prevDir, $prevLen) = ("", 0, "0", 0); + # Loop through the query results, creating location specifiers. + while (my $location = $query->Fetch()) { + # Get the location parameters. + my ($contigID, $beg, $dir, $len) = $location->Values(['IsLocatedIn(to-link)', + 'IsLocatedIn(beg)', 'IsLocatedIn(dir)', 'IsLocatedIn(len)']); + # Check to see if we are adjacent to the previous segment. + if ($prevContig eq $contigID && $dir eq $prevDir) { + # Here the new segment is in the same direction on the same contig. Insure the + # new segment's beginning is next to the old segment's end. + if (($dir eq "-" && $beg == $prevBeg - $prevLen) || + ($dir eq "+" && $beg == $prevBeg + $prevLen)) { + # Here we need to merge two segments. Adjust the beginning and length values + # to include both segments. + $beg = $prevBeg; + $len += $prevLen; + # Pop the old segment off. The new one will replace it later. + pop @retVal; + } + } + # Remember this specifier for the adjacent-segment test the next time through. + ($prevContig, $prevBeg, $prevDir, $prevLen) = ($contigID, $beg, $dir, $len); + # Add the specifier to the list. + push @retVal, "${contigID}_$beg$dir$len"; + } + # Return the list in the format indicated by the context. + return (wantarray ? @retVal : join(',', @retVal)); } =head3 ParseLocation @@ -639,25 +652,25 @@ =cut #: Return Type @; sub ParseLocation { - # Get the parameter. Note that if we're called as an instance method, we ignore + # Get the parameter. Note that if we're called as an instance method, we ignore # the first parameter. shift if UNIVERSAL::isa($_[0],__PACKAGE__); - my ($location) = @_; - # Parse it into segments. - $location =~ /^(.*)_(\d*)([+-_])(\d*)$/; - my ($contigID, $start, $dir, $len) = ($1, $2, $3, $4); - # If the direction is an underscore, convert it to a + or -. - if ($dir eq "_") { - if ($start < $len) { - $dir = "+"; - $len = $len - $start + 1; - } else { - $dir = "-"; - $len = $start - $len + 1; - } - } - # Return the result. - return ($contigID, $start, $dir, $len); + my ($location) = @_; + # Parse it into segments. + $location =~ /^(.*)_(\d*)([+-_])(\d*)$/; + my ($contigID, $start, $dir, $len) = ($1, $2, $3, $4); + # If the direction is an underscore, convert it to a + or -. + if ($dir eq "_") { + if ($start < $len) { + $dir = "+"; + $len = $len - $start + 1; + } else { + $dir = "-"; + $len = $start - $len + 1; + } + } + # Return the result. + return ($contigID, $start, $dir, $len); } =head3 PointLocation @@ -666,8 +679,8 @@ Return the offset into the specified location of the specified point on the contig. If the specified point is before the location, a negative value will be returned. If it is -beyond the location, an undefined value will be returned. It is assumed that the offset -is for the location's contig. The location can either be new-style (using a C<+> or C<-> +beyond the location, an undefined value will be returned. It is assumed that the offset +is for the location's contig. The location can either be new-style (using a C<+> or C<-> and a length) or old-style (using C<_> and start and end positions. =over 4 @@ -693,10 +706,10 @@ =cut #: Return Type $; sub PointLocation { - # Get the parameter. Note that if we're called as an instance method, we ignore + # Get the parameter. Note that if we're called as an instance method, we ignore # the first parameter. shift if UNIVERSAL::isa($_[0],__PACKAGE__); - my ($location, $point) = @_; + my ($location, $point) = @_; # Parse out the location elements. Note that this works on both old-style and new-style # locations. my ($contigID, $start, $dir, $len) = ParseLocation($location); @@ -736,56 +749,56 @@ =cut #: Return Type $; sub DNASeq { - # Get the parameters. - my ($self, $locationList) = @_; - # Create the return string. - my $retVal = ""; - # Loop through the locations. - for my $location (@{$locationList}) { - # Set up a variable to contain the DNA at this location. - my $locationDNA = ""; - # Parse out the contig ID, the beginning point, the direction, and the end point. - my ($contigID, $beg, $dir, $len) = ParseLocation($location); - # Now we must create a query to return all the sequences in the contig relevant to the region - # specified. First, we compute the start and stop points when reading through the sequences. - # For a forward transcription, the start point is the beginning; for a backward transcription, - # the start point is the ending. Note that in the latter case we must reverse the DNA string - # before putting it in the return value. - my ($start, $stop); - if ($dir eq "+") { - $start = $beg; - $stop = $beg + $len - 1; - } else { - $start = $beg + $len + 1; - $stop = $beg; - } - my $query = $self->Get(['IsMadeUpOf','Sequence'], - "IsMadeUpOf(from-link) = ? AND IsMadeUpOf(start-position) + IsMadeUpOf(len) > ? AND " . - " IsMadeUpOf(start-position) <= ? ORDER BY IsMadeUpOf(start-position)", - [$contigID, $start, $stop]); - # Loop through the sequences. - while (my $sequence = $query->Fetch()) { - # Determine whether the location starts, stops, or continues through this sequence. - my ($startPosition, $sequenceData, $sequenceLength) = - $sequence->Values(['IsMadeUpOf(start-position)', 'Sequence(sequence)', - 'IsMadeUpOf(len)']); - my $stopPosition = $startPosition + $sequenceLength; - # Figure out the start point and length of the relevant section. - my $pos1 = ($start < $startPosition ? 0 : $start - $startPosition); - my $len = ($stopPosition <= $stop ? $stopPosition : $stop) - $startPosition - $pos1; - # Add the relevant data to the location data. - $locationDNA .= substr($sequenceData, $pos1, $len); - } - # Add this location's data to the return string. Note that we may need to reverse it. - if ($dir eq '+') { - $retVal .= $locationDNA; - } else { - $locationDNA = join('', reverse split //, $locationDNA); - $retVal .= $locationDNA; - } - } - # Return the result. - return $retVal; + # Get the parameters. + my ($self, $locationList) = @_; + # Create the return string. + my $retVal = ""; + # Loop through the locations. + for my $location (@{$locationList}) { + # Set up a variable to contain the DNA at this location. + my $locationDNA = ""; + # Parse out the contig ID, the beginning point, the direction, and the end point. + my ($contigID, $beg, $dir, $len) = ParseLocation($location); + # Now we must create a query to return all the sequences in the contig relevant to the region + # specified. First, we compute the start and stop points when reading through the sequences. + # For a forward transcription, the start point is the beginning; for a backward transcription, + # the start point is the ending. Note that in the latter case we must reverse the DNA string + # before putting it in the return value. + my ($start, $stop); + if ($dir eq "+") { + $start = $beg; + $stop = $beg + $len - 1; + } else { + $start = $beg + $len + 1; + $stop = $beg; + } + my $query = $self->Get(['IsMadeUpOf','Sequence'], + "IsMadeUpOf(from-link) = ? AND IsMadeUpOf(start-position) + IsMadeUpOf(len) > ? AND " . + " IsMadeUpOf(start-position) <= ? ORDER BY IsMadeUpOf(start-position)", + [$contigID, $start, $stop]); + # Loop through the sequences. + while (my $sequence = $query->Fetch()) { + # Determine whether the location starts, stops, or continues through this sequence. + my ($startPosition, $sequenceData, $sequenceLength) = + $sequence->Values(['IsMadeUpOf(start-position)', 'Sequence(sequence)', + 'IsMadeUpOf(len)']); + my $stopPosition = $startPosition + $sequenceLength; + # Figure out the start point and length of the relevant section. + my $pos1 = ($start < $startPosition ? 0 : $start - $startPosition); + my $len = ($stopPosition <= $stop ? $stopPosition : $stop) - $startPosition - $pos1; + # Add the relevant data to the location data. + $locationDNA .= substr($sequenceData, $pos1, $len); + } + # Add this location's data to the return string. Note that we may need to reverse it. + if ($dir eq '+') { + $retVal .= $locationDNA; + } else { + $locationDNA = join('', reverse split //, $locationDNA); + $retVal .= $locationDNA; + } + } + # Return the result. + return $retVal; } =head3 AllContigs @@ -809,13 +822,13 @@ =cut #: Return Type @; sub AllContigs { - # Get the parameters. - my ($self, $genomeID) = @_; - # Ask for the genome's Contigs. - my @retVal = $self->GetFlat(['HasContig'], "HasContig(from-link) = ?", [$genomeID], - 'HasContig(to-link)'); - # Return the list of Contigs. - return @retVal; + # Get the parameters. + my ($self, $genomeID) = @_; + # Ask for the genome's Contigs. + my @retVal = $self->GetFlat(['HasContig'], "HasContig(from-link) = ?", [$genomeID], + 'HasContig(to-link)'); + # Return the list of Contigs. + return @retVal; } =head3 ContigLength @@ -839,22 +852,22 @@ =cut #: Return Type $; sub ContigLength { - # Get the parameters. - my ($self, $contigID) = @_; - # Get the contig's last sequence. - my $query = $self->Get(['IsMadeUpOf'], - "IsMadeUpOf(from-link) = ? ORDER BY IsMadeUpOf(start-position) DESC", - [$contigID]); - my $sequence = $query->Fetch(); - # Declare the return value. - my $retVal = 0; - # Set it from the sequence data, if any. - if ($sequence) { - my ($start, $len) = $sequence->Values(['IsMadeUpOf(start-position)', 'IsMadeUpOf(len)']); - $retVal = $start + $len; - } - # Return the result. - return $retVal; + # Get the parameters. + my ($self, $contigID) = @_; + # Get the contig's last sequence. + my $query = $self->Get(['IsMadeUpOf'], + "IsMadeUpOf(from-link) = ? ORDER BY IsMadeUpOf(start-position) DESC", + [$contigID]); + my $sequence = $query->Fetch(); + # Declare the return value. + my $retVal = 0; + # Set it from the sequence data, if any. + if ($sequence) { + my ($start, $len) = $sequence->Values(['IsMadeUpOf(start-position)', 'IsMadeUpOf(len)']); + $retVal = $start + $len; + } + # Return the result. + return $retVal; } =head3 GenesInRegion @@ -890,83 +903,83 @@ =cut #: Return Type @@; sub GenesInRegion { - # Get the parameters. - my ($self, $contigID, $start, $stop) = @_; - # Get the maximum segment length. - my $maximumSegmentLength = $self->MaxSegment; - # Create a hash to receive the feature list. We use a hash so that we can eliminate - # duplicates easily. The hash key will be the feature ID. The value will be a two-element - # containing the minimum and maximum offsets. We will use the offsets to sort the results - # when we're building the result set. - my %featuresFound = (); - # Prime the values we'll use for the returned beginning and end. - my @initialMinMax = ($self->ContigLength($contigID), 0); - my ($min, $max) = @initialMinMax; - # Create a table of parameters for each query. Each query looks for features travelling in - # a particular direction. The query parameters include the contig ID, the feature direction, - # the lowest possible start position, and the highest possible start position. This works - # because each feature segment length must be no greater than the maximum segment length. - my %queryParms = (forward => [$contigID, '+', $start - $maximumSegmentLength + 1, $stop], - reverse => [$contigID, '-', $start, $stop + $maximumSegmentLength - 1]); - # Loop through the query parameters. - for my $parms (values %queryParms) { - # Create the query. - my $query = $self->Get(['IsLocatedIn'], - "IsLocatedIn(to-link)= ? AND IsLocatedIn(dir) = ? AND IsLocatedIn(beg) >= ? AND IsLocatedIn(beg) <= ?", - $parms); - # Loop through the feature segments found. - while (my $segment = $query->Fetch) { - # Get the data about this segment. - my ($featureID, $dir, $beg, $len) = $segment->Values(['IsLocatedIn(from-link)', - 'IsLocatedIn(dir)', 'IsLocatedIn(beg)', 'IsLocatedIn(len)']); - # Determine if this feature actually overlaps the region. The query insures that - # this will be the case if the segment is the maximum length, so to fine-tune - # the results we insure that the inequality from the query holds using the actual - # length. - my ($found, $end) = (0, 0); - if ($dir eq '+') { - $end = $beg + $len; - if ($end >= $start) { - # Denote we found a useful feature. - $found = 1; - } - } elsif ($dir eq '-') { - # Note we switch things around so that the beginning is to the left of the - # ending. - ($beg, $end) = ($beg - $len, $beg); - if ($beg <= $stop) { - # Denote we found a useful feature. - $found = 1; - } - } - if ($found) { - # Here we need to record the feature and update the minima and maxima. First, - # get the current entry for the specified feature. - my ($loc1, $loc2) = (exists $featuresFound{$featureID} ? @{$featuresFound{$featureID}} : - @initialMinMax); - # Merge the current segment's begin and end into the feature begin and end and the - # global min and max. - if ($beg < $loc1) { - $loc1 = $beg; - $min = $beg if $beg < $min; - } - if ($end > $loc2) { - $loc2 = $end; - $max = $end if $end > $max; - } - # Store the entry back into the hash table. - $featuresFound{$featureID} = [$loc1, $loc2]; - } - } - } - # Now we must compute the list of the IDs for the features found. We start with a list - # of midpoints / feature ID pairs. (It's not really a midpoint, it's twice the midpoint, - # but the result of the sort will be the same.) - my @list = map { [$featuresFound{$_}->[0] + $featuresFound{$_}->[1], $_] } keys %featuresFound; - # Now we sort by midpoint and yank out the feature IDs. - my @retVal = map { $_->[1] } sort { $a->[0] <=> $b->[0] } @list; - # Return it along with the min and max. - return (\@retVal, $min, $max); + # Get the parameters. + my ($self, $contigID, $start, $stop) = @_; + # Get the maximum segment length. + my $maximumSegmentLength = $self->MaxSegment; + # Create a hash to receive the feature list. We use a hash so that we can eliminate + # duplicates easily. The hash key will be the feature ID. The value will be a two-element + # containing the minimum and maximum offsets. We will use the offsets to sort the results + # when we're building the result set. + my %featuresFound = (); + # Prime the values we'll use for the returned beginning and end. + my @initialMinMax = ($self->ContigLength($contigID), 0); + my ($min, $max) = @initialMinMax; + # Create a table of parameters for each query. Each query looks for features travelling in + # a particular direction. The query parameters include the contig ID, the feature direction, + # the lowest possible start position, and the highest possible start position. This works + # because each feature segment length must be no greater than the maximum segment length. + my %queryParms = (forward => [$contigID, '+', $start - $maximumSegmentLength + 1, $stop], + reverse => [$contigID, '-', $start, $stop + $maximumSegmentLength - 1]); + # Loop through the query parameters. + for my $parms (values %queryParms) { + # Create the query. + my $query = $self->Get(['IsLocatedIn'], + "IsLocatedIn(to-link)= ? AND IsLocatedIn(dir) = ? AND IsLocatedIn(beg) >= ? AND IsLocatedIn(beg) <= ?", + $parms); + # Loop through the feature segments found. + while (my $segment = $query->Fetch) { + # Get the data about this segment. + my ($featureID, $dir, $beg, $len) = $segment->Values(['IsLocatedIn(from-link)', + 'IsLocatedIn(dir)', 'IsLocatedIn(beg)', 'IsLocatedIn(len)']); + # Determine if this feature actually overlaps the region. The query insures that + # this will be the case if the segment is the maximum length, so to fine-tune + # the results we insure that the inequality from the query holds using the actual + # length. + my ($found, $end) = (0, 0); + if ($dir eq '+') { + $end = $beg + $len; + if ($end >= $start) { + # Denote we found a useful feature. + $found = 1; + } + } elsif ($dir eq '-') { + # Note we switch things around so that the beginning is to the left of the + # ending. + ($beg, $end) = ($beg - $len, $beg); + if ($beg <= $stop) { + # Denote we found a useful feature. + $found = 1; + } + } + if ($found) { + # Here we need to record the feature and update the minima and maxima. First, + # get the current entry for the specified feature. + my ($loc1, $loc2) = (exists $featuresFound{$featureID} ? @{$featuresFound{$featureID}} : + @initialMinMax); + # Merge the current segment's begin and end into the feature begin and end and the + # global min and max. + if ($beg < $loc1) { + $loc1 = $beg; + $min = $beg if $beg < $min; + } + if ($end > $loc2) { + $loc2 = $end; + $max = $end if $end > $max; + } + # Store the entry back into the hash table. + $featuresFound{$featureID} = [$loc1, $loc2]; + } + } + } + # Now we must compute the list of the IDs for the features found. We start with a list + # of midpoints / feature ID pairs. (It's not really a midpoint, it's twice the midpoint, + # but the result of the sort will be the same.) + my @list = map { [$featuresFound{$_}->[0] + $featuresFound{$_}->[1], $_] } keys %featuresFound; + # Now we sort by midpoint and yank out the feature IDs. + my @retVal = map { $_->[1] } sort { $a->[0] <=> $b->[0] } @list; + # Return it along with the min and max. + return (\@retVal, $min, $max); } =head3 FType @@ -991,12 +1004,12 @@ =cut #: Return Type $; sub FType { - # Get the parameters. - my ($self, $featureID) = @_; - # Get the specified feature's type. - my ($retVal) = $self->GetEntityValues('Feature', $featureID, ['Feature(feature-type)']); - # Return the result. - return $retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Get the specified feature's type. + my ($retVal) = $self->GetEntityValues('Feature', $featureID, ['Feature(feature-type)']); + # Return the result. + return $retVal; } =head3 FeatureAnnotations @@ -1028,29 +1041,29 @@ =cut #: Return Type @%; sub FeatureAnnotations { - # Get the parameters. - my ($self, $featureID) = @_; - # Create a query to get the feature's annotations and the associated users. - my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], - "IsTargetOfAnnotation(from-link) = ?", [$featureID]); - # Create the return list. - my @retVal = (); - # Loop through the annotations. - while (my $annotation = $query->Fetch) { - # Get the fields to return. - my ($featureID, $timeStamp, $user, $text) = - $annotation->Values(['IsTargetOfAnnotation(from-link)', - 'Annotation(time)', 'MadeAnnotation(from-link)', - 'Annotation(annotation)']); - # Assemble them into a hash. + # Get the parameters. + my ($self, $featureID) = @_; + # Create a query to get the feature's annotations and the associated users. + my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], + "IsTargetOfAnnotation(from-link) = ?", [$featureID]); + # Create the return list. + my @retVal = (); + # Loop through the annotations. + while (my $annotation = $query->Fetch) { + # Get the fields to return. + my ($featureID, $timeStamp, $user, $text) = + $annotation->Values(['IsTargetOfAnnotation(from-link)', + 'Annotation(time)', 'MadeAnnotation(from-link)', + 'Annotation(annotation)']); + # Assemble them into a hash. my $annotationHash = { featureID => $featureID, timeStamp => FriendlyTimestamp($timeStamp), - user => $user, text => $text }; - # Add it to the return list. - push @retVal, $annotationHash; - } - # Return the result list. - return @retVal; + user => $user, text => $text }; + # Add it to the return list. + push @retVal, $annotationHash; + } + # Return the result list. + return @retVal; } =head3 AllFunctionsOf @@ -1059,10 +1072,10 @@ Return all of the functional assignments for a particular feature. The data is returned as a hash of functional assignments to user IDs. A functional assignment is a type of annotation, -Functional assignments are described in the L function. Its worth noting that -we cannot filter on the content of the annotation itself because it's a text field; however, -this is not a big problem because most features only have a small number of annotations. -Finally, if a single user has multiple functional assignments, we will only keep the most +Functional assignments are described in the L function. Its worth noting that +we cannot filter on the content of the annotation itself because it's a text field; however, +this is not a big problem because most features only have a small number of annotations. +Finally, if a single user has multiple functional assignments, we will only keep the most recent one. =over 4 @@ -1080,35 +1093,35 @@ =cut #: Return Type %; sub AllFunctionsOf { - # Get the parameters. - my ($self, $featureID) = @_; - # Get all of the feature's annotations. + # Get the parameters. + my ($self, $featureID) = @_; + # Get all of the feature's annotations. my @query = $self->GetAll(['IsTargetOfAnnotation', 'Annotation'], - "IsTargetOfAnnotation(from-link) = ?", + "IsTargetOfAnnotation(from-link) = ?", [$featureID], ['Annotation(time)', 'Annotation(annotation)']); - # Declare the return hash. - my %retVal; + # Declare the return hash. + my %retVal; # Declare a hash for insuring we only make one assignment per user. my %timeHash = (); # Now we sort the assignments by timestamp in reverse. my @sortedQuery = sort { -($a->[0] <=> $b->[0]) } @query; - # Loop until we run out of annotations. + # Loop until we run out of annotations. for my $annotation (@sortedQuery) { # Get the annotation fields. my ($timeStamp, $text) = @{$annotation}; - # Check to see if this is a functional assignment. - my ($user, $function) = _ParseAssignment($text); + # Check to see if this is a functional assignment. + my ($user, $function) = _ParseAssignment($text); if ($user && ! exists $timeHash{$user}) { # Here it is a functional assignment and there has been no # previous assignment for this user, so we stuff it in the # return hash. - $retVal{$function} = $user; + $retVal{$function} = $user; # Insure we don't assign to this user again. $timeHash{$user} = 1; - } - } - # Return the hash of assignments found. - return %retVal; + } + } + # Return the hash of assignments found. + return %retVal; } =head3 FunctionOf @@ -1120,8 +1133,8 @@ The functional assignment is handled differently depending on the type of feature. If the feature is identified by a FIG ID (begins with the string C), then a functional assignment is a type of annotation. The format of an assignment is described in -L. Its worth noting that we cannot filter on the content of the -annotation itself because it's a text field; however, this is not a big problem because +L. Its worth noting that we cannot filter on the content of the +annotation itself because it's a text field; however, this is not a big problem because most features only have a small number of annotations. Each user has an associated list of trusted users. The assignment returned will be the most @@ -1153,8 +1166,8 @@ =cut #: Return Type $; sub FunctionOf { - # Get the parameters. - my ($self, $featureID, $userID) = @_; + # Get the parameters. + my ($self, $featureID, $userID) = @_; # Declare the return value. my $retVal; # Determine the ID type. @@ -1207,8 +1220,8 @@ # table. ($retVal) = $self->GetEntityValues('ExternalAliasFunc', $featureID, ['ExternalAliasFunc(func)']); } - # Return the assignment found. - return $retVal; + # Return the assignment found. + return $retVal; } =head3 BBHList @@ -1230,33 +1243,111 @@ =item RETURN -Returns a reference to a hash that maps the IDs of the incoming features to the IDs of -their best hits. +Returns a reference to a hash that maps the IDs of the incoming features to the best hits +on the target genome. =back =cut #: Return Type %; sub BBHList { - # Get the parameters. - my ($self, $genomeID, $featureList) = @_; - # Create the return structure. - my %retVal = (); - # Loop through the incoming features. - for my $featureID (@{$featureList}) { - # Create a query to get the feature's best hit. - my $query = $self->Get(['IsBidirectionalBestHitOf'], - "IsBidirectionalBestHitOf(from-link) = ? AND IsBidirectionalBestHitOf(genome) = ?", - [$featureID, $genomeID]); - # Look for the best hit. - my $bbh = $query->Fetch; - if ($bbh) { - my ($targetFeature) = $bbh->Value('IsBidirectionalBestHitOf(to-link)'); - $retVal{$featureID} = $targetFeature; - } - } - # Return the mapping. - return \%retVal; + # Get the parameters. + my ($self, $genomeID, $featureList) = @_; + # Create the return structure. + my %retVal = (); + # Loop through the incoming features. + for my $featureID (@{$featureList}) { + # Create a query to get the feature's best hit. + my $query = $self->Get(['IsBidirectionalBestHitOf'], + "IsBidirectionalBestHitOf(from-link) = ? AND IsBidirectionalBestHitOf(genome) = ?", + [$featureID, $genomeID]); + # Look for the best hit. + my $bbh = $query->Fetch; + if ($bbh) { + my ($targetFeature) = $bbh->Value('IsBidirectionalBestHitOf(to-link)'); + $retVal{$featureID} = $targetFeature; + } + } + # Return the mapping. + return \%retVal; +} + +=head3 SimList + +C<< my %similarities = $sprout->SimList($featureID, $count); >> + +Return a list of the similarities to the specified feature. + +Sprout does not support real similarities, so this method just returns the bidirectional +best hits. + +=over 4 + +=item featureID + +ID of the feature whose similarities are desired. + +=item count + +Maximum number of similar features to be returned, or C<0> to return them all. + +=back + +=cut +#: Return Type %; +sub SimList { + # Get the parameters. + my ($self, $featureID, $count) = @_; + # Ask for the best hits. + my @lists = $self->GetAll(['IsBidirectionalBestHitOf'], + "IsBidirectionalBestHitOf(from-link) = ? ORDER BY IsBidirectionalBestHitOf(score) DESC", + [$featureID], ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(score)'], + $count); + # Create the return value. + my %retVal = (); + for my $tuple (@lists) { + $retVal{$tuple->[0]} = $tuple->[1]; + } + # Return the result. + return %retVal; +} + + + +=head3 IsComplete + +C<< my $flag = $sprout->IsComplete($genomeID); >> + +Return TRUE if the specified genome is complete, else FALSE. + +=over 4 + +=item genomeID + +ID of the genome whose completeness status is desired. + +=item RETURN + +Returns TRUE if the genome is complete, FALSE if it is incomplete, and C if it is +not found. + +=back + +=cut +#: Return Type $; +sub IsComplete { + # Get the parameters. + my ($self, $genomeID) = @_; + # Declare the return variable. + my $retVal; + # Get the genome's data. + my $genomeData = $self->GetEntity('Genome', $genomeID); + if ($genomeData) { + # The genome exists, so get the completeness flag. + ($retVal) = $genomeData->Value('complete'); + } + # Return the result. + return $retVal; } =head3 FeatureAliases @@ -1281,12 +1372,12 @@ =cut #: Return Type @; sub FeatureAliases { - # Get the parameters. - my ($self, $featureID) = @_; - # Get the desired feature's aliases - my @retVal = $self->GetEntityValues('Feature', $featureID, ['Feature(alias)']); - # Return the result. - return @retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Get the desired feature's aliases + my @retVal = $self->GetEntityValues('Feature', $featureID, ['Feature(alias)']); + # Return the result. + return @retVal; } =head3 GenomeOf @@ -1311,18 +1402,18 @@ =cut #: Return Type $; sub GenomeOf { - # Get the parameters. - my ($self, $featureID) = @_; - # Create a query to find the genome associated with the feature. - my $query = $self->Get(['IsLocatedIn', 'HasContig'], "IsLocatedIn(from-link) = ?", [$featureID]); - # Declare the return value. - my $retVal; - # Get the genome ID. - if (my $relationship = $query->Fetch()) { - ($retVal) = $relationship->Value('HasContig(from-link)'); - } - # Return the value found. - return $retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Create a query to find the genome associated with the feature. + my $query = $self->Get(['IsLocatedIn', 'HasContig'], "IsLocatedIn(from-link) = ?", [$featureID]); + # Declare the return value. + my $retVal; + # Get the genome ID. + if (my $relationship = $query->Fetch()) { + ($retVal) = $relationship->Value('HasContig(from-link)'); + } + # Return the value found. + return $retVal; } =head3 CoupledFeatures @@ -1347,31 +1438,213 @@ =cut #: Return Type %; sub CoupledFeatures { - # Get the parameters. - my ($self, $featureID) = @_; - # Create a query to retrieve the functionally-coupled features. Note that we depend on the - # fact that the functional coupling is physically paired. If (A,B) is in the database, then - # (B,A) will also be found. - my $query = $self->Get(['IsClusteredOnChromosomeWith'], - "IsClusteredOnChromosomeWith(from-link) = ?", [$featureID]); - # This value will be set to TRUE if we find at least one coupled feature. - my $found = 0; - # Create the return hash. - my %retVal = (); - # Retrieve the relationship records and store them in the hash. - while (my $clustering = $query->Fetch()) { - my ($otherFeatureID, $score) = $clustering->Values(['IsClusteredOnChromosomeWith(to-link)', - 'IsClusteredOnChromosomeWith(score)']); - $retVal{$otherFeatureID} = $score; - $found = 1; - } - # Functional coupling is reflexive. If we found at least one coupled feature, we must add - # the incoming feature as well. - if ($found) { - $retVal{$featureID} = 9999; + # Get the parameters. + my ($self, $featureID) = @_; + # Create a query to retrieve the functionally-coupled features. + my $query = $self->Get(['ParticipatesInCoupling', 'Coupling'], + "ParticipatesInCoupling(from-link) = ?", [$featureID]); + # This value will be set to TRUE if we find at least one coupled feature. + my $found = 0; + # Create the return hash. + my %retVal = (); + # Retrieve the relationship records and store them in the hash. + while (my $clustering = $query->Fetch()) { + # Get the ID and score of the coupling. + my ($couplingID, $score) = $clustering->Values(['Coupling(id)', + 'Coupling(score)']); + # The coupling ID contains the two feature IDs separated by a space. We use + # this information to find the ID of the other feature. + my ($fid1, $fid2) = split / /, $couplingID; + my $otherFeatureID = ($featureID eq $fid1 ? $fid2 : $fid1); + # Attach the other feature's score to its ID. + $retVal{$otherFeatureID} = $score; + $found = 1; + } + # Functional coupling is reflexive. If we found at least one coupled feature, we must add + # the incoming feature as well. + if ($found) { + $retVal{$featureID} = 9999; + } + # Return the hash. + return %retVal; +} + +=head3 CouplingEvidence + +C<< my @evidence = $sprout->CouplingEvidence($peg1, $peg2); >> + +Return the evidence for a functional coupling. + +A pair of features is considered evidence of a coupling between two other +features if they occur close together on a contig and both are similar to +the coupled features. So, if B and B are close together on a contig, +B and B are considered evidence for the coupling if (1) B and +B are close together, (2) B is similar to B, and (3) B is +similar to B. + +The score of a coupling is determined by the number of pieces of evidence +that are considered I. If several evidence items belong to +a group of genomes that are close to each other, only one of those items +is considered representative. The other evidence items are presumed to be +there because of the relationship between the genomes rather than because +the two proteins generated by the features have a related functionality. + +Each evidence item is returned as a three-tuple in the form C<[>I<$peg1a>C<,> +I<$peg2a>C<,> I<$rep>C<]>, where I<$peg1a> is similar to I<$peg1>, I<$peg2a> +is similar to I<$peg2>, and I<$rep> is TRUE if the evidence is representative +and FALSE otherwise. + +=over 4 + +=item peg1 + +ID of the feature of interest. + +=item peg2 + +ID of a feature functionally coupled to the feature of interest. + +=item RETURN + +Returns a list of 3-tuples. Each tuple consists of a feature similar to the feature +of interest, a feature similar to the functionally coupled feature, and a flag +that is TRUE for a representative piece of evidence and FALSE otherwise. + +=back + +=cut +#: Return Type @@; +sub CouplingEvidence { + # Get the parameters. + my ($self, $peg1, $peg2) = @_; + # Declare the return variable. + my @retVal = (); + # Our first task is to find out the nature of the coupling: whether or not + # it exists, its score, and whether the features are stored in the same + # order as the ones coming in. + my ($couplingID, $inverted, $score) = $self->GetCoupling($peg1, $peg2); + # Only proceed if a coupling exists. + if ($couplingID) { + # Determine the ordering to place on the evidence items. If we're + # inverted, we want to see feature 2 before feature 1 (descending); otherwise, + # we want feature 1 before feature 2 (normal). + Trace("Coupling evidence for ($peg1, $peg2) with inversion flag $inverted.") if T(Coupling => 4); + my $ordering = ($inverted ? "DESC" : ""); + # Get the coupling evidence. + my @evidenceList = $self->GetAll(['IsEvidencedBy', 'PCH', 'UsesAsEvidence'], + "IsEvidencedBy(from-link) = ? ORDER BY PCH(id), UsesAsEvidence(pos) $ordering", + [$couplingID], + ['PCH(used)', 'UsesAsEvidence(to-link)']); + # Loop through the evidence items. Each piece of evidence is represented by two + # positions in the evidence list, one for each feature on the other side of the + # evidence link. If at some point we want to generalize to couplings with + # more than two positions, this section of code will need to be re-done. + while (@evidenceList > 0) { + my $peg1Data = shift @evidenceList; + my $peg2Data = shift @evidenceList; + Trace("Peg 1 is " . $peg1Data->[1] . " and Peg 2 is " . $peg2Data->[1] . ".") if T(Coupling => 4); + push @retVal, [$peg1Data->[1], $peg2Data->[1], $peg1Data->[0]]; + } + Trace("Last index in evidence result is is $#retVal.") if T(Coupling => 4); + } + # Return the result. + return @retVal; +} + +=head3 GetCoupling + +C<< my ($couplingID, $inverted, $score) = $sprout->GetCoupling($peg1, $peg2); >> + +Return the coupling (if any) for the specified pair of PEGs. If a coupling +exists, we return the coupling ID along with an indicator of whether the +coupling is stored as C<(>I<$peg1>C<, >I<$peg2>C<)> or C<(>I<$peg2>C<, >I<$peg1>C<)>. +In the second case, we say the coupling is I. The importance of an +inverted coupling is that the PEGs in the evidence will appear in reverse order. + +=over 4 + +=item peg1 + +ID of the feature of interest. + +=item peg2 + +ID of the potentially coupled feature. + +=item RETURN + +Returns a three-element list. The first element contains the database ID of +the coupling. The second element is FALSE if the coupling is stored in the +database in the caller specified order and TRUE if it is stored in the +inverted order. The third element is the coupling's score. If the coupling +does not exist, all three list elements will be C. + +=back + +=cut +#: Return Type $%@; +sub GetCoupling { + # Get the parameters. + my ($self, $peg1, $peg2) = @_; + # Declare the return values. We'll start with the coupling ID and undefine the + # flag and score until we have more information. + my ($retVal, $inverted, $score) = (CouplingID($peg1, $peg2), undef, undef); + # Find the coupling data. + my @pegs = $self->GetAll(['Coupling', 'ParticipatesInCoupling'], + "Coupling(id) = ? ORDER BY ParticipatesInCoupling(pos)", + [$retVal], ["ParticipatesInCoupling(from-link)", "Coupling(score)"]); + # Check to see if we found anything. + if (!@pegs) { + Trace("No coupling found.") if T(Coupling => 4); + # No coupling, so undefine the return value. + $retVal = undef; + } else { + # We have a coupling! Get the score and check for inversion. + $score = $pegs[0]->[1]; + my $firstFound = $pegs[0]->[0]; + $inverted = ($firstFound ne $peg1); + Trace("Coupling score is $score. First peg is $firstFound, peg 1 is $peg1.") if T(Coupling => 4); } - # Return the hash. - return %retVal; + # Return the result. + return ($retVal, $inverted, $score); +} + +=head3 CouplingID + +C<< my $couplingID = Sprout::CouplingID($peg1, $peg2); >> + +Return the coupling ID for a pair of feature IDs. + +The coupling ID is currently computed by joining the feature IDs in +sorted order with a space. Client modules (that is, modules which +use Sprout) should not, however, count on this always being the +case. This method provides a way for abstracting the concept of a +coupling ID. All that we know for sure about it is that it can be +generated easily from the feature IDs and the order of the IDs +in the parameter list does not matter (i.e. C +will have the same value as C. + +=over 4 + +=item peg1 + +First feature of interest. + +=item peg2 + +Second feature of interest. + +=item RETURN + +Returns the ID that would be used to represent a functional coupling of +the two specified PEGs. + +=back + +=cut +#: Return Type $; +sub CouplingID { + return join " ", sort @_; } =head3 GetEntityTypes @@ -1383,12 +1656,12 @@ =cut #: Return Type @; sub GetEntityTypes { - # Get the parameters. - my ($self) = @_; - # Get the underlying database object. - my $erdb = $self->{_erdb}; - # Get its entity type list. - my @retVal = $erdb->GetEntityTypes(); + # Get the parameters. + my ($self) = @_; + # Get the underlying database object. + my $erdb = $self->{_erdb}; + # Get its entity type list. + my @retVal = $erdb->GetEntityTypes(); } =head3 ReadFasta @@ -1418,43 +1691,43 @@ =cut #: Return Type %; sub ReadFasta { - # Get the parameters. - my ($fileName, $prefix) = @_; - # Create the return hash. - my %retVal = (); - # Open the file for input. - open FASTAFILE, '<', $fileName; - # Declare the ID variable and clear the sequence accumulator. - my $sequence = ""; - my $id = ""; - # Loop through the file. - while () { - # Get the current line. - my $line = $_; - # Check for a header line. - if ($line =~ m/^>\s*(.+?)(\s|\n)/) { - # Here we have a new header. Store the current sequence if we have one. - if ($id) { - $retVal{$id} = uc $sequence; - } - # Clear the sequence accumulator and save the new ID. - ($id, $sequence) = ("$prefix$1", ""); - } else { - # Here we have a data line, so we add it to the sequence accumulator. - # First, we get the actual data out. Note that we normalize to upper - # case. - $line =~ /^\s*(.*?)(\s|\n)/; - $sequence .= $1; - } - } - # Flush out the last sequence (if any). - if ($sequence) { - $retVal{$id} = uc $sequence; - } - # Close the file. - close FASTAFILE; - # Return the hash constructed from the file. - return %retVal; + # Get the parameters. + my ($fileName, $prefix) = @_; + # Create the return hash. + my %retVal = (); + # Open the file for input. + open FASTAFILE, '<', $fileName; + # Declare the ID variable and clear the sequence accumulator. + my $sequence = ""; + my $id = ""; + # Loop through the file. + while () { + # Get the current line. + my $line = $_; + # Check for a header line. + if ($line =~ m/^>\s*(.+?)(\s|\n)/) { + # Here we have a new header. Store the current sequence if we have one. + if ($id) { + $retVal{$id} = lc $sequence; + } + # Clear the sequence accumulator and save the new ID. + ($id, $sequence) = ("$prefix$1", ""); + } else { + # Here we have a data line, so we add it to the sequence accumulator. + # First, we get the actual data out. Note that we normalize to lower + # case. + $line =~ /^\s*(.*?)(\s|\n)/; + $sequence .= $1; + } + } + # Flush out the last sequence (if any). + if ($sequence) { + $retVal{$id} = lc $sequence; + } + # Close the file. + close FASTAFILE; + # Return the hash constructed from the file. + return %retVal; } =head3 FormatLocations @@ -1464,7 +1737,7 @@ Insure that a list of feature locations is in the Sprout format. The Sprout feature location format is I_I where I<*> is C<+> for a forward gene and C<-> for a backward gene. The old format is I_I_I. If a feature is in the new format already, -it will not be changed; otherwise, it will be converted. This method can also be used to +it will not be changed; otherwise, it will be converted. This method can also be used to perform the reverse task-- insuring that all the locations are in the old format. =over 4 @@ -1491,35 +1764,35 @@ =cut #: Return Type @; sub FormatLocations { - # Get the parameters. - my ($self, $prefix, $locations, $oldFormat) = @_; - # Create the return list. - my @retVal = (); - # Check to see if any locations were passed in. - if ($locations eq '') { - Confess("No locations specified."); - } else { - # Loop through the locations, converting them to the new format. - for my $location (@{$locations}) { - # Parse the location elements. - my ($contig, $beg, $dir, $len) = ParseLocation($location); - # Process according to the desired output format. - if (!$oldFormat) { - # Here we're producing the new format. Add the location to the return list. - push @retVal, "$prefix${contig}_$beg$dir$len"; - } elsif ($dir eq '+') { - # Here we're producing the old format and it's a forward gene. - my $end = $beg + $len - 1; - push @retVal, "$prefix${contig}_${beg}_$end"; - } else { - # Here we're producting the old format and it's a backward gene. - my $end = $beg - $len + 1; - push @retVal, "$prefix${contig}_${beg}_$end"; - } - } - } - # Return the normalized list. - return @retVal; + # Get the parameters. + my ($self, $prefix, $locations, $oldFormat) = @_; + # Create the return list. + my @retVal = (); + # Check to see if any locations were passed in. + if ($locations eq '') { + Confess("No locations specified."); + } else { + # Loop through the locations, converting them to the new format. + for my $location (@{$locations}) { + # Parse the location elements. + my ($contig, $beg, $dir, $len) = ParseLocation($location); + # Process according to the desired output format. + if (!$oldFormat) { + # Here we're producing the new format. Add the location to the return list. + push @retVal, "$prefix${contig}_$beg$dir$len"; + } elsif ($dir eq '+') { + # Here we're producing the old format and it's a forward gene. + my $end = $beg + $len - 1; + push @retVal, "$prefix${contig}_${beg}_$end"; + } else { + # Here we're producting the old format and it's a backward gene. + my $end = $beg - $len + 1; + push @retVal, "$prefix${contig}_${beg}_$end"; + } + } + } + # Return the normalized list. + return @retVal; } =head3 DumpData @@ -1531,12 +1804,12 @@ =cut sub DumpData { - # Get the parameters. - my ($self) = @_; - # Get the data directory name. - my $outputDirectory = $self->{_options}->{dataDir}; - # Dump the relations. - $self->{_erdb}->DumpRelations($outputDirectory); + # Get the parameters. + my ($self) = @_; + # Get the data directory name. + my $outputDirectory = $self->{_options}->{dataDir}; + # Dump the relations. + $self->{_erdb}->DumpRelations($outputDirectory); } =head3 XMLFileName @@ -1548,8 +1821,8 @@ =cut #: Return Type $; sub XMLFileName { - my ($self) = @_; - return $self->{_xmlName}; + my ($self) = @_; + return $self->{_xmlName}; } =head3 Insert @@ -1568,7 +1841,7 @@ The next statement inserts a C relationship between feature C and property C<4> with an evidence URL of C. -C<< $sprout->InsertObject('HasProperty', { 'from-link' => 'fig|158879.1.peg.1', 'to-link' => 4, evidence = 'http://seedu.uchicago.edu/query.cgi?article_id=142'}); >> +C<< $sprout->InsertObject('HasProperty', { 'from-link' => 'fig|158879.1.peg.1', 'to-link' => 4, evidence => 'http://seedu.uchicago.edu/query.cgi?article_id=142'}); >> =over 4 @@ -1585,10 +1858,10 @@ =cut #: Return Type ; sub Insert { - # Get the parameters. - my ($self, $objectType, $fieldHash) = @_; - # Call the underlying method. - $self->{_erdb}->InsertObject($objectType, $fieldHash); + # Get the parameters. + my ($self, $objectType, $fieldHash) = @_; + # Call the underlying method. + $self->{_erdb}->InsertObject($objectType, $fieldHash); } =head3 Annotate @@ -1626,23 +1899,23 @@ =cut #: Return Type $; sub Annotate { - # Get the parameters. - my ($self, $fid, $timestamp, $user, $text) = @_; - # Create the annotation ID. - my $aid = "$fid:$timestamp"; - # Insert the Annotation object. - my $retVal = $self->Insert('Annotation', { id => $aid, time => $timestamp, annotation => $text }); - if ($retVal) { - # Connect it to the user. - $retVal = $self->Insert('MadeAnnotation', { 'from-link' => $user, 'to-link' => $aid }); - if ($retVal) { - # Connect it to the feature. - $retVal = $self->Insert('IsTargetOfAnnotation', { 'from-link' => $fid, - 'to-link' => $aid }); - } - } - # Return the success indicator. - return $retVal; + # Get the parameters. + my ($self, $fid, $timestamp, $user, $text) = @_; + # Create the annotation ID. + my $aid = "$fid:$timestamp"; + # Insert the Annotation object. + my $retVal = $self->Insert('Annotation', { id => $aid, time => $timestamp, annotation => $text }); + if ($retVal) { + # Connect it to the user. + $retVal = $self->Insert('MadeAnnotation', { 'from-link' => $user, 'to-link' => $aid }); + if ($retVal) { + # Connect it to the feature. + $retVal = $self->Insert('IsTargetOfAnnotation', { 'from-link' => $fid, + 'to-link' => $aid }); + } + } + # Return the success indicator. + return $retVal; } =head3 AssignFunction @@ -1679,30 +1952,30 @@ =cut #: Return Type $; sub AssignFunction { - # Get the parameters. - my ($self, $featureID, $user, $function, $assigningUser) = @_; + # Get the parameters. + my ($self, $featureID, $user, $function, $assigningUser) = @_; # Default the assigning user. if (! $assigningUser) { $assigningUser = $user; } - # Create an annotation string from the parameters. - my $annotationText = "$assigningUser\nset $user function to\n$function"; - # Get the current time. - my $now = time; - # Declare the return variable. - my $retVal = 1; - # Locate the genome containing the feature. - my $genome = $self->GenomeOf($featureID); - if (!$genome) { - # Here the genome was not found. This probably means the feature ID is invalid. - Trace("No genome found for feature $featureID.") if T(0); - $retVal = 0; - } else { - # Here we know we have a feature with a genome. Store the annotation. + # Create an annotation string from the parameters. + my $annotationText = "$assigningUser\nset $user function to\n$function"; + # Get the current time. + my $now = time; + # Declare the return variable. + my $retVal = 1; + # Locate the genome containing the feature. + my $genome = $self->GenomeOf($featureID); + if (!$genome) { + # Here the genome was not found. This probably means the feature ID is invalid. + Trace("No genome found for feature $featureID.") if T(0); + $retVal = 0; + } else { + # Here we know we have a feature with a genome. Store the annotation. $retVal = $self->Annotate($featureID, $now, $user, $annotationText); - } - # Return the success indicator. - return $retVal; + } + # Return the success indicator. + return $retVal; } =head3 FeaturesByAlias @@ -1730,21 +2003,21 @@ =cut #: Return Type @; sub FeaturesByAlias { - # Get the parameters. - my ($self, $alias) = @_; - # Declare the return variable. - my @retVal = (); - # Parse the alias. - my ($mappedAlias, $flag) = FIGRules::NormalizeAlias($alias); - # If it's a FIG alias, we're done. - if ($flag) { - push @retVal, $mappedAlias; - } else { - # Here we have a non-FIG alias. Get the features with the normalized alias. - @retVal = $self->GetFlat(['Feature'], 'Feature(alias) = ?', [$mappedAlias], 'Feature(id)'); - } - # Return the result. - return @retVal; + # Get the parameters. + my ($self, $alias) = @_; + # Declare the return variable. + my @retVal = (); + # Parse the alias. + my ($mappedAlias, $flag) = FIGRules::NormalizeAlias($alias); + # If it's a FIG alias, we're done. + if ($flag) { + push @retVal, $mappedAlias; + } else { + # Here we have a non-FIG alias. Get the features with the normalized alias. + @retVal = $self->GetFlat(['Feature'], 'Feature(alias) = ?', [$mappedAlias], 'Feature(id)'); + } + # Return the result. + return @retVal; } =head3 Exists @@ -1772,13 +2045,13 @@ =cut #: Return Type $; sub Exists { - # Get the parameters. - my ($self, $entityName, $entityID) = @_; - # Check for the entity instance. - my $testInstance = $self->GetEntity($entityName, $entityID); - # Return an existence indicator. - my $retVal = ($testInstance ? 1 : 0); - return $retVal; + # Get the parameters. + my ($self, $entityName, $entityID) = @_; + # Check for the entity instance. + my $testInstance = $self->GetEntity($entityName, $entityID); + # Return an existence indicator. + my $retVal = ($testInstance ? 1 : 0); + return $retVal; } =head3 FeatureTranslation @@ -1802,11 +2075,11 @@ =cut #: Return Type $; sub FeatureTranslation { - # Get the parameters. - my ($self, $featureID) = @_; - # Get the specified feature's translation. - my ($retVal) = $self->GetEntityValues("Feature", $featureID, ['Feature(translation)']); - return $retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Get the specified feature's translation. + my ($retVal) = $self->GetEntityValues("Feature", $featureID, ['Feature(translation)']); + return $retVal; } =head3 Taxonomy @@ -1834,20 +2107,20 @@ =cut #: Return Type @; sub Taxonomy { - # Get the parameters. - my ($self, $genome) = @_; - # Find the specified genome's taxonomy string. - my ($list) = $self->GetEntityValues('Genome', $genome, ['Genome(taxonomy)']); - # Declare the return variable. - my @retVal = (); - # If we found the genome, return its taxonomy string. - if ($list) { - @retVal = split /\s*;\s*/, $list; - } else { - Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0); - } - # Return the value found. - return @retVal; + # Get the parameters. + my ($self, $genome) = @_; + # Find the specified genome's taxonomy string. + my ($list) = $self->GetEntityValues('Genome', $genome, ['Genome(taxonomy)']); + # Declare the return variable. + my @retVal = (); + # If we found the genome, return its taxonomy string. + if ($list) { + @retVal = split /\s*;\s*/, $list; + } else { + Trace("Genome \"$genome\" does not have a taxonomy in the database.\n") if T(0); + } + # Return the value found. + return @retVal; } =head3 CrudeDistance @@ -1877,28 +2150,28 @@ =cut #: Return Type $; sub CrudeDistance { - # Get the parameters. - my ($self, $genome1, $genome2) = @_; - # Insure that the distance is commutative by sorting the genome IDs. - my ($genomeA, $genomeB); - if ($genome2 < $genome2) { - ($genomeA, $genomeB) = ($genome1, $genome2); - } else { - ($genomeA, $genomeB) = ($genome2, $genome1); - } - my @taxA = $self->Taxonomy($genomeA); - my @taxB = $self->Taxonomy($genomeB); - # Initialize the distance to 1. We'll reduce it each time we find a match between the - # taxonomies. - my $retVal = 1.0; - # Initialize the subtraction amount. This amount determines the distance reduction caused - # by a mismatch at the current level. - my $v = 0.5; - # Loop through the taxonomies. - for (my $i = 0; ($i < @taxA) && ($i < @taxB) && ($taxA[$i] eq $taxB[$i]); $i++) { - $retVal -= $v; - $v /= 2; - } + # Get the parameters. + my ($self, $genome1, $genome2) = @_; + # Insure that the distance is commutative by sorting the genome IDs. + my ($genomeA, $genomeB); + if ($genome2 < $genome2) { + ($genomeA, $genomeB) = ($genome1, $genome2); + } else { + ($genomeA, $genomeB) = ($genome2, $genome1); + } + my @taxA = $self->Taxonomy($genomeA); + my @taxB = $self->Taxonomy($genomeB); + # Initialize the distance to 1. We'll reduce it each time we find a match between the + # taxonomies. + my $retVal = 1.0; + # Initialize the subtraction amount. This amount determines the distance reduction caused + # by a mismatch at the current level. + my $v = 0.5; + # Loop through the taxonomies. + for (my $i = 0; ($i < @taxA) && ($i < @taxB) && ($taxA[$i] eq $taxB[$i]); $i++) { + $retVal -= $v; + $v /= 2; + } return $retVal; } @@ -1924,16 +2197,16 @@ =cut #: Return Type $; sub RoleName { - # Get the parameters. - my ($self, $roleID) = @_; - # Get the specified role's name. - my ($retVal) = $self->GetEntityValues('Role', $roleID, ['Role(name)']); - # Use the ID if the role has no name. - if (!$retVal) { - $retVal = $roleID; - } - # Return the name. - return $retVal; + # Get the parameters. + my ($self, $roleID) = @_; + # Get the specified role's name. + my ($retVal) = $self->GetEntityValues('Role', $roleID, ['Role(name)']); + # Use the ID if the role has no name. + if (!$retVal) { + $retVal = $roleID; + } + # Return the name. + return $retVal; } =head3 RoleDiagrams @@ -1957,13 +2230,96 @@ =cut #: Return Type @; sub RoleDiagrams { - # Get the parameters. - my ($self, $roleID) = @_; - # Query for the diagrams. - my @retVal = $self->GetFlat(['RoleOccursIn'], "RoleOccursIn(from-link) = ?", [$roleID], - 'RoleOccursIn(to-link)'); - # Return the result. - return @retVal; + # Get the parameters. + my ($self, $roleID) = @_; + # Query for the diagrams. + my @retVal = $self->GetFlat(['RoleOccursIn'], "RoleOccursIn(from-link) = ?", [$roleID], + 'RoleOccursIn(to-link)'); + # Return the result. + return @retVal; +} + +=head3 GetProperties + +C<< my @list = $sprout->GetProperties($fid, $key, $value, $url); >> + +Return a list of the properties with the specified characteristics. + +Properties are arbitrary key-value pairs associated with a feature. (At some point they +will also be associated with genomes.) A property value is represented by a 4-tuple of +the form B<($fid, $key, $value, $url)>. These exactly correspond to the parameter + +=over 4 + +=item fid + +ID of the feature possessing the property. + +=item key + +Name or key of the property. + +=item value + +Value of the property. + +=item url + +URL of the document that indicated the property should have this particular value, or an +empty string if no such document exists. + +=back + +The parameters act as a filter for the desired data. Any non-null parameter will +automatically match all the tuples returned. So, specifying just the I<$fid> will +return all the properties of the specified feature; similarly, specifying the I<$key> +and I<$value> parameters will return all the features having the specified property +value. + +A single property key can have many values, representing different ideas about the +feature in question. For example, one paper may declare that a feature C is +virulent, and another may declare that it is not virulent. A query about the virulence of +C would be coded as + + my @list = $sprout->GetProperties('fig|83333.1.peg.10', 'virulence', '', ''); + +Here the I<$value> and I<$url> fields are left blank, indicating that those fields are +not to be filtered. The tuples returned would be + + ('fig|83333.1.peg.10', 'virulence', 'yes', 'http://www.somewhere.edu/first.paper.pdf') + ('fig|83333.1.peg.10', 'virulence', 'no', 'http://www.somewhere.edu/second.paper.pdf') + +=cut +#: Return Type @@; +sub GetProperties { + # Get the parameters. + my ($self, @parms) = @_; + # Declare the return variable. + my @retVal = (); + # Now we need to create a WHERE clause that will get us the data we want. First, + # we create a list of the columns containing the data for each parameter. + my @colNames = ('HasProperty(from-link)', 'Property(property-name)', + 'Property(property-value)', 'HasProperty(evidence)'); + # Now we build the WHERE clause and the list of parameter values. + my @where = (); + my @values = (); + for (my $i = 0; $i <= $#colNames; $i++) { + my $parm = $parms[$i]; + if (defined $parm && ($parm ne '')) { + push @where, "$colNames[$i] = ?"; + push @values, $parm; + } + } + # Format the WHERE clause. + my $filter = (@values > 0 ? (join " AND ", @where) : undef); + # Ask for all the propertie values with the desired characteristics. + my $query = $self->Get(['HasProperty', 'Property'], $filter, \@values); + while (my $valueObject = $query->Fetch()) { + my @tuple = $valueObject->Values(\@colNames); + push @retVal, \@tuple; + } + # Return the result. + return @retVal; } =head3 FeatureProperties @@ -1994,14 +2350,14 @@ =cut #: Return Type @@; sub FeatureProperties { - # Get the parameters. - my ($self, $featureID) = @_; - # Get the properties. - my @retVal = $self->GetAll(['HasProperty', 'Property'], "HasProperty(from-link) = ?", [$featureID], - ['Property(property-name)', 'Property(property-value)', - 'HasProperty(evidence)']); - # Return the resulting list. - return @retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Get the properties. + my @retVal = $self->GetAll(['HasProperty', 'Property'], "HasProperty(from-link) = ?", [$featureID], + ['Property(property-name)', 'Property(property-value)', + 'HasProperty(evidence)']); + # Return the resulting list. + return @retVal; } =head3 DiagramName @@ -2025,11 +2381,11 @@ =cut #: Return Type $; sub DiagramName { - # Get the parameters. - my ($self, $diagramID) = @_; - # Get the specified diagram's name and return it. - my ($retVal) = $self->GetEntityValues('Diagram', $diagramID, ['Diagram(name)']); - return $retVal; + # Get the parameters. + my ($self, $diagramID) = @_; + # Get the specified diagram's name and return it. + my ($retVal) = $self->GetEntityValues('Diagram', $diagramID, ['Diagram(name)']); + return $retVal; } =head3 MergedAnnotations @@ -2057,28 +2413,28 @@ =cut #: Return Type @; sub MergedAnnotations { - # Get the parameters. - my ($self, $list) = @_; - # Create a list to hold the annotation tuples found. - my @tuples = (); - # Loop through the features in the input list. - for my $fid (@{$list}) { - # Create a list of this feature's annotation tuples. - my @newTuples = $self->GetAll(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], - "IsTargetOfAnnotation(from-link) = ?", [$fid], - ['IsTargetOfAnnotation(from-link)', 'Annotation(time)', - 'MadeAnnotation(from-link)', 'Annotation(annotation)']); - # Put it in the result list. - push @tuples, @newTuples; - } - # Sort the result list by timestamp. - my @retVal = sort { $a->[1] <=> $b->[1] } @tuples; + # Get the parameters. + my ($self, $list) = @_; + # Create a list to hold the annotation tuples found. + my @tuples = (); + # Loop through the features in the input list. + for my $fid (@{$list}) { + # Create a list of this feature's annotation tuples. + my @newTuples = $self->GetAll(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], + "IsTargetOfAnnotation(from-link) = ?", [$fid], + ['IsTargetOfAnnotation(from-link)', 'Annotation(time)', + 'MadeAnnotation(from-link)', 'Annotation(annotation)']); + # Put it in the result list. + push @tuples, @newTuples; + } + # Sort the result list by timestamp. + my @retVal = sort { $a->[1] <=> $b->[1] } @tuples; # Loop through and make the time stamps friendly. for my $tuple (@retVal) { $tuple->[1] = FriendlyTimestamp($tuple->[1]); } - # Return the sorted list. - return @retVal; + # Return the sorted list. + return @retVal; } =head3 RoleNeighbors @@ -2105,23 +2461,23 @@ =cut #: Return Type @; sub RoleNeighbors { - # Get the parameters. - my ($self, $roleID) = @_; - # Get all the diagrams containing this role. - my @diagrams = $self->GetFlat(['RoleOccursIn'], "RoleOccursIn(from-link) = ?", [$roleID], - 'RoleOccursIn(to-link)'); - # Create the return list. - my @retVal = (); - # Loop through the diagrams. - for my $diagramID (@diagrams) { - # Get all the roles in this diagram. - my @roles = $self->GetFlat(['RoleOccursIn'], "RoleOccursIn(to-link) = ?", [$diagramID], - 'RoleOccursIn(from-link)'); - # Add them to the return list. - push @retVal, @roles; - } - # Merge the duplicates from the list. - return Tracer::Merge(@retVal); + # Get the parameters. + my ($self, $roleID) = @_; + # Get all the diagrams containing this role. + my @diagrams = $self->GetFlat(['RoleOccursIn'], "RoleOccursIn(from-link) = ?", [$roleID], + 'RoleOccursIn(to-link)'); + # Create the return list. + my @retVal = (); + # Loop through the diagrams. + for my $diagramID (@diagrams) { + # Get all the roles in this diagram. + my @roles = $self->GetFlat(['RoleOccursIn'], "RoleOccursIn(to-link) = ?", [$diagramID], + 'RoleOccursIn(from-link)'); + # Add them to the return list. + push @retVal, @roles; + } + # Merge the duplicates from the list. + return Tracer::Merge(@retVal); } =head3 FeatureLinks @@ -2147,12 +2503,12 @@ =cut #: Return Type @; sub FeatureLinks { - # Get the parameters. - my ($self, $featureID) = @_; - # Get the feature's links. - my @retVal = $self->GetEntityValues('Feature', $featureID, ['Feature(link)']); - # Return the feature's links. - return @retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Get the feature's links. + my @retVal = $self->GetEntityValues('Feature', $featureID, ['Feature(link)']); + # Return the feature's links. + return @retVal; } =head3 SubsystemsOf @@ -2160,7 +2516,7 @@ C<< my %subsystems = $sprout->SubsystemsOf($featureID); >> Return a hash describing all the subsystems in which a feature participates. Each subsystem is mapped -to the role the feature performs. +to the roles the feature performs. =over 4 @@ -2170,27 +2526,64 @@ =item RETURN -Returns a hash mapping all the feature's subsystems to the feature's role. +Returns a hash mapping all the feature's subsystems to a list of the feature's roles. =back =cut -#: Return Type %; +#: Return Type %@; sub SubsystemsOf { - # Get the parameters. - my ($self, $featureID) = @_; - # Use the SSCell to connect features to subsystems. - my @subsystems = $self->GetAll(['ContainsFeature', 'HasSSCell', 'IsRoleOf'], - "ContainsFeature(to-link) = ?", [$featureID], - ['HasSSCell(from-link)', 'IsRoleOf(from-link)']); - # Create the return value. - my %retVal = (); - # Loop through the results, adding them to the hash. - for my $record (@subsystems) { - $retVal{$record->[0]} = $record->[1]; - } - # Return the hash. - return %retVal; + # Get the parameters. + my ($self, $featureID) = @_; + # Get the subsystem list. + my @subsystems = $self->GetAll(['ContainsFeature', 'HasSSCell', 'IsRoleOf'], + "ContainsFeature(to-link) = ?", [$featureID], + ['HasSSCell(from-link)', 'IsRoleOf(from-link)']); + # Create the return value. + my %retVal = (); + # Loop through the results, adding them to the hash. + for my $record (@subsystems) { + my ($subsys, $role) = @{$record}; + if (exists $retVal{$subsys}) { + push @{$retVal{$subsys}}, $role; + } else { + $retVal{$subsys} = [$role]; + } + } + # Return the hash. + return %retVal; +} + +=head3 SubsystemList + +C<< my @subsystems = $sprout->SubsystemList($featureID); >> + +Return a list containing the names of the subsystems in which the specified +feature participates. Unlike L, this method only returns the +subsystem names, not the roles. + +=over 4 + +=item featureID + +ID of the feature whose subsystem names are desired. + +=item RETURN + +Returns a list of the names of the subsystems in which the feature participates. + +=back + +=cut +#: Return Type @; +sub SubsystemList { + # Get the parameters. + my ($self, $featureID) = @_; + # Get the list of names. + my @retVal = $self->GetFlat(['ContainsFeature', 'HasSSCell'], "ContainsFeature(to-link) = ?", + [$featureID], 'HasSSCell(from-link)'); + # Return the result. + return @retVal; } =head3 RelatedFeatures @@ -2225,25 +2618,25 @@ =cut #: Return Type @; sub RelatedFeatures { - # Get the parameters. - my ($self, $featureID, $function, $userID) = @_; - # Get a list of the features that are BBHs of the incoming feature. - my @bbhFeatures = $self->GetFlat(['IsBidirectionalBestHitOf'], - "IsBidirectionalBestHitOf(from-link) = ?", [$featureID], - 'IsBidirectionalBestHitOf(to-link)'); - # Now we loop through the features, pulling out the ones that have the correct - # functional assignment. - my @retVal = (); - for my $bbhFeature (@bbhFeatures) { - # Get this feature's functional assignment. - my $newFunction = $self->FunctionOf($bbhFeature, $userID); - # If it matches, add it to the result list. - if ($newFunction eq $function) { - push @retVal, $bbhFeature; - } - } - # Return the result list. - return @retVal; + # Get the parameters. + my ($self, $featureID, $function, $userID) = @_; + # Get a list of the features that are BBHs of the incoming feature. + my @bbhFeatures = $self->GetFlat(['IsBidirectionalBestHitOf'], + "IsBidirectionalBestHitOf(from-link) = ?", [$featureID], + 'IsBidirectionalBestHitOf(to-link)'); + # Now we loop through the features, pulling out the ones that have the correct + # functional assignment. + my @retVal = (); + for my $bbhFeature (@bbhFeatures) { + # Get this feature's functional assignment. + my $newFunction = $self->FunctionOf($bbhFeature, $userID); + # If it matches, add it to the result list. + if ($newFunction eq $function) { + push @retVal, $bbhFeature; + } + } + # Return the result list. + return @retVal; } =head3 TaxonomySort @@ -2273,25 +2666,25 @@ =cut #: Return Type @; sub TaxonomySort { - # Get the parameters. - my ($self, $featureIDs) = @_; - # Create the working hash table. - my %hashBuffer = (); - # Loop through the features. - for my $fid (@{$featureIDs}) { - # Get the taxonomy of the feature's genome. - my ($taxonomy) = $self->GetFlat(['IsLocatedIn', 'HasContig', 'Genome'], "IsLocatedIn(from-link) = ?", - [$fid], 'Genome(taxonomy)'); - # Add this feature to the hash buffer. + # Get the parameters. + my ($self, $featureIDs) = @_; + # Create the working hash table. + my %hashBuffer = (); + # Loop through the features. + for my $fid (@{$featureIDs}) { + # Get the taxonomy of the feature's genome. + my ($taxonomy) = $self->GetFlat(['IsLocatedIn', 'HasContig', 'Genome'], "IsLocatedIn(from-link) = ?", + [$fid], 'Genome(taxonomy)'); + # Add this feature to the hash buffer. Tracer::AddToListMap(\%hashBuffer, $taxonomy, $fid); - } - # Sort the keys and get the elements. - my @retVal = (); - for my $taxon (sort keys %hashBuffer) { - push @retVal, @{$hashBuffer{$taxon}}; - } - # Return the result. - return @retVal; + } + # Sort the keys and get the elements. + my @retVal = (); + for my $taxon (sort keys %hashBuffer) { + push @retVal, @{$hashBuffer{$taxon}}; + } + # Return the result. + return @retVal; } =head3 GetAll @@ -2351,26 +2744,13 @@ =cut #: Return Type @@; sub GetAll { - # Get the parameters. - my ($self, $objectNames, $filterClause, $parameterList, $fields, $count) = @_; - # Create the query. - my $query = $self->Get($objectNames, $filterClause, $parameterList); - # Set up a counter of the number of records read. - my $fetched = 0; - # Insure the counter has a value. - if (!defined $count) { - $count = 0; - } - # Loop through the records returned, extracting the fields. Note that if the - # counter is non-zero, we stop when the number of records read hits the count. - my @retVal = (); - while (($count == 0 || $fetched < $count) && (my $row = $query->Fetch())) { - my @rowData = $row->Values($fields); - push @retVal, \@rowData; - $fetched++; - } - # Return the resulting list. - return @retVal; + # Get the parameters. + my ($self, $objectNames, $filterClause, $parameterList, $fields, $count) = @_; + # Call the ERDB method. + my @retVal = $self->{_erdb}->GetAll($objectNames, $filterClause, $parameterList, + $fields, $count); + # Return the resulting list. + return @retVal; } =head3 GetFlat @@ -2412,18 +2792,18 @@ =cut #: Return Type @; sub GetFlat { - # Get the parameters. - my ($self, $objectNames, $filterClause, $parameterList, $field) = @_; - # Construct the query. - my $query = $self->Get($objectNames, $filterClause, $parameterList); - # Create the result list. - my @retVal = (); - # Loop through the records, adding the field values found to the result list. - while (my $row = $query->Fetch()) { - push @retVal, $row->Value($field); - } - # Return the list created. - return @retVal; + # Get the parameters. + my ($self, $objectNames, $filterClause, $parameterList, $field) = @_; + # Construct the query. + my $query = $self->Get($objectNames, $filterClause, $parameterList); + # Create the result list. + my @retVal = (); + # Loop through the records, adding the field values found to the result list. + while (my $row = $query->Fetch()) { + push @retVal, $row->Value($field); + } + # Return the list created. + return @retVal; } =head3 Protein @@ -2453,62 +2833,62 @@ # This is the translation table for protein synthesis. my $ProteinTable = { AAA => 'K', AAG => 'K', AAT => 'N', AAC => 'N', - AGA => 'R', AGG => 'R', AGT => 'S', AGC => 'S', - ATA => 'I', ATG => 'M', ATT => 'I', ATC => 'I', - ACA => 'T', ACG => 'T', ACT => 'T', ACC => 'T', - GAA => 'E', GAG => 'E', GAT => 'D', GAC => 'D', - GTA => 'V', GTG => 'V', GTT => 'V', GTC => 'V', - GGA => 'G', GGG => 'G', GGT => 'G', GGC => 'G', - GCA => 'A', GCG => 'A', GCT => 'A', GCC => 'A', - CAA => 'Q', CAG => 'Q', CAT => 'H', CAC => 'H', - CTA => 'L', CTG => 'L', CTT => 'L', CTC => 'L', - CGA => 'R', CGG => 'R', CGT => 'R', CGC => 'R', - CCA => 'P', CCG => 'P', CCT => 'P', CCC => 'P', - TAA => '*', TAG => '*', TAT => 'Y', TAC => 'Y', - TGA => '*', TGG => 'W', TGT => 'C', TGC => 'C', - TTA => 'L', TTG => 'L', TTT => 'F', TTC => 'F', - TCA => 'S', TCG => 'S', TCT => 'S', TCC => 'S', - AAR => 'K', AAY => 'N', - AGR => 'R', AGY => 'S', - ATY => 'I', - ACR => 'T', ACY => 'T', 'ACX' => 'T', - GAR => 'E', GAY => 'D', - GTR => 'V', GTY => 'V', GTX => 'V', - GGR => 'G', GGY => 'G', GGX => 'G', - GCR => 'A', GCY => 'A', GCX => 'A', - CAR => 'Q', CAY => 'H', - CTR => 'L', CTY => 'L', CTX => 'L', - CGR => 'R', CGY => 'R', CGX => 'R', - CCR => 'P', CCY => 'P', CCX => 'P', - TAR => '*', TAY => 'Y', - TGY => 'C', - TTR => 'L', TTY => 'F', - TCR => 'S', TCY => 'S', TCX => 'S' - }; + AGA => 'R', AGG => 'R', AGT => 'S', AGC => 'S', + ATA => 'I', ATG => 'M', ATT => 'I', ATC => 'I', + ACA => 'T', ACG => 'T', ACT => 'T', ACC => 'T', + GAA => 'E', GAG => 'E', GAT => 'D', GAC => 'D', + GTA => 'V', GTG => 'V', GTT => 'V', GTC => 'V', + GGA => 'G', GGG => 'G', GGT => 'G', GGC => 'G', + GCA => 'A', GCG => 'A', GCT => 'A', GCC => 'A', + CAA => 'Q', CAG => 'Q', CAT => 'H', CAC => 'H', + CTA => 'L', CTG => 'L', CTT => 'L', CTC => 'L', + CGA => 'R', CGG => 'R', CGT => 'R', CGC => 'R', + CCA => 'P', CCG => 'P', CCT => 'P', CCC => 'P', + TAA => '*', TAG => '*', TAT => 'Y', TAC => 'Y', + TGA => '*', TGG => 'W', TGT => 'C', TGC => 'C', + TTA => 'L', TTG => 'L', TTT => 'F', TTC => 'F', + TCA => 'S', TCG => 'S', TCT => 'S', TCC => 'S', + AAR => 'K', AAY => 'N', + AGR => 'R', AGY => 'S', + ATY => 'I', + ACR => 'T', ACY => 'T', 'ACX' => 'T', + GAR => 'E', GAY => 'D', + GTR => 'V', GTY => 'V', GTX => 'V', + GGR => 'G', GGY => 'G', GGX => 'G', + GCR => 'A', GCY => 'A', GCX => 'A', + CAR => 'Q', CAY => 'H', + CTR => 'L', CTY => 'L', CTX => 'L', + CGR => 'R', CGY => 'R', CGX => 'R', + CCR => 'P', CCY => 'P', CCX => 'P', + TAR => '*', TAY => 'Y', + TGY => 'C', + TTR => 'L', TTY => 'F', + TCR => 'S', TCY => 'S', TCX => 'S' + }; sub Protein { - # Get the paraeters. - my ($sequence, $table) = @_; - # If no table was specified, use the default. - if (!$table) { - $table = $ProteinTable; - } - # Create the return value. - my $retVal = ""; - # Loop through the input triples. - my $n = length $sequence; - for (my $i = 0; $i < $n; $i += 3) { - # Get the current triple from the sequence. - my $triple = substr($sequence, $i, 3); - # Translate it using the table. - my $protein = "X"; - if (exists $table->{$triple}) { $protein = $table->{$triple}; } - $retVal .= $protein; - } - # Remove the stop codon (if any). - $retVal =~ s/\*$//; - # Return the result. - return $retVal; + # Get the paraeters. + my ($sequence, $table) = @_; + # If no table was specified, use the default. + if (!$table) { + $table = $ProteinTable; + } + # Create the return value. + my $retVal = ""; + # Loop through the input triples. + my $n = length $sequence; + for (my $i = 0; $i < $n; $i += 3) { + # Get the current triple from the sequence. + my $triple = substr($sequence, $i, 3); + # Translate it using the table. + my $protein = "X"; + if (exists $table->{$triple}) { $protein = $table->{$triple}; } + $retVal .= $protein; + } + # Remove the stop codon (if any). + $retVal =~ s/\*$//; + # Return the result. + return $retVal; } =head3 LoadInfo @@ -2522,14 +2902,14 @@ =cut #: Return Type @; sub LoadInfo { - # Get the parameters. - my ($self) = @_; - # Create the return list, priming it with the name of the data directory. - my @retVal = ($self->{_options}->{dataDir}); - # Concatenate the table names. - push @retVal, $self->{_erdb}->GetTableNames(); - # Return the result. - return @retVal; + # Get the parameters. + my ($self) = @_; + # Create the return list, priming it with the name of the data directory. + my @retVal = ($self->{_options}->{dataDir}); + # Concatenate the table names. + push @retVal, $self->{_erdb}->GetTableNames(); + # Return the result. + return @retVal; } =head3 LowBBHs @@ -2559,21 +2939,21 @@ =cut #: Return Type %; sub LowBBHs { - # Get the parsameters. - my ($self, $featureID, $cutoff) = @_; - # Create the return hash. - my %retVal = (); - # Create a query to get the desired BBHs. - my @bbhList = $self->GetAll(['IsBidirectionalBestHitOf'], - 'IsBidirectionalBestHitOf(sc) <= ? AND IsBidirectionalBestHitOf(from-link) = ?', - [$cutoff, $featureID], - ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(sc)']); - # Form the results into the return hash. - for my $pair (@bbhList) { - $retVal{$pair->[0]} = $pair->[1]; - } - # Return the result. - return %retVal; + # Get the parsameters. + my ($self, $featureID, $cutoff) = @_; + # Create the return hash. + my %retVal = (); + # Create a query to get the desired BBHs. + my @bbhList = $self->GetAll(['IsBidirectionalBestHitOf'], + 'IsBidirectionalBestHitOf(sc) <= ? AND IsBidirectionalBestHitOf(from-link) = ?', + [$cutoff, $featureID], + ['IsBidirectionalBestHitOf(to-link)', 'IsBidirectionalBestHitOf(sc)']); + # Form the results into the return hash. + for my $pair (@bbhList) { + $retVal{$pair->[0]} = $pair->[1]; + } + # Return the result. + return %retVal; } =head3 GetGroups @@ -2624,18 +3004,91 @@ return %retVal; } +=head3 MyGenomes + +C<< my @genomes = Sprout::MyGenomes($dataDir); >> + +Return a list of the genomes to be included in the Sprout. + +This method is provided for use during the Sprout load. It presumes the Genome load file has +already been created. (It will be in the Sprout data directory and called either C +or C.) Essentially, it reads in the Genome load file and strips out the genome +IDs. + +=over 4 + +=item dataDir + +Directory containing the Sprout load files. + +=back + +=cut +#: Return Type @; +sub MyGenomes { + # Get the parameters. + my ($dataDir) = @_; + # Compute the genome file name. + my $genomeFileName = LoadFileName($dataDir, "Genome"); + # Extract the genome IDs from the files. + my @retVal = map { $_ =~ /^(\S+)/; $1 } Tracer::GetFile($genomeFileName); + # Return the result. + return @retVal; +} + +=head3 LoadFileName + +C<< my $fileName = Sprout::LoadFileName($dataDir, $tableName); >> + +Return the name of the load file for the specified table in the specified data +directory. + +=over 4 + +=item dataDir + +Directory containing the Sprout load files. + +=item tableName + +Name of the table whose load file is desired. + +=item RETURN + +Returns the name of the file containing the load data for the specified table, or +C if no load file is present. + +=back + +=cut +#: Return Type $; +sub LoadFileName { + # Get the parameters. + my ($dataDir, $tableName) = @_; + # Declare the return variable. + my $retVal; + # Check for the various file names. + if (-e "$dataDir/$tableName") { + $retVal = "$dataDir/$tableName"; + } elsif (-e "$dataDir/$tableName.dtx") { + $retVal = "$dataDir/$tableName.dtx"; + } + # Return the result. + return $retVal; +} + =head2 Internal Utility Methods =head3 ParseAssignment Parse annotation text to determine whether or not it is a functional assignment. If it is, -the user, function text, and assigning user will be returned as a 3-element list. If it +the user, function text, and assigning user will be returned as a 3-element list. If it isn't, an empty list will be returned. A functional assignment is always of the form IC<\nset >IC< function to\n>I - + where I is the B, I is the B, and I is the actual functional role. In most cases, the user and the assigning user will be the same, but that is not always the case. @@ -2658,19 +3111,19 @@ =cut sub _ParseAssignment { - # Get the parameters. - my ($text) = @_; - # Declare the return value. - my @retVal = (); - # Check to see if this is a functional assignment. - my ($user, $type, $function) = split(/\n/, $text); - if ($type =~ m/^set ([^ ]+) function to$/i) { - # Here it is, so we return the user name (which is in $1), the functional role text, + # Get the parameters. + my ($text) = @_; + # Declare the return value. + my @retVal = (); + # Check to see if this is a functional assignment. + my ($user, $type, $function) = split(/\n/, $text); + if ($type =~ m/^set ([^ ]+) function to$/i) { + # Here it is, so we return the user name (which is in $1), the functional role text, # and the assigning user. - @retVal = ($1, $function, $user); - } - # Return the result list. - return @retVal; + @retVal = ($1, $function, $user); + } + # Return the result list. + return @retVal; } =head3 FriendlyTimestamp @@ -2699,4 +3152,63 @@ return $retVal; } -1; \ No newline at end of file +=head3 AddProperty + +C<< my = $sprout->AddProperty($featureID, $key, $value, $url); >> + +Add a new attribute value (Property) to a feature. In the SEED system, attributes can +be added to almost any object. In Sprout, they can only be added to features. In +Sprout, attributes are implemented using I. A property represents a key/value +pair. If the particular key/value pair coming in is not already in the database, a new +B record is created to hold it. + +=over 4 + +=item peg + +ID of the feature to which the attribute is to be replied. + +=item key + +Name of the attribute (key). + +=item value + +Value of the attribute. + +=item url + +URL or text citation from which the property was obtained. + +=back + +=cut +#: Return Type ; +sub AddProperty { + # Get the parameters. + my ($self, $featureID, $key, $value, $url) = @_; + # Declare the variable to hold the desired property ID. + my $propID; + # Attempt to find a property record for this key/value pair. + my @properties = $self->GetFlat(['Property'], + "Property(property-name) = ? AND Property(property-value) = ?", + [$key, $value], 'Property(id)'); + if (@properties) { + # Here the property is already in the database. We save its ID. + $propID = $properties[0]; + # Here the property value does not exist. We need to generate an ID. It will be set + # to a number one greater than the maximum value in the database. This call to + # GetAll will stop after one record. + my @maxProperty = $self->GetAll(['Property'], "ORDER BY Property(id) DESC", [], ['Property(id)'], + 1); + $propID = $maxProperty[0]->[0] + 1; + # Insert the new property value. + $self->Insert('Property', { 'property-name' => $key, 'property-value' => $value, id => $propID }); + } + # Now we connect the incoming feature to the property. + $self->Insert('HasProperty', { 'from-link' => $featureID, 'to-link' => $propID, evidence => $url }); +} + + + +1;