--- Sprout.pm 2005/08/14 23:32:08 1.19 +++ Sprout.pm 2006/06/18 07:20:33 1.65 @@ -1,5 +1,8 @@ package Sprout; + require Exporter; + use ERDB; + @ISA = qw(Exporter ERDB); use Data::Dumper; use strict; use Carp; @@ -7,7 +10,6 @@ use XML::Simple; use DBQuery; use DBObject; - use ERDB; use Tracer; use FIGRules; use Stats; @@ -32,6 +34,8 @@ query tasks. For example, L lists the IDs of all the genomes in the database and L returns the DNA sequence for a specified genome location. +The Sprout object is a subclass of the ERDB object and inherits all its properties and methods. + =cut #: Constructor SFXlate->new_sprout_only(); @@ -62,14 +66,18 @@ * B name of the XML file containing the database definition (default C) -* B user name and password, delimited by a slash (default C) +* B user name and password, delimited by a slash (default same as SEED) * B connection port (default C<0>) +* B connection socket (default same as SEED) + * B maximum number of residues per feature segment, (default C<4500>) * B maximum number of residues per sequence, (default C<8000>) +* B suppresses the connection to the database if TRUE, else FALSE + =back For example, the following constructor call specifies a database named I and a user name of @@ -90,14 +98,16 @@ # database type dataDir => $FIG_Config::sproutData, # data file directory - xmlFileName => "$FIG_Config::sproutData/SproutDBD.xml", + xmlFileName => "$FIG_Config::fig/SproutDBD.xml", # database definition file name userData => "$FIG_Config::dbuser/$FIG_Config::dbpass", # user name and password port => $FIG_Config::dbport, # database connection port + sock => $FIG_Config::dbsock, maxSegmentLength => 4500, # maximum feature segment length maxSequenceLength => 8000, # maximum contig sequence length + noDBOpen => 0, # 1 to suppress the database open }, $options); # Get the data directory. my $dataDir = $optionTable->{dataDir}; @@ -105,15 +115,19 @@ $optionTable->{userData} =~ m!([^/]*)/(.*)$!; my ($userName, $password) = ($1, $2); # Connect to the database. - my $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName, $password, $optionTable->{port}); + my $dbh; + if (! $optionTable->{noDBOpen}) { + $dbh = DBKernel->new($optionTable->{dbType}, $dbName, $userName, + $password, $optionTable->{port}, undef, $optionTable->{sock}); + } # Create the ERDB object. my $xmlFileName = "$optionTable->{xmlFileName}"; - my $erdb = ERDB->new($dbh, $xmlFileName); - # Create this object. - my $self = { _erdb => $erdb, _options => $optionTable, _xmlName => $xmlFileName }; - # Bless and return it. - bless $self; - return $self; + my $retVal = ERDB::new($class, $dbh, $xmlFileName); + # Add the option table and XML file name. + $retVal->{_options} = $optionTable; + $retVal->{_xmlName} = $xmlFileName; + # Return it. + return $retVal; } =head3 MaxSegment @@ -148,196 +162,6 @@ return $self->{_options}->{maxSequenceLength}; } -=head3 Get - -C<< my $query = $sprout->Get(\@objectNames, $filterClause, \@parameterList); >> - -This method allows a general query against the Sprout data using a specified filter clause. - -The filter is a standard WHERE/ORDER BY clause with question marks as parameter markers and each -field name represented in the form B(I)>. For example, the -following call requests all B objects for the genus specified in the variable -$genus. - -C<< $query = $sprout->Get(['Genome'], "Genome(genus) = ?", [$genus]); >> - -The WHERE clause contains a single question mark, so there is a single additional -parameter representing the parameter value. It would also be possible to code - -C<< $query = $sprout->Get(['Genome'], "Genome(genus) = \'$genus\'"); >> - -however, this version of the call would generate a syntax error if there were any quote -characters inside the variable C<$genus>. - -The use of the strange parenthesized notation for field names enables us to distinguish -hyphens contained within field names from minus signs that participate in the computation -of the WHERE clause. All of the methods that manipulate fields will use this same notation. - -It is possible to specify multiple entity and relationship names in order to retrieve more than -one object's data at the same time, which allows highly complex joined queries. For example, - -C<< $query = $sprout->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", [$genus]); >> - -This query returns all the genomes for a particular genus and allows access to the -sources from which they came. The join clauses to go from Genome to Source are generated -automatically. - -Finally, the filter clause can contain sort information. To do this, simply put an C -clause at the end of the filter. Field references in the ORDER BY section follow the same rules -as they do in the filter itself; in other words, each one must be of the form B(I)>. -For example, the following filter string gets all genomes for a particular genus and sorts -them by species name. - -C<< $query = $sprout->Get(['Genome'], "Genome(genus) = ? ORDER BY Genome(species)", [$genus]); >> - -It is also permissible to specify I an ORDER BY clause. For example, the following invocation gets -all genomes ordered by genus and species. - -C<< $query = $sprout->Get(['Genome'], "ORDER BY Genome(genus), Genome(species)"); >> - -Odd things may happen if one of the ORDER BY fields is in a secondary relation. So, for example, an -attempt to order Bs by alias may (depending on the underlying database engine used) cause -a single feature to appear more than once. - -If multiple names are specified, then the query processor will automatically determine a -join path between the entities and relationships. The algorithm used is very simplistic. -In particular, you can't specify any entity or relationship more than once, and if a -relationship is recursive, the path is determined by the order in which the entity -and the relationship appear. For example, consider a recursive relationship B -which relates B objects to other B objects. If the join path is -coded as C<['People', 'IsParentOf']>, then the people returned will be parents. If, however, -the join path is C<['IsParentOf', 'People']>, then the people returned will be children. - -=over 4 - -=item objectNames - -List containing the names of the entity and relationship objects to be retrieved. - -=item filterClause - -WHERE/ORDER BY clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can -be parameterized with parameter markers (C). Each field used must be specified in the standard form -B(I)>. Any parameters specified in the filter clause should be added to the -parameter list as additional parameters. The fields in a filter clause can come from primary -entity relations, relationship relations, or secondary entity relations; however, all of the -entities and relationships involved must be included in the list of object names. - -=item parameterList - -List of the parameters to be substituted in for the parameters marks in the filter clause. - -=item RETURN - -Returns a B that can be used to iterate through all of the results. - -=back - -=cut - -sub Get { - # Get the parameters. - my ($self, $objectNames, $filterClause, $parameterList) = @_; - # We differ from the ERDB Get method in that the parameter list is passed in as a list reference - # rather than a list of parameters. The next step is to convert the parameters from a reference - # to a real list. We can only do this if the parameters have been specified. - my @parameters; - if ($parameterList) { @parameters = @{$parameterList}; } - return $self->{_erdb}->Get($objectNames, $filterClause, @parameters); -} - -=head3 GetEntity - -C<< my $entityObject = $sprout->GetEntity($entityType, $ID); >> - -Return an object describing the entity instance with a specified ID. - -=over 4 - -=item entityType - -Entity type name. - -=item ID - -ID of the desired entity. - -=item RETURN - -Returns a B representing the desired entity instance, or an undefined value if no -instance is found with the specified key. - -=back - -=cut - -sub GetEntity { - # Get the parameters. - my ($self, $entityType, $ID) = @_; - # Call the ERDB method. - return $self->{_erdb}->GetEntity($entityType, $ID); -} - -=head3 GetEntityValues - -C<< my @values = GetEntityValues($entityType, $ID, \@fields); >> - -Return a list of values from a specified entity instance. - -=over 4 - -=item entityType - -Entity type name. - -=item ID - -ID of the desired entity. - -=item fields - -List of field names, each of the form IC<(>IC<)>. - -=item RETURN - -Returns a flattened list of the values of the specified fields for the specified entity. - -=back - -=cut -#: Return Type @; -sub GetEntityValues { - # Get the parameters. - my ($self, $entityType, $ID, $fields) = @_; - # Call the ERDB method. - return $self->{_erdb}->GetEntityValues($entityType, $ID, $fields); -} - -=head3 ShowMetaData - -C<< $sprout->ShowMetaData($fileName); >> - -This method outputs a description of the database to an HTML file in the data directory. - -=over 4 - -=item fileName - -Fully-qualified name to give to the output file. - -=back - -=cut - -sub ShowMetaData { - # Get the parameters. - my ($self, $fileName) = @_; - # Compute the file name. - my $options = $self->{_options}; - # Call the show method on the underlying ERDB object. - $self->{_erdb}->ShowMetaData($fileName); -} - =head3 Load C<< $sprout->Load($rebuild); >>; @@ -372,17 +196,15 @@ sub Load { # Get the parameters. my ($self, $rebuild) = @_; - # Get the database object. - my $erdb = $self->{_erdb}; # Load the tables from the data directory. - my $retVal = $erdb->LoadTables($self->{_options}->{dataDir}, $rebuild); + my $retVal = $self->LoadTables($self->{_options}->{dataDir}, $rebuild); # Return the statistics. return $retVal; } =head3 LoadUpdate -C<< my %stats = $sprout->LoadUpdate($truncateFlag, \@tableList); >> +C<< my $stats = $sprout->LoadUpdate($truncateFlag, \@tableList); >> Load updates to one or more database tables. This method enables the client to make changes to one or two tables without reloading the whole database. For each table, there must be a corresponding @@ -415,8 +237,6 @@ sub LoadUpdate { # Get the parameters. my ($self, $truncateFlag, $tableList) = @_; - # Get the database object. - my $erdb = $self->{_erdb}; # Declare the return value. my $retVal = Stats->new(); # Get the data directory. @@ -430,7 +250,7 @@ Trace("No load file found for $tableName in $dataDir.") if T(0); } else { # Attempt to load this table. - my $result = $erdb->LoadTable($fileName, $tableName, $truncateFlag); + my $result = $self->LoadTable($fileName, $tableName, $truncateFlag); # Accumulate the resulting statistics. $retVal->Accumulate($result); } @@ -439,6 +259,140 @@ return $retVal; } +=head3 GenomeCounts + +C<< my ($arch, $bact, $euk, $vir, $env, $unk) = $sprout->GenomeCounts($complete); >> + +Count the number of genomes in each domain. If I<$complete> is TRUE, only complete +genomes will be included in the counts. + +=over 4 + +=item complete + +TRUE if only complete genomes are to be counted, FALSE if all genomes are to be +counted + +=item RETURN + +A six-element list containing the number of genomes in each of six categories-- +Archaea, Bacteria, Eukaryota, Viral, Environmental, and Unknown, respectively. + +=back + +=cut + +sub GenomeCounts { + # Get the parameters. + my ($self, $complete) = @_; + # Set the filter based on the completeness flag. + my $filter = ($complete ? "Genome(complete) = 1" : ""); + # Get all the genomes and the related taxonomy information. + my @genomes = $self->GetAll(['Genome'], $filter, [], ['Genome(id)', 'Genome(taxonomy)']); + # Clear the counters. + my ($arch, $bact, $euk, $vir, $env, $unk) = (0, 0, 0, 0, 0, 0); + # Loop through, counting the domains. + for my $genome (@genomes) { + if ($genome->[1] =~ /^archaea/i) { ++$arch } + elsif ($genome->[1] =~ /^bacter/i) { ++$bact } + elsif ($genome->[1] =~ /^eukar/i) { ++$euk } + elsif ($genome->[1] =~ /^vir/i) { ++$vir } + elsif ($genome->[1] =~ /^env/i) { ++$env } + else { ++$unk } + } + # Return the counts. + return ($arch, $bact, $euk, $vir, $env, $unk); +} + +=head3 ContigCount + +C<< my $count = $sprout->ContigCount($genomeID); >> + +Return the number of contigs for the specified genome ID. + +=over 4 + +=item genomeID + +ID of the genome whose contig count is desired. + +=item RETURN + +Returns the number of contigs for the specified genome. + +=back + +=cut + +sub ContigCount { + # Get the parameters. + my ($self, $genomeID) = @_; + # Get the contig count. + my $retVal = $self->GetCount(['Contig', 'HasContig'], "HasContig(from-link) = ?", [$genomeID]); + # Return the result. + return $retVal; +} + +=head3 GeneMenu + +C<< my $selectHtml = $sprout->GeneMenu(\%attributes, $filterString, \@params); >> + +Return an HTML select menu of genomes. Each genome will be an option in the menu, +and will be displayed by name with the ID and a contig count attached. The selection +value will be the genome ID. The genomes will be sorted by genus/species name. + +=over 4 + +=item attributes + +Reference to a hash mapping attributes to values for the SELECT tag generated. + +=item filterString + +A filter string for use in selecting the genomes. The filter string must conform +to the rules for the C<< ERDB->Get >> method. + +=item params + +Reference to a list of values to be substituted in for the parameter marks in +the filter string. + +=item RETURN + +Returns an HTML select menu with the specified genomes as selectable options. + +=back + +=cut + +sub GeneMenu { + # Get the parameters. + my ($self, $attributes, $filterString, $params) = @_; + # Start the menu. + my $retVal = "\n"; + # Return the result. + return $retVal; +} =head3 Build C<< $sprout->Build(); >> @@ -453,7 +407,7 @@ # Get the parameters. my ($self) = @_; # Create the tables. - $self->{_erdb}->CreateTables; + $self->CreateTables(); } =head3 Genomes @@ -576,7 +530,7 @@ =item RETURN Returns a list of the feature's contig segments. The locations are returned as a list in a list -context and as a space-delimited string in a scalar context. +context and as a comma-delimited string in a scalar context. =back @@ -603,10 +557,15 @@ if ($prevContig eq $contigID && $dir eq $prevDir) { # Here the new segment is in the same direction on the same contig. Insure the # new segment's beginning is next to the old segment's end. - if (($dir eq "-" && $beg == $prevBeg - $prevLen) || - ($dir eq "+" && $beg == $prevBeg + $prevLen)) { - # Here we need to merge two segments. Adjust the beginning and length values - # to include both segments. + if ($dir eq "-" && $beg + $len == $prevBeg) { + # Here we're merging two backward blocks, so we keep the new begin point + # and adjust the length. + $len += $prevLen; + # Pop the old segment off. The new one will replace it later. + pop @retVal; + } elsif ($dir eq "+" && $beg == $prevBeg + $prevLen) { + # Here we need to merge two forward blocks. Adjust the beginning and + # length values to include both segments. $beg = $prevBeg; $len += $prevLen; # Pop the old segment off. The new one will replace it later. @@ -615,11 +574,13 @@ } # Remember this specifier for the adjacent-segment test the next time through. ($prevContig, $prevBeg, $prevDir, $prevLen) = ($contigID, $beg, $dir, $len); + # Compute the initial base pair. + my $start = ($dir eq "+" ? $beg : $beg + $len - 1); # Add the specifier to the list. - push @retVal, "${contigID}_$beg$dir$len"; + push @retVal, "${contigID}_$start$dir$len"; } # Return the list in the format indicated by the context. - return (wantarray ? @retVal : join(' ', @retVal)); + return (wantarray ? @retVal : join(',', @retVal)); } =head3 ParseLocation @@ -650,7 +611,7 @@ shift if UNIVERSAL::isa($_[0],__PACKAGE__); my ($location) = @_; # Parse it into segments. - $location =~ /^(.*)_(\d*)([+-_])(\d*)$/; + $location =~ /^(.+)_(\d+)([+\-_])(\d+)$/; my ($contigID, $start, $dir, $len) = ($1, $2, $3, $4); # If the direction is an underscore, convert it to a + or -. if ($dir eq "_") { @@ -726,12 +687,17 @@ should be of the form returned by L when in a list context. In other words, each location is of the form IC<_>III. +For example, the following would return the DNA sequence for contig C<83333.1:NC_000913> +between positions 1401 and 1532, inclusive. + + my $sequence = $sprout->DNASeq('83333.1:NC_000913_1401_1532'); + =over 4 =item locationList -List of location specifiers, each in the form IC<_>III (see -L for more about this format). +List of location specifiers, each in the form IC<_>III or +IC<_>IC<_>I (see L for more about this format). =item RETURN @@ -758,13 +724,15 @@ # the start point is the ending. Note that in the latter case we must reverse the DNA string # before putting it in the return value. my ($start, $stop); + Trace("Parse of \"$location\" is $beg$dir$len.") if T(SDNA => 4); if ($dir eq "+") { $start = $beg; $stop = $beg + $len - 1; } else { - $start = $beg + $len + 1; + $start = $beg - $len + 1; $stop = $beg; } + Trace("Looking for sequences containing $start through $stop.") if T(SDNA => 4); my $query = $self->Get(['IsMadeUpOf','Sequence'], "IsMadeUpOf(from-link) = ? AND IsMadeUpOf(start-position) + IsMadeUpOf(len) > ? AND " . " IsMadeUpOf(start-position) <= ? ORDER BY IsMadeUpOf(start-position)", @@ -776,18 +744,19 @@ $sequence->Values(['IsMadeUpOf(start-position)', 'Sequence(sequence)', 'IsMadeUpOf(len)']); my $stopPosition = $startPosition + $sequenceLength; + Trace("Sequence is from $startPosition to $stopPosition.") if T(SDNA => 4); # Figure out the start point and length of the relevant section. my $pos1 = ($start < $startPosition ? 0 : $start - $startPosition); - my $len = ($stopPosition <= $stop ? $stopPosition : $stop) - $startPosition - $pos1; + my $len1 = ($stopPosition < $stop ? $stopPosition : $stop) + 1 - $startPosition - $pos1; + Trace("Position is $pos1 for length $len1.") if T(SDNA => 4); # Add the relevant data to the location data. - $locationDNA .= substr($sequenceData, $pos1, $len); + $locationDNA .= substr($sequenceData, $pos1, $len1); } # Add this location's data to the return string. Note that we may need to reverse it. if ($dir eq '+') { $retVal .= $locationDNA; } else { - $locationDNA = join('', reverse split //, $locationDNA); - $retVal .= $locationDNA; + $retVal .= FIG::reverse_comp($locationDNA); } } # Return the result. @@ -824,6 +793,128 @@ return @retVal; } +=head3 GenomeLength + +C<< my $length = $sprout->GenomeLength($genomeID); >> + +Return the length of the specified genome in base pairs. + +=over 4 + +=item genomeID + +ID of the genome whose base pair count is desired. + +=item RETURN + +Returns the number of base pairs in all the contigs of the specified +genome. + +=back + +=cut + +sub GenomeLength { + # Get the parameters. + my ($self, $genomeID) = @_; + # Declare the return variable. + my $retVal = 0; + # Get the genome's contig sequence lengths. + my @lens = $self->GetFlat(['HasContig', 'IsMadeUpOf'], 'HasContig(from-link) = ?', + [$genomeID], 'IsMadeUpOf(len)'); + # Sum the lengths. + map { $retVal += $_ } @lens; + # Return the result. + return $retVal; +} + +=head3 FeatureCount + +C<< my $count = $sprout->FeatureCount($genomeID, $type); >> + +Return the number of features of the specified type in the specified genome. + +=over 4 + +=genomeID + +ID of the genome whose feature count is desired. + +=item type + +Type of feature to count (eg. C, C, etc.). + +=item RETURN + +Returns the number of features of the specified type for the specified genome. + +=back + +=cut + +sub FeatureCount { + # Get the parameters. + my ($self, $genomeID, $type) = @_; + # Compute the count. + my $retVal = $self->GetCount(['HasFeature', 'Feature'], + "HasFeature(from-link) = ? AND Feature(feature-type) = ?", + [$genomeID, $type]); + # Return the result. + return $retVal; +} + +=head3 GenomeAssignments + +C<< my $fidHash = $sprout->GenomeAssignments($genomeID); >> + +Return a list of a genome's assigned features. The return hash will contain each +assigned feature of the genome mapped to the text of its most recent functional +assignment. + +=over 4 + +=item genomeID + +ID of the genome whose functional assignments are desired. + +=item RETURN + +Returns a reference to a hash which maps each feature to its most recent +functional assignment. + +=back + +=cut + +sub GenomeAssignments { + # Get the parameters. + my ($self, $genomeID) = @_; + # Declare the return variable. + my $retVal = {}; + # Query the genome's features and annotations. We'll put the oldest annotations + # first so that the last assignment to go into the hash will be the correct one. + my $query = $self->Get(['HasFeature', 'IsTargetOfAnnotation', 'Annotation'], + "HasFeature(from-link) = ? ORDER BY Annotation(time)", + [$genomeID]); + # Loop through the annotations. + while (my $data = $query->Fetch) { + # Get the feature ID and annotation text. + my ($fid, $annotation) = $data->Values(['HasFeature(from-link)', + 'Annotation(annotation)']); + # Check to see if this is an assignment. Note that the user really + # doesn't matter to us, other than we use it to determine whether or + # not this is an assignment. + my ($user, $assignment) = $self->_ParseAssignment('fig', $annotation); + if ($user) { + # Here it's an assignment. We put it in the return hash, overwriting + # any older assignment that might be present. + $retVal->{$fid} = $assignment; + } + } + # Return the result. + return $retVal; +} + =head3 ContigLength C<< my $length = $sprout->ContigLength($contigID); >> @@ -857,7 +948,55 @@ # Set it from the sequence data, if any. if ($sequence) { my ($start, $len) = $sequence->Values(['IsMadeUpOf(start-position)', 'IsMadeUpOf(len)']); - $retVal = $start + $len; + $retVal = $start + $len - 1; + } + # Return the result. + return $retVal; +} + +=head3 ClusterPEGs + +C<< my $clusteredList = $sprout->ClusterPEGs($sub, \@pegs); >> + +Cluster the PEGs in a list according to the cluster coding scheme of the specified +subsystem. In order for this to work properly, the subsystem object must have +been used recently to retrieve the PEGs using the B method. +This causes the cluster numbers to be pulled into the subsystem's color hash. +If a PEG is not found in the color hash, it will not appear in the output +sequence. + +=over 4 + +=item sub + +Sprout subsystem object for the relevant subsystem, from the L +method. + +=item pegs + +Reference to the list of PEGs to be clustered. + +=item RETURN + +Returns a list of the PEGs, grouped into smaller lists by cluster number. + +=back + +=cut +#: Return Type $@@; +sub ClusterPEGs { + # Get the parameters. + my ($self, $sub, $pegs) = @_; + # Declare the return variable. + my $retVal = []; + # Loop through the PEGs, creating arrays for each cluster. + for my $pegID (@{$pegs}) { + my $clusterNumber = $sub->get_cluster_number($pegID); + # Only proceed if the PEG is in a cluster. + if ($clusterNumber >= 0) { + # Push this PEG onto the sub-list for the specified cluster number. + push @{$retVal->[$clusterNumber]}, $pegID; + } } # Return the result. return $retVal; @@ -1007,7 +1146,7 @@ =head3 FeatureAnnotations -C<< my @descriptors = $sprout->FeatureAnnotations($featureID); >> +C<< my @descriptors = $sprout->FeatureAnnotations($featureID, $rawFlag); >> Return the annotations of a feature. @@ -1017,13 +1156,18 @@ ID of the feature whose annotations are desired. +=item rawFlag + +If TRUE, the annotation timestamps will be returned in raw form; otherwise, they +will be returned in human-readable form. + =item RETURN Returns a list of annotation descriptors. Each descriptor is a hash with the following fields. * B ID of the relevant feature. -* B time the annotation was made, in user-friendly format. +* B time the annotation was made. * B ID of the user who made the annotation @@ -1035,7 +1179,7 @@ #: Return Type @%; sub FeatureAnnotations { # Get the parameters. - my ($self, $featureID) = @_; + my ($self, $featureID, $rawFlag) = @_; # Create a query to get the feature's annotations and the associated users. my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], "IsTargetOfAnnotation(from-link) = ?", [$featureID]); @@ -1048,9 +1192,13 @@ $annotation->Values(['IsTargetOfAnnotation(from-link)', 'Annotation(time)', 'MadeAnnotation(from-link)', 'Annotation(annotation)']); + # Convert the time, if necessary. + if (! $rawFlag) { + $timeStamp = FriendlyTimestamp($timeStamp); + } # Assemble them into a hash. my $annotationHash = { featureID => $featureID, - timeStamp => FriendlyTimestamp($timeStamp), + timeStamp => $timeStamp, user => $user, text => $text }; # Add it to the return list. push @retVal, $annotationHash; @@ -1079,7 +1227,7 @@ =item RETURN -Returns a hash mapping the functional assignment IDs to user IDs. +Returns a hash mapping the user IDs to functional assignment IDs. =back @@ -1089,28 +1237,25 @@ # Get the parameters. my ($self, $featureID) = @_; # Get all of the feature's annotations. - my @query = $self->GetAll(['IsTargetOfAnnotation', 'Annotation'], + my @query = $self->GetAll(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], "IsTargetOfAnnotation(from-link) = ?", - [$featureID], ['Annotation(time)', 'Annotation(annotation)']); + [$featureID], ['Annotation(time)', 'Annotation(annotation)', + 'MadeAnnotation(from-link)']); # Declare the return hash. my %retVal; - # Declare a hash for insuring we only make one assignment per user. - my %timeHash = (); # Now we sort the assignments by timestamp in reverse. my @sortedQuery = sort { -($a->[0] <=> $b->[0]) } @query; # Loop until we run out of annotations. for my $annotation (@sortedQuery) { # Get the annotation fields. - my ($timeStamp, $text) = @{$annotation}; + my ($timeStamp, $text, $user) = @{$annotation}; # Check to see if this is a functional assignment. - my ($user, $function) = _ParseAssignment($text); - if ($user && ! exists $timeHash{$user}) { + my ($actualUser, $function) = _ParseAssignment($user, $text); + if ($actualUser && ! exists $retVal{$actualUser}) { # Here it is a functional assignment and there has been no # previous assignment for this user, so we stuff it in the # return hash. - $retVal{$function} = $user; - # Insure we don't assign to this user again. - $timeHash{$user} = 1; + $retVal{$actualUser} = $function; } } # Return the hash of assignments found. @@ -1126,7 +1271,7 @@ The functional assignment is handled differently depending on the type of feature. If the feature is identified by a FIG ID (begins with the string C), then a functional assignment is a type of annotation. The format of an assignment is described in -L. Its worth noting that we cannot filter on the content of the +L. Its worth noting that we cannot filter on the content of the annotation itself because it's a text field; however, this is not a big problem because most features only have a small number of annotations. @@ -1188,20 +1333,22 @@ } } # Build a query for all of the feature's annotations, sorted by date. - my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation'], + my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], "IsTargetOfAnnotation(from-link) = ? ORDER BY Annotation(time) DESC", [$featureID]); my $timeSelected = 0; # Loop until we run out of annotations. while (my $annotation = $query->Fetch()) { # Get the annotation text. - my ($text, $time) = $annotation->Values(['Annotation(annotation)','Annotation(time)']); + my ($text, $time, $user) = $annotation->Values(['Annotation(annotation)', + 'Annotation(time)', 'MadeAnnotation(from-link)']); # Check to see if this is a functional assignment for a trusted user. - my ($user, $function) = _ParseAssignment($text); - if ($user) { + my ($actualUser, $function) = _ParseAssignment($user, $text); + Trace("Assignment user is $actualUser, text is $function.") if T(4); + if ($actualUser) { # Here it is a functional assignment. Check the time and the user # name. The time must be recent and the user must be trusted. - if ((exists $trusteeTable{$user}) && ($time > $timeSelected)) { + if ((exists $trusteeTable{$actualUser}) && ($time > $timeSelected)) { $retVal = $function; $timeSelected = $time; } @@ -1217,6 +1364,78 @@ return $retVal; } +=head3 FunctionsOf + +C<< my @functionList = $sprout->FunctionOf($featureID, $userID); >> + +Return the functional assignments of a particular feature. + +The functional assignment is handled differently depending on the type of feature. If +the feature is identified by a FIG ID (begins with the string C), then a functional +assignment is a type of annotation. The format of an assignment is described in +L. Its worth noting that we cannot filter on the content of the +annotation itself because it's a text field; however, this is not a big problem because +most features only have a small number of annotations. + +If the feature is B identified by a FIG ID, then the functional assignment +information is taken from the B table. If the table does +not contain an entry for the feature, an empty list is returned. + +=over 4 + +=item featureID + +ID of the feature whose functional assignments are desired. + +=item RETURN + +Returns a list of 2-tuples, each consisting of a user ID and the text of an assignment by +that user. + +=back + +=cut +#: Return Type @@; +sub FunctionsOf { + # Get the parameters. + my ($self, $featureID) = @_; + # Declare the return value. + my @retVal = (); + # Determine the ID type. + if ($featureID =~ m/^fig\|/) { + # Here we have a FIG feature ID. We must build the list of trusted + # users. + my %trusteeTable = (); + # Build a query for all of the feature's annotations, sorted by date. + my $query = $self->Get(['IsTargetOfAnnotation', 'Annotation', 'MadeAnnotation'], + "IsTargetOfAnnotation(from-link) = ? ORDER BY Annotation(time) DESC", + [$featureID]); + my $timeSelected = 0; + # Loop until we run out of annotations. + while (my $annotation = $query->Fetch()) { + # Get the annotation text. + my ($text, $time, $user) = $annotation->Values(['Annotation(annotation)', + 'Annotation(time)', + 'MadeAnnotation(user)']); + # Check to see if this is a functional assignment for a trusted user. + my ($actualUser, $function) = _ParseAssignment($user, $text); + if ($actualUser) { + # Here it is a functional assignment. + push @retVal, [$actualUser, $function]; + } + } + } else { + # Here we have a non-FIG feature ID. In this case the user ID does not + # matter. We simply get the information from the External Alias Function + # table. + my @assignments = $self->GetEntityValues('ExternalAliasFunc', $featureID, + ['ExternalAliasFunc(func)']); + push @retVal, map { ['master', $_] } @assignments; + } + # Return the assignments found. + return @retVal; +} + =head3 BBHList C<< my $bbhHash = $sprout->BBHList($genomeID, \@featureList); >> @@ -1254,12 +1473,12 @@ my $query = $self->Get(['IsBidirectionalBestHitOf'], "IsBidirectionalBestHitOf(from-link) = ? AND IsBidirectionalBestHitOf(genome) = ?", [$featureID, $genomeID]); - # Look for the best hit. - my $bbh = $query->Fetch; - if ($bbh) { - my ($targetFeature) = $bbh->Value('IsBidirectionalBestHitOf(to-link)'); - $retVal{$featureID} = $targetFeature; + # Peel off the BBHs found. + my @found = (); + while (my $bbh = $query->Fetch) { + push @found, $bbh->Value('IsBidirectionalBestHitOf(to-link)'); } + $retVal{$featureID} = \@found; } # Return the mapping. return \%retVal; @@ -1337,7 +1556,7 @@ my $genomeData = $self->GetEntity('Genome', $genomeID); if ($genomeData) { # The genome exists, so get the completeness flag. - ($retVal) = $genomeData->Value('complete'); + ($retVal) = $genomeData->Value('Genome(complete)'); } # Return the result. return $retVal; @@ -1377,18 +1596,18 @@ C<< my $genomeID = $sprout->GenomeOf($featureID); >> -Return the genome that contains a specified feature. +Return the genome that contains a specified feature or contig. =over 4 =item featureID -ID of the feature whose genome is desired. +ID of the feature or contig whose genome is desired. =item RETURN -Returns the ID of the genome for the specified feature. If the feature is not found, returns -an undefined value. +Returns the ID of the genome for the specified feature or contig. If the feature or contig is not +found, returns an undefined value. =back @@ -1397,8 +1616,9 @@ sub GenomeOf { # Get the parameters. my ($self, $featureID) = @_; - # Create a query to find the genome associated with the feature. - my $query = $self->Get(['IsLocatedIn', 'HasContig'], "IsLocatedIn(from-link) = ?", [$featureID]); + # Create a query to find the genome associated with the incoming ID. + my $query = $self->Get(['IsLocatedIn', 'HasContig'], "IsLocatedIn(from-link) = ? OR HasContig(to-link) = ?", + [$featureID, $featureID]); # Declare the return value. my $retVal; # Get the genome ID. @@ -1445,10 +1665,10 @@ # Get the ID and score of the coupling. my ($couplingID, $score) = $clustering->Values(['Coupling(id)', 'Coupling(score)']); - # The coupling ID contains the two feature IDs separated by a space. We use - # this information to find the ID of the other feature. - my ($fid1, $fid2) = split / /, $couplingID; - my $otherFeatureID = ($featureID eq $fid1 ? $fid2 : $fid1); + # Get the other feature that participates in the coupling. + my ($otherFeatureID) = $self->GetFlat(['ParticipatesInCoupling'], + "ParticipatesInCoupling(to-link) = ? AND ParticipatesInCoupling(from-link) <> ?", + [$couplingID, $featureID], 'ParticipatesInCoupling(from-link)'); # Attach the other feature's score to its ID. $retVal{$otherFeatureID} = $score; $found = 1; @@ -1521,6 +1741,7 @@ # Determine the ordering to place on the evidence items. If we're # inverted, we want to see feature 2 before feature 1 (descending); otherwise, # we want feature 1 before feature 2 (normal). + Trace("Coupling evidence for ($peg1, $peg2) with inversion flag $inverted.") if T(Coupling => 4); my $ordering = ($inverted ? "DESC" : ""); # Get the coupling evidence. my @evidenceList = $self->GetAll(['IsEvidencedBy', 'PCH', 'UsesAsEvidence'], @@ -1534,8 +1755,10 @@ while (@evidenceList > 0) { my $peg1Data = shift @evidenceList; my $peg2Data = shift @evidenceList; + Trace("Peg 1 is " . $peg1Data->[1] . " and Peg 2 is " . $peg2Data->[1] . ".") if T(Coupling => 4); push @retVal, [$peg1Data->[1], $peg2Data->[1], $peg1Data->[0]]; } + Trace("Last index in evidence result is is $#retVal.") if T(Coupling => 4); } # Return the result. return @retVal; @@ -1585,12 +1808,15 @@ [$retVal], ["ParticipatesInCoupling(from-link)", "Coupling(score)"]); # Check to see if we found anything. if (!@pegs) { + Trace("No coupling found.") if T(Coupling => 4); # No coupling, so undefine the return value. $retVal = undef; } else { # We have a coupling! Get the score and check for inversion. $score = $pegs[0]->[1]; - $inverted = ($pegs[0]->[0] eq $peg1); + my $firstFound = $pegs[0]->[0]; + $inverted = ($firstFound ne $peg1); + Trace("Coupling score is $score. First peg is $firstFound, peg 1 is $peg1.") if T(Coupling => 4); } # Return the result. return ($retVal, $inverted, $score); @@ -1634,23 +1860,6 @@ return join " ", sort @_; } -=head3 GetEntityTypes - -C<< my @entityList = $sprout->GetEntityTypes(); >> - -Return the list of supported entity types. - -=cut -#: Return Type @; -sub GetEntityTypes { - # Get the parameters. - my ($self) = @_; - # Get the underlying database object. - my $erdb = $self->{_erdb}; - # Get its entity type list. - my @retVal = $erdb->GetEntityTypes(); -} - =head3 ReadFasta C<< my %sequenceData = Sprout::ReadFasta($fileName, $prefix); >> @@ -1695,13 +1904,13 @@ if ($line =~ m/^>\s*(.+?)(\s|\n)/) { # Here we have a new header. Store the current sequence if we have one. if ($id) { - $retVal{$id} = uc $sequence; + $retVal{$id} = lc $sequence; } # Clear the sequence accumulator and save the new ID. ($id, $sequence) = ("$prefix$1", ""); } else { # Here we have a data line, so we add it to the sequence accumulator. - # First, we get the actual data out. Note that we normalize to upper + # First, we get the actual data out. Note that we normalize to lower # case. $line =~ /^\s*(.*?)(\s|\n)/; $sequence .= $1; @@ -1709,7 +1918,7 @@ } # Flush out the last sequence (if any). if ($sequence) { - $retVal{$id} = uc $sequence; + $retVal{$id} = lc $sequence; } # Close the file. close FASTAFILE; @@ -1796,7 +2005,7 @@ # Get the data directory name. my $outputDirectory = $self->{_options}->{dataDir}; # Dump the relations. - $self->{_erdb}->DumpRelations($outputDirectory); + $self->DumpRelations($outputDirectory); } =head3 XMLFileName @@ -1848,7 +2057,7 @@ # Get the parameters. my ($self, $objectType, $fieldHash) = @_; # Call the underlying method. - $self->{_erdb}->InsertObject($objectType, $fieldHash); + $self->InsertObject($objectType, $fieldHash); } =head3 Annotate @@ -2035,6 +2244,7 @@ # Get the parameters. my ($self, $entityName, $entityID) = @_; # Check for the entity instance. + Trace("Checking existence of $entityName with ID=$entityID.") if T(4); my $testInstance = $self->GetEntity($entityName, $entityID); # Return an existence indicator. my $retVal = ($testInstance ? 1 : 0); @@ -2503,7 +2713,7 @@ C<< my %subsystems = $sprout->SubsystemsOf($featureID); >> Return a hash describing all the subsystems in which a feature participates. Each subsystem is mapped -to the role the feature performs. +to the roles the feature performs. =over 4 @@ -2513,12 +2723,12 @@ =item RETURN -Returns a hash mapping all the feature's subsystems to the feature's role. +Returns a hash mapping all the feature's subsystems to a list of the feature's roles. =back =cut -#: Return Type %; +#: Return Type %@; sub SubsystemsOf { # Get the parameters. my ($self, $featureID) = @_; @@ -2528,9 +2738,19 @@ ['HasSSCell(from-link)', 'IsRoleOf(from-link)']); # Create the return value. my %retVal = (); + # Build a hash to weed out duplicates. Sometimes the same PEG and role appears + # in two spreadsheet cells. + my %dupHash = (); # Loop through the results, adding them to the hash. for my $record (@subsystems) { - $retVal{$record->[0]} = $record->[1]; + # Get this subsystem and role. + my ($subsys, $role) = @{$record}; + # Insure it's the first time for both. + my $dupKey = "$subsys\n$role"; + if (! exists $dupHash{"$subsys\n$role"}) { + $dupHash{$dupKey} = 1; + push @{$retVal{$subsys}}, $role; + } } # Return the hash. return %retVal; @@ -2568,6 +2788,8 @@ return @retVal; } + + =head3 RelatedFeatures C<< my @relatedList = $sprout->RelatedFeatures($featureID, $function, $userID); >> @@ -2669,125 +2891,6 @@ return @retVal; } -=head3 GetAll - -C<< my @list = $sprout->GetAll(\@objectNames, $filterClause, \@parameters, \@fields, $count); >> - -Return a list of values taken from the objects returned by a query. The first three -parameters correspond to the parameters of the L method. The final parameter is -a list of the fields desired from each record found by the query. The field name -syntax is the standard syntax used for fields in the B system-- -B(I)>-- where I is the name of the relevant entity -or relationship and I is the name of the field. - -The list returned will be a list of lists. Each element of the list will contain -the values returned for the fields specified in the fourth parameter. If one of the -fields specified returns multiple values, they are flattened in with the rest. For -example, the following call will return a list of the features in a particular -spreadsheet cell, and each feature will be represented by a list containing the -feature ID followed by all of its aliases. - -C<< $query = $sprout->Get(['ContainsFeature', 'Feature'], "ContainsFeature(from-link) = ?", [$ssCellID], ['Feature(id)', 'Feature(alias)']); >> - -=over 4 - -=item objectNames - -List containing the names of the entity and relationship objects to be retrieved. - -=item filterClause - -WHERE/ORDER BY clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can -be parameterized with parameter markers (C). Each field used must be specified in the standard form -B(I)>. Any parameters specified in the filter clause should be added to the -parameter list as additional parameters. The fields in a filter clause can come from primary -entity relations, relationship relations, or secondary entity relations; however, all of the -entities and relationships involved must be included in the list of object names. - -=item parameterList - -List of the parameters to be substituted in for the parameters marks in the filter clause. - -=item fields - -List of the fields to be returned in each element of the list returned. - -=item count - -Maximum number of records to return. If omitted or 0, all available records will be returned. - -=item RETURN - -Returns a list of list references. Each element of the return list contains the values for the -fields specified in the B parameter. - -=back - -=cut -#: Return Type @@; -sub GetAll { - # Get the parameters. - my ($self, $objectNames, $filterClause, $parameterList, $fields, $count) = @_; - # Call the ERDB method. - my @retVal = $self->{_erdb}->GetAll($objectNames, $filterClause, $parameterList, - $fields, $count); - # Return the resulting list. - return @retVal; -} - -=head3 GetFlat - -C<< my @list = $sprout->GetFlat(\@objectNames, $filterClause, $parameterList, $field); >> - -This is a variation of L that asks for only a single field per record and -returns a single flattened list. - -=over 4 - -=item objectNames - -List containing the names of the entity and relationship objects to be retrieved. - -=item filterClause - -WHERE/ORDER BY clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can -be parameterized with parameter markers (C). Each field used must be specified in the standard form -B(I)>. Any parameters specified in the filter clause should be added to the -parameter list as additional parameters. The fields in a filter clause can come from primary -entity relations, relationship relations, or secondary entity relations; however, all of the -entities and relationships involved must be included in the list of object names. - -=item parameterList - -List of the parameters to be substituted in for the parameters marks in the filter clause. - -=item field - -Name of the field to be used to get the elements of the list returned. - -=item RETURN - -Returns a list of values. - -=back - -=cut -#: Return Type @; -sub GetFlat { - # Get the parameters. - my ($self, $objectNames, $filterClause, $parameterList, $field) = @_; - # Construct the query. - my $query = $self->Get($objectNames, $filterClause, $parameterList); - # Create the result list. - my @retVal = (); - # Loop through the records, adding the field values found to the result list. - while (my $row = $query->Fetch()) { - push @retVal, $row->Value($field); - } - # Return the list created. - return @retVal; -} - =head3 Protein C<< my $protein = Sprout::Protein($sequence, $table); >> @@ -2889,14 +2992,14 @@ # Create the return list, priming it with the name of the data directory. my @retVal = ($self->{_options}->{dataDir}); # Concatenate the table names. - push @retVal, $self->{_erdb}->GetTableNames(); + push @retVal, $self->GetTableNames(); # Return the result. return @retVal; } =head3 LowBBHs -C<< my %bbhMap = $sprout->GoodBBHs($featureID, $cutoff); >> +C<< my %bbhMap = $sprout->LowBBHs($featureID, $cutoff); >> Return the bidirectional best hits of a feature whose score is no greater than a specified cutoff value. A higher cutoff value will allow inclusion of hits with @@ -3059,6 +3162,42 @@ return $retVal; } +=head3 DeleteGenome + +C<< my $stats = $sprout->DeleteGenome($genomeID, $testFlag); >> + +Delete a genome from the database. + +=over 4 + +=item genomeID + +ID of the genome to delete + +=item testFlag + +If TRUE, then the DELETE statements will be traced, but no deletions will occur. + +=item RETURN + +Returns a statistics object describing the rows deleted. + +=back + +=cut +#: Return Type $%; +sub DeleteGenome { + # Get the parameters. + my ($self, $genomeID, $testFlag) = @_; + # Perform the delete for the genome's features. + my $retVal = $self->Delete('Feature', "fig|$genomeID.%", $testFlag); + # Perform the delete for the primary genome data. + my $stats = $self->Delete('Genome', $genomeID, $testFlag); + $retVal->Accumulate($stats); + # Return the result. + return $retVal; +} + =head2 Internal Utility Methods =head3 ParseAssignment @@ -3069,16 +3208,23 @@ A functional assignment is always of the form - IC<\nset >IC< function to\n>I + CIC< function to\n>I + +where I is the B, and I is the actual functional role. In most cases, +the user and the assigning user (from MadeAnnotation) will be the same, but that is +not always the case. -where I is the B, I is the B, and I is the -actual functional role. In most cases, the user and the assigning user will be the -same, but that is not always the case. +In addition, the functional role may contain extra data that is stripped, such as +terminating spaces or a comment separated from the rest of the text by a tab. This is a static method. =over 4 +=item user + +Name of the assigning user. + =item text Text of the annotation. @@ -3094,15 +3240,22 @@ sub _ParseAssignment { # Get the parameters. - my ($text) = @_; + my ($user, $text) = @_; # Declare the return value. my @retVal = (); # Check to see if this is a functional assignment. - my ($user, $type, $function) = split(/\n/, $text); - if ($type =~ m/^set ([^ ]+) function to$/i) { - # Here it is, so we return the user name (which is in $1), the functional role text, - # and the assigning user. - @retVal = ($1, $function, $user); + my ($type, $function) = split(/\n/, $text); + if ($type =~ m/^set function to$/i) { + # Here we have an assignment without a user, so we use the incoming user ID. + @retVal = ($user, $function); + } elsif ($type =~ m/^set (\S+) function to$/i) { + # Here we have an assignment with a user that is passed back to the caller. + @retVal = ($1, $function); + } + # If we have an assignment, we need to clean the function text. There may be + # extra junk at the end added as a note from the user. + if (@retVal) { + $retVal[1] =~ s/(\t\S)?\s*$//; } # Return the result list. return @retVal; @@ -3130,7 +3283,7 @@ sub FriendlyTimestamp { my ($timeValue) = @_; - my $retVal = strftime("%a %b %e %H:%M:%S %Y", localtime($timeValue)); + my $retVal = localtime($timeValue); return $retVal; } @@ -3192,5 +3345,4 @@ } - -1; +1; \ No newline at end of file