#!/usr/bin/perl -w package CustomAttributes; require Exporter; use ERDB; @ISA = qw(ERDB); use strict; use Tracer; use ERDBLoad; =head1 Custom SEED Attribute Manager =head2 Introduction The Custom SEED Attributes Manager allows the user to upload and retrieve custom data for SEED objects. It uses the B database system to store the attributes. Attributes are organized by I. Attribute values are assigned to I. In the real world, objects have types and IDs; however, to the attribute database only the ID matters. This will create a problem if we have a single ID that applies to two objects of different types, but it is more consistent with the original attribute implementation in the SEED (which this implementation replaces). The actual attribute values are stored as a relationship between the attribute keys and the objects. There can be multiple values for a single key/object pair. The full suite of ERDB retrieval capabilities is provided. In addition, custom methods are provided specific to this application. To get all the values of the attribute C in a specified B, you would code my @values = $attrDB->GetAttributes($fid, 'essential'); where I<$fid> contains the ID of the desired feature. New attribute keys must be defined before they can be used. A web interface is provided for this purpose. =head2 FIG_Config Parameters The following configuration parameters are used to manage custom attributes. =over 4 =item attrDbms Type of database manager used: C for MySQL or C for PostGres. =item attrDbName Name of the attribute database. =item attrHost Name of the host server for the database. If omitted, the current host is used. =item attrUser User name for logging in to the database. =item attrPass Password for logging in to the database. =item attrPort TCP/IP port for accessing the database. =item attrSock Socket name used to access the database. If omitted, the default socket will be used. =item attrDBD Fully-qualified file name for the database definition XML file. This file functions as data to the attribute management process, so if the data is moved, this file must go with it. =back =head2 Public Methods =head3 new C<< my $attrDB = CustomAttributes->new($splitter); >> Construct a new CustomAttributes object. =over 4 =item splitter Value to be used to split attribute values into sections in the L. The default is a double colon C<::>. If you do not use the replacement methods, you do not need to worry about this parameter. =back =cut sub new { # Get the parameters. my ($class, $splitter) = @_; # Connect to the database. my $dbh = DBKernel->new($FIG_Config::attrDbms, $FIG_Config::attrDbName, $FIG_Config::attrUser, $FIG_Config::attrPass, $FIG_Config::attrPort, $FIG_Config::attrHost, $FIG_Config::attrSock); # Create the ERDB object. my $xmlFileName = $FIG_Config::attrDBD; my $retVal = ERDB::new($class, $dbh, $xmlFileName); # Store the splitter value. $retVal->{splitter} = (defined($splitter) ? $splitter : '::'); # Return the result. return $retVal; } =head3 StoreAttributeKey C<< $attrDB->StoreAttributeKey($attributeName, $type, $notes, \@groups); >> Create or update an attribute for the database. =over 4 =item attributeName Name of the attribute. It must be a valid ERDB field name, consisting entirely of letters, digits, and hyphens, with a letter at the beginning. If it does not exist already, it will be created. =item type Data type of the attribute. This must be a valid ERDB data type name. =item notes Descriptive notes about the attribute. It is presumed to be raw text, not HTML. =item groups Reference to a list of the groups to which the attribute should be associated. This will replace any groups to which the attribute is currently attached. =back =cut sub StoreAttributeKey { # Get the parameters. my ($self, $attributeName, $type, $notes, $groups) = @_; # Declare the return variable. my $retVal; # Get the data type hash. my %types = ERDB::GetDataTypes(); # Validate the initial input values. if (! ERDB::ValidateFieldName($attributeName)) { Confess("Invalid attribute name \"$attributeName\" specified."); } elsif (! $notes || length($notes) < 25) { Confess("Missing or incomplete description for $attributeName."); } elsif (! exists $types{$type}) { Confess("Invalid data type \"$type\" for $attributeName."); } else { # Okay, we're ready to begin. See if this key exists. my $attribute = $self->GetEntity('AttributeKey', $attributeName); if (defined($attribute)) { # It does, so we do an update. $self->UpdateEntity('AttributeKey', $attributeName, { description => $notes, 'data-type' => $type }); # Detach the key from its current groups. $self->Disconnect('IsInGroup', 'AttributeKey', $attributeName); } else { # It doesn't, so we do an insert. $self->InsertObject('AttributeKey', { id => $attributeName, description => $notes, 'data-type' => $type }); } # Attach the key to the specified groups. (We presume the groups already # exist.) for my $group (@{$groups}) { $self->InsertObject('IsInGroup', { 'from-link' => $attributeName, 'to-link' => $group }); } } } =head3 LoadAttributeKey C<< my $stats = $attrDB->LoadAttributeKey($keyName, $fh, $keyCol, $dataCol, %options); >> Load the specified attribute from the specified file. The file should be a tab-delimited file with internal tab and new-line characters escaped. This is the typical TBL-style file used by most FIG applications. One of the columns in the input file must contain the appropriate object id value and the other the corresponding attribute value. =over 4 =item keyName Key of the attribute to load. =item fh Open file handle for the input file. =item idCol Index (0-based) of the column containing the ID field. The ID field should contain the ID of an instance of the named entity. =item dataCol Index (0-based) of the column containing the data value field. =item options Hash specifying the options for this load. =item RETURN Returns a statistics object for the load process. =back The available options are as follows. =over 4 =item erase If TRUE, the key's values will all be erased before loading. (Doing so makes for a faster load.) =back =cut sub LoadAttributeKey { # Get the parameters. my ($self, $keyName, $fh, $idCol, $dataCol, %options) = @_; # Create the return variable. my $retVal = Stats->new("lineIn", "shortLine", "newObject"); # Compute the minimum number of fields required in each input line. my $minCols = ($idCol < $dataCol ? $idCol : $idCol) + 1; # Insure the attribute key exists. my $found = $self->GetEntity('AttributeKey', $keyName); if (! defined $found) { Confess("Attribute key \"$keyName\" not found in database."); } else { # Erase the key's current values. $self->EraseAttribute($keyName); # Save a list of the object IDs we need to add. my %objectIDs = (); # Loop through the input file. while (! eof $fh) { # Get the next line of the file. my @fields = Tracer::GetLine($fh); $retVal->Add(lineIn => 1); # Now we need to validate the line. if (scalar(@fields) < $minCols) { $retVal->Add(shortLine => 1); } else { # It's valid, so get the ID and value. my ($id, $value) = ($fields[$idCol], $fields[$dataCol]); # Denote we're using this input line. $retVal->Add(lineUsed => 1); # Now the fun begins. Find out if we need to create a target object record for this object ID. if (! exists $objectIDs{$id}) { my $found = $self->Exists('TargetObject', $id); if (! $found) { $self->InsertObject('TargetObject', { id => $id }); } $objectIDs{$id} = 1; $retVal->Add(newObject => 1); } # Now we insert the attribute. $self->InsertObject('HasValueFor', { from => $keyName, to => $id, value => $value }); $retVal->Add(newValue => 1); } } } # Return the statistics. return $retVal; } =head3 DeleteAttributeKey C<< my $stats = $attrDB->DeleteAttributeKey($attributeName); >> Delete an attribute from the custom attributes database. =over 4 =item attributeName Name of the attribute to delete. =item RETURN Returns a statistics object describing the effects of the deletion. =back =cut sub DeleteAttributeKey { # Get the parameters. my ($self, $attributeName) = @_; # Delete the attribute key. my $retVal = $self->Delete('AttributeKey', $attributeName); # Return the result. return $retVal; } =head3 NewName C<< my $text = CustomAttributes::NewName(); >> Return the string used to indicate the user wants to add a new attribute. =cut sub NewName { return "(new)"; } =head3 ControlForm C<< my $formHtml = $attrDB->ControlForm($cgi, $name, \%keys); >> Return a form that can be used to control the creation and modification of attributes. Only a subset of the attribute keys will be displayed, as determined by the incoming list. =over 4 =item cgi CGI query object used to create HTML. =item name Name to give to the form. This should be unique for the web page. =item keys Reference to a hash mapping attribute keys to n-tuples. Each tuple will contain the attribute's data type, its description, and a list of the groups in which it participates. =item RETURN Returns the HTML for a form that can be used to submit instructions to the C script for loading, creating, displaying, changing, or deleting an attribute. Note that only the form controls are generated. The form tags are left to the caller. =back =cut sub ControlForm { # Get the parameters. my ($self, $cgi, $name, $keys) = @_; # Declare the return list. my @retVal = (); # We'll put the controls in a table. Nothing else ever seems to look nice. push @retVal, $cgi->start_table({ border => 2, cellpadding => 2 }); # The first row is for selecting the field name. push @retVal, $cgi->Tr($cgi->th("Select a Field"), $cgi->td($self->FieldMenu($cgi, 10, 'fieldName', $keys, new => 1, notes => "document.$name.notes.value", type => "document.$name.dataType.value", groups => "document.$name.groups"))); # Now we set up a dropdown for the data types. The values will be the # data type names, and the labels will be the descriptions. my %types = ERDB::GetDataTypes(); my %labelMap = map { $_ => $types{$_}->{notes} } keys %types; my $typeMenu = $cgi->popup_menu(-name => 'dataType', -values => [sort keys %types], -labels => \%labelMap, -default => 'string'); # Allow the user to specify a new field name. This is required if the # user has selected the "(new)" marker. We put a little scriptlet in here that # selects the (new) marker when the user enters the field. push @retVal, "\n"; # Return the result. return $retVal; } =head3 GetGroups C<< my @groups = $attrDB->GetGroups(); >> Return a list of the available groups. =cut sub GetGroups { # Get the parameters. my ($self) = @_; # Get the groups. my @retVal = $self->GetFlat(['AttributeGroup'], "", [], 'AttributeGroup(id)'); # Return them. return @retVal; } =head3 GetAttributeData C<< my %keys = $attrDB->GetAttributeData($type, @list); >> Return attribute data for the selected attributes. The attribute data is a hash mapping each attribute key name to a n-tuple containing the data type, the description, and the groups. This is the same format expected in the L and L methods for the list of attributes to display. =over 4 =item type Type of attribute criterion: C for attributes whose names begin with the specified string, or C for attributes in the specified group. =item list List containing the names of the groups or keys for the desired attributes. =item RETURN Returns a hash mapping each attribute key name to its data type, description, and parent groups. =back =cut sub GetAttributeData { # Get the parameters. my ($self, $type, @list) = @_; # Set up a hash to store the attribute data. my %retVal = (); # Loop through the list items. for my $item (@list) { # Set up a query for the desired attributes. my $query; if ($type eq 'name') { # Here we're doing a generic name search. We need to escape it and then tack # on a %. my $parm = $item; $parm =~ s/_/\\_/g; $parm =~ s/%/\\%/g; $parm .= "%"; # Ask for matching attributes. (Note that if the user passed in a null string # he'll get everything.) $query = $self->Get(['AttributeKey'], "AttributeKey(id) LIKE ?", [$parm]); } elsif ($type eq 'group') { $query = $self->Get(['IsInGroup', 'AttributeKey'], "IsInGroup(to-link) = ?", [$item]); } else { Confess("Unknown attribute query type \"$type\"."); } while (my $row = $query->Fetch()) { # Get this attribute's data. my ($key, $type, $notes) = $row->Values(['AttributeKey(id)', 'AttributeKey(data-type)', 'AttributeKey(description)']); # If it's new, get its groups and add it to the return hash. if (! exists $retVal{$key}) { my @groups = $self->GetFlat(['IsInGroup'], "IsInGroup(from-link) = ?", [$key], 'IsInGroup(to-link)'); $retVal{$key} = [$type, $notes, @groups]; } } } # Return the result. return %retVal; } =head2 FIG Method Replacements The following methods are used by B to replace the previous attribute functionality. Some of the old functionality is no longer present: controlled vocabulary is no longer supported and there is no longer any searching by URL. Fortunately, neither of these capabilities were used in the old system. The methods here are the only ones supported by the B object. The idea is that these methods represent attribute manipulation allowed by all users, while the others are only for privileged users with access to the attribute server. In the previous implementation, an attribute had a value and a URL. In the new implementation, there is only a value. In this implementation, each attribute has only a value. These methods will treat the value as a list with the individual elements separated by the value of the splitter parameter on the constructor (L). The default is double colons C<::>. So, for example, an old-style keyword with a value of C and a URL of C using the default splitter value would be stored as essential::http://www.sciencemag.org/cgi/content/abstract/293/5538/2266 The best performance is achieved by searching for a particular key for a specified feature or genome. =head3 GetAttributes C<< my @attributeList = $attrDB->GetAttributes($objectID, $key, @values); >> In the database, attribute values are sectioned into pieces using a splitter value specified in the constructor (L). This is not a requirement of the attribute system as a whole, merely a convenience for the purpose of these methods. If a value has multiple sections, each section is matched against the corresponding criterion in the I<@valuePatterns> list. This method returns a series of tuples that match the specified criteria. Each tuple will contain an object ID, a key, and one or more values. The parameters to this method therefore correspond structurally to the values expected in each tuple. In addition, you can ask for a generic search by suffixing a percent sign (C<%>) to any of the parameters. So, for example, my @attributeList = $attrDB->GetAttributes('fig|100226.1.peg.1004', 'structure%', 1, 2); would return something like ['fig}100226.1.peg.1004', 'structure', 1, 2] ['fig}100226.1.peg.1004', 'structure1', 1, 2] ['fig}100226.1.peg.1004', 'structure2', 1, 2] ['fig}100226.1.peg.1004', 'structureA', 1, 2] Use of C in any position acts as a wild card (all values). You can also specify a list reference in the ID column. Thus, my @attributeList = $attrDB->GetAttributes(['100226.1', 'fig|100226.1.%'], 'PUBMED'); would get the PUBMED attribute data for Streptomyces coelicolor A3(2) and all its features. In addition to values in multiple sections, a single attribute key can have multiple values, so even my @attributeList = $attrDB->GetAttributes($peg, 'virulent'); which has no wildcard in the key or the object ID, may return multiple tuples. Value matching in this system works very poorly, because of the way multiple values are stored. For the object ID and key name, we create queries that filter for the desired results. For the values, we do a comparison after the attributes are retrieved from the database. As a result, queries in which filter only on value end up reading the entire attribute table to find the desired results. =over 4 =item objectID ID of object whose attributes are desired. If the attributes are desired for multiple objects, this parameter can be specified as a list reference. If the attributes are desired for all objects, specify C or an empty string. Finally, you can specify attributes for a range of object IDs by putting a percent sign (C<%>) at the end. =item key Attribute key name. A value of C or an empty string will match all attribute keys. If the values are desired for multiple keys, this parameter can be specified as a list reference. Finally, you can specify attributes for a range of keys by putting a percent sign (C<%>) at the end. =item values List of the desired attribute values, section by section. If C or an empty string is specified, all values in that section will match. A generic match can be requested by placing a percent sign (C<%>) at the end. In that case, all values that match up to and not including the percent sign will match. =item RETURN Returns a list of tuples. The first element in the tuple is an object ID, the second is an attribute key, and the remaining elements are the sections of the attribute value. All of the tuples will match the criteria set forth in the parameter list. =back =cut sub GetAttributes { # Get the parameters. my ($self, $objectID, $key, @values) = @_; # We will create one big honking query. The following hash will build the filter # clause and a parameter list. my %data = ('HasValueFor(from-link)' => $key, 'HasValueFor(to-link)' => $objectID); my @filter = (); my @parms = (); # This next loop goes through the different fields that can be specified in the # parameter list and generates filters for each. for my $field (keys %data) { # Accumulate filter information for this field. We will OR together all the # elements accumulated to create the final result. my @fieldFilter = (); # Get the specified data from the caller. my $fieldPattern = $data{$field}; # Only proceed if the pattern is one that won't match everything. if (defined($fieldPattern) && $fieldPattern ne "" && $fieldPattern ne "%") { # Convert the pattern to an array. my @patterns = (); if (ref $fieldPattern eq 'ARRAY') { push @patterns, @{$fieldPattern}; } else { push @patterns, $fieldPattern; } # Only proceed if the array is nonempty. The loop will work fine if the # array is empty, but when we build the filter string at the end we'll # get "()" in the filter list, which will result in an SQL syntax error. if (@patterns) { # Loop through the individual patterns. for my $pattern (@patterns) { # Check for a generic request. if (substr($pattern, -1, 1) ne '%') { # Here we have a normal request. push @fieldFilter, "$field = ?"; push @parms, $pattern; } else { # Here we have a generate request, so we will use the LIKE operator to # filter the field to this value pattern. push @fieldFilter, "$field LIKE ?"; # We must convert the pattern value to an SQL match pattern. First # we get a copy of it. my $actualPattern = $pattern; # Now we escape the underscores. Underscores are an SQL wild card # character, but they are used frequently in key names and object IDs. $actualPattern =~ s/_/\\_/g; # Add the escaped pattern to the bound parameter list. push @parms, $actualPattern; } } # Form the filter for this field. my $fieldFilterString = join(" OR ", @fieldFilter); push @filter, "($fieldFilterString)"; } } } # Now @filter contains one or more filter strings and @parms contains the parameter # values to bind to them. my $actualFilter = join(" AND ", @filter); # Declare the return variable. my @retVal = (); # Get the number of value sections we have to match. my $sectionCount = scalar(@values); # Now we're ready to make our query. my $query = $self->Get(['HasValueFor'], $actualFilter, \@parms); # Loop through the assignments found. while (my $row = $query->Fetch()) { # Get the current row's data. my ($id, $key, $valueString) = $row->Values(['HasValueFor(to-link)', 'HasValueFor(from-link)', 'HasValueFor(value)']); # Break the value into sections. my @sections = split($self->{splitter}, $valueString); # Match each section against the incoming values. We'll assume we're # okay unless we learn otherwise. my $matching = 1; for (my $i = 0; $i < $sectionCount && $matching; $i++) { # We need to check to see if this section is generic. if (substr($values[$i], -1, 1) eq '%') { my $matchLen = length($values[$i] - 1); $matching = substr($sections[$i], 0, $matchLen) eq substr($values[$i], 0, $matchLen); } else { $matching = ($sections[$i] eq $values[$i]); } } # If we match, output this row to the return list. if ($matching) { push @retVal, [$id, $key, @sections]; } } # Return the rows found. return @retVal; } =head3 AddAttribute C<< $attrDB->AddAttribute($objectID, $key, @values); >> Add an attribute key/value pair to an object. This method cannot add a new key, merely add a value to an existing key. Use L to create a new key. =over 4 =item objectID ID of the object to which the attribute is to be added. =item key Attribute key name. =item values One or more values to be associated with the key. The values are joined together with the splitter value before being stored as field values. This enables L to split them apart during retrieval. The splitter value defaults to double colons C<::>. =back =cut sub AddAttribute { # Get the parameters. my ($self, $objectID, $key, @values) = @_; # Don't allow undefs. if (! defined($objectID)) { Confess("No object ID specified for AddAttribute call."); } elsif (! defined($key)) { Confess("No attribute key specified for AddAttribute call."); } elsif (! @values) { Confess("No values specified in AddAttribute call for key $key."); } else { # Okay, now we have some reason to believe we can do this. Form the values # into a scalar. my $valueString = join($self->{splitter}, @values); # Connect the object to the key. $self->InsertObject('HasValueFor', { 'from-link' => $key, 'to-link' => $objectID, 'value' => $valueString, }); } # Return a one, indicating success. We do this for backward compatability. return 1; } =head3 DeleteAttribute C<< $attrDB->DeleteAttribute($objectID, $key, @values); >> Delete the specified attribute key/value combination from the database. =over 4 =item objectID ID of the object whose attribute is to be deleted. =item key Attribute key name. =item values One or more values associated with the key. If no values are specified, then all values will be deleted. Otherwise, only a matching value will be deleted. =back =cut sub DeleteAttribute { # Get the parameters. my ($self, $objectID, $key, @values) = @_; # Don't allow undefs. if (! defined($objectID)) { Confess("No object ID specified for DeleteAttribute call."); } elsif (! defined($key)) { Confess("No attribute key specified for DeleteAttribute call."); } elsif (scalar(@values) == 0) { # Here we erase the entire key. $self->EraseAttribute($key); } else { # Here we erase the matching values. my $valueString = join($self->{splitter}, @values); $self->DeleteRow('HasValueFor', $key, $objectID, { value => $valueString }); } # Return a one. This is for backward compatability. return 1; } =head3 ChangeAttribute C<< $attrDB->ChangeAttribute($objectID, $key, \@oldValues, \@newValues); >> Change the value of an attribute key/value pair for an object. =over 4 =item objectID ID of the genome or feature to which the attribute is to be changed. In general, an ID that starts with C is treated as a feature ID, and an ID that is all digits and periods is treated as a genome ID. For IDs of other types, this parameter should be a reference to a 2-tuple consisting of the entity type name followed by the object ID. =item key Attribute key name. This corresponds to the name of a field in the database. =item oldValues One or more values identifying the key/value pair to change. =item newValues One or more values to be put in place of the old values. =back =cut sub ChangeAttribute { # Get the parameters. my ($self, $objectID, $key, $oldValues, $newValues) = @_; # Don't allow undefs. if (! defined($objectID)) { Confess("No object ID specified for ChangeAttribute call."); } elsif (! defined($key)) { Confess("No attribute key specified for ChangeAttribute call."); } elsif (! defined($oldValues) || ref $oldValues ne 'ARRAY') { Confess("No old values specified in ChangeAttribute call for key $key."); } elsif (! defined($newValues) || ref $newValues ne 'ARRAY') { Confess("No new values specified in ChangeAttribute call for key $key."); } else { # We do the change as a delete/add. $self->DeleteAttribute($objectID, $key, @{$oldValues}); $self->AddAttribute($objectID, $key, @{$newValues}); } # Return a one. We do this for backward compatability. return 1; } =head3 EraseAttribute C<< $attrDB->EraseAttribute($key); >> Erase all values for the specified attribute key. This does not remove the key from the database; it merely removes all the values. =over 4 =item key Key to erase. =back =cut sub EraseAttribute { # Get the parameters. my ($self, $key) = @_; # Delete everything connected to the key. The "keepRoot" option keeps the key in the # datanase while deleting everything attached to it. $self->Delete('AttributeKey', $key, keepRoot => 1); # Return a 1, for backward compatability. return 1; } =head3 GetAttributeKeys C<< my @keyList = $attrDB->GetAttributeKeys($groupName); >> Return a list of the attribute keys for a particular group. =over 4 =item groupName Name of the group whose keys are desired. =item RETURN Returns a list of the attribute keys for the specified group. =back =cut sub GetAttributeKeys { # Get the parameters. my ($self, $groupName) = @_; # Get the attributes for the specified group. my @groups = $self->GetFlat(['IsInGroup'], "IsInGroup(to-link) = ?", [$groupName], 'IsInGroup(from-link)'); # Return the keys. return sort @groups; } 1;