--- ERDB.pm 2006/03/17 22:02:03 1.38 +++ ERDB.pm 2006/07/14 01:36:29 1.67 @@ -9,6 +9,7 @@ use DBObject; use Stats; use Time::HiRes qw(gettimeofday); + use Digest::MD5 qw(md5_base64); use FIG; =head1 Entity-Relationship Database Package @@ -90,6 +91,10 @@ 32-bit signed integer +=item counter + +32-bit unsigned integer + =item date 64-bit unsigned integer, representing a PERL date/time value @@ -109,6 +114,10 @@ compatability with certain database packages), but the only values supported are 0 and 1. +=item id-string + +variable-length string, maximum 25 characters + =item key-string variable-length string, maximum 40 characters @@ -125,8 +134,19 @@ variable-length string, maximum 255 characters +=item hash-string + +variable-length string, maximum 22 characters + =back +The hash-string data type has a special meaning. The actual key passed into the loader will +be a string, but it will be digested into a 22-character MD5 code to save space. Although the +MD5 algorithm is not perfect, it is extremely unlikely two strings will have the same +digest. Therefore, it is presumed the keys will be unique. When the database is actually +in use, the hashed keys will be presented rather than the original values. For this reason, +they should not be used for entities where the key is meaningful. + =head3 Global Tags The entire database definition must be inside a B tag. The display name of @@ -302,20 +322,25 @@ # "maxLen" is the maximum permissible length of the incoming string data used to populate a field # of the specified type. "dataGen" is PERL string that will be evaluated if no test data generation # string is specified in the field definition. "avgLen" is the average byte length for estimating -# record sizes. -my %TypeTable = ( char => { sqlType => 'CHAR(1)', maxLen => 1, avgLen => 1, dataGen => "StringGen('A')" }, - int => { sqlType => 'INTEGER', maxLen => 20, avgLen => 4, dataGen => "IntGen(0, 99999999)" }, - string => { sqlType => 'VARCHAR(255)', maxLen => 255, avgLen => 100, dataGen => "StringGen(IntGen(10,250))" }, - text => { sqlType => 'TEXT', maxLen => 1000000000, avgLen => 500, dataGen => "StringGen(IntGen(80,1000))" }, - date => { sqlType => 'BIGINT', maxLen => 80, avgLen => 8, dataGen => "DateGen(-7, 7, IntGen(0,1400))" }, - float => { sqlType => 'DOUBLE PRECISION', maxLen => 40, avgLen => 8, dataGen => "FloatGen(0.0, 100.0)" }, - boolean => { sqlType => 'SMALLINT', maxLen => 1, avgLen => 1, dataGen => "IntGen(0, 1)" }, +# record sizes. "sort" is the key modifier for the sort command. +my %TypeTable = ( char => { sqlType => 'CHAR(1)', maxLen => 1, avgLen => 1, sort => "", dataGen => "StringGen('A')" }, + int => { sqlType => 'INTEGER', maxLen => 20, avgLen => 4, sort => "n", dataGen => "IntGen(0, 99999999)" }, + counter => { sqlType => 'INTEGER UNSIGNED', maxLen => 20, avgLen => 4, sort => "n", dataGen => "IntGen(0, 99999999)" }, + string => { sqlType => 'VARCHAR(255)', maxLen => 255, avgLen => 100, sort => "", dataGen => "StringGen(IntGen(10,250))" }, + text => { sqlType => 'TEXT', maxLen => 1000000000, avgLen => 500, sort => "", dataGen => "StringGen(IntGen(80,1000))" }, + date => { sqlType => 'BIGINT', maxLen => 80, avgLen => 8, sort => "n", dataGen => "DateGen(-7, 7, IntGen(0,1400))" }, + float => { sqlType => 'DOUBLE PRECISION', maxLen => 40, avgLen => 8, sort => "g", dataGen => "FloatGen(0.0, 100.0)" }, + boolean => { sqlType => 'SMALLINT', maxLen => 1, avgLen => 1, sort => "n", dataGen => "IntGen(0, 1)" }, + 'hash-string' => + { sqlType => 'VARCHAR(22)', maxLen => 22, avgLen => 22, sort => "", dataGen => "SringGen(22)" }, + 'id-string' => + { sqlType => 'VARCHAR(25)', maxLen => 25, avgLen => 25, sort => "", dataGen => "SringGen(22)" }, 'key-string' => - { sqlType => 'VARCHAR(40)', maxLen => 40, avgLen => 10, dataGen => "StringGen(IntGen(10,40))" }, + { sqlType => 'VARCHAR(40)', maxLen => 40, avgLen => 10, sort => "", dataGen => "StringGen(IntGen(10,40))" }, 'name-string' => - { sqlType => 'VARCHAR(80)', maxLen => 80, avgLen => 40, dataGen => "StringGen(IntGen(10,80))" }, + { sqlType => 'VARCHAR(80)', maxLen => 80, avgLen => 40, sort => "", dataGen => "StringGen(IntGen(10,80))" }, 'medium-string' => - { sqlType => 'VARCHAR(160)', maxLen => 160, avgLen => 40, dataGen => "StringGen(IntGen(10,160))" }, + { sqlType => 'VARCHAR(160)', maxLen => 160, avgLen => 40, sort => "", dataGen => "StringGen(IntGen(10,160))" }, ); # Table translating arities into natural language. @@ -402,41 +427,81 @@ # Write the HTML heading stuff. print HTMLOUT "\n\n$title\n"; print HTMLOUT "\n\n"; + # Write the documentation. + print HTMLOUT $self->DisplayMetaData(); + # Close the document. + print HTMLOUT "\n\n"; + # Close the file. + close HTMLOUT; +} + +=head3 DisplayMetaData + +C<< my $html = $erdb->DisplayMetaData(); >> + +Return an HTML description of the database. This description can be used to help users create +the data to be loaded into the relations and form queries. The output is raw includable HTML +without any HEAD or BODY tags. + +=over 4 + +=item filename + +The name of the output file. + +=back + +=cut + +sub DisplayMetaData { + # Get the parameters. + my ($self) = @_; + # Get the metadata and the title string. + my $metadata = $self->{_metaData}; + # Get the title string. + my $title = $metadata->{Title}; + # Get the entity and relationship lists. + my $entityList = $metadata->{Entities}; + my $relationshipList = $metadata->{Relationships}; + # Declare the return variable. + my $retVal = ""; + # Open the output file. + Trace("Building MetaData table of contents.") if T(4); # Here we do the table of contents. It starts as an unordered list of section names. Each # section contains an ordered list of entity or relationship subsections. - print HTMLOUT "\n"; + $retVal .= "\n"; # Now we start with the actual data. Denote we're starting the entity section. - print HTMLOUT "

Entities

\n"; + $retVal .= "

Entities

\n"; # Loop through the entities. for my $key (sort keys %{$entityList}) { Trace("Building MetaData entry for $key entity.") if T(4); # Create the entity header. It contains a bookmark and the entity name. - print HTMLOUT "

$key

\n"; + $retVal .= "

$key

\n"; # Get the entity data. my $entityData = $entityList->{$key}; # If there's descriptive text, display it. if (my $notes = $entityData->{Notes}) { - print HTMLOUT "

" . _HTMLNote($notes->{content}) . "

\n"; + $retVal .= "

" . _HTMLNote($notes->{content}) . "

\n"; } # Now we want a list of the entity's relationships. First, we set up the relationship subsection. - print HTMLOUT "

Relationships for $key

\n\n"; # Get the entity's relations. my $relationList = $entityData->{Relations}; # Create a header for the relation subsection. - print HTMLOUT "

Relations for $key

\n"; + $retVal .= "

Relations for $key

\n"; # Loop through the relations, displaying them. for my $relation (sort keys %{$relationList}) { my $htmlString = _ShowRelationTable($relation, $relationList->{$relation}); - print HTMLOUT $htmlString; + $retVal .= $htmlString; } } # Denote we're starting the relationship section. - print HTMLOUT "

Relationships

\n"; + $retVal .= "

Relationships

\n"; # Loop through the relationships. for my $key (sort keys %{$relationshipList}) { Trace("Building MetaData entry for $key relationship.") if T(4); @@ -470,7 +535,7 @@ my $relationshipStructure = $relationshipList->{$key}; # Create the relationship header. my $headerText = _ComputeRelationshipHeading($key, $relationshipStructure); - print HTMLOUT "

$headerText

\n"; + $retVal .= "

$headerText

\n"; # Get the entity names. my $fromEntity = $relationshipStructure->{from}; my $toEntity = $relationshipStructure->{to}; @@ -480,27 +545,27 @@ # since both sentences will say the same thing. my $arity = $relationshipStructure->{arity}; if ($arity eq "11") { - print HTMLOUT "

Each $fromEntity relates to at most one $toEntity.\n"; + $retVal .= "

Each $fromEntity relates to at most one $toEntity.\n"; } else { - print HTMLOUT "

Each $fromEntity relates to multiple $toEntitys.\n"; + $retVal .= "

Each $fromEntity relates to multiple $toEntitys.\n"; if ($arity eq "MM" && $fromEntity ne $toEntity) { - print HTMLOUT "Each $toEntity relates to multiple $fromEntitys.\n"; + $retVal .= "Each $toEntity relates to multiple $fromEntitys.\n"; } } - print HTMLOUT "

\n"; + $retVal .= "

\n"; # If there are notes on this relationship, display them. if (my $notes = $relationshipStructure->{Notes}) { - print HTMLOUT "

" . _HTMLNote($notes->{content}) . "

\n"; + $retVal .= "

" . _HTMLNote($notes->{content}) . "

\n"; } # Generate the relationship's relation table. my $htmlString = _ShowRelationTable($key, $relationshipStructure->{Relations}->{$key}); - print HTMLOUT $htmlString; + $retVal .= $htmlString; } Trace("Building MetaData join table.") if T(4); # Denote we're starting the join table. - print HTMLOUT "

Join Table

\n"; + $retVal .= "

Join Table

\n"; # Create a table header. - print HTMLOUT _OpenTable("Join Table", "Source", "Target", "Join Condition"); + $retVal .= _OpenTable("Join Table", "Source", "Target", "Join Condition"); # Loop through the joins. my $joinTable = $metadata->{Joins}; my @joinKeys = keys %{$joinTable}; @@ -513,15 +578,13 @@ my $target = $self->ComputeObjectSentence($targetRelation); my $clause = $joinTable->{$joinKey}; # Display them in a table row. - print HTMLOUT "$source$target$clause\n"; + $retVal .= "$source$target$clause\n"; } # Close the table. - print HTMLOUT _CloseTable(); - # Close the document. - print HTMLOUT "\n\n"; - # Close the file. - close HTMLOUT; - Trace("Built MetaData web page.") if T(3); + $retVal .= _CloseTable(); + Trace("Built MetaData HTML.") if T(3); + # Return the HTML. + return $retVal; } =head3 DumpMetaData @@ -687,6 +750,79 @@ return $retVal; } +=head3 DigestFields + +C<< $erdb->DigestFields($relName, $fieldList); >> + +Digest the strings in the field list that correspond to data type C in the +specified relation. + +=over 4 + +=item relName + +Name of the relation to which the fields belong. + +=item fieldList + +List of field contents to be loaded into the relation. + +=back + +=cut +#: Return Type ; +sub DigestFields { + # Get the parameters. + my ($self, $relName, $fieldList) = @_; + # Get the relation definition. + my $relData = $self->_FindRelation($relName); + # Get the list of field descriptors. + my $fieldTypes = $relData->{Fields}; + my $fieldCount = scalar @{$fieldTypes}; + # Loop through the two lists. + for (my $i = 0; $i < $fieldCount; $i++) { + # Get the type of the current field. + my $fieldType = $fieldTypes->[$i]->{type}; + # If it's a hash string, digest it in place. + if ($fieldType eq 'hash-string') { + $fieldList->[$i] = $self->DigestKey($fieldList->[$i]); + } + } +} + +=head3 DigestKey + +C<< my $digested = $erdb->DigestKey($keyValue); >> + +Return the digested value of a symbolic key. The digested value can then be plugged into a +key-based search into a table with key-type hash-string. + +Currently the digesting process is independent of the database structure, but that may not +always be the case, so this is an instance method instead of a static method. + +=over 4 + +=item keyValue + +Key value to digest. + +=item RETURN + +Digested value of the key. + +=back + +=cut + +sub DigestKey { + # Get the parameters. + my ($self, $keyValue) = @_; + # Compute the digest. + my $retVal = md5_base64($keyValue); + # Return the result. + return $retVal; +} + =head3 CreateIndex C<< $erdb->CreateIndex($relationName); >> @@ -848,7 +984,7 @@ =head3 Get -C<< my $query = $erdb->Get(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >> +C<< my $query = $erdb->Get(\@objectNames, $filterClause, \@params); >> This method returns a query object for entities of a specified type using a specified filter. The filter is a standard WHERE/ORDER BY clause with question marks as parameter markers and each @@ -856,7 +992,7 @@ following call requests all B objects for the genus specified in the variable $genus. -C<< $query = $erdb->Get(['Genome'], "Genome(genus) = ?", $genus); >> +C<< $query = $erdb->Get(['Genome'], "Genome(genus) = ?", [$genus]); >> The WHERE clause contains a single question mark, so there is a single additional parameter representing the parameter value. It would also be possible to code @@ -873,17 +1009,22 @@ It is possible to specify multiple entity and relationship names in order to retrieve more than one object's data at the same time, which allows highly complex joined queries. For example, -C<< $query = $erdb->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", $genus); >> +C<< $query = $erdb->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", [$genus]); >> If multiple names are specified, then the query processor will automatically determine a join path between the entities and relationships. The algorithm used is very simplistic. -In particular, you can't specify any entity or relationship more than once, and if a -relationship is recursive, the path is determined by the order in which the entity -and the relationship appear. For example, consider a recursive relationship B -which relates B objects to other B objects. If the join path is +In particular, if a relationship is recursive, the path is determined by the order in which +the entity and the relationship appear. For example, consider a recursive relationship +B which relates B objects to other B objects. If the join path is coded as C<['People', 'IsParentOf']>, then the people returned will be parents. If, however, the join path is C<['IsParentOf', 'People']>, then the people returned will be children. +If an entity or relationship is mentioned twice, the name for the second occurrence will +be suffixed with C<2>, the third occurrence will be suffixed with C<3>, and so forth. So, +for example, if we have C<['Feature', 'HasContig', 'Contig', 'HasContig']>, then the +B field of the first B is specified as C, while +the B field of the second B is specified as C. + =over 4 =item objectNames @@ -913,9 +1054,21 @@ filter clause in general; however, odd things may happen if a sort field is from a secondary relation. -=item param1, param2, ..., paramN +Finally, you can limit the number of rows returned by adding a LIMIT clause. The LIMIT must +be the last thing in the filter clause, and it contains only the word "LIMIT" followed by +a positive number. So, for example + +C<< "Genome(genus) = ? ORDER BY Genome(species) LIMIT 10" >> + +will only return the first ten genomes for the specified genus. The ORDER BY clause is not +required. For example, to just get the first 10 genomes in the B table, you could +use + +C<< "LIMIT 10" >> -Parameter values to be substituted into the filter clause. +=item params + +Reference to a list of parameter values to be substituted into the filter clause. =item RETURN @@ -927,130 +1080,78 @@ sub Get { # Get the parameters. - my ($self, $objectNames, $filterClause, @params) = @_; - # Construct the SELECT statement. The general pattern is - # - # SELECT name1.*, name2.*, ... nameN.* FROM name1, name2, ... nameN - # - my $dbh = $self->{_dbh}; - my $command = "SELECT DISTINCT " . join('.*, ', @{$objectNames}) . ".* FROM " . - join(', ', @{$objectNames}); - # Check for a filter clause. - if ($filterClause) { - # Here we have one, so we convert its field names and add it to the query. First, - # We create a copy of the filter string we can work with. - my $filterString = $filterClause; - # Next, we sort the object names by length. This helps protect us from finding - # object names inside other object names when we're doing our search and replace. - my @sortedNames = sort { length($b) - length($a) } @{$objectNames}; - # We will also keep a list of conditions to add to the WHERE clause in order to link - # entities and relationships as well as primary relations to secondary ones. - my @joinWhere = (); - # The final preparatory step is to create a hash table of relation names. The - # table begins with the relation names already in the SELECT command. - my %fromNames = (); - for my $objectName (@sortedNames) { - $fromNames{$objectName} = 1; - } - # We are ready to begin. We loop through the object names, replacing each - # object name's field references by the corresponding SQL field reference. - # Along the way, if we find a secondary relation, we will need to add it - # to the FROM clause. - for my $objectName (@sortedNames) { - # Get the length of the object name plus 2. This is the value we add to the - # size of the field name to determine the size of the field reference as a - # whole. - my $nameLength = 2 + length $objectName; - # Get the object's field list. - my $fieldList = $self->GetFieldTable($objectName); - # Find the field references for this object. - while ($filterString =~ m/$objectName\(([^)]*)\)/g) { - # At this point, $1 contains the field name, and the current position - # is set immediately after the final parenthesis. We pull out the name of - # the field and the position and length of the field reference as a whole. - my $fieldName = $1; - my $len = $nameLength + length $fieldName; - my $pos = pos($filterString) - $len; - # Insure the field exists. - if (!exists $fieldList->{$fieldName}) { - Confess("Field $fieldName not found for object $objectName."); - } else { - # Get the field's relation. - my $relationName = $fieldList->{$fieldName}->{relation}; - # Insure the relation is in the FROM clause. - if (!exists $fromNames{$relationName}) { - # Add the relation to the FROM clause. - $command .= ", $relationName"; - # Create its join sub-clause. - push @joinWhere, "$objectName.id = $relationName.id"; - # Denote we have it available for future fields. - $fromNames{$relationName} = 1; - } - # Form an SQL field reference from the relation name and the field name. - my $sqlReference = "$relationName." . _FixName($fieldName); - # Put it into the filter string in place of the old value. - substr($filterString, $pos, $len) = $sqlReference; - # Reposition the search. - pos $filterString = $pos + length $sqlReference; - } - } - } - # The next step is to join the objects together. We only need to do this if there - # is more than one object in the object list. We start with the first object and - # run through the objects after it. Note also that we make a safety copy of the - # list before running through it. - my @objectList = @{$objectNames}; - my $lastObject = shift @objectList; - # Get the join table. - my $joinTable = $self->{_metaData}->{Joins}; - # Loop through the object list. - for my $thisObject (@objectList) { - # Look for a join. - my $joinKey = "$lastObject/$thisObject"; - if (!exists $joinTable->{$joinKey}) { - # Here there's no join, so we throw an error. - Confess("No join exists to connect from $lastObject to $thisObject."); - } else { - # Get the join clause and add it to the WHERE list. - push @joinWhere, $joinTable->{$joinKey}; - # Save this object as the last object for the next iteration. - $lastObject = $thisObject; - } - } - # Now we need to handle the whole ORDER BY / LIMIT thing. The important part - # here is we want the filter clause to be empty if there's no WHERE filter. - # We'll put the ORDER BY / LIMIT clauses in the following variable. - my $orderClause = ""; - # Locate the ORDER BY or LIMIT verbs (if any). We use a non-greedy - # operator so that we find the first occurrence of either verb. - if ($filterString =~ m/^(.*?)\s*(ORDER BY|LIMIT)/g) { - # Here we have an ORDER BY or LIMIT verb. Split it off of the filter string. - my $pos = pos $filterString; - $orderClause = $2 . substr($filterString, $pos); - $filterString = $1; - } - # Add the filter and the join clauses (if any) to the SELECT command. - if ($filterString) { - push @joinWhere, "($filterString)"; - } - if (@joinWhere) { - $command .= " WHERE " . join(' AND ', @joinWhere); - } - # Add the sort or limit clause (if any) to the SELECT command. - if ($orderClause) { - $command .= " $orderClause"; - } + my ($self, $objectNames, $filterClause, $params) = @_; + # Process the SQL stuff. + my ($suffix, $mappedNameListRef, $mappedNameHashRef) = + $self->_SetupSQL($objectNames, $filterClause); + # Create the query. + my $command = "SELECT DISTINCT " . join(".*, ", @{$mappedNameListRef}) . + ".* $suffix"; + my $sth = $self->_GetStatementHandle($command, $params); + # Now we create the relation map, which enables DBQuery to determine the order, name + # and mapped name for each object in the query. + my @relationMap = (); + for my $mappedName (@{$mappedNameListRef}) { + push @relationMap, [$mappedName, $mappedNameHashRef->{$mappedName}]; } - Trace("SQL query: $command") if T(SQL => 4); - Trace("PARMS: '" . (join "', '", @params) . "'") if (T(SQL => 4) && (@params > 0)); - my $sth = $dbh->prepare_command($command); - # Execute it with the parameters bound in. - $sth->execute(@params) || Confess("SELECT error" . $sth->errstr()); # Return the statement object. - my $retVal = DBQuery::_new($self, $sth, @{$objectNames}); + my $retVal = DBQuery::_new($self, $sth, \@relationMap); return $retVal; } +=head3 GetFlat + +C<< my @list = $erdb->GetFlat(\@objectNames, $filterClause, \@parameterList, $field); >> + +This is a variation of L that asks for only a single field per record and +returns a single flattened list. + +=over 4 + +=item objectNames + +List containing the names of the entity and relationship objects to be retrieved. + +=item filterClause + +WHERE/ORDER BY clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can +be parameterized with parameter markers (C). Each field used must be specified in the standard form +B(I)>. Any parameters specified in the filter clause should be added to the +parameter list as additional parameters. The fields in a filter clause can come from primary +entity relations, relationship relations, or secondary entity relations; however, all of the +entities and relationships involved must be included in the list of object names. + +=item parameterList + +List of the parameters to be substituted in for the parameters marks in the filter clause. + +=item field + +Name of the field to be used to get the elements of the list returned. + +=item RETURN + +Returns a list of values. + +=back + +=cut +#: Return Type @; +sub GetFlat { + # Get the parameters. + my ($self, $objectNames, $filterClause, $parameterList, $field) = @_; + # Construct the query. + my $query = $self->Get($objectNames, $filterClause, $parameterList); + # Create the result list. + my @retVal = (); + # Loop through the records, adding the field values found to the result list. + while (my $row = $query->Fetch()) { + push @retVal, $row->Value($field); + } + # Return the list created. + return @retVal; +} + =head3 Delete C<< my $stats = $erdb->Delete($entityName, $objectID); >> @@ -1221,7 +1322,7 @@ =head3 GetList -C<< my @dbObjects = $erdb->GetList(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >> +C<< my @dbObjects = $erdb->GetList(\@objectNames, $filterClause, \@params); >> Return a list of object descriptors for the specified objects as determined by the specified filter clause. @@ -1255,9 +1356,9 @@ filter clause in general; however, odd things may happen if a sort field is from a secondary relation. -=item param1, param2, ..., paramN +=item params -Parameter values to be substituted into the filter clause. +Reference to a list of parameter values to be substituted into the filter clause. =item RETURN @@ -1269,11 +1370,11 @@ #: Return Type @% sub GetList { # Get the parameters. - my ($self, $objectNames, $filterClause, @params) = @_; + my ($self, $objectNames, $filterClause, $params) = @_; # Declare the return variable. my @retVal = (); # Perform the query. - my $query = $self->Get($objectNames, $filterClause, @params); + my $query = $self->Get($objectNames, $filterClause, $params); # Loop through the results. while (my $object = $query->Fetch) { push @retVal, $object; @@ -1282,6 +1383,99 @@ return @retVal; } +=head3 GetCount + +C<< my $count = $erdb->GetCount(\@objectNames, $filter, \@params); >> + +Return the number of rows found by a specified query. This method would +normally be used to count the records in a single table. For example, in a +genetics database + + my $count = $erdb->GetCount(['Genome'], 'Genome(genus-species) LIKE ?', ['homo %']); + +would return the number of genomes for the genus I. It is conceivable, however, +to use it to return records based on a join. For example, + + my $count = $erdb->GetCount(['HasFeature', 'Genome'], 'Genome(genus-species) LIKE ?', + ['homo %']); + +would return the number of features for genomes in the genus I. Note that +only the rows from the first table are counted. If the above command were + + my $count = $erdb->GetCount(['Genome', 'Feature'], 'Genome(genus-species) LIKE ?', + ['homo %']); + +it would return the number of genomes, not the number of genome/feature pairs. + +=over 4 + +=item objectNames + +Reference to a list of the objects (entities and relationships) included in the +query. + +=item filter + +A filter clause for restricting the query. The rules are the same as for the L +method. + +=item params + +Reference to a list of the parameter values to be substituted for the parameter marks +in the filter. + +=item RETURN + +Returns a count of the number of records in the first table that would satisfy +the query. + +=back + +=cut + +sub GetCount { + # Get the parameters. + my ($self, $objectNames, $filter, $params) = @_; + # Insure the params argument is an array reference if the caller left it off. + if (! defined($params)) { + $params = []; + } + # Declare the return variable. + my $retVal; + # Find out if we're counting an entity or a relationship. + my $countedField; + if ($self->IsEntity($objectNames->[0])) { + $countedField = "id"; + } else { + # For a relationship we count the to-link because it's usually more + # numerous. Note we're automatically converting to the SQL form + # of the field name (to_link vs. to-link). + $countedField = "to_link"; + } + # Create the SQL command suffix to get the desired records. + my ($suffix, $mappedNameListRef, $mappedNameHashRef) = $self->_SetupSQL($objectNames, + $filter); + # Prefix it with text telling it we want a record count. + my $firstObject = $mappedNameListRef->[0]; + my $command = "SELECT COUNT($firstObject.$countedField) $suffix"; + # Prepare and execute the command. + my $sth = $self->_GetStatementHandle($command, $params); + # Get the count value. + ($retVal) = $sth->fetchrow_array(); + # Check for a problem. + if (! defined($retVal)) { + if ($sth->err) { + # Here we had an SQL error. + Confess("Error retrieving row count: " . $sth->errstr()); + } else { + # Here we have no result. + Confess("No result attempting to retrieve row count."); + } + } + # Return the result. + return $retVal; +} + =head3 ComputeObjectSentence C<< my $sentence = $erdb->ComputeObjectSentence($objectName); >> @@ -1349,13 +1543,76 @@ my $relation = $relationList->{$relationName}; $self->_DumpRelation($outputDirectory, $relationName, $relation); } - } - # Next, we loop through the relationships. - my $relationships = $metaData->{Relationships}; - for my $relationshipName (keys %{$relationships}) { - my $relationshipStructure = $relationships->{$relationshipName}; - # Dump this relationship's relation. - $self->_DumpRelation($outputDirectory, $relationshipName, $relationshipStructure->{Relations}->{$relationshipName}); + } + # Next, we loop through the relationships. + my $relationships = $metaData->{Relationships}; + for my $relationshipName (keys %{$relationships}) { + my $relationshipStructure = $relationships->{$relationshipName}; + # Dump this relationship's relation. + $self->_DumpRelation($outputDirectory, $relationshipName, $relationshipStructure->{Relations}->{$relationshipName}); + } +} + +=head3 InsertValue + +C<< $erdb->InsertValue($entityID, $fieldName, $value); >> + +This method will insert a new value into the database. The value must be one +associated with a secondary relation, since primary values cannot be inserted: +they occur exactly once. Secondary values, on the other hand, can be missing +or multiply-occurring. + +=over 4 + +=item entityID + +ID of the object that is to receive the new value. + +=item fieldName + +Field name for the new value-- this includes the entity name, since +field names are of the format IC<(>IC<)>. + +=item value + +New value to be put in the field. + +=back + +=cut + +sub InsertValue { + # Get the parameters. + my ($self, $entityID, $fieldName, $value) = @_; + # Parse the entity name and the real field name. + if ($fieldName =~ /^([^(]+)\(([^)]+)\)/) { + my $entityName = $1; + my $fieldTitle = $2; + # Get its descriptor. + if (!$self->IsEntity($entityName)) { + Confess("$entityName is not a valid entity."); + } else { + my $entityData = $self->{_metaData}->{Entities}->{$entityName}; + # Find the relation containing this field. + my $fieldHash = $entityData->{Fields}; + if (! exists $fieldHash->{$fieldTitle}) { + Confess("$fieldTitle not found in $entityName."); + } else { + my $relation = $fieldHash->{$fieldTitle}->{relation}; + if ($relation eq $entityName) { + Confess("Cannot do InsertValue on primary field $fieldTitle of $entityName."); + } else { + # Now we can create an INSERT statement. + my $dbh = $self->{_dbh}; + my $fixedName = _FixName($fieldTitle); + my $statement = "INSERT INTO $relation (id, $fixedName) VALUES(?, ?)"; + # Execute the command. + $dbh->SQL($statement, 0, $entityID, $value); + } + } + } + } else { + Confess("$fieldName is not a valid field name."); } } @@ -1375,7 +1632,7 @@ The next statement inserts a C relationship between feature C and property C<4> with an evidence URL of C. -C<< $erdb->InsertObject('HasProperty', { 'from-link' => 'fig|158879.1.peg.1', 'to-link' => 4, evidence = 'http://seedu.uchicago.edu/query.cgi?article_id=142'}); >> +C<< $erdb->InsertObject('HasProperty', { 'from-link' => 'fig|158879.1.peg.1', 'to-link' => 4, evidence => 'http://seedu.uchicago.edu/query.cgi?article_id=142'}); >> =over 4 @@ -1585,7 +1842,9 @@ } } # Analyze the table to improve performance. + Trace("Analyzing and compacting $relationName.") if T(3); $dbh->vacuum_it($relationName); + Trace("$relationName load completed.") if T(3); # Return the statistics. return $retVal; } @@ -1677,7 +1936,7 @@ # Get the parameters. my ($self, $entityType, $ID) = @_; # Create a query. - my $query = $self->Get([$entityType], "$entityType(id) = ?", $ID); + my $query = $self->Get([$entityType], "$entityType(id) = ?", [$ID]); # Get the first (and only) object. my $retVal = $query->Fetch(); # Return the result. @@ -1790,8 +2049,10 @@ # list is a scalar we convert it into a singleton list. my @parmList = (); if (ref $parameterList eq "ARRAY") { + Trace("GetAll parm list is an array.") if T(4); @parmList = @{$parameterList}; } else { + Trace("GetAll parm list is a scalar: $parameterList.") if T(4); push @parmList, $parameterList; } # Insure the counter has a value. @@ -1803,7 +2064,7 @@ $filterClause .= " LIMIT $count"; } # Create the query. - my $query = $self->Get($objectNames, $filterClause, @parmList); + my $query = $self->Get($objectNames, $filterClause, \@parmList); # Set up a counter of the number of records read. my $fetched = 0; # Loop through the records returned, extracting the fields. Note that if the @@ -1814,10 +2075,46 @@ push @retVal, \@rowData; $fetched++; } + Trace("$fetched rows returned in GetAll.") if T(SQL => 4); # Return the resulting list. return @retVal; } +=head3 Exists + +C<< my $found = $sprout->Exists($entityName, $entityID); >> + +Return TRUE if an entity exists, else FALSE. + +=over 4 + +=item entityName + +Name of the entity type (e.g. C) relevant to the existence check. + +=item entityID + +ID of the entity instance whose existence is to be checked. + +=item RETURN + +Returns TRUE if the entity instance exists, else FALSE. + +=back + +=cut +#: Return Type $; +sub Exists { + # Get the parameters. + my ($self, $entityName, $entityID) = @_; + # Check for the entity instance. + Trace("Checking existence of $entityName with ID=$entityID.") if T(4); + my $testInstance = $self->GetEntity($entityName, $entityID); + # Return an existence indicator. + my $retVal = ($testInstance ? 1 : 0); + return $retVal; +} + =head3 EstimateRowSize C<< my $rowSize = $erdb->EstimateRowSize($relName); >> @@ -1885,6 +2182,8 @@ return $objectData->{Fields}; } +=head2 Data Mining Methods + =head3 GetUsefulCrossValues C<< my @attrNames = $sprout->GetUsefulCrossValues($sourceEntity, $relationship); >> @@ -1946,8 +2245,357 @@ return @retVal; } +=head3 FindColumn + +C<< my $colIndex = ERDB::FindColumn($headerLine, $columnIdentifier); >> + +Return the location a desired column in a data mining header line. The data +mining header line is a tab-separated list of column names. The column +identifier is either the numerical index of a column or the actual column +name. + +=over 4 + +=item headerLine + +The header line from a data mining command, which consists of a tab-separated +list of column names. + +=item columnIdentifier + +Either the ordinal number of the desired column (1-based), or the name of the +desired column. + +=item RETURN + +Returns the array index (0-based) of the desired column. + +=back + +=cut + +sub FindColumn { + # Get the parameters. + my ($headerLine, $columnIdentifier) = @_; + # Declare the return variable. + my $retVal; + # Split the header line into column names. + my @headers = ParseColumns($headerLine); + # Determine whether we have a number or a name. + if ($columnIdentifier =~ /^\d+$/) { + # Here we have a number. Subtract 1 and validate the result. + $retVal = $columnIdentifier - 1; + if ($retVal < 0 || $retVal > $#headers) { + Confess("Invalid column identifer \"$columnIdentifier\": value out of range."); + } + } else { + # Here we have a name. We need to find it in the list. + for (my $i = 0; $i <= $#headers && ! defined($retVal); $i++) { + if ($headers[$i] eq $columnIdentifier) { + $retVal = $i; + } + } + if (! defined($retVal)) { + Confess("Invalid column identifier \"$columnIdentifier\": value not found."); + } + } + # Return the result. + return $retVal; +} + +=head3 ParseColumns + +C<< my @columns = ERDB::ParseColumns($line); >> + +Convert the specified data line to a list of columns. + +=over 4 + +=item line + +A data mining input, consisting of a tab-separated list of columns terminated by a +new-line. + +=item RETURN + +Returns a list consisting of the column values. + +=back + +=cut + +sub ParseColumns { + # Get the parameters. + my ($line) = @_; + # Chop off the line-end. + chomp $line; + # Split it into a list. + my @retVal = split(/\t/, $line); + # Return the result. + return @retVal; +} + =head2 Internal Utility Methods +=head3 SetupSQL + +Process a list of object names and a filter clause so that they can be used to +build an SQL statement. This method takes in a reference to a list of object names +and a filter clause. It will return a corrected filter clause, a list of mapped +names and the mapped name hash. + +This is an instance method. + +=over 4 + +=item objectNames + +Reference to a list of the object names to be included in the query. + +=item filterClause + +A string containing the WHERE clause for the query (without the C) and also +optionally the C and C clauses. + +=item RETURN + +Returns a three-element list. The first element is the SQL statement suffix, beginning +with the FROM clause. The second element is a reference to a list of the names to be +used in retrieving the fields. The third element is a hash mapping the names to the +objects they represent. + +=back + +=cut + +sub _SetupSQL { + my ($self, $objectNames, $filterClause) = @_; + # Adjust the list of object names to account for multiple occurrences of the + # same object. We start with a hash table keyed on object name that will + # return the object suffix. The first time an object is encountered it will + # not be found in the hash. The next time the hash will map the object name + # to 2, then 3, and so forth. + my %objectHash = (); + # This list will contain the object names as they are to appear in the + # FROM list. + my @fromList = (); + # This list contains the suffixed object name for each object. It is exactly + # parallel to the list in the $objectNames parameter. + my @mappedNameList = (); + # Finally, this hash translates from a mapped name to its original object name. + my %mappedNameHash = (); + # Now we create the lists. Note that for every single name we push something into + # @fromList and @mappedNameList. This insures that those two arrays are exactly + # parallel to $objectNames. + for my $objectName (@{$objectNames}) { + # Get the next suffix for this object. + my $suffix = $objectHash{$objectName}; + if (! $suffix) { + # Here we are seeing the object for the first time. The object name + # is used as is. + push @mappedNameList, $objectName; + push @fromList, $objectName; + $mappedNameHash{$objectName} = $objectName; + # Denote the next suffix will be 2. + $objectHash{$objectName} = 2; + } else { + # Here we've seen the object before. We construct a new name using + # the suffix from the hash and update the hash. + my $mappedName = "$objectName$suffix"; + $objectHash{$objectName} = $suffix + 1; + # The FROM list has the object name followed by the mapped name. This + # tells SQL it's still the same table, but we're using a different name + # for it to avoid confusion. + push @fromList, "$objectName $mappedName"; + # The mapped-name list contains the real mapped name. + push @mappedNameList, $mappedName; + # Finally, enable us to get back from the mapped name to the object name. + $mappedNameHash{$mappedName} = $objectName; + } + } + # Begin the SELECT suffix. It starts with + # + # FROM name1, name2, ... nameN + # + my $suffix = "FROM " . join(', ', @fromList); + # Check for a filter clause. + if ($filterClause) { + # Here we have one, so we convert its field names and add it to the query. First, + # We create a copy of the filter string we can work with. + my $filterString = $filterClause; + # Next, we sort the object names by length. This helps protect us from finding + # object names inside other object names when we're doing our search and replace. + my @sortedNames = sort { length($b) - length($a) } @mappedNameList; + # We will also keep a list of conditions to add to the WHERE clause in order to link + # entities and relationships as well as primary relations to secondary ones. + my @joinWhere = (); + # The final preparatory step is to create a hash table of relation names. The + # table begins with the relation names already in the SELECT command. We may + # need to add relations later if there is filtering on a field in a secondary + # relation. The secondary relations are the ones that contain multiply- + # occurring or optional fields. + my %fromNames = map { $_ => 1 } @sortedNames; + # We are ready to begin. We loop through the object names, replacing each + # object name's field references by the corresponding SQL field reference. + # Along the way, if we find a secondary relation, we will need to add it + # to the FROM clause. + for my $mappedName (@sortedNames) { + # Get the length of the object name plus 2. This is the value we add to the + # size of the field name to determine the size of the field reference as a + # whole. + my $nameLength = 2 + length $mappedName; + # Get the real object name for this mapped name. + my $objectName = $mappedNameHash{$mappedName}; + Trace("Processing $mappedName for object $objectName.") if T(4); + # Get the object's field list. + my $fieldList = $self->GetFieldTable($objectName); + # Find the field references for this object. + while ($filterString =~ m/$mappedName\(([^)]*)\)/g) { + # At this point, $1 contains the field name, and the current position + # is set immediately after the final parenthesis. We pull out the name of + # the field and the position and length of the field reference as a whole. + my $fieldName = $1; + my $len = $nameLength + length $fieldName; + my $pos = pos($filterString) - $len; + # Insure the field exists. + if (!exists $fieldList->{$fieldName}) { + Confess("Field $fieldName not found for object $objectName."); + } else { + Trace("Processing $fieldName at position $pos.") if T(4); + # Get the field's relation. + my $relationName = $fieldList->{$fieldName}->{relation}; + # Now we have a secondary relation. We need to insure it matches the + # mapped name of the primary relation. First we peel off the suffix + # from the mapped name. + my $mappingSuffix = substr $mappedName, length($objectName); + # Put the mapping suffix onto the relation name to get the + # mapped relation name. + my $mappedRelationName = "$relationName$mappingSuffix"; + # Insure the relation is in the FROM clause. + if (!exists $fromNames{$mappedRelationName}) { + # Add the relation to the FROM clause. + if ($mappedRelationName eq $relationName) { + # The name is un-mapped, so we add it without + # any frills. + $suffix .= ", $relationName"; + push @joinWhere, "$objectName.id = $relationName.id"; + } else { + # Here we have a mapping situation. + $suffix .= ", $relationName $mappedRelationName"; + push @joinWhere, "$mappedRelationName.id = $mappedName.id"; + } + # Denote we have this relation available for future fields. + $fromNames{$mappedRelationName} = 1; + } + # Form an SQL field reference from the relation name and the field name. + my $sqlReference = "$mappedRelationName." . _FixName($fieldName); + # Put it into the filter string in place of the old value. + substr($filterString, $pos, $len) = $sqlReference; + # Reposition the search. + pos $filterString = $pos + length $sqlReference; + } + } + } + # The next step is to join the objects together. We only need to do this if there + # is more than one object in the object list. We start with the first object and + # run through the objects after it. Note also that we make a safety copy of the + # list before running through it. + my @mappedObjectList = @mappedNameList; + my $lastMappedObject = shift @mappedObjectList; + # Get the join table. + my $joinTable = $self->{_metaData}->{Joins}; + # Loop through the object list. + for my $thisMappedObject (@mappedObjectList) { + # Look for a join using the real object names. + my $lastObject = $mappedNameHash{$lastMappedObject}; + my $thisObject = $mappedNameHash{$thisMappedObject}; + my $joinKey = "$lastObject/$thisObject"; + if (!exists $joinTable->{$joinKey}) { + # Here there's no join, so we throw an error. + Confess("No join exists to connect from $lastMappedObject to $thisMappedObject."); + } else { + # Get the join clause. + my $unMappedJoin = $joinTable->{$joinKey}; + # Fix the names. + $unMappedJoin =~ s/$lastObject/$lastMappedObject/; + $unMappedJoin =~ s/$thisObject/$thisMappedObject/; + push @joinWhere, $unMappedJoin; + # Save this object as the last object for the next iteration. + $lastMappedObject = $thisMappedObject; + } + } + # Now we need to handle the whole ORDER BY / LIMIT thing. The important part + # here is we want the filter clause to be empty if there's no WHERE filter. + # We'll put the ORDER BY / LIMIT clauses in the following variable. + my $orderClause = ""; + # Locate the ORDER BY or LIMIT verbs (if any). We use a non-greedy + # operator so that we find the first occurrence of either verb. + if ($filterString =~ m/^(.*?)\s*(ORDER BY|LIMIT)/g) { + # Here we have an ORDER BY or LIMIT verb. Split it off of the filter string. + my $pos = pos $filterString; + $orderClause = $2 . substr($filterString, $pos); + $filterString = $1; + } + # Add the filter and the join clauses (if any) to the SELECT command. + if ($filterString) { + Trace("Filter string is \"$filterString\".") if T(4); + push @joinWhere, "($filterString)"; + } + if (@joinWhere) { + $suffix .= " WHERE " . join(' AND ', @joinWhere); + } + # Add the sort or limit clause (if any) to the SELECT command. + if ($orderClause) { + $suffix .= " $orderClause"; + } + } + # Return the suffix, the mapped name list, and the mapped name hash. + return ($suffix, \@mappedNameList, \%mappedNameHash); +} + +=head3 GetStatementHandle + +This method will prepare and execute an SQL query, returning the statement handle. +The main reason for doing this here is so that everybody who does SQL queries gets +the benefit of tracing. + +This is an instance method. + +=over 4 + +=item command + +Command to prepare and execute. + +=item params + +Reference to a list of the values to be substituted in for the parameter marks. + +=item RETURN + +Returns a prepared and executed statement handle from which the caller can extract +results. + +=back + +=cut + +sub _GetStatementHandle { + # Get the parameters. + my ($self, $command, $params) = @_; + # Trace the query. + Trace("SQL query: $command") if T(SQL => 3); + Trace("PARMS: '" . (join "', '", @{$params}) . "'") if (T(SQL => 4) && (@{$params} > 0)); + # Get the database handle. + my $dbh = $self->{_dbh}; + # Prepare the command. + my $sth = $dbh->prepare_command($command); + # Execute it with the parameters bound in. + $sth->execute(@{$params}) || Confess("SELECT error" . $sth->errstr()); + # Return the statement handle. + return $sth; +} + =head3 GetLoadStats Return a blank statistics object for use by the load methods. @@ -2505,7 +3153,7 @@ # Determine if this relationship has our entity in one of its link fields. my $fromEntity = $relationship->{from}; my $toEntity = $relationship->{to}; - Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(4); + Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(Joins => 4); if ($fromEntity eq $entityName) { if ($toEntity eq $entityName) { # Here the relationship is recursive. @@ -2594,6 +3242,104 @@ return $metadata; } +=head3 SortNeeded + +C<< my $parms = $erdb->SortNeeded($relationName); >> + +Return the pipe command for the sort that should be applied to the specified +relation when creating the load file. + +For example, if the load file should be sorted ascending by the first +field, this method would return + + sort -k1 -t"\t" + +If the first field is numeric, the method would return + + sort -k1n -t"\t" + +Unfortunately, due to a bug in the C command, we cannot eliminate duplicate +keys using a sort. + +=over 4 + +=item relationName + +Name of the relation to be examined. + +=item + +Returns the sort command to use for sorting the relation, suitable for piping. + +=back + +=cut +#: Return Type $; +sub SortNeeded { + # Get the parameters. + my ($self, $relationName) = @_; + # Declare a descriptor to hold the names of the key fields. + my @keyNames = (); + # Get the relation structure. + my $relationData = $self->_FindRelation($relationName); + # Find out if the relation is a primary entity relation, + # a relationship relation, or a secondary entity relation. + my $entityTable = $self->{_metaData}->{Entities}; + my $relationshipTable = $self->{_metaData}->{Relationships}; + if (exists $entityTable->{$relationName}) { + # Here we have a primary entity relation. + push @keyNames, "id"; + } elsif (exists $relationshipTable->{$relationName}) { + # Here we have a relationship. We sort using the FROM index. + my $relationshipData = $relationshipTable->{$relationName}; + my $index = $relationData->{Indexes}->{"idx${relationName}From"}; + push @keyNames, @{$index->{IndexFields}}; + } else { + # Here we have a secondary entity relation, so we have a sort on the ID field. + push @keyNames, "id"; + } + # Now we parse the key names into sort parameters. First, we prime the return + # string. + my $retVal = "sort -t\"\t\" "; + # Get the relation's field list. + my @fields = @{$relationData->{Fields}}; + # Loop through the keys. + for my $keyData (@keyNames) { + # Get the key and the ordering. + my ($keyName, $ordering); + if ($keyData =~ /^([^ ]+) DESC/) { + ($keyName, $ordering) = ($1, "descending"); + } else { + ($keyName, $ordering) = ($keyData, "ascending"); + } + # Find the key's position and type. + my $fieldSpec; + for (my $i = 0; $i <= $#fields && ! $fieldSpec; $i++) { + my $thisField = $fields[$i]; + if ($thisField->{name} eq $keyName) { + # Get the sort modifier for this field type. The modifier + # decides whether we're using a character, numeric, or + # floating-point sort. + my $modifier = $TypeTable{$thisField->{type}}->{sort}; + # If the index is descending for this field, denote we want + # to reverse the sort order on this field. + if ($ordering eq 'descending') { + $modifier .= "r"; + } + # Store the position and modifier into the field spec, which + # will stop the inner loop. Note that the field number is + # 1-based in the sort command, so we have to increment the + # index. + $fieldSpec = ($i + 1) . $modifier; + } + } + # Add this field to the sort command. + $retVal .= " -k$fieldSpec"; + } + # Return the result. + return $retVal; +} + =head3 CreateRelationshipIndex Create an index for a relationship's relation.