--- ERDB.pm 2005/09/09 14:50:58 1.19 +++ ERDB.pm 2006/06/23 22:53:08 1.51 @@ -9,6 +9,7 @@ use DBObject; use Stats; use Time::HiRes qw(gettimeofday); + use Digest::MD5 qw(md5_base64); use FIG; =head1 Entity-Relationship Database Package @@ -109,6 +110,10 @@ compatability with certain database packages), but the only values supported are 0 and 1. +=item id-string + +variable-length string, maximum 25 characters + =item key-string variable-length string, maximum 40 characters @@ -125,8 +130,19 @@ variable-length string, maximum 255 characters +=item hash-string + +variable-length string, maximum 22 characters + =back +The hash-string data type has a special meaning. The actual key passed into the loader will +be a string, but it will be digested into a 22-character MD5 code to save space. Although the +MD5 algorithm is not perfect, it is extremely unlikely two strings will have the same +digest. Therefore, it is presumed the keys will be unique. When the database is actually +in use, the hashed keys will be presented rather than the original values. For this reason, +they should not be used for entities where the key is meaningful. + =head3 Global Tags The entire database definition must be inside a B tag. The display name of @@ -309,7 +325,11 @@ text => { sqlType => 'TEXT', maxLen => 1000000000, avgLen => 500, dataGen => "StringGen(IntGen(80,1000))" }, date => { sqlType => 'BIGINT', maxLen => 80, avgLen => 8, dataGen => "DateGen(-7, 7, IntGen(0,1400))" }, float => { sqlType => 'DOUBLE PRECISION', maxLen => 40, avgLen => 8, dataGen => "FloatGen(0.0, 100.0)" }, - boolean => { sqlType => 'SMALLINT', maxLen => 1, avgLen => 2, dataGen => "IntGen(0, 1)" }, + boolean => { sqlType => 'SMALLINT', maxLen => 1, avgLen => 1, dataGen => "IntGen(0, 1)" }, + 'hash-string' => + { sqlType => 'VARCHAR(22)', maxLen => 22, avgLen => 22, dataGen => "SringGen(22)" }, + 'id-string' => + { sqlType => 'VARCHAR(25)', maxLen => 25, avgLen => 25, dataGen => "SringGen(22)" }, 'key-string' => { sqlType => 'VARCHAR(40)', maxLen => 40, avgLen => 10, dataGen => "StringGen(IntGen(10,40))" }, 'name-string' => @@ -402,41 +422,81 @@ # Write the HTML heading stuff. print HTMLOUT "\n\n$title\n"; print HTMLOUT "\n\n"; + # Write the documentation. + print HTMLOUT $self->DisplayMetaData(); + # Close the document. + print HTMLOUT "\n\n"; + # Close the file. + close HTMLOUT; +} + +=head3 DisplayMetaData + +C<< my $html = $erdb->DisplayMetaData(); >> + +Return an HTML description of the database. This description can be used to help users create +the data to be loaded into the relations and form queries. The output is raw includable HTML +without any HEAD or BODY tags. + +=over 4 + +=item filename + +The name of the output file. + +=back + +=cut + +sub DisplayMetaData { + # Get the parameters. + my ($self) = @_; + # Get the metadata and the title string. + my $metadata = $self->{_metaData}; + # Get the title string. + my $title = $metadata->{Title}; + # Get the entity and relationship lists. + my $entityList = $metadata->{Entities}; + my $relationshipList = $metadata->{Relationships}; + # Declare the return variable. + my $retVal = ""; + # Open the output file. + Trace("Building MetaData table of contents.") if T(4); # Here we do the table of contents. It starts as an unordered list of section names. Each # section contains an ordered list of entity or relationship subsections. - print HTMLOUT "\n"; + $retVal .= "\n"; # Now we start with the actual data. Denote we're starting the entity section. - print HTMLOUT "

Entities

\n"; + $retVal .= "

Entities

\n"; # Loop through the entities. for my $key (sort keys %{$entityList}) { Trace("Building MetaData entry for $key entity.") if T(4); # Create the entity header. It contains a bookmark and the entity name. - print HTMLOUT "

$key

\n"; + $retVal .= "

$key

\n"; # Get the entity data. my $entityData = $entityList->{$key}; # If there's descriptive text, display it. if (my $notes = $entityData->{Notes}) { - print HTMLOUT "

" . _HTMLNote($notes->{content}) . "

\n"; + $retVal .= "

" . _HTMLNote($notes->{content}) . "

\n"; } # Now we want a list of the entity's relationships. First, we set up the relationship subsection. - print HTMLOUT "

Relationships for $key

\n\n"; # Get the entity's relations. my $relationList = $entityData->{Relations}; # Create a header for the relation subsection. - print HTMLOUT "

Relations for $key

\n"; + $retVal .= "

Relations for $key

\n"; # Loop through the relations, displaying them. for my $relation (sort keys %{$relationList}) { my $htmlString = _ShowRelationTable($relation, $relationList->{$relation}); - print HTMLOUT $htmlString; + $retVal .= $htmlString; } } # Denote we're starting the relationship section. - print HTMLOUT "

Relationships

\n"; + $retVal .= "

Relationships

\n"; # Loop through the relationships. for my $key (sort keys %{$relationshipList}) { Trace("Building MetaData entry for $key relationship.") if T(4); @@ -470,7 +530,7 @@ my $relationshipStructure = $relationshipList->{$key}; # Create the relationship header. my $headerText = _ComputeRelationshipHeading($key, $relationshipStructure); - print HTMLOUT "

$headerText

\n"; + $retVal .= "

$headerText

\n"; # Get the entity names. my $fromEntity = $relationshipStructure->{from}; my $toEntity = $relationshipStructure->{to}; @@ -480,27 +540,27 @@ # since both sentences will say the same thing. my $arity = $relationshipStructure->{arity}; if ($arity eq "11") { - print HTMLOUT "

Each $fromEntity relates to at most one $toEntity.\n"; + $retVal .= "

Each $fromEntity relates to at most one $toEntity.\n"; } else { - print HTMLOUT "

Each $fromEntity relates to multiple $toEntitys.\n"; + $retVal .= "

Each $fromEntity relates to multiple $toEntitys.\n"; if ($arity eq "MM" && $fromEntity ne $toEntity) { - print HTMLOUT "Each $toEntity relates to multiple $fromEntitys.\n"; + $retVal .= "Each $toEntity relates to multiple $fromEntitys.\n"; } } - print HTMLOUT "

\n"; + $retVal .= "

\n"; # If there are notes on this relationship, display them. if (my $notes = $relationshipStructure->{Notes}) { - print HTMLOUT "

" . _HTMLNote($notes->{content}) . "

\n"; + $retVal .= "

" . _HTMLNote($notes->{content}) . "

\n"; } # Generate the relationship's relation table. my $htmlString = _ShowRelationTable($key, $relationshipStructure->{Relations}->{$key}); - print HTMLOUT $htmlString; + $retVal .= $htmlString; } Trace("Building MetaData join table.") if T(4); # Denote we're starting the join table. - print HTMLOUT "

Join Table

\n"; + $retVal .= "

Join Table

\n"; # Create a table header. - print HTMLOUT _OpenTable("Join Table", "Source", "Target", "Join Condition"); + $retVal .= _OpenTable("Join Table", "Source", "Target", "Join Condition"); # Loop through the joins. my $joinTable = $metadata->{Joins}; my @joinKeys = keys %{$joinTable}; @@ -508,20 +568,18 @@ # Separate out the source, the target, and the join clause. $joinKey =~ m!^([^/]+)/(.+)$!; my ($sourceRelation, $targetRelation) = ($1, $2); - Trace("Join with key $joinKey is from $sourceRelation to $targetRelation.") if T(4); + Trace("Join with key $joinKey is from $sourceRelation to $targetRelation.") if T(Joins => 4); my $source = $self->ComputeObjectSentence($sourceRelation); my $target = $self->ComputeObjectSentence($targetRelation); my $clause = $joinTable->{$joinKey}; # Display them in a table row. - print HTMLOUT "$source$target$clause\n"; + $retVal .= "$source$target$clause\n"; } # Close the table. - print HTMLOUT _CloseTable(); - # Close the document. - print HTMLOUT "\n\n"; - # Close the file. - close HTMLOUT; - Trace("Built MetaData web page.") if T(3); + $retVal .= _CloseTable(); + Trace("Built MetaData HTML.") if T(3); + # Return the HTML. + return $retVal; } =head3 DumpMetaData @@ -553,27 +611,13 @@ sub CreateTables { # Get the parameters. my ($self) = @_; - my $metadata = $self->{_metaData}; - my $dbh = $self->{_dbh}; - # Loop through the entities. - my $entityHash = $metadata->{Entities}; - for my $entityName (keys %{$entityHash}) { - my $entityData = $entityHash->{$entityName}; - # Tell the user what we're doing. - Trace("Creating relations for entity $entityName.") if T(1); - # Loop through the entity's relations. - for my $relationName (keys %{$entityData->{Relations}}) { - # Create a table for this relation. - $self->CreateTable($relationName); - Trace("Relation $relationName created.") if T(1); - } - } - # Loop through the relationships. - my $relationshipTable = $metadata->{Relationships}; - for my $relationshipName (keys %{$metadata->{Relationships}}) { - # Create a table for this relationship. - Trace("Creating relationship $relationshipName.") if T(1); - $self->CreateTable($relationshipName); + # Get the relation names. + my @relNames = $self->GetTableNames(); + # Loop through the relations. + for my $relationName (@relNames) { + # Create a table for this relation. + $self->CreateTable($relationName); + Trace("Relation $relationName created.") if T(2); } } @@ -646,6 +690,134 @@ } } +=head3 VerifyFields + +C<< my $count = $erdb->VerifyFields($relName, \@fieldList); >> + +Run through the list of proposed field values, insuring that all the character fields are +below the maximum length. If any fields are too long, they will be truncated in place. + +=over 4 + +=item relName + +Name of the relation for which the specified fields are destined. + +=item fieldList + +Reference to a list, in order, of the fields to be put into the relation. + +=item RETURN + +Returns the number of fields truncated. + +=back + +=cut + +sub VerifyFields { + # Get the parameters. + my ($self, $relName, $fieldList) = @_; + # Initialize the return value. + my $retVal = 0; + # Get the relation definition. + my $relData = $self->_FindRelation($relName); + # Get the list of field descriptors. + my $fieldTypes = $relData->{Fields}; + my $fieldCount = scalar @{$fieldTypes}; + # Loop through the two lists. + for (my $i = 0; $i < $fieldCount; $i++) { + # Get the type of the current field. + my $fieldType = $fieldTypes->[$i]->{type}; + # If it's a character field, verify the length. + if ($fieldType =~ /string/) { + my $maxLen = $TypeTable{$fieldType}->{maxLen}; + my $oldString = $fieldList->[$i]; + if (length($oldString) > $maxLen) { + # Here it's too big, so we truncate it. + Trace("Truncating field $i in relation $relName to $maxLen characters from \"$oldString\".") if T(1); + $fieldList->[$i] = substr $oldString, 0, $maxLen; + $retVal++; + } + } + } + # Return the truncation count. + return $retVal; +} + +=head3 DigestFields + +C<< $erdb->DigestFields($relName, $fieldList); >> + +Digest the strings in the field list that correspond to data type C in the +specified relation. + +=over 4 + +=item relName + +Name of the relation to which the fields belong. + +=item fieldList + +List of field contents to be loaded into the relation. + +=back + +=cut +#: Return Type ; +sub DigestFields { + # Get the parameters. + my ($self, $relName, $fieldList) = @_; + # Get the relation definition. + my $relData = $self->_FindRelation($relName); + # Get the list of field descriptors. + my $fieldTypes = $relData->{Fields}; + my $fieldCount = scalar @{$fieldTypes}; + # Loop through the two lists. + for (my $i = 0; $i < $fieldCount; $i++) { + # Get the type of the current field. + my $fieldType = $fieldTypes->[$i]->{type}; + # If it's a hash string, digest it in place. + if ($fieldType eq 'hash-string') { + $fieldList->[$i] = $self->DigestKey($fieldList->[$i]); + } + } +} + +=head3 DigestKey + +C<< my $digested = $erdb->DigestKey($keyValue); >> + +Return the digested value of a symbolic key. The digested value can then be plugged into a +key-based search into a table with key-type hash-string. + +Currently the digesting process is independent of the database structure, but that may not +always be the case, so this is an instance method instead of a static method. + +=over 4 + +=item keyValue + +Key value to digest. + +=item RETURN + +Digested value ofthe key. + +=back + +=cut + +sub DigestKey { + # Get the parameters. + my ($self, $keyValue) = @_; + # Compute the digest. + my $retVal = md5_base64($keyValue); + # Return the result. + return $retVal; +} + =head3 CreateIndex C<< $erdb->CreateIndex($relationName); >> @@ -674,8 +846,13 @@ # Get the index's uniqueness flag. my $unique = (exists $indexData->{Unique} ? $indexData->{Unique} : 'false'); # Create the index. - $dbh->create_index(idx => $indexName, tbl => $relationName, flds => $flds, unique => $unique); - Trace("Index created: $indexName for $relationName ($flds)") if T(1); + my $rv = $dbh->create_index(idx => $indexName, tbl => $relationName, + flds => $flds, unique => $unique); + if ($rv) { + Trace("Index created: $indexName for $relationName ($flds)") if T(1); + } else { + Confess("Error creating index $indexName for $relationName using ($flds): " . $dbh->error_message()); + } } } @@ -724,22 +901,11 @@ $directoryName =~ s!/\\$!!; # Declare the return variable. my $retVal = Stats->new(); - # Get the metadata structure. - my $metaData = $self->{_metaData}; - # Loop through the entities. - for my $entity (values %{$metaData->{Entities}}) { - # Loop through the entity's relations. - for my $relationName (keys %{$entity->{Relations}}) { - # Try to load this relation. - my $result = $self->_LoadRelation($directoryName, $relationName, $rebuild); - # Accumulate the statistics. - $retVal->Accumulate($result); - } - } - # Loop through the relationships. - for my $relationshipName (keys %{$metaData->{Relationships}}) { - # Try to load this relationship's relation. - my $result = $self->_LoadRelation($directoryName, $relationshipName, $rebuild); + # Get the relation names. + my @relNames = $self->GetTableNames(); + for my $relationName (@relNames) { + # Try to load this relation. + my $result = $self->_LoadRelation($directoryName, $relationName, $rebuild); # Accumulate the statistics. $retVal->Accumulate($result); } @@ -749,6 +915,7 @@ return $retVal; } + =head3 GetTableNames C<< my @names = $erdb->GetTableNames; >> @@ -783,9 +950,36 @@ return sort keys %{$entityList}; } +=head3 IsEntity + +C<< my $flag = $erdb->IsEntity($entityName); >> + +Return TRUE if the parameter is an entity name, else FALSE. + +=over 4 + +=item entityName + +Object name to be tested. + +=item RETURN + +Returns TRUE if the specified string is an entity name, else FALSE. + +=back + +=cut + +sub IsEntity { + # Get the parameters. + my ($self, $entityName) = @_; + # Test to see if it's an entity. + return exists $self->{_metaData}->{Entities}->{$entityName}; +} + =head3 Get -C<< my $query = $erdb->Get(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >> +C<< my $query = $erdb->Get(\@objectNames, $filterClause, \@params); >> This method returns a query object for entities of a specified type using a specified filter. The filter is a standard WHERE/ORDER BY clause with question marks as parameter markers and each @@ -793,7 +987,7 @@ following call requests all B objects for the genus specified in the variable $genus. -C<< $query = $erdb->Get(['Genome'], "Genome(genus) = ?", $genus); >> +C<< $query = $erdb->Get(['Genome'], "Genome(genus) = ?", [$genus]); >> The WHERE clause contains a single question mark, so there is a single additional parameter representing the parameter value. It would also be possible to code @@ -810,17 +1004,22 @@ It is possible to specify multiple entity and relationship names in order to retrieve more than one object's data at the same time, which allows highly complex joined queries. For example, -C<< $query = $erdb->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", $genus); >> +C<< $query = $erdb->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", [$genus]); >> If multiple names are specified, then the query processor will automatically determine a join path between the entities and relationships. The algorithm used is very simplistic. -In particular, you can't specify any entity or relationship more than once, and if a -relationship is recursive, the path is determined by the order in which the entity -and the relationship appear. For example, consider a recursive relationship B -which relates B objects to other B objects. If the join path is +In particular, if a relationship is recursive, the path is determined by the order in which +the entity and the relationship appear. For example, consider a recursive relationship +B which relates B objects to other B objects. If the join path is coded as C<['People', 'IsParentOf']>, then the people returned will be parents. If, however, the join path is C<['IsParentOf', 'People']>, then the people returned will be children. +If an entity or relationship is mentioned twice, the name for the second occurrence will +be suffixed with C<2>, the third occurrence will be suffixed with C<3>, and so forth. So, +for example, if we have C<['Feature', 'HasContig', 'Contig', 'HasContig']>, then the +B field of the first B is specified as C, while +the B field of the second B is specified as C. + =over 4 =item objectNames @@ -843,13 +1042,28 @@ C<< "Genome(genus) = ? ORDER BY Genome(species)" >> +Note that the case is important. Only an uppercase "ORDER BY" with a single space will +be processed. The idea is to make it less likely to find the verb by accident. + The rules for field references in a sort order are the same as those for field references in the filter clause in general; however, odd things may happen if a sort field is from a secondary relation. -=item param1, param2, ..., paramN +Finally, you can limit the number of rows returned by adding a LIMIT clause. The LIMIT must +be the last thing in the filter clause, and it contains only the word "LIMIT" followed by +a positive number. So, for example + +C<< "Genome(genus) = ? ORDER BY Genome(species) LIMIT 10" >> -Parameter values to be substituted into the filter clause. +will only return the first ten genomes for the specified genus. The ORDER BY clause is not +required. For example, to just get the first 10 genomes in the B table, you could +use + +C<< "LIMIT 10" >> + +=item params + +Reference to a list of parameter values to be substituted into the filter clause. =item RETURN @@ -861,137 +1075,31 @@ sub Get { # Get the parameters. - my ($self, $objectNames, $filterClause, @params) = @_; - # Construct the SELECT statement. The general pattern is - # - # SELECT name1.*, name2.*, ... nameN.* FROM name1, name2, ... nameN - # - my $dbh = $self->{_dbh}; - my $command = "SELECT DISTINCT " . join('.*, ', @{$objectNames}) . ".* FROM " . - join(', ', @{$objectNames}); - # Check for a filter clause. - if ($filterClause) { - # Here we have one, so we convert its field names and add it to the query. First, - # We create a copy of the filter string we can work with. - my $filterString = $filterClause; - # Next, we sort the object names by length. This helps protect us from finding - # object names inside other object names when we're doing our search and replace. - my @sortedNames = sort { length($b) - length($a) } @{$objectNames}; - # We will also keep a list of conditions to add to the WHERE clause in order to link - # entities and relationships as well as primary relations to secondary ones. - my @joinWhere = (); - # The final preparatory step is to create a hash table of relation names. The - # table begins with the relation names already in the SELECT command. - my %fromNames = (); - for my $objectName (@sortedNames) { - $fromNames{$objectName} = 1; - } - # We are ready to begin. We loop through the object names, replacing each - # object name's field references by the corresponding SQL field reference. - # Along the way, if we find a secondary relation, we will need to add it - # to the FROM clause. - for my $objectName (@sortedNames) { - # Get the length of the object name plus 2. This is the value we add to the - # size of the field name to determine the size of the field reference as a - # whole. - my $nameLength = 2 + length $objectName; - # Get the object's field list. - my $fieldList = $self->_GetFieldTable($objectName); - # Find the field references for this object. - while ($filterString =~ m/$objectName\(([^)]*)\)/g) { - # At this point, $1 contains the field name, and the current position - # is set immediately after the final parenthesis. We pull out the name of - # the field and the position and length of the field reference as a whole. - my $fieldName = $1; - my $len = $nameLength + length $fieldName; - my $pos = pos($filterString) - $len; - # Insure the field exists. - if (!exists $fieldList->{$fieldName}) { - Confess("Field $fieldName not found for object $objectName."); - } else { - # Get the field's relation. - my $relationName = $fieldList->{$fieldName}->{relation}; - # Insure the relation is in the FROM clause. - if (!exists $fromNames{$relationName}) { - # Add the relation to the FROM clause. - $command .= ", $relationName"; - # Create its join sub-clause. - push @joinWhere, "$objectName.id = $relationName.id"; - # Denote we have it available for future fields. - $fromNames{$relationName} = 1; - } - # Form an SQL field reference from the relation name and the field name. - my $sqlReference = "$relationName." . _FixName($fieldName); - # Put it into the filter string in place of the old value. - substr($filterString, $pos, $len) = $sqlReference; - # Reposition the search. - pos $filterString = $pos + length $sqlReference; - } - } - } - # The next step is to join the objects together. We only need to do this if there - # is more than one object in the object list. We start with the first object and - # run through the objects after it. Note also that we make a safety copy of the - # list before running through it. - my @objectList = @{$objectNames}; - my $lastObject = shift @objectList; - # Get the join table. - my $joinTable = $self->{_metaData}->{Joins}; - # Loop through the object list. - for my $thisObject (@objectList) { - # Look for a join. - my $joinKey = "$lastObject/$thisObject"; - if (!exists $joinTable->{$joinKey}) { - # Here there's no join, so we throw an error. - Confess("No join exists to connect from $lastObject to $thisObject."); - } else { - # Get the join clause and add it to the WHERE list. - push @joinWhere, $joinTable->{$joinKey}; - # Save this object as the last object for the next iteration. - $lastObject = $thisObject; - } - } - # Now we need to handle the whole ORDER BY thing. We'll put the order by clause - # in the following variable. - my $orderClause = ""; - # Locate the ORDER BY verb (if any). - if ($filterString =~ m/^(.*)ORDER BY/g) { - # Here we have an ORDER BY verb. Split it off of the filter string. - my $pos = pos $filterString; - $orderClause = substr($filterString, $pos); - $filterString = $1; - } - # Add the filter and the join clauses (if any) to the SELECT command. - if ($filterString) { - push @joinWhere, "($filterString)"; - } - if (@joinWhere) { - $command .= " WHERE " . join(' AND ', @joinWhere); - } - # Add the sort clause (if any) to the SELECT command. - if ($orderClause) { - $command .= " ORDER BY $orderClause"; - } + my ($self, $objectNames, $filterClause, $params) = @_; + # Process the SQL stuff. + my ($suffix, $mappedNameListRef, $mappedNameHashRef) = + $self->_SetupSQL($objectNames, $filterClause); + # Create the query. + my $command = "SELECT DISTINCT " . join(".*, ", @{$mappedNameListRef}) . + ".* $suffix"; + my $sth = $self->_GetStatementHandle($command, $params); + # Now we create the relation map, which enables DBQuery to determine the order, name + # and mapped name for each object in the query. + my @relationMap = (); + for my $mappedName (@{$mappedNameListRef}) { + push @relationMap, [$mappedName, $mappedNameHashRef->{$mappedName}]; } - Trace("SQL query: $command") if T(3); - Trace("PARMS: '" . (join "', '", @params) . "'") if (T(4) && (@params > 0)); - my $sth = $dbh->prepare_command($command); - # Execute it with the parameters bound in. - $sth->execute(@params) || Confess("SELECT error" . $sth->errstr()); # Return the statement object. - my $retVal = DBQuery::_new($self, $sth, @{$objectNames}); + my $retVal = DBQuery::_new($self, $sth, \@relationMap); return $retVal; } -=head3 GetList - -C<< my @dbObjects = $erdb->GetList(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >> +=head3 GetFlat -Return a list of object descriptors for the specified objects as determined by the -specified filter clause. +C<< my @list = $erdb->GetFlat(\@objectNames, $filterClause, \@parameterList, $field); >> -This method is essentially the same as L except it returns a list of objects rather -than a query object that can be used to get the results one record at a time. +This is a variation of L that asks for only a single field per record and +returns a single flattened list. =over 4 @@ -1001,51 +1109,364 @@ =item filterClause -WHERE clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can -be parameterized with parameter markers (C). Each field used in the WHERE clause must be -specified in the standard form B(I)>. Any parameters specified -in the filter clause should be added to the parameter list as additional parameters. The -fields in a filter clause can come from primary entity relations, relationship relations, -or secondary entity relations; however, all of the entities and relationships involved must -be included in the list of object names. - -The filter clause can also specify a sort order. To do this, simply follow the filter string -with an ORDER BY clause. For example, the following filter string gets all genomes for a -particular genus and sorts them by species name. +WHERE/ORDER BY clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can +be parameterized with parameter markers (C). Each field used must be specified in the standard form +B(I)>. Any parameters specified in the filter clause should be added to the +parameter list as additional parameters. The fields in a filter clause can come from primary +entity relations, relationship relations, or secondary entity relations; however, all of the +entities and relationships involved must be included in the list of object names. -C<< "Genome(genus) = ? ORDER BY Genome(species)" >> +=item parameterList -The rules for field references in a sort order are the same as those for field references in the -filter clause in general; however, odd things may happen if a sort field is from a secondary -relation. +List of the parameters to be substituted in for the parameters marks in the filter clause. -=item param1, param2, ..., paramN +=item field -Parameter values to be substituted into the filter clause. +Name of the field to be used to get the elements of the list returned. =item RETURN -Returns a list of Bs that satisfy the query conditions. +Returns a list of values. =back =cut -#: Return Type @% -sub GetList { +#: Return Type @; +sub GetFlat { # Get the parameters. - my ($self, $objectNames, $filterClause, @params) = @_; - # Declare the return variable. + my ($self, $objectNames, $filterClause, $parameterList, $field) = @_; + # Construct the query. + my $query = $self->Get($objectNames, $filterClause, $parameterList); + # Create the result list. my @retVal = (); - # Perform the query. - my $query = $self->Get($objectNames, $filterClause, @params); - # Loop through the results. - while (my $object = $query->Fetch) { + # Loop through the records, adding the field values found to the result list. + while (my $row = $query->Fetch()) { + push @retVal, $row->Value($field); + } + # Return the list created. + return @retVal; +} + +=head3 Delete + +C<< my $stats = $erdb->Delete($entityName, $objectID); >> + +Delete an entity instance from the database. The instance is deleted along with all entity and +relationship instances dependent on it. The idea of dependence here is recursive. An object is +always dependent on itself. An object is dependent if it is a 1-to-many or many-to-many +relationship connected to a dependent entity or the "to" entity connected to a 1-to-many +dependent relationship. + +=over 4 + +=item entityName + +Name of the entity type for the instance being deleted. + +=item objectID + +ID of the entity instance to be deleted. If the ID contains a wild card character (C<%>), +then it is presumed to by a LIKE pattern. + +=item testFlag + +If TRUE, the delete statements will be traced without being executed. + +=item RETURN + +Returns a statistics object indicating how many records of each particular table were +deleted. + +=back + +=cut +#: Return Type $%; +sub Delete { + # Get the parameters. + my ($self, $entityName, $objectID, $testFlag) = @_; + # Declare the return variable. + my $retVal = Stats->new(); + # Get the DBKernel object. + my $db = $self->{_dbh}; + # We're going to generate all the paths branching out from the starting entity. One of + # the things we have to be careful about is preventing loops. We'll use a hash to + # determine if we've hit a loop. + my %alreadyFound = (); + # These next lists will serve as our result stack. We start by pushing object lists onto + # the stack, and then popping them off to do the deletes. This means the deletes will + # start with the longer paths before getting to the shorter ones. That, in turn, makes + # sure we don't delete records that might be needed to forge relationships back to the + # original item. We have two lists-- one for TO-relationships, and one for + # FROM-relationships and entities. + my @fromPathList = (); + my @toPathList = (); + # This final hash is used to remember what work still needs to be done. We push paths + # onto the list, then pop them off to extend the paths. We prime it with the starting + # point. Note that we will work hard to insure that the last item on a path in the + # TODO list is always an entity. + my @todoList = ([$entityName]); + while (@todoList) { + # Get the current path. + my $current = pop @todoList; + # Copy it into a list. + my @stackedPath = @{$current}; + # Pull off the last item on the path. It will always be an entity. + my $entityName = pop @stackedPath; + # Add it to the alreadyFound list. + $alreadyFound{$entityName} = 1; + # Get the entity data. + my $entityData = $self->_GetStructure($entityName); + # The first task is to loop through the entity's relation. A DELETE command will + # be needed for each of them. + my $relations = $entityData->{Relations}; + for my $relation (keys %{$relations}) { + my @augmentedList = (@stackedPath, $relation); + push @fromPathList, \@augmentedList; + } + # Now we need to look for relationships connected to this entity. + my $relationshipList = $self->{_metaData}->{Relationships}; + for my $relationshipName (keys %{$relationshipList}) { + my $relationship = $relationshipList->{$relationshipName}; + # Check the FROM field. We're only interested if it's us. + if ($relationship->{from} eq $entityName) { + # Add the path to this relationship. + my @augmentedList = (@stackedPath, $entityName, $relationshipName); + push @fromPathList, \@augmentedList; + # Check the arity. If it's MM we're done. If it's 1M + # and the target hasn't been seen yet, we want to + # stack the entity for future processing. + if ($relationship->{arity} eq '1M') { + my $toEntity = $relationship->{to}; + if (! exists $alreadyFound{$toEntity}) { + # Here we have a new entity that's dependent on + # the current entity, so we need to stack it. + my @stackList = (@augmentedList, $toEntity); + push @fromPathList, \@stackList; + } else { + Trace("$toEntity ignored because it occurred previously.") if T(4); + } + } + } + # Now check the TO field. In this case only the relationship needs + # deletion. + if ($relationship->{to} eq $entityName) { + my @augmentedList = (@stackedPath, $entityName, $relationshipName); + push @toPathList, \@augmentedList; + } + } + } + # Create the first qualifier for the WHERE clause. This selects the + # keys of the primary entity records to be deleted. When we're deleting + # from a dependent table, we construct a join page from the first qualifier + # to the table containing the dependent records to delete. + my $qualifier = ($objectID =~ /%/ ? "LIKE ?" : "= ?"); + # We need to make two passes. The first is through the to-list, and + # the second through the from-list. The from-list is second because + # the to-list may need to pass through some of the entities the + # from-list would delete. + my %stackList = ( from_link => \@fromPathList, to_link => \@toPathList ); + # Now it's time to do the deletes. We do it in two passes. + for my $keyName ('to_link', 'from_link') { + # Get the list for this key. + my @pathList = @{$stackList{$keyName}}; + Trace(scalar(@pathList) . " entries in path list for $keyName.") if T(3); + # Loop through this list. + while (my $path = pop @pathList) { + # Get the table whose rows are to be deleted. + my @pathTables = @{$path}; + # Start the DELETE statement. We need to call DBKernel because the + # syntax of a DELETE-USING varies among DBMSs. + my $target = $pathTables[$#pathTables]; + my $stmt = $db->SetUsing(@pathTables); + # Now start the WHERE. The first thing is the ID field from the starting table. That + # starting table will either be the entity relation or one of the entity's + # sub-relations. + $stmt .= " WHERE $pathTables[0].id $qualifier"; + # Now we run through the remaining entities in the path, connecting them up. + for (my $i = 1; $i <= $#pathTables; $i += 2) { + # Connect the current relationship to the preceding entity. + my ($entity, $rel) = @pathTables[$i-1,$i]; + # The style of connection depends on the direction of the relationship. + $stmt .= " AND $entity.id = $rel.$keyName"; + if ($i + 1 <= $#pathTables) { + # Here there's a next entity, so connect that to the relationship's + # to-link. + my $entity2 = $pathTables[$i+1]; + $stmt .= " AND $rel.to_link = $entity2.id"; + } + } + # Now we have our desired DELETE statement. + if ($testFlag) { + # Here the user wants to trace without executing. + Trace($stmt) if T(0); + } else { + # Here we can delete. Note that the SQL method dies with a confessing + # if an error occurs, so we just go ahead and do it. + Trace("Executing delete from $target using '$objectID'.") if T(3); + my $rv = $db->SQL($stmt, 0, $objectID); + # Accumulate the statistics for this delete. The only rows deleted + # are from the target table, so we use its name to record the + # statistic. + $retVal->Add($target, $rv); + } + } + } + # Return the result. + return $retVal; +} + +=head3 GetList + +C<< my @dbObjects = $erdb->GetList(\@objectNames, $filterClause, \@params); >> + +Return a list of object descriptors for the specified objects as determined by the +specified filter clause. + +This method is essentially the same as L except it returns a list of objects rather +than a query object that can be used to get the results one record at a time. + +=over 4 + +=item objectNames + +List containing the names of the entity and relationship objects to be retrieved. + +=item filterClause + +WHERE clause (without the WHERE) to be used to filter and sort the query. The WHERE clause can +be parameterized with parameter markers (C). Each field used in the WHERE clause must be +specified in the standard form B(I)>. Any parameters specified +in the filter clause should be added to the parameter list as additional parameters. The +fields in a filter clause can come from primary entity relations, relationship relations, +or secondary entity relations; however, all of the entities and relationships involved must +be included in the list of object names. + +The filter clause can also specify a sort order. To do this, simply follow the filter string +with an ORDER BY clause. For example, the following filter string gets all genomes for a +particular genus and sorts them by species name. + +C<< "Genome(genus) = ? ORDER BY Genome(species)" >> + +The rules for field references in a sort order are the same as those for field references in the +filter clause in general; however, odd things may happen if a sort field is from a secondary +relation. + +=item params + +Reference to a list of parameter values to be substituted into the filter clause. + +=item RETURN + +Returns a list of Bs that satisfy the query conditions. + +=back + +=cut +#: Return Type @% +sub GetList { + # Get the parameters. + my ($self, $objectNames, $filterClause, $params) = @_; + # Declare the return variable. + my @retVal = (); + # Perform the query. + my $query = $self->Get($objectNames, $filterClause, $params); + # Loop through the results. + while (my $object = $query->Fetch) { push @retVal, $object; } # Return the result. return @retVal; } +=head3 GetCount + +C<< my $count = $erdb->GetCount(\@objectNames, $filter, \@params); >> + +Return the number of rows found by a specified query. This method would +normally be used to count the records in a single table. For example, in a +genetics database + + my $count = $erdb->GetCount(['Genome'], 'Genome(genus-species) LIKE ?', ['homo %']); + +would return the number of genomes for the genus I. It is conceivable, however, +to use it to return records based on a join. For example, + + my $count = $erdb->GetCount(['HasFeature', 'Genome'], 'Genome(genus-species) LIKE ?', + ['homo %']); + +would return the number of features for genomes in the genus I. Note that +only the rows from the first table are counted. If the above command were + + my $count = $erdb->GetCount(['Genome', 'Feature'], 'Genome(genus-species) LIKE ?', + ['homo %']); + +it would return the number of genomes, not the number of genome/feature pairs. + +=over 4 + +=item objectNames + +Reference to a list of the objects (entities and relationships) included in the +query. + +=item filter + +A filter clause for restricting the query. The rules are the same as for the L +method. + +=item params + +Reference to a list of the parameter values to be substituted for the parameter marks +in the filter. + +=item RETURN + +Returns a count of the number of records in the first table that would satisfy +the query. + +=back + +=cut + +sub GetCount { + # Get the parameters. + my ($self, $objectNames, $filter, $params) = @_; + # Declare the return variable. + my $retVal; + # Find out if we're counting an entity or a relationship. + my $countedField; + if ($self->IsEntity($objectNames->[0])) { + $countedField = "id"; + } else { + # For a relationship we count the to-link because it's usually more + # numerous. Note we're automatically converting to the SQL form + # of the field name (to_link vs. to-link). + $countedField = "to_link"; + } + # Create the SQL command suffix to get the desired records. + my ($suffix, $mappedNameListRef, $mappedNameHashRef) = $self->_SetupSQL($objectNames, + $filter); + # Prefix it with text telling it we want a record count. + my $firstObject = $mappedNameListRef->[0]; + my $command = "SELECT COUNT($firstObject.$countedField) $suffix"; + # Prepare and execute the command. + my $sth = $self->_GetStatementHandle($command, $params); + # Get the count value. + ($retVal) = $sth->fetchrow_array(); + # Check for a problem. + if (! defined($retVal)) { + if ($sth->err) { + # Here we had an SQL error. + Confess("Error retrieving row count: " . $sth->errstr()); + } else { + # Here we have no result. + Confess("No result attempting to retrieve row count."); + } + } + # Return the result. + return $retVal; +} + =head3 ComputeObjectSentence C<< my $sentence = $erdb->ComputeObjectSentence($objectName); >> @@ -1123,6 +1544,69 @@ } } +=head3 InsertValue + +C<< $erdb->InsertValue($entityID, $fieldName, $value); >> + +This method will insert a new value into the database. The value must be one +associated with a secondary relation, since primary values cannot be inserted: +they occur exactly once. Secondary values, on the other hand, can be missing +or multiply-occurring. + +=over 4 + +=item entityID + +ID of the object that is to receive the new value. + +=item fieldName + +Field name for the new value-- this includes the entity name, since +field names are of the format IC<(>IC<)>. + +=item value + +New value to be put in the field. + +=back + +=cut + +sub InsertValue { + # Get the parameters. + my ($self, $entityID, $fieldName, $value) = @_; + # Parse the entity name and the real field name. + if ($fieldName =~ /^([^(]+)\(([^)]+)\)/) { + my $entityName = $1; + my $fieldTitle = $2; + # Get its descriptor. + if (!$self->IsEntity($entityName)) { + Confess("$entityName is not a valid entity."); + } else { + my $entityData = $self->{_metaData}->{Entities}->{$entityName}; + # Find the relation containing this field. + my $fieldHash = $entityData->{Fields}; + if (! exist $fieldHash->{$fieldTitle}) { + Confess("$fieldTitle not found in $entityName."); + } else { + my $relation = $fieldHash->{$fieldTitle}->{relation}; + if ($relation eq $entityName) { + Confess("Cannot do InsertValue on primary field $fieldTitle of $entityName."); + } else { + # Now we can create an INSERT statement. + my $dbh = $self->{_dbh}; + my $fixedName = _FixName($fieldTitle); + my $statement = "INSERT INTO $relation ($fixedName) VALUES(?, ?)"; + # Execute the command. + $dbh->SQL($statement, 0, $entityID, $value); + } + } + } + } else { + Confess("$fieldName is not a valid field name."); + } +} + =head3 InsertObject C<< my $ok = $erdb->InsertObject($objectType, \%fieldHash); >> @@ -1285,8 +1769,7 @@ =item RETURN -Returns a statistical object containing the number of records read and a list of -the error messages. +Returns a statistical object containing a list of the error messages. =back @@ -1300,6 +1783,8 @@ Trace("Loading table $relationName from $fileName") if T(2); # Get the database handle. my $dbh = $self->{_dbh}; + # Get the input file size. + my $fileSize = -s $fileName; # Get the relation data. my $relation = $self->_FindRelation($relationName); # Check the truncation flag. @@ -1310,7 +1795,6 @@ # leave extra room. We postulate a minimum row count of 1000 to # prevent problems with incoming empty load files. my $rowSize = $self->EstimateRowSize($relationName); - my $fileSize = -s $fileName; my $estimate = FIG::max($fileSize * 1.5 / $rowSize, 1000); # Re-create the table without its index. $self->CreateTable($relationName, 0, $estimate); @@ -1324,73 +1808,20 @@ } } } - # Determine whether or not this is a primary relation. Primary relations have an extra - # field indicating whether or not a given object is new or was loaded from the flat files. - my $primary = $self->_IsPrimary($relationName); - # Get the number of fields in this relation. - my @fieldList = @{$relation->{Fields}}; - my $fieldCount = @fieldList; - # Start a database transaction. - $dbh->begin_tran; - # Open the relation file. We need to create a cleaned-up copy before loading. - open TABLEIN, '<', $fileName; - my $tempName = "$fileName.tbl"; - open TABLEOUT, '>', $tempName; - my $inputCount = 0; - # Loop through the file. - while () { - $inputCount++; - # Chop off the new-line character. - my $record = Tracer::Strip($_); - # Only proceed if the record is non-blank. - if ($record) { - # Escape all the backslashes found in the line. - $record =~ s/\\/\\\\/g; - # Insure the number of fields is correct. - my @fields = split /\t/, $record; - while (@fields > $fieldCount) { - my $extraField = $fields[$#fields]; - delete $fields[$#fields]; - if ($extraField) { - Trace("Nonblank extra field value \"$extraField\" deleted from record $inputCount of $fileName.") if T(1); - } - } - while (@fields < $fieldCount) { - push @fields, ""; - } - # If this is a primary relation, add a 0 for the new-record flag (indicating that - # this record is not new, but part of the original load). - if ($primary) { - push @fields, "0"; - } - # Write the record. - $record = join "\t", @fields; - print TABLEOUT "$record\n"; - # Count the record written. - my $count = $retVal->Add('records'); - my $len = length $record; - Trace("Record $count written with $len characters.") if T(4); - } else { - # Here we have a blank record. - $retVal->Add('skipped'); - } - } - # Close the files. - close TABLEIN; - close TABLEOUT; - Trace("Temporary file $tempName created.") if T(2); # Load the table. my $rv; eval { - $rv = $dbh->load_table(file => $tempName, tbl => $relationName); + $rv = $dbh->load_table(file => $fileName, tbl => $relationName); }; if (!defined $rv) { $retVal->AddMessage($@) if ($@); - $retVal->AddMessage("Table load failed for $relationName using $tempName."); + $retVal->AddMessage("Table load failed for $relationName using $fileName."); Trace("Table load failed for $relationName.") if T(1); } else { - # Here we successfully loaded the table. Trace the number of records loaded. - Trace("$retVal->{records} records read for $relationName.") if T(2); + # Here we successfully loaded the table. + $retVal->Add("tables"); + my $size = -s $fileName; + Trace("$size bytes loaded into $relationName.") if T(2); # If we're rebuilding, we need to create the table indexes. if ($truncateFlag && ! $dbh->{_preIndex}) { eval { @@ -1400,13 +1831,9 @@ $retVal->AddMessage($@); } } - # Analyze the table to help optimize tables. } - # Commit the database changes. - $dbh->commit_tran; + # Analyze the table to improve performance. $dbh->vacuum_it($relationName); - # Delete the temporary file. - unlink $tempName; # Return the statistics. return $retVal; } @@ -1498,7 +1925,7 @@ # Get the parameters. my ($self, $entityType, $ID) = @_; # Create a query. - my $query = $self->Get([$entityType], "$entityType(id) = ?", $ID); + my $query = $self->Get([$entityType], "$entityType(id) = ?", [$ID]); # Get the first (and only) object. my $retVal = $query->Fetch(); # Return the result. @@ -1611,18 +2038,24 @@ # list is a scalar we convert it into a singleton list. my @parmList = (); if (ref $parameterList eq "ARRAY") { + Trace("GetAll parm list is an array.") if T(4); @parmList = @{$parameterList}; } else { + Trace("GetAll parm list is a scalar: $parameterList.") if T(4); push @parmList, $parameterList; } - # Create the query. - my $query = $self->Get($objectNames, $filterClause, @parmList); - # Set up a counter of the number of records read. - my $fetched = 0; # Insure the counter has a value. if (!defined $count) { $count = 0; } + # Add the row limit to the filter clause. + if ($count > 0) { + $filterClause .= " LIMIT $count"; + } + # Create the query. + my $query = $self->Get($objectNames, $filterClause, \@parmList); + # Set up a counter of the number of records read. + my $fetched = 0; # Loop through the records returned, extracting the fields. Note that if the # counter is non-zero, we stop when the number of records read hits the count. my @retVal = (); @@ -1673,8 +2106,449 @@ return $retVal; } +=head3 GetFieldTable + +C<< my $fieldHash = $self->GetFieldTable($objectnName); >> + +Get the field structure for a specified entity or relationship. + +=over 4 + +=item objectName + +Name of the desired entity or relationship. + +=item RETURN + +The table containing the field descriptors for the specified object. + +=back + +=cut + +sub GetFieldTable { + # Get the parameters. + my ($self, $objectName) = @_; + # Get the descriptor from the metadata. + my $objectData = $self->_GetStructure($objectName); + # Return the object's field table. + return $objectData->{Fields}; +} + +=head2 Data Mining Methods + +=head3 GetUsefulCrossValues + +C<< my @attrNames = $sprout->GetUsefulCrossValues($sourceEntity, $relationship); >> + +Return a list of the useful attributes that would be returned by a B call +from an entity of the source entity type through the specified relationship. This +means it will return the fields of the target entity type and the intersection data +fields in the relationship. Only primary table fields are returned. In other words, +the field names returned will be for fields where there is always one and only one +value. + +=over 4 + +=item sourceEntity + +Name of the entity from which the relationship crossing will start. + +=item relationship + +Name of the relationship being crossed. + +=item RETURN + +Returns a list of field names in Sprout field format (IC<(>IC<)>. + +=back + +=cut +#: Return Type @; +sub GetUsefulCrossValues { + # Get the parameters. + my ($self, $sourceEntity, $relationship) = @_; + # Declare the return variable. + my @retVal = (); + # Determine the target entity for the relationship. This is whichever entity is not + # the source entity. So, if the source entity is the FROM, we'll get the name of + # the TO, and vice versa. + my $relStructure = $self->_GetStructure($relationship); + my $targetEntityType = ($relStructure->{from} eq $sourceEntity ? "to" : "from"); + my $targetEntity = $relStructure->{$targetEntityType}; + # Get the field table for the entity. + my $entityFields = $self->GetFieldTable($targetEntity); + # The field table is a hash. The hash key is the field name. The hash value is a structure. + # For the entity fields, the key aspect of the target structure is that the {relation} value + # must match the entity name. + my @fieldList = map { "$targetEntity($_)" } grep { $entityFields->{$_}->{relation} eq $targetEntity } + keys %{$entityFields}; + # Push the fields found onto the return variable. + push @retVal, sort @fieldList; + # Get the field table for the relationship. + my $relationshipFields = $self->GetFieldTable($relationship); + # Here we have a different rule. We want all the fields other than "from-link" and "to-link". + # This may end up being an empty set. + my @fieldList2 = map { "$relationship($_)" } grep { $_ ne "from-link" && $_ ne "to-link" } + keys %{$relationshipFields}; + # Push these onto the return list. + push @retVal, sort @fieldList2; + # Return the result. + return @retVal; +} + +=head3 FindColumn + +C<< my $colIndex = ERDB::FindColumn($headerLine, $columnIdentifier); >> + +Return the location a desired column in a data mining header line. The data +mining header line is a tab-separated list of column names. The column +identifier is either the numerical index of a column or the actual column +name. + +=over 4 + +=item headerLine + +The header line from a data mining command, which consists of a tab-separated +list of column names. + +=item columnIdentifier + +Either the ordinal number of the desired column (1-based), or the name of the +desired column. + +=item RETURN + +Returns the array index (0-based) of the desired column. + +=back + +=cut + +sub FindColumn { + # Get the parameters. + my ($headerLine, $columnIdentifier) = @_; + # Declare the return variable. + my $retVal; + # Split the header line into column names. + my @headers = ParseColumns($headerLine); + # Determine whether we have a number or a name. + if ($columnIdentifier =~ /^\d+$/) { + # Here we have a number. Subtract 1 and validate the result. + $retVal = $columnIdentifier - 1; + if ($retVal < 0 || $retVal > $#headers) { + Confess("Invalid column identifer \"$columnIdentifier\": value out of range."); + } + } else { + # Here we have a name. We need to find it in the list. + for (my $i = 0; $i <= $#headers && ! defined($retVal); $i++) { + if ($headers[$i] eq $columnIdentifier) { + $retVal = $i; + } + } + if (! defined($retVal)) { + Confess("Invalid column identifier \"$columnIdentifier\": value not found."); + } + } + # Return the result. + return $retVal; +} + +=head3 ParseColumns + +C<< my @columns = ERDB::ParseColumns($line); >> + +Convert the specified data line to a list of columns. + +=over 4 + +=item line + +A data mining input, consisting of a tab-separated list of columns terminated by a +new-line. + +=item RETURN + +Returns a list consisting of the column values. + +=back + +=cut + +sub ParseColumns { + # Get the parameters. + my ($line) = @_; + # Chop off the line-end. + chomp $line; + # Split it into a list. + my @retVal = split(/\t/, $line); + # Return the result. + return @retVal; +} + =head2 Internal Utility Methods +=head3 SetupSQL + +Process a list of object names and a filter clause so that they can be used to +build an SQL statement. This method takes in a reference to a list of object names +and a filter clause. It will return a corrected filter clause, a list of mapped +names and the mapped name hash. + +This is an instance method. + +=over 4 + +=item objectNames + +Reference to a list of the object names to be included in the query. + +=item filterClause + +A string containing the WHERE clause for the query (without the C) and also +optionally the C and C clauses. + +=item RETURN + +Returns a three-element list. The first element is the SQL statement suffix, beginning +with the FROM clause. The second element is a reference to a list of the names to be +used in retrieving the fields. The third element is a hash mapping the names to the +objects they represent. + +=back + +=cut + +sub _SetupSQL { + my ($self, $objectNames, $filterClause) = @_; + # Adjust the list of object names to account for multiple occurrences of the + # same object. We start with a hash table keyed on object name that will + # return the object suffix. The first time an object is encountered it will + # not be found in the hash. The next time the hash will map the object name + # to 2, then 3, and so forth. + my %objectHash = (); + # This list will contain the object names as they are to appear in the + # FROM list. + my @fromList = (); + # This list contains the suffixed object name for each object. It is exactly + # parallel to the list in the $objectNames parameter. + my @mappedNameList = (); + # Finally, this hash translates from a mapped name to its original object name. + my %mappedNameHash = (); + # Now we create the lists. Note that for every single name we push something into + # @fromList and @mappedNameList. This insures that those two arrays are exactly + # parallel to $objectNames. + for my $objectName (@{$objectNames}) { + # Get the next suffix for this object. + my $suffix = $objectHash{$objectName}; + if (! $suffix) { + # Here we are seeing the object for the first time. The object name + # is used as is. + push @mappedNameList, $objectName; + push @fromList, $objectName; + $mappedNameHash{$objectName} = $objectName; + # Denote the next suffix will be 2. + $objectHash{$objectName} = 2; + } else { + # Here we've seen the object before. We construct a new name using + # the suffix from the hash and update the hash. + my $mappedName = "$objectName$suffix"; + $objectHash{$objectName} = $suffix + 1; + # The FROM list has the object name followed by the mapped name. This + # tells SQL it's still the same table, but we're using a different name + # for it to avoid confusion. + push @fromList, "$objectName $mappedName"; + # The mapped-name list contains the real mapped name. + push @mappedNameList, $mappedName; + # Finally, enable us to get back from the mapped name to the object name. + $mappedNameHash{$mappedName} = $objectName; + } + } + # Begin the SELECT suffix. It starts with + # + # FROM name1, name2, ... nameN + # + my $suffix = "FROM " . join(', ', @fromList); + # Check for a filter clause. + if ($filterClause) { + # Here we have one, so we convert its field names and add it to the query. First, + # We create a copy of the filter string we can work with. + my $filterString = $filterClause; + # Next, we sort the object names by length. This helps protect us from finding + # object names inside other object names when we're doing our search and replace. + my @sortedNames = sort { length($b) - length($a) } @mappedNameList; + # We will also keep a list of conditions to add to the WHERE clause in order to link + # entities and relationships as well as primary relations to secondary ones. + my @joinWhere = (); + # The final preparatory step is to create a hash table of relation names. The + # table begins with the relation names already in the SELECT command. We may + # need to add relations later if there is filtering on a field in a secondary + # relation. The secondary relations are the ones that contain multiply- + # occurring or optional fields. + my %fromNames = map { $_ => 1 } @sortedNames; + # We are ready to begin. We loop through the object names, replacing each + # object name's field references by the corresponding SQL field reference. + # Along the way, if we find a secondary relation, we will need to add it + # to the FROM clause. + for my $mappedName (@sortedNames) { + # Get the length of the object name plus 2. This is the value we add to the + # size of the field name to determine the size of the field reference as a + # whole. + my $nameLength = 2 + length $mappedName; + # Get the real object name for this mapped name. + my $objectName = $mappedNameHash{$mappedName}; + Trace("Processing $mappedName for object $objectName.") if T(4); + # Get the object's field list. + my $fieldList = $self->GetFieldTable($objectName); + # Find the field references for this object. + while ($filterString =~ m/$mappedName\(([^)]*)\)/g) { + # At this point, $1 contains the field name, and the current position + # is set immediately after the final parenthesis. We pull out the name of + # the field and the position and length of the field reference as a whole. + my $fieldName = $1; + my $len = $nameLength + length $fieldName; + my $pos = pos($filterString) - $len; + # Insure the field exists. + if (!exists $fieldList->{$fieldName}) { + Confess("Field $fieldName not found for object $objectName."); + } else { + Trace("Processing $fieldName at position $pos.") if T(4); + # Get the field's relation. + my $relationName = $fieldList->{$fieldName}->{relation}; + # Now we have a secondary relation. We need to insure it matches the + # mapped name of the primary relation. First we peel off the suffix + # from the mapped name. + my $mappingSuffix = substr $mappedName, length($objectName); + # Put the mapping suffix onto the relation name to get the + # mapped relation name. + my $mappedRelationName = "$relationName$mappingSuffix"; + # Insure the relation is in the FROM clause. + if (!exists $fromNames{$mappedRelationName}) { + # Add the relation to the FROM clause. + if ($mappedRelationName eq $relationName) { + # The name is un-mapped, so we add it without + # any frills. + $suffix .= ", $relationName"; + push @joinWhere, "$objectName.id = $relationName.id"; + } else { + # Here we have a mapping situation. + $suffix .= ", $relationName $mappedRelationName"; + push @joinWhere, "$mappedRelationName.id = $mappedName.id"; + } + # Denote we have this relation available for future fields. + $fromNames{$mappedRelationName} = 1; + } + # Form an SQL field reference from the relation name and the field name. + my $sqlReference = "$mappedRelationName." . _FixName($fieldName); + # Put it into the filter string in place of the old value. + substr($filterString, $pos, $len) = $sqlReference; + # Reposition the search. + pos $filterString = $pos + length $sqlReference; + } + } + } + # The next step is to join the objects together. We only need to do this if there + # is more than one object in the object list. We start with the first object and + # run through the objects after it. Note also that we make a safety copy of the + # list before running through it. + my @mappedObjectList = @mappedNameList; + my $lastMappedObject = shift @mappedObjectList; + # Get the join table. + my $joinTable = $self->{_metaData}->{Joins}; + # Loop through the object list. + for my $thisMappedObject (@mappedObjectList) { + # Look for a join using the real object names. + my $lastObject = $mappedNameHash{$lastMappedObject}; + my $thisObject = $mappedNameHash{$thisMappedObject}; + my $joinKey = "$lastObject/$thisObject"; + if (!exists $joinTable->{$joinKey}) { + # Here there's no join, so we throw an error. + Confess("No join exists to connect from $lastMappedObject to $thisMappedObject."); + } else { + # Get the join clause. + my $unMappedJoin = $joinTable->{$joinKey}; + # Fix the names. + $unMappedJoin =~ s/$lastObject/$lastMappedObject/; + $unMappedJoin =~ s/$thisObject/$thisMappedObject/; + push @joinWhere, $unMappedJoin; + # Save this object as the last object for the next iteration. + $lastMappedObject = $thisMappedObject; + } + } + # Now we need to handle the whole ORDER BY / LIMIT thing. The important part + # here is we want the filter clause to be empty if there's no WHERE filter. + # We'll put the ORDER BY / LIMIT clauses in the following variable. + my $orderClause = ""; + # Locate the ORDER BY or LIMIT verbs (if any). We use a non-greedy + # operator so that we find the first occurrence of either verb. + if ($filterString =~ m/^(.*?)\s*(ORDER BY|LIMIT)/g) { + # Here we have an ORDER BY or LIMIT verb. Split it off of the filter string. + my $pos = pos $filterString; + $orderClause = $2 . substr($filterString, $pos); + $filterString = $1; + } + # Add the filter and the join clauses (if any) to the SELECT command. + if ($filterString) { + Trace("Filter string is \"$filterString\".") if T(4); + push @joinWhere, "($filterString)"; + } + if (@joinWhere) { + $suffix .= " WHERE " . join(' AND ', @joinWhere); + } + # Add the sort or limit clause (if any) to the SELECT command. + if ($orderClause) { + $suffix .= " $orderClause"; + } + } + # Return the suffix, the mapped name list, and the mapped name hash. + return ($suffix, \@mappedNameList, \%mappedNameHash); +} + +=head3 GetStatementHandle + +This method will prepare and execute an SQL query, returning the statement handle. +The main reason for doing this here is so that everybody who does SQL queries gets +the benefit of tracing. + +This is an instance method. + +=over 4 + +=item command + +Command to prepare and execute. + +=item params + +Reference to a list of the values to be substituted in for the parameter marks. + +=item RETURN + +Returns a prepared and executed statement handle from which the caller can extract +results. + +=back + +=cut + +sub _GetStatementHandle { + # Get the parameters. + my ($self, $command, $params) = @_; + # Trace the query. + Trace("SQL query: $command") if T(SQL => 3); + Trace("PARMS: '" . (join "', '", @{$params}) . "'") if (T(SQL => 4) && (@{$params} > 0)); + # Get the database handle. + my $dbh = $self->{_dbh}; + # Prepare the command. + my $sth = $dbh->prepare_command($command); + # Execute it with the parameters bound in. + $sth->execute(@{$params}) || Confess("SELECT error" . $sth->errstr()); + # Return the statement handle. + return $sth; +} + =head3 GetLoadStats Return a blank statistics object for use by the load methods. @@ -1683,8 +2557,8 @@ =cut -sub _GetLoadStats { - return Stats->new('records'); +sub _GetLoadStats{ + return Stats->new(); } =head3 GenerateFields @@ -1879,35 +2753,6 @@ return $objectData->{Relations}; } -=head3 GetFieldTable - -Get the field structure for a specified entity or relationship. - -This is an instance method. - -=over 4 - -=item objectName - -Name of the desired entity or relationship. - -=item RETURN - -The table containing the field descriptors for the specified object. - -=back - -=cut - -sub _GetFieldTable { - # Get the parameters. - my ($self, $objectName) = @_; - # Get the descriptor from the metadata. - my $objectData = $self->_GetStructure($objectName); - # Return the object's field table. - return $objectData->{Fields}; -} - =head3 ValidateFieldNames Determine whether or not the field names are valid. A description of the problems with the names @@ -2255,27 +3100,27 @@ my @fromList = (); my @toList = (); my @bothList = (); - Trace("Join table build for $entityName.") if T(4); + Trace("Join table build for $entityName.") if T(metadata => 4); for my $relationshipName (keys %{$relationshipList}) { my $relationship = $relationshipList->{$relationshipName}; # Determine if this relationship has our entity in one of its link fields. my $fromEntity = $relationship->{from}; my $toEntity = $relationship->{to}; - Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(4); + Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(Joins => 4); if ($fromEntity eq $entityName) { if ($toEntity eq $entityName) { # Here the relationship is recursive. push @bothList, $relationshipName; - Trace("Relationship $relationshipName put in both-list.") if T(4); + Trace("Relationship $relationshipName put in both-list.") if T(metadata => 4); } else { # Here the relationship comes from the entity. push @fromList, $relationshipName; - Trace("Relationship $relationshipName put in from-list.") if T(4); + Trace("Relationship $relationshipName put in from-list.") if T(metadata => 4); } } elsif ($toEntity eq $entityName) { # Here the relationship goes to the entity. push @toList, $relationshipName; - Trace("Relationship $relationshipName put in to-list.") if T(4); + Trace("Relationship $relationshipName put in to-list.") if T(metadata => 4); } } # Create the nonrecursive joins. Note that we build two hashes for running @@ -2291,7 +3136,7 @@ # Create joins between the entity and this relationship. my $linkField = "$relationshipName.${linkType}_link"; my $joinClause = "$entityName.id = $linkField"; - Trace("Entity join clause is $joinClause for $entityName and $relationshipName.") if T(4); + Trace("Entity join clause is $joinClause for $entityName and $relationshipName.") if T(metadata => 4); $joinTable{"$entityName/$relationshipName"} = $joinClause; $joinTable{"$relationshipName/$entityName"} = $joinClause; # Create joins between this relationship and the other relationships. @@ -2312,7 +3157,7 @@ # relationship and itself are prohibited. my $relJoinClause = "$otherName.${otherType}_link = $linkField"; $joinTable{$joinKey} = $relJoinClause; - Trace("Relationship join clause is $relJoinClause for $joinKey.") if T(4); + Trace("Relationship join clause is $relJoinClause for $joinKey.") if T(metadata => 4); } } } @@ -2321,7 +3166,7 @@ # relationship can only be ambiguous with another recursive relationship, # and the incoming relationship from the outer loop is never recursive. for my $otherName (@bothList) { - Trace("Setting up relationship joins to recursive relationship $otherName with $relationshipName.") if T(4); + Trace("Setting up relationship joins to recursive relationship $otherName with $relationshipName.") if T(metadata => 4); # Join from the left. $joinTable{"$relationshipName/$otherName"} = "$linkField = $otherName.from_link"; @@ -2336,7 +3181,7 @@ # rise to situations where we can't create the path we want; however, it is always # possible to get the same effect using multiple queries. for my $relationshipName (@bothList) { - Trace("Setting up entity joins to recursive relationship $relationshipName with $entityName.") if T(4); + Trace("Setting up entity joins to recursive relationship $relationshipName with $entityName.") if T(metadata => 4); # Join to the entity from each direction. $joinTable{"$entityName/$relationshipName"} = "$entityName.id = $relationshipName.from_link"; @@ -2350,6 +3195,46 @@ return $metadata; } +=head3 SortNeeded + +C<< my $flag = $erdb->SortNeeded($relationName); >> + +Return TRUE if the specified relation should be sorted during loading to remove duplicate keys, +else FALSE. + +=over 4 + +=item relationName + +Name of the relation to be examined. + +=item RETURN + +Returns TRUE if the relation needs a sort, else FALSE. + +=back + +=cut +#: Return Type $; +sub SortNeeded { + # Get the parameters. + my ($self, $relationName) = @_; + # Declare the return variable. + my $retVal = 0; + # Find out if the relation is a primary entity relation. + my $entityTable = $self->{_metaData}->{Entities}; + if (exists $entityTable->{$relationName}) { + my $keyType = $entityTable->{$relationName}->{keyType}; + Trace("Relation $relationName found in entity table with key type $keyType.") if T(3); + # If the key is not a hash string, we must do the sort. + if ($keyType ne 'hash-string') { + $retVal = 1; + } + } + # Return the result. + return $retVal; +} + =head3 CreateRelationshipIndex Create an index for a relationship's relation.