[Bio] / Sprout / ERDB.pm Repository:
ViewVC logotype

Diff of /Sprout/ERDB.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.38, Fri Mar 17 22:02:03 2006 UTC revision 1.44, Sat May 27 02:02:28 2006 UTC
# Line 9  Line 9 
9      use DBObject;      use DBObject;
10      use Stats;      use Stats;
11      use Time::HiRes qw(gettimeofday);      use Time::HiRes qw(gettimeofday);
12        use Digest::MD5 qw(md5_base64);
13      use FIG;      use FIG;
14    
15  =head1 Entity-Relationship Database Package  =head1 Entity-Relationship Database Package
# Line 109  Line 110 
110  compatability with certain database packages), but the only values supported are  compatability with certain database packages), but the only values supported are
111  0 and 1.  0 and 1.
112    
113    =item id-string
114    
115    variable-length string, maximum 25 characters
116    
117  =item key-string  =item key-string
118    
119  variable-length string, maximum 40 characters  variable-length string, maximum 40 characters
# Line 125  Line 130 
130    
131  variable-length string, maximum 255 characters  variable-length string, maximum 255 characters
132    
133    =item hash-string
134    
135    variable-length string, maximum 22 characters
136    
137  =back  =back
138    
139    The hash-string data type has a special meaning. The actual key passed into the loader will
140    be a string, but it will be digested into a 22-character MD5 code to save space. Although the
141    MD5 algorithm is not perfect, it is extremely unlikely two strings will have the same
142    digest. Therefore, it is presumed the keys will be unique. When the database is actually
143    in use, the hashed keys will be presented rather than the original values. For this reason,
144    they should not be used for entities where the key is meaningful.
145    
146  =head3 Global Tags  =head3 Global Tags
147    
148  The entire database definition must be inside a B<Database> tag. The display name of  The entire database definition must be inside a B<Database> tag. The display name of
# Line 310  Line 326 
326                    date =>    { sqlType => 'BIGINT',             maxLen => 80,           avgLen =>   8, dataGen => "DateGen(-7, 7, IntGen(0,1400))" },                    date =>    { sqlType => 'BIGINT',             maxLen => 80,           avgLen =>   8, dataGen => "DateGen(-7, 7, IntGen(0,1400))" },
327                    float =>   { sqlType => 'DOUBLE PRECISION',   maxLen => 40,           avgLen =>   8, dataGen => "FloatGen(0.0, 100.0)" },                    float =>   { sqlType => 'DOUBLE PRECISION',   maxLen => 40,           avgLen =>   8, dataGen => "FloatGen(0.0, 100.0)" },
328                    boolean => { sqlType => 'SMALLINT',           maxLen => 1,            avgLen =>   1, dataGen => "IntGen(0, 1)" },                    boolean => { sqlType => 'SMALLINT',           maxLen => 1,            avgLen =>   1, dataGen => "IntGen(0, 1)" },
329                     'hash-string' =>
330                                 { sqlType => 'VARCHAR(22)',        maxLen => 22,           avgLen =>  22, dataGen => "SringGen(22)" },
331                     'id-string' =>
332                                 { sqlType => 'VARCHAR(25)',        maxLen => 25,           avgLen =>  25, dataGen => "SringGen(22)" },
333                   'key-string' =>                   'key-string' =>
334                               { sqlType => 'VARCHAR(40)',        maxLen => 40,           avgLen =>  10, dataGen => "StringGen(IntGen(10,40))" },                               { sqlType => 'VARCHAR(40)',        maxLen => 40,           avgLen =>  10, dataGen => "StringGen(IntGen(10,40))" },
335                   'name-string' =>                   'name-string' =>
# Line 687  Line 707 
707      return $retVal;      return $retVal;
708  }  }
709    
710    =head3 DigestFields
711    
712    C<< $erdb->DigestFields($relName, $fieldList); >>
713    
714    Digest the strings in the field list that correspond to data type C<hash-string> in the
715    specified relation.
716    
717    =over 4
718    
719    =item relName
720    
721    Name of the relation to which the fields belong.
722    
723    =item fieldList
724    
725    List of field contents to be loaded into the relation.
726    
727    =back
728    
729    =cut
730    #: Return Type ;
731    sub DigestFields {
732        # Get the parameters.
733        my ($self, $relName, $fieldList) = @_;
734        # Get the relation definition.
735        my $relData = $self->_FindRelation($relName);
736        # Get the list of field descriptors.
737        my $fieldTypes = $relData->{Fields};
738        my $fieldCount = scalar @{$fieldTypes};
739        # Loop through the two lists.
740        for (my $i = 0; $i < $fieldCount; $i++) {
741            # Get the type of the current field.
742            my $fieldType = $fieldTypes->[$i]->{type};
743            # If it's a hash string, digest it in place.
744            if ($fieldType eq 'hash-string') {
745                $fieldList->[$i] = md5_base64($fieldList->[$i]);
746            }
747        }
748    }
749    
750  =head3 CreateIndex  =head3 CreateIndex
751    
752  C<< $erdb->CreateIndex($relationName); >>  C<< $erdb->CreateIndex($relationName); >>
# Line 877  Line 937 
937    
938  If multiple names are specified, then the query processor will automatically determine a  If multiple names are specified, then the query processor will automatically determine a
939  join path between the entities and relationships. The algorithm used is very simplistic.  join path between the entities and relationships. The algorithm used is very simplistic.
940  In particular, you can't specify any entity or relationship more than once, and if a  In particular, if a relationship is recursive, the path is determined by the order in which
941  relationship is recursive, the path is determined by the order in which the entity  the entity and the relationship appear. For example, consider a recursive relationship
942  and the relationship appear. For example, consider a recursive relationship B<IsParentOf>  B<IsParentOf> which relates B<People> objects to other B<People> objects. If the join path is
 which relates B<People> objects to other B<People> objects. If the join path is  
943  coded as C<['People', 'IsParentOf']>, then the people returned will be parents. If, however,  coded as C<['People', 'IsParentOf']>, then the people returned will be parents. If, however,
944  the join path is C<['IsParentOf', 'People']>, then the people returned will be children.  the join path is C<['IsParentOf', 'People']>, then the people returned will be children.
945    
946    If an entity or relationship is mentioned twice, the name for the second occurrence will
947    be suffixed with C<2>, the third occurrence will be suffixed with C<3>, and so forth. So,
948    for example, if we have C<['Feature', 'HasContig', 'Contig', 'HasContig']>, then the
949    B<to-link> field of the first B<HasContig> is specified as C<HasContig(to-link)>, while
950    the B<to-link> field of the second B<HasContig> is specified as C<HasContig2(to-link)>.
951    
952  =over 4  =over 4
953    
954  =item objectNames  =item objectNames
# Line 913  Line 978 
978  filter clause in general; however, odd things may happen if a sort field is from a secondary  filter clause in general; however, odd things may happen if a sort field is from a secondary
979  relation.  relation.
980    
981    Finally, you can limit the number of rows returned by adding a LIMIT clause. The LIMIT must
982    be the last thing in the filter clause, and it contains only the word "LIMIT" followed by
983    a positive number. So, for example
984    
985    C<< "Genome(genus) = ? ORDER BY Genome(species) LIMIT 10" >>
986    
987    will only return the first ten genomes for the specified genus. The ORDER BY clause is not
988    required. For example, to just get the first 10 genomes in the B<Genome> table, you could
989    use
990    
991    C<< "LIMIT 10" >>
992    
993  =item param1, param2, ..., paramN  =item param1, param2, ..., paramN
994    
995  Parameter values to be substituted into the filter clause.  Parameter values to be substituted into the filter clause.
# Line 928  Line 1005 
1005  sub Get {  sub Get {
1006      # Get the parameters.      # Get the parameters.
1007      my ($self, $objectNames, $filterClause, @params) = @_;      my ($self, $objectNames, $filterClause, @params) = @_;
1008        # Adjust the list of object names to account for multiple occurrences of the
1009        # same object. We start with a hash table keyed on object name that will
1010        # return the object suffix. The first time an object is encountered it will
1011        # not be found in the hash. The next time the hash will map the object name
1012        # to 2, then 3, and so forth.
1013        my %objectHash = ();
1014        # This list will contain the object names as they are to appear in the
1015        # FROM list.
1016        my @fromList = ();
1017        # This list contains the suffixed object name for each object. It is exactly
1018        # parallel to the list in the $objectNames parameter.
1019        my @mappedNameList = ();
1020        # Finally, this hash translates from a mapped name to its original object name.
1021        my %mappedNameHash = ();
1022        # Now we create the lists. Note that for every single name we push something into
1023        # @fromList and @mappedNameList. This insures that those two arrays are exactly
1024        # parallel to $objectNames.
1025        for my $objectName (@{$objectNames}) {
1026            # Get the next suffix for this object.
1027            my $suffix = $objectHash{$objectName};
1028            if (! $suffix) {
1029                # Here we are seeing the object for the first time. The object name
1030                # is used as is.
1031                push @mappedNameList, $objectName;
1032                push @fromList, $objectName;
1033                $mappedNameHash{$objectName} = $objectName;
1034                # Denote the next suffix will be 2.
1035                $objectHash{$objectName} = 2;
1036            } else {
1037                # Here we've seen the object before. We construct a new name using
1038                # the suffix from the hash and update the hash.
1039                my $mappedName = "$objectName$suffix";
1040                $objectHash{$objectName} = $suffix + 1;
1041                # The FROM list has the object name followed by the mapped name. This
1042                # tells SQL it's still the same table, but we're using a different name
1043                # for it to avoid confusion.
1044                push @fromList, "$objectName $mappedName";
1045                # The mapped-name list contains the real mapped name.
1046                push @mappedNameList, $mappedName;
1047                # Finally, enable us to get back from the mapped name to the object name.
1048                $mappedNameHash{$mappedName} = $objectName;
1049            }
1050        }
1051      # Construct the SELECT statement. The general pattern is      # Construct the SELECT statement. The general pattern is
1052      #      #
1053      # SELECT name1.*, name2.*, ... nameN.* FROM name1, name2, ... nameN      # SELECT name1.*, name2.*, ... nameN.* FROM name1, name2, ... nameN
1054      #      #
1055      my $dbh = $self->{_dbh};      my $dbh = $self->{_dbh};
1056      my $command = "SELECT DISTINCT " . join('.*, ', @{$objectNames}) . ".* FROM " .      my $command = "SELECT DISTINCT " . join('.*, ', @mappedNameList) . ".* FROM " .
1057                  join(', ', @{$objectNames});                  join(', ', @fromList);
1058      # Check for a filter clause.      # Check for a filter clause.
1059      if ($filterClause) {      if ($filterClause) {
1060          # Here we have one, so we convert its field names and add it to the query. First,          # Here we have one, so we convert its field names and add it to the query. First,
# Line 942  Line 1062 
1062          my $filterString = $filterClause;          my $filterString = $filterClause;
1063          # Next, we sort the object names by length. This helps protect us from finding          # Next, we sort the object names by length. This helps protect us from finding
1064          # object names inside other object names when we're doing our search and replace.          # object names inside other object names when we're doing our search and replace.
1065          my @sortedNames = sort { length($b) - length($a) } @{$objectNames};          my @sortedNames = sort { length($b) - length($a) } @mappedNameList;
1066          # We will also keep a list of conditions to add to the WHERE clause in order to link          # We will also keep a list of conditions to add to the WHERE clause in order to link
1067          # entities and relationships as well as primary relations to secondary ones.          # entities and relationships as well as primary relations to secondary ones.
1068          my @joinWhere = ();          my @joinWhere = ();
1069          # The final preparatory step is to create a hash table of relation names. The          # The final preparatory step is to create a hash table of relation names. The
1070          # table begins with the relation names already in the SELECT command.          # table begins with the relation names already in the SELECT command. We may
1071          my %fromNames = ();          # need to add relations later if there is filtering on a field in a secondary
1072          for my $objectName (@sortedNames) {          # relation. The secondary relations are the ones that contain multiply-
1073              $fromNames{$objectName} = 1;          # occurring or optional fields.
1074          }          my %fromNames = map { $_ => 1 } @sortedNames;
1075          # We are ready to begin. We loop through the object names, replacing each          # We are ready to begin. We loop through the object names, replacing each
1076          # object name's field references by the corresponding SQL field reference.          # object name's field references by the corresponding SQL field reference.
1077          # Along the way, if we find a secondary relation, we will need to add it          # Along the way, if we find a secondary relation, we will need to add it
1078          # to the FROM clause.          # to the FROM clause.
1079          for my $objectName (@sortedNames) {          for my $mappedName (@sortedNames) {
1080              # Get the length of the object name plus 2. This is the value we add to the              # Get the length of the object name plus 2. This is the value we add to the
1081              # size of the field name to determine the size of the field reference as a              # size of the field name to determine the size of the field reference as a
1082              # whole.              # whole.
1083              my $nameLength = 2 + length $objectName;              my $nameLength = 2 + length $mappedName;
1084                # Get the real object name for this mapped name.
1085                my $objectName = $mappedNameHash{$mappedName};
1086                Trace("Processing $mappedName for object $objectName.") if T(4);
1087              # Get the object's field list.              # Get the object's field list.
1088              my $fieldList = $self->GetFieldTable($objectName);              my $fieldList = $self->GetFieldTable($objectName);
1089              # Find the field references for this object.              # Find the field references for this object.
1090              while ($filterString =~ m/$objectName\(([^)]*)\)/g) {              while ($filterString =~ m/$mappedName\(([^)]*)\)/g) {
1091                  # At this point, $1 contains the field name, and the current position                  # At this point, $1 contains the field name, and the current position
1092                  # is set immediately after the final parenthesis. We pull out the name of                  # is set immediately after the final parenthesis. We pull out the name of
1093                  # the field and the position and length of the field reference as a whole.                  # the field and the position and length of the field reference as a whole.
# Line 975  Line 1098 
1098                  if (!exists $fieldList->{$fieldName}) {                  if (!exists $fieldList->{$fieldName}) {
1099                      Confess("Field $fieldName not found for object $objectName.");                      Confess("Field $fieldName not found for object $objectName.");
1100                  } else {                  } else {
1101                        Trace("Processing $fieldName at position $pos.") if T(4);
1102                      # Get the field's relation.                      # Get the field's relation.
1103                      my $relationName = $fieldList->{$fieldName}->{relation};                      my $relationName = $fieldList->{$fieldName}->{relation};
1104                        # Now we have a secondary relation. We need to insure it matches the
1105                        # mapped name of the primary relation. First we peel off the suffix
1106                        # from the mapped name.
1107                        my $mappingSuffix = substr $mappedName, length($objectName);
1108                        # Put the mapping suffix onto the relation name to get the
1109                        # mapped relation name.
1110                        my $mappedRelationName = "$relationName$mappingSuffix";
1111                      # Insure the relation is in the FROM clause.                      # Insure the relation is in the FROM clause.
1112                      if (!exists $fromNames{$relationName}) {                      if (!exists $fromNames{$mappedRelationName}) {
1113                          # Add the relation to the FROM clause.                          # Add the relation to the FROM clause.
1114                            if ($mappedRelationName eq $relationName) {
1115                                # The name is un-mapped, so we add it without
1116                                # any frills.
1117                          $command .= ", $relationName";                          $command .= ", $relationName";
                         # Create its join sub-clause.  
1118                          push @joinWhere, "$objectName.id = $relationName.id";                          push @joinWhere, "$objectName.id = $relationName.id";
1119                          # Denote we have it available for future fields.                          } else {
1120                          $fromNames{$relationName} = 1;                              # Here we have a mapping situation.
1121                                $command .= ", $relationName $mappedRelationName";
1122                                push @joinWhere, "$mappedRelationName.id = $mappedName.id";
1123                            }
1124                            # Denote we have this relation available for future fields.
1125                            $fromNames{$mappedRelationName} = 1;
1126                      }                      }
1127                      # Form an SQL field reference from the relation name and the field name.                      # Form an SQL field reference from the relation name and the field name.
1128                      my $sqlReference = "$relationName." . _FixName($fieldName);                      my $sqlReference = "$mappedRelationName." . _FixName($fieldName);
1129                      # Put it into the filter string in place of the old value.                      # Put it into the filter string in place of the old value.
1130                      substr($filterString, $pos, $len) = $sqlReference;                      substr($filterString, $pos, $len) = $sqlReference;
1131                      # Reposition the search.                      # Reposition the search.
# Line 999  Line 1137 
1137          # is more than one object in the object list. We start with the first object and          # is more than one object in the object list. We start with the first object and
1138          # run through the objects after it. Note also that we make a safety copy of the          # run through the objects after it. Note also that we make a safety copy of the
1139          # list before running through it.          # list before running through it.
1140          my @objectList = @{$objectNames};          my @mappedObjectList = @mappedNameList;
1141          my $lastObject = shift @objectList;          my $lastMappedObject = shift @mappedObjectList;
1142          # Get the join table.          # Get the join table.
1143          my $joinTable = $self->{_metaData}->{Joins};          my $joinTable = $self->{_metaData}->{Joins};
1144          # Loop through the object list.          # Loop through the object list.
1145          for my $thisObject (@objectList) {          for my $thisMappedObject (@mappedObjectList) {
1146              # Look for a join.              # Look for a join using the real object names.
1147                my $lastObject = $mappedNameHash{$lastMappedObject};
1148                my $thisObject = $mappedNameHash{$thisMappedObject};
1149              my $joinKey = "$lastObject/$thisObject";              my $joinKey = "$lastObject/$thisObject";
1150              if (!exists $joinTable->{$joinKey}) {              if (!exists $joinTable->{$joinKey}) {
1151                  # Here there's no join, so we throw an error.                  # Here there's no join, so we throw an error.
1152                  Confess("No join exists to connect from $lastObject to $thisObject.");                  Confess("No join exists to connect from $lastMappedObject to $thisMappedObject.");
1153              } else {              } else {
1154                  # Get the join clause and add it to the WHERE list.                  # Get the join clause.
1155                  push @joinWhere, $joinTable->{$joinKey};                  my $unMappedJoin = $joinTable->{$joinKey};
1156                    # Fix the names.
1157                    $unMappedJoin =~ s/$lastObject/$lastMappedObject/;
1158                    $unMappedJoin =~ s/$thisObject/$thisMappedObject/;
1159                    push @joinWhere, $unMappedJoin;
1160                  # Save this object as the last object for the next iteration.                  # Save this object as the last object for the next iteration.
1161                  $lastObject = $thisObject;                  $lastMappedObject = $thisMappedObject;
1162              }              }
1163          }          }
1164          # Now we need to handle the whole ORDER BY / LIMIT thing. The important part          # Now we need to handle the whole ORDER BY / LIMIT thing. The important part
# Line 1031  Line 1175 
1175          }          }
1176          # Add the filter and the join clauses (if any) to the SELECT command.          # Add the filter and the join clauses (if any) to the SELECT command.
1177          if ($filterString) {          if ($filterString) {
1178                Trace("Filter string is \"$filterString\".") if T(4);
1179              push @joinWhere, "($filterString)";              push @joinWhere, "($filterString)";
1180          }          }
1181          if (@joinWhere) {          if (@joinWhere) {
# Line 1041  Line 1186 
1186              $command .= " $orderClause";              $command .= " $orderClause";
1187          }          }
1188      }      }
1189      Trace("SQL query: $command") if T(SQL => 4);      Trace("SQL query: $command") if T(SQL => 3);
1190      Trace("PARMS: '" . (join "', '", @params) . "'") if (T(SQL => 4) && (@params > 0));      Trace("PARMS: '" . (join "', '", @params) . "'") if (T(SQL => 4) && (@params > 0));
1191      my $sth = $dbh->prepare_command($command);      my $sth = $dbh->prepare_command($command);
1192      # Execute it with the parameters bound in.      # Execute it with the parameters bound in.
1193      $sth->execute(@params) || Confess("SELECT error" . $sth->errstr());      $sth->execute(@params) || Confess("SELECT error" . $sth->errstr());
1194        # Now we create the relation map, which enables DBQuery to determine the order, name
1195        # and mapped name for each object in the query.
1196        my @relationMap = ();
1197        for my $mappedName (@mappedNameList) {
1198            push @relationMap, [$mappedName, $mappedNameHash{$mappedName}];
1199        }
1200      # Return the statement object.      # Return the statement object.
1201      my $retVal = DBQuery::_new($self, $sth, @{$objectNames});      my $retVal = DBQuery::_new($self, $sth, \@relationMap);
1202      return $retVal;      return $retVal;
1203  }  }
1204    
# Line 2505  Line 2656 
2656              # Determine if this relationship has our entity in one of its link fields.              # Determine if this relationship has our entity in one of its link fields.
2657              my $fromEntity = $relationship->{from};              my $fromEntity = $relationship->{from};
2658              my $toEntity = $relationship->{to};              my $toEntity = $relationship->{to};
2659              Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(4);              Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(Joins => 4);
2660              if ($fromEntity eq $entityName) {              if ($fromEntity eq $entityName) {
2661                  if ($toEntity eq $entityName) {                  if ($toEntity eq $entityName) {
2662                      # Here the relationship is recursive.                      # Here the relationship is recursive.
# Line 2594  Line 2745 
2745      return $metadata;      return $metadata;
2746  }  }
2747    
2748    =head3 SortNeeded
2749    
2750    C<< my $flag = $erdb->SortNeeded($relationName); >>
2751    
2752    Return TRUE if the specified relation should be sorted during loading to remove duplicate keys,
2753    else FALSE.
2754    
2755    =over 4
2756    
2757    =item relationName
2758    
2759    Name of the relation to be examined.
2760    
2761    =item RETURN
2762    
2763    Returns TRUE if the relation needs a sort, else FALSE.
2764    
2765    =back
2766    
2767    =cut
2768    #: Return Type $;
2769    sub SortNeeded {
2770        # Get the parameters.
2771        my ($self, $relationName) = @_;
2772        # Declare the return variable.
2773        my $retVal = 0;
2774        # Find out if the relation is a primary entity relation.
2775        my $entityTable = $self->{_metaData}->{Entities};
2776        if (exists $entityTable->{$relationName}) {
2777            my $keyType = $entityTable->{$relationName}->{keyType};
2778            Trace("Relation $relationName found in entity table with key type $keyType.") if T(3);
2779            # If the key is not a hash string, we must do the sort.
2780            if ($keyType ne 'hash-string') {
2781                $retVal = 1;
2782            }
2783        }
2784        # Return the result.
2785        return $retVal;
2786    }
2787    
2788  =head3 CreateRelationshipIndex  =head3 CreateRelationshipIndex
2789    
2790  Create an index for a relationship's relation.  Create an index for a relationship's relation.

Legend:
Removed from v.1.38  
changed lines
  Added in v.1.44

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3