[Bio] / Sprout / ERDB.pm Repository:
ViewVC logotype

Diff of /Sprout/ERDB.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.7, Thu May 5 03:14:03 2005 UTC revision 1.19, Fri Sep 9 14:50:58 2005 UTC
# Line 2  Line 2 
2    
3          use strict;          use strict;
4          use Tracer;          use Tracer;
5          use DBKernel;      use DBrtns;
6          use Data::Dumper;          use Data::Dumper;
7          use XML::Simple;          use XML::Simple;
8          use DBQuery;          use DBQuery;
9          use DBObject;          use DBObject;
10          use Stats;          use Stats;
11          use Time::HiRes qw(gettimeofday);          use Time::HiRes qw(gettimeofday);
12        use FIG;
13    
14  =head1 Entity-Relationship Database Package  =head1 Entity-Relationship Database Package
15    
# Line 32  Line 33 
33  relation that contains two fields-- the feature ID (C<id>) and the alias name (C<alias>).  relation that contains two fields-- the feature ID (C<id>) and the alias name (C<alias>).
34  The B<FEATURE> entity also contains an optional virulence number. This is implemented  The B<FEATURE> entity also contains an optional virulence number. This is implemented
35  as a separate relation C<FeatureVirulence> which contains an ID (C<id>) and a virulence number  as a separate relation C<FeatureVirulence> which contains an ID (C<id>) and a virulence number
36  (C<virulence>). If the virulence of a feature I<ABC> is known to be 6, there will be one row in the  (C<virulence>). If the virulence of a feature I<ABC> is known to be 6, there will be one row in
37  C<FeatureVirulence> relation possessing the value I<ABC> as its ID and 6 as its virulence number.  the C<FeatureVirulence> relation possessing the value I<ABC> as its ID and 6 as its virulence
38  If the virulence of I<ABC> is not known, there will not be any rows for it in C<FeatureVirulence>.  number. If the virulence of I<ABC> is not known, there will not be any rows for it in
39    C<FeatureVirulence>.
40    
41  Entities are connected by binary relationships implemented using single relations possessing the  Entities are connected by binary relationships implemented using single relations possessing the
42  same name as the relationship itself and that has an I<arity> of 1-to-1 (C<11>), 1-to-many (C<1M>),  same name as the relationship itself and that has an I<arity> of 1-to-1 (C<11>), 1-to-many (C<1M>),
# Line 69  Line 71 
71  is described in the L</GenerateEntity> and L</GenerateConnection> methods, though it is not yet  is described in the L</GenerateEntity> and L</GenerateConnection> methods, though it is not yet
72  fully implemented.  fully implemented.
73    
74    =head2 XML Database Description
75    
76    =head3 Data Types
77    
78    The ERDB system supports the following data types. Note that there are numerous string
79    types depending on the maximum length. Some database packages limit the total number of
80    characters you have in an index key; to insure the database works in all environments,
81    the type of string should be the shortest one possible that supports all the known values.
82    
83    =over 4
84    
85    =item char
86    
87    single ASCII character
88    
89    =item int
90    
91    32-bit signed integer
92    
93    =item date
94    
95    64-bit unsigned integer, representing a PERL date/time value
96    
97    =item text
98    
99    long string; Text fields cannot be used in indexes or sorting and do not support the
100    normal syntax of filter clauses, but can be up to a billion character in length
101    
102    =item float
103    
104    double-precision floating-point number
105    
106    =item boolean
107    
108    single-bit numeric value; The value is stored as a 16-bit signed integer (for
109    compatability with certain database packages), but the only values supported are
110    0 and 1.
111    
112    =item key-string
113    
114    variable-length string, maximum 40 characters
115    
116    =item name-string
117    
118    variable-length string, maximum 80 characters
119    
120    =item medium-string
121    
122    variable-length string, maximum 160 characters
123    
124    =item string
125    
126    variable-length string, maximum 255 characters
127    
128    =back
129    
130    =head3 Global Tags
131    
132    The entire database definition must be inside a B<Database> tag. The display name of
133    the database is given by the text associated with the B<Title> tag. The display name
134    is only used in the automated documentation. It has no other effect. The entities and
135    relationships are listed inside the B<Entities> and B<Relationships> tags,
136    respectively. None of these tags have attributes.
137    
138        <Database>
139            <Title>... display title here...</Title>
140            <Entities>
141                ... entity definitions here ...
142            </Entities>
143            <Relationships>
144                ... relationship definitions here...
145            </Relationships>
146        </Database>
147    
148    Entities, relationships, indexes, and fields all allow a text tag called B<Notes>.
149    The text inside the B<Notes> tag contains comments that will appear when the database
150    documentation is generated. Within a B<Notes> tag, you may use C<[i]> and C<[/i]> for
151    italics, C<[b]> and C<[/b]> for bold, and C<[p]> for a new paragraph.
152    
153    =head3 Fields
154    
155    Both entities and relationships have fields described by B<Field> tags. A B<Field>
156    tag can have B<Notes> associated with it. The complete set of B<Field> tags for an
157    object mus be inside B<Fields> tags.
158    
159        <Entity ... >
160            <Fields>
161                ... Field tags ...
162            </Fields>
163        </Entity>
164    
165    The attributes for the B<Field> tag are as follows.
166    
167    =over 4
168    
169    =item name
170    
171    Name of the field. The field name should contain only letters, digits, and hyphens (C<->),
172    and the first character should be a letter. Most underlying databases are case-insensitive
173    with the respect to field names, so a best practice is to use lower-case letters only.
174    
175    =item type
176    
177    Data type of the field. The legal data types are given above.
178    
179    =item relation
180    
181    Name of the relation containing the field. This should only be specified for entity
182    fields. The ERDB system does not support optional fields or multi-occurring fields
183    in the primary relation of an entity. Instead, they are put into secondary relations.
184    So, for example, in the C<Genome> entity, the C<group-name> field indicates a special
185    grouping used to select a subset of the genomes. A given genome may not be in any
186    groups or may be in multiple groups. Therefore, C<group-name> specifies a relation
187    value. The relation name specified must be a valid table name. By convention, it is
188    usually the entity name followed by a qualifying word (e.g. C<GenomeGroup>). In an
189    entity, the fields without a relation attribute are said to belong to the
190    I<primary relation>. This relation has the same name as the entity itself.
191    
192    =back
193    
194    =head3 Indexes
195    
196    An entity can have multiple alternate indexes associated with it. The fields must
197    be from the primary relation. The alternate indexes assist in ordering results
198    from a query. A relationship can have up to two indexes-- a I<to-index> and a
199    I<from-index>. These order the results when crossing the relationship. For
200    example, in the relationship C<HasContig> from C<Genome> to C<Contig>, the
201    from-index would order the contigs of a ganome, and the to-index would order
202    the genomes of a contig. A relationship's index must specify only fields in
203    the relationship.
204    
205    The indexes for an entity must be listed inside the B<Indexes> tag. The from-index
206    of a relationship is specified using the B<FromIndex> tag; the to-index is specified
207    using the B<ToIndex> tag.
208    
209    Each index can contain a B<Notes> tag. In addition, it will have an B<IndexFields>
210    tag containing the B<IndexField> tags. These specify, in order, the fields used in
211    the index. The attributes of an B<IndexField> tag are as follows.
212    
213    =over 4
214    
215    =item name
216    
217    Name of the field.
218    
219    =item order
220    
221    Sort order of the field-- C<ascending> or C<descending>.
222    
223    =back
224    
225    The B<Index>, B<FromIndex>, and B<ToIndex> tags themselves have no attributes.
226    
227    =head3 Object and Field Names
228    
229    By convention entity and relationship names use capital casing (e.g. C<Genome> or
230    C<HasRegionsIn>. Most underlying databases, however, are aggressively case-insensitive
231    with respect to relation names, converting them internally to all-upper case or
232    all-lower case.
233    
234    If syntax or parsing errors occur when you try to load or use an ERDB database, the
235    most likely reason is that one of your objects has an SQL reserved word as its name.
236    The list of SQL reserved words keeps increasing; however, most are unlikely to show
237    up as a noun or declarative verb phrase. The exceptions are C<Group>, C<User>,
238    C<Table>, C<Index>, C<Object>, C<Date>, C<Number>, C<Update>, C<Time>, C<Percent>,
239    C<Memo>, C<Order>, and C<Sum>. This problem can crop up in field names as well.
240    
241    Every entity has a field called C<id> that acts as its primary key. Every relationship
242    has fields called C<from-link> and C<to-link> that contain copies of the relevant
243    entity IDs. These are essentially ERDB's reserved words, and should not be used
244    for user-defined field names.
245    
246    =head3 Entities
247    
248    An entity is described by the B<Entity> tag. The entity can contain B<Notes>, an
249    B<Indexes> tag containing one or more secondary indexes, and a B<Fields> tag
250    containing one or more fields. The attributes of the B<Entity> tag are as follows.
251    
252    =over 4
253    
254    =item name
255    
256    Name of the entity. The entity name, by convention, uses capital casing (e.g. C<Genome>
257    or C<GroupBlock>) and should be a noun or noun phrase.
258    
259    =item keyType
260    
261    Data type of the primary key. The primary key is always named C<id>.
262    
263    =back
264    
265    =head3 Relationships
266    
267    A relationship is described by the C<Relationship> tag. Within a relationship,
268    there can be a C<Notes> tag, a C<Fields> tag containing the intersection data
269    fields, a C<FromIndex> tag containing the from-index, and a C<ToIndex> tag containing
270    the to-index.
271    
272    The C<Relationship> tag has the following attributes.
273    
274    =over 4
275    
276    =item name
277    
278    Name of the relationship. The relationship name, by convention, uses capital casing
279    (e.g. C<ContainsRegionIn> or C<HasContig>), and should be a declarative verb
280    phrase, designed to fit between the from-entity and the to-entity (e.g.
281    Block C<ContainsRegionIn> Genome).
282    
283    =item from
284    
285    Name of the entity from which the relationship starts.
286    
287    =item to
288    
289    Name of the entity to which the relationship proceeds.
290    
291    =item arity
292    
293    Relationship type: C<1M> for one-to-many and C<MM> for many-to-many.
294    
295    =back
296    
297  =cut  =cut
298    
299  # GLOBALS  # GLOBALS
# Line 76  Line 301 
301  # Table of information about our datatypes. "sqlType" is the corresponding SQL datatype string.  # Table of information about our datatypes. "sqlType" is the corresponding SQL datatype string.
302  # "maxLen" is the maximum permissible length of the incoming string data used to populate a field  # "maxLen" is the maximum permissible length of the incoming string data used to populate a field
303  # of the specified type. "dataGen" is PERL string that will be evaluated if no test data generation  # of the specified type. "dataGen" is PERL string that will be evaluated if no test data generation
304   #string is specified in the field definition.  # string is specified in the field definition. "avgLen" is the average byte length for estimating
305  my %TypeTable = ( char =>        { sqlType => 'CHAR(1)',                        maxLen => 1,                    dataGen => "StringGen('A')" },  # record sizes.
306                                    int =>         { sqlType => 'INTEGER',                        maxLen => 20,                   dataGen => "IntGen(0, 99999999)" },  my %TypeTable = ( char =>    { sqlType => 'CHAR(1)',            maxLen => 1,            avgLen =>   1, dataGen => "StringGen('A')" },
307                                    string =>  { sqlType => 'VARCHAR(255)',               maxLen => 255,                  dataGen => "StringGen(IntGen(10,250))" },                    int =>     { sqlType => 'INTEGER',            maxLen => 20,           avgLen =>   4, dataGen => "IntGen(0, 99999999)" },
308                                    text =>        { sqlType => 'TEXT',                           maxLen => 1000000000,   dataGen => "StringGen(IntGen(80,1000))" },                    string =>  { sqlType => 'VARCHAR(255)',       maxLen => 255,          avgLen => 100, dataGen => "StringGen(IntGen(10,250))" },
309                                    date =>        { sqlType => 'BIGINT',                         maxLen => 80,                   dataGen => "DateGen(-7, 7, IntGen(0,1400))" },                    text =>    { sqlType => 'TEXT',               maxLen => 1000000000,   avgLen => 500, dataGen => "StringGen(IntGen(80,1000))" },
310                                    float =>       { sqlType => 'DOUBLE PRECISION',       maxLen => 40,                   dataGen => "FloatGen(0.0, 100.0)" },                    date =>    { sqlType => 'BIGINT',             maxLen => 80,           avgLen =>   8, dataGen => "DateGen(-7, 7, IntGen(0,1400))" },
311                                    boolean => { sqlType => 'SMALLINT',                   maxLen => 1,                    dataGen => "IntGen(0, 1)" },                    float =>   { sqlType => 'DOUBLE PRECISION',   maxLen => 40,           avgLen =>   8, dataGen => "FloatGen(0.0, 100.0)" },
312                      boolean => { sqlType => 'SMALLINT',           maxLen => 1,            avgLen =>   2, dataGen => "IntGen(0, 1)" },
313                               'key-string' =>                               'key-string' =>
314                                                           { sqlType => 'VARCHAR(40)',            maxLen => 40,                   dataGen => "StringGen(IntGen(10,40))" },                               { sqlType => 'VARCHAR(40)',        maxLen => 40,           avgLen =>  10, dataGen => "StringGen(IntGen(10,40))" },
315                                   'name-string' =>                                   'name-string' =>
316                                                           { sqlType => 'VARCHAR(80)',            maxLen => 80,                   dataGen => "StringGen(IntGen(10,80))" },                               { sqlType => 'VARCHAR(80)',        maxLen => 80,           avgLen =>  40, dataGen => "StringGen(IntGen(10,80))" },
317                                   'medium-string' =>                                   'medium-string' =>
318                                                           { sqlType => 'VARCHAR(160)',           maxLen => 160,                  dataGen => "StringGen(IntGen(10,160))" },                               { sqlType => 'VARCHAR(160)',       maxLen => 160,          avgLen =>  40, dataGen => "StringGen(IntGen(10,160))" },
319                                  );                                  );
320    
321  # Table translating arities into natural language.  # Table translating arities into natural language.
# Line 145  Line 371 
371    
372  =head3 ShowMetaData  =head3 ShowMetaData
373    
374  C<< $database->ShowMetaData($fileName); >>  C<< $erdb->ShowMetaData($fileName); >>
375    
376  This method outputs a description of the database. This description can be used to help users create  This method outputs a description of the database. This description can be used to help users create
377  the data to be loaded into the relations.  the data to be loaded into the relations.
# Line 300  Line 526 
526    
527  =head3 DumpMetaData  =head3 DumpMetaData
528    
529  C<< $database->DumpMetaData(); >>  C<< $erdb->DumpMetaData(); >>
530    
531  Return a dump of the metadata structure.  Return a dump of the metadata structure.
532    
# Line 315  Line 541 
541    
542  =head3 CreateTables  =head3 CreateTables
543    
544  C<< $datanase->CreateTables(); >>  C<< $erdb->CreateTables(); >>
545    
546  This method creates the tables for the database from the metadata structure loaded by the  This method creates the tables for the database from the metadata structure loaded by the
547  constructor. It is expected this function will only be used on rare occasions, when the  constructor. It is expected this function will only be used on rare occasions, when the
# Line 353  Line 579 
579    
580  =head3 CreateTable  =head3 CreateTable
581    
582  C<< $database->CreateTable($tableName, $indexFlag); >>  C<< $erdb->CreateTable($tableName, $indexFlag, $estimatedRows); >>
583    
584  Create the table for a relation and optionally create its indexes.  Create the table for a relation and optionally create its indexes.
585    
# Line 363  Line 589 
589    
590  Name of the relation (which will also be the table name).  Name of the relation (which will also be the table name).
591    
592  =item $indexFlag  =item indexFlag
593    
594  TRUE if the indexes for the relation should be created, else FALSE. If FALSE,  TRUE if the indexes for the relation should be created, else FALSE. If FALSE,
595  L</CreateIndexes> must be called later to bring the indexes into existence.  L</CreateIndexes> must be called later to bring the indexes into existence.
596    
597    =item estimatedRows (optional)
598    
599    If specified, the estimated maximum number of rows for the relation. This
600    information allows the creation of tables using storage engines that are
601    faster but require size estimates, such as MyISAM.
602    
603  =back  =back
604    
605  =cut  =cut
606    
607  sub CreateTable {  sub CreateTable {
608          # Get the parameters.          # Get the parameters.
609          my ($self, $relationName, $indexFlag) = @_;      my ($self, $relationName, $indexFlag, $estimatedRows) = @_;
610          # Get the database handle.          # Get the database handle.
611          my $dbh = $self->{_dbh};          my $dbh = $self->{_dbh};
612          # Get the relation data and determine whether or not the relation is primary.          # Get the relation data and determine whether or not the relation is primary.
# Line 398  Line 630 
630          # Insure the table is not already there.          # Insure the table is not already there.
631          $dbh->drop_table(tbl => $relationName);          $dbh->drop_table(tbl => $relationName);
632          Trace("Table $relationName dropped.") if T(2);          Trace("Table $relationName dropped.") if T(2);
633        # If there are estimated rows, create an estimate so we can take advantage of
634        # faster DB technologies.
635        my $estimation = undef;
636        if ($estimatedRows) {
637            $estimation = [$self->EstimateRowSize($relationName), $estimatedRows];
638        }
639          # Create the table.          # Create the table.
640          Trace("Creating table $relationName: $fieldThing") if T(2);          Trace("Creating table $relationName: $fieldThing") if T(2);
641          $dbh->create_table(tbl => $relationName, flds => $fieldThing);      $dbh->create_table(tbl => $relationName, flds => $fieldThing, estimates => $estimation);
642          Trace("Relation $relationName created in database.") if T(2);          Trace("Relation $relationName created in database.") if T(2);
643          # If we want to build the indexes, we do it here.          # If we want to build the indexes, we do it here.
644          if ($indexFlag) {          if ($indexFlag) {
# Line 410  Line 648 
648    
649  =head3 CreateIndex  =head3 CreateIndex
650    
651  C<< $database->CreateIndex($relationName); >>  C<< $erdb->CreateIndex($relationName); >>
652    
653  Create the indexes for a relation. If a table is being loaded from a large source file (as  Create the indexes for a relation. If a table is being loaded from a large source file (as
654  is the case in L</LoadTable>), it is best to create the indexes after the load. If that is  is the case in L</LoadTable>), it is sometimes best to create the indexes after the load.
655  the case, then L</CreateTable> should be called with the index flag set to FALSE, and this  If that is the case, then L</CreateTable> should be called with the index flag set to
656  method used after the load to create the indexes for the table.  FALSE, and this method used after the load to create the indexes for the table.
657    
658  =cut  =cut
659    
# Line 443  Line 681 
681    
682  =head3 LoadTables  =head3 LoadTables
683    
684  C<< my $stats = $database->LoadTables($directoryName, $rebuild); >>  C<< my $stats = $erdb->LoadTables($directoryName, $rebuild); >>
685    
686  This method will load the database tables from a directory. The tables must already have been created  This method will load the database tables from a directory. The tables must already have been created
687  in the database. (This can be done by calling L</CreateTables>.) The caller passes in a directory name;  in the database. (This can be done by calling L</CreateTables>.) The caller passes in a directory name;
# Line 513  Line 751 
751    
752  =head3 GetTableNames  =head3 GetTableNames
753    
754  C<< my @names = $database->GetTableNames; >>  C<< my @names = $erdb->GetTableNames; >>
755    
756  Return a list of the relations required to implement this database.  Return a list of the relations required to implement this database.
757    
# Line 530  Line 768 
768    
769  =head3 GetEntityTypes  =head3 GetEntityTypes
770    
771  C<< my @names = $database->GetEntityTypes; >>  C<< my @names = $erdb->GetEntityTypes; >>
772    
773  Return a list of the entity type names.  Return a list of the entity type names.
774    
# Line 547  Line 785 
785    
786  =head3 Get  =head3 Get
787    
788  C<< my $query = $database->Get(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >>  C<< my $query = $erdb->Get(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >>
789    
790  This method returns a query object for entities of a specified type using a specified filter.  This method returns a query object for entities of a specified type using a specified filter.
791  The filter is a standard WHERE/ORDER BY clause with question marks as parameter markers and each  The filter is a standard WHERE/ORDER BY clause with question marks as parameter markers and each
# Line 555  Line 793 
793  following call requests all B<Genome> objects for the genus specified in the variable  following call requests all B<Genome> objects for the genus specified in the variable
794  $genus.  $genus.
795    
796  C<< $query = $sprout->Get(['Genome'], "Genome(genus) = ?", $genus); >>  C<< $query = $erdb->Get(['Genome'], "Genome(genus) = ?", $genus); >>
797    
798  The WHERE clause contains a single question mark, so there is a single additional  The WHERE clause contains a single question mark, so there is a single additional
799  parameter representing the parameter value. It would also be possible to code  parameter representing the parameter value. It would also be possible to code
800    
801  C<< $query = $sprout->Get(['Genome'], "Genome(genus) = \'$genus\'"); >>  C<< $query = $erdb->Get(['Genome'], "Genome(genus) = \'$genus\'"); >>
802    
803  however, this version of the call would generate a syntax error if there were any quote  however, this version of the call would generate a syntax error if there were any quote
804  characters inside the variable C<$genus>.  characters inside the variable C<$genus>.
# Line 572  Line 810 
810  It is possible to specify multiple entity and relationship names in order to retrieve more than  It is possible to specify multiple entity and relationship names in order to retrieve more than
811  one object's data at the same time, which allows highly complex joined queries. For example,  one object's data at the same time, which allows highly complex joined queries. For example,
812    
813  C<< $query = $sprout->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", $genus); >>  C<< $query = $erdb->Get(['Genome', 'ComesFrom', 'Source'], "Genome(genus) = ?", $genus); >>
814    
815  If multiple names are specified, then the query processor will automatically determine a  If multiple names are specified, then the query processor will automatically determine a
816  join path between the entities and relationships. The algorithm used is very simplistic.  join path between the entities and relationships. The algorithm used is very simplistic.
# Line 735  Line 973 
973                          $command .= " ORDER BY $orderClause";                          $command .= " ORDER BY $orderClause";
974                  }                  }
975          }          }
976          Trace("SQL query: $command") if T(2);      Trace("SQL query: $command") if T(3);
977          Trace("PARMS: '" . (join "', '", @params) . "'") if (T(3) && (@params > 0));      Trace("PARMS: '" . (join "', '", @params) . "'") if (T(4) && (@params > 0));
978          my $sth = $dbh->prepare_command($command);          my $sth = $dbh->prepare_command($command);
979          # Execute it with the parameters bound in.          # Execute it with the parameters bound in.
980          $sth->execute(@params) || Confess("SELECT error" . $sth->errstr());          $sth->execute(@params) || Confess("SELECT error" . $sth->errstr());
# Line 747  Line 985 
985    
986  =head3 GetList  =head3 GetList
987    
988  C<< my @dbObjects = $database->GetList(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >>  C<< my @dbObjects = $erdb->GetList(\@objectNames, $filterClause, $param1, $param2, ..., $paramN); >>
989    
990  Return a list of object descriptors for the specified objects as determined by the  Return a list of object descriptors for the specified objects as determined by the
991  specified filter clause.  specified filter clause.
# Line 810  Line 1048 
1048    
1049  =head3 ComputeObjectSentence  =head3 ComputeObjectSentence
1050    
1051  C<< my $sentence = $database->ComputeObjectSentence($objectName); >>  C<< my $sentence = $erdb->ComputeObjectSentence($objectName); >>
1052    
1053  Check an object name, and if it is a relationship convert it to a relationship sentence.  Check an object name, and if it is a relationship convert it to a relationship sentence.
1054    
# Line 845  Line 1083 
1083    
1084  =head3 DumpRelations  =head3 DumpRelations
1085    
1086  C<< $database->DumpRelations($outputDirectory); >>  C<< $erdb->DumpRelations($outputDirectory); >>
1087    
1088  Write the contents of all the relations to tab-delimited files in the specified directory.  Write the contents of all the relations to tab-delimited files in the specified directory.
1089  Each file will have the same name as the relation dumped, with an extension of DTX.  Each file will have the same name as the relation dumped, with an extension of DTX.
# Line 887  Line 1125 
1125    
1126  =head3 InsertObject  =head3 InsertObject
1127    
1128  C<< my $ok = $database->InsertObject($objectType, \%fieldHash); >>  C<< my $ok = $erdb->InsertObject($objectType, \%fieldHash); >>
1129    
1130  Insert an object into the database. The object is defined by a type name and then a hash  Insert an object into the database. The object is defined by a type name and then a hash
1131  of field names to values. Field values in the primary relation are represented by scalars.  of field names to values. Field values in the primary relation are represented by scalars.
# Line 896  Line 1134 
1134  example, the following line inserts an inactive PEG feature named C<fig|188.1.peg.1> with aliases  example, the following line inserts an inactive PEG feature named C<fig|188.1.peg.1> with aliases
1135  C<ZP_00210270.1> and C<gi|46206278>.  C<ZP_00210270.1> and C<gi|46206278>.
1136    
1137  C<< $database->InsertObject('Feature', { id => 'fig|188.1.peg.1', active => 0, feature-type => 'peg', alias => ['ZP_00210270.1', 'gi|46206278']}); >>  C<< $erdb->InsertObject('Feature', { id => 'fig|188.1.peg.1', active => 0, feature-type => 'peg', alias => ['ZP_00210270.1', 'gi|46206278']}); >>
1138    
1139  The next statement inserts a C<HasProperty> relationship between feature C<fig|158879.1.peg.1> and  The next statement inserts a C<HasProperty> relationship between feature C<fig|158879.1.peg.1> and
1140  property C<4> with an evidence URL of C<http://seedu.uchicago.edu/query.cgi?article_id=142>.  property C<4> with an evidence URL of C<http://seedu.uchicago.edu/query.cgi?article_id=142>.
1141    
1142  C<< $database->InsertObject('HasProperty', { 'from-link' => 'fig|158879.1.peg.1', 'to-link' => 4, evidence = 'http://seedu.uchicago.edu/query.cgi?article_id=142'}); >>  C<< $erdb->InsertObject('HasProperty', { 'from-link' => 'fig|158879.1.peg.1', 'to-link' => 4, evidence = 'http://seedu.uchicago.edu/query.cgi?article_id=142'}); >>
1143    
1144  =over 4  =over 4
1145    
# Line 1026  Line 1264 
1264    
1265  =head3 LoadTable  =head3 LoadTable
1266    
1267  C<< my %results = $database->LoadTable($fileName, $relationName, $truncateFlag); >>  C<< my %results = $erdb->LoadTable($fileName, $relationName, $truncateFlag); >>
1268    
1269  Load data from a tab-delimited file into a specified table, optionally re-creating the table first.  Load data from a tab-delimited file into a specified table, optionally re-creating the table
1270    first.
1271    
1272  =over 4  =over 4
1273    
# Line 1046  Line 1285 
1285    
1286  =item RETURN  =item RETURN
1287    
1288  Returns a statistical object containing the number of records read and a list of the error messages.  Returns a statistical object containing the number of records read and a list of
1289    the error messages.
1290    
1291  =back  =back
1292    
# Line 1057  Line 1297 
1297          # Create the statistical return object.          # Create the statistical return object.
1298          my $retVal = _GetLoadStats();          my $retVal = _GetLoadStats();
1299          # Trace the fact of the load.          # Trace the fact of the load.
1300          Trace("Loading table $relationName from $fileName") if T(1);      Trace("Loading table $relationName from $fileName") if T(2);
1301          # Get the database handle.          # Get the database handle.
1302          my $dbh = $self->{_dbh};          my $dbh = $self->{_dbh};
1303          # Get the relation data.          # Get the relation data.
1304          my $relation = $self->_FindRelation($relationName);          my $relation = $self->_FindRelation($relationName);
1305          # Check the truncation flag.          # Check the truncation flag.
1306          if ($truncateFlag) {          if ($truncateFlag) {
1307                  Trace("Creating table $relationName") if T(1);          Trace("Creating table $relationName") if T(2);
1308            # Compute the row count estimate. We take the size of the load file,
1309            # divide it by the estimated row size, and then multiply by 1.5 to
1310            # leave extra room. We postulate a minimum row count of 1000 to
1311            # prevent problems with incoming empty load files.
1312            my $rowSize = $self->EstimateRowSize($relationName);
1313            my $fileSize = -s $fileName;
1314            my $estimate = FIG::max($fileSize * 1.5 / $rowSize, 1000);
1315                  # Re-create the table without its index.                  # Re-create the table without its index.
1316                  $self->CreateTable($relationName, 0);          $self->CreateTable($relationName, 0, $estimate);
1317            # If this is a pre-index DBMS, create the index here.
1318            if ($dbh->{_preIndex}) {
1319                eval {
1320                    $self->CreateIndex($relationName);
1321                };
1322                if ($@) {
1323                    $retVal->AddMessage($@);
1324                }
1325            }
1326          }          }
1327          # Determine whether or not this is a primary relation. Primary relations have an extra          # Determine whether or not this is a primary relation. Primary relations have an extra
1328          # field indicating whether or not a given object is new or was loaded from the flat files.          # field indicating whether or not a given object is new or was loaded from the flat files.
# Line 1074  Line 1330 
1330          # Get the number of fields in this relation.          # Get the number of fields in this relation.
1331          my @fieldList = @{$relation->{Fields}};          my @fieldList = @{$relation->{Fields}};
1332          my $fieldCount = @fieldList;          my $fieldCount = @fieldList;
         # Record the number of expected fields.  
         my $expectedFields = $fieldCount + ($primary ? 1 : 0);  
1333          # Start a database transaction.          # Start a database transaction.
1334          $dbh->begin_tran;          $dbh->begin_tran;
1335          # Open the relation file. We need to create a cleaned-up copy before loading.          # Open the relation file. We need to create a cleaned-up copy before loading.
1336          open TABLEIN, '<', $fileName;          open TABLEIN, '<', $fileName;
1337          my $tempName = "$fileName.tbl";          my $tempName = "$fileName.tbl";
1338          open TABLEOUT, '>', $tempName;          open TABLEOUT, '>', $tempName;
1339        my $inputCount = 0;
1340          # Loop through the file.          # Loop through the file.
1341          while (<TABLEIN>) {          while (<TABLEIN>) {
1342            $inputCount++;
1343                  # Chop off the new-line character.                  # Chop off the new-line character.
1344                  my $record = $_;          my $record = Tracer::Strip($_);
                 chomp $record;  
1345          # Only proceed if the record is non-blank.          # Only proceed if the record is non-blank.
1346          if ($record) {          if ($record) {
1347              # Escape all the backslashes found in the line.              # Escape all the backslashes found in the line.
1348              $record =~ s/\\/\\\\/g;              $record =~ s/\\/\\\\/g;
1349              # Eliminate any trailing tabs.              # Insure the number of fields is correct.
1350              chop $record while substr($record, -1) eq "\t";              my @fields = split /\t/, $record;
1351                while (@fields > $fieldCount) {
1352                    my $extraField = $fields[$#fields];
1353                    delete $fields[$#fields];
1354                    if ($extraField) {
1355                        Trace("Nonblank extra field value \"$extraField\" deleted from record $inputCount of $fileName.") if T(1);
1356                    }
1357                }
1358                while (@fields < $fieldCount) {
1359                    push @fields, "";
1360                }
1361              # If this is a primary relation, add a 0 for the new-record flag (indicating that              # If this is a primary relation, add a 0 for the new-record flag (indicating that
1362              # this record is not new, but part of the original load).              # this record is not new, but part of the original load).
1363              if ($primary) {              if ($primary) {
1364                  $record .= "\t0";                  push @fields, "0";
1365              }              }
1366              # Write the record.              # Write the record.
1367                $record = join "\t", @fields;
1368              print TABLEOUT "$record\n";              print TABLEOUT "$record\n";
1369              # Count the record read.              # Count the record written.
1370              my $count = $retVal->Add('records');              my $count = $retVal->Add('records');
1371              my $len = length $record;              my $len = length $record;
1372              Trace("Record $count written with $len characters.") if T(4);              Trace("Record $count written with $len characters.") if T(4);
1373            } else {
1374                # Here we have a blank record.
1375                $retVal->Add('skipped');
1376          }          }
1377          }          }
1378          # Close the files.          # Close the files.
1379          close TABLEIN;          close TABLEIN;
1380          close TABLEOUT;          close TABLEOUT;
1381      Trace("Temporary file $tempName created.") if T(4);      Trace("Temporary file $tempName created.") if T(2);
1382      # Load the table.      # Load the table.
1383          my $rv;          my $rv;
1384          eval {          eval {
# Line 1121  Line 1390 
1390                  Trace("Table load failed for $relationName.") if T(1);                  Trace("Table load failed for $relationName.") if T(1);
1391          } else {          } else {
1392                  # Here we successfully loaded the table. Trace the number of records loaded.                  # Here we successfully loaded the table. Trace the number of records loaded.
1393                  Trace("$retVal->{records} records read for $relationName.") if T(1);          Trace("$retVal->{records} records read for $relationName.") if T(2);
1394                  # If we're rebuilding, we need to create the table indexes.                  # If we're rebuilding, we need to create the table indexes.
1395                  if ($truncateFlag) {          if ($truncateFlag && ! $dbh->{_preIndex}) {
1396                          eval {                          eval {
1397                                  $self->CreateIndex($relationName);                                  $self->CreateIndex($relationName);
1398                          };                          };
# Line 1131  Line 1400 
1400                                  $retVal->AddMessage($@);                                  $retVal->AddMessage($@);
1401                          }                          }
1402                  }                  }
1403            # Analyze the table to help optimize tables.
1404          }          }
1405          # Commit the database changes.          # Commit the database changes.
1406          $dbh->commit_tran;          $dbh->commit_tran;
1407        $dbh->vacuum_it($relationName);
1408          # Delete the temporary file.          # Delete the temporary file.
1409          unlink $tempName;          unlink $tempName;
1410          # Return the statistics.          # Return the statistics.
# Line 1142  Line 1413 
1413    
1414  =head3 GenerateEntity  =head3 GenerateEntity
1415    
1416  C<< my $fieldHash = $database->GenerateEntity($id, $type, \%values); >>  C<< my $fieldHash = $erdb->GenerateEntity($id, $type, \%values); >>
1417    
1418  Generate the data for a new entity instance. This method creates a field hash suitable for  Generate the data for a new entity instance. This method creates a field hash suitable for
1419  passing as a parameter to L</InsertObject>. The ID is specified by the callr, but the rest  passing as a parameter to L</InsertObject>. The ID is specified by the callr, but the rest
# Line 1200  Line 1471 
1471    
1472  =head3 GetEntity  =head3 GetEntity
1473    
1474  C<< my $entityObject = $sprout->GetEntity($entityType, $ID); >>  C<< my $entityObject = $erdb->GetEntity($entityType, $ID); >>
1475    
1476  Return an object describing the entity instance with a specified ID.  Return an object describing the entity instance with a specified ID.
1477    
# Line 1236  Line 1507 
1507    
1508  =head3 GetEntityValues  =head3 GetEntityValues
1509    
1510  C<< my @values = GetEntityValues($entityType, $ID, \@fields); >>  C<< my @values = $erdb->GetEntityValues($entityType, $ID, \@fields); >>
1511    
1512  Return a list of values from a specified entity instance.  Return a list of values from a specified entity instance.
1513    
# Line 1279  Line 1550 
1550    
1551  =head3 GetAll  =head3 GetAll
1552    
1553  C<< my @list = $sprout->GetAll(\@objectNames, $filterClause, \@parameters, \@fields, $count); >>  C<< my @list = $erdb->GetAll(\@objectNames, $filterClause, \@parameters, \@fields, $count); >>
1554    
1555  Return a list of values taken from the objects returned by a query. The first three  Return a list of values taken from the objects returned by a query. The first three
1556  parameters correspond to the parameters of the L</Get> method. The final parameter is  parameters correspond to the parameters of the L</Get> method. The final parameter is
# Line 1295  Line 1566 
1566  spreadsheet cell, and each feature will be represented by a list containing the  spreadsheet cell, and each feature will be represented by a list containing the
1567  feature ID followed by all of its aliases.  feature ID followed by all of its aliases.
1568    
1569  C<< $query = $sprout->Get(['ContainsFeature', 'Feature'], "ContainsFeature(from-link) = ?", [$ssCellID], ['Feature(id)', 'Feature(alias)']); >>  C<< $query = $erdb->Get(['ContainsFeature', 'Feature'], "ContainsFeature(from-link) = ?", [$ssCellID], ['Feature(id)', 'Feature(alias)']); >>
1570    
1571  =over 4  =over 4
1572    
# Line 1364  Line 1635 
1635          return @retVal;          return @retVal;
1636  }  }
1637    
1638    =head3 EstimateRowSize
1639    
1640    C<< my $rowSize = $erdb->EstimateRowSize($relName); >>
1641    
1642    Estimate the row size of the specified relation. The estimated row size is computed by adding
1643    up the average length for each data type.
1644    
1645    =over 4
1646    
1647    =item relName
1648    
1649    Name of the relation whose estimated row size is desired.
1650    
1651    =item RETURN
1652    
1653    Returns an estimate of the row size for the specified relation.
1654    
1655    =back
1656    
1657    =cut
1658    #: Return Type $;
1659    sub EstimateRowSize {
1660        # Get the parameters.
1661        my ($self, $relName) = @_;
1662        # Declare the return variable.
1663        my $retVal = 0;
1664        # Find the relation descriptor.
1665        my $relation = $self->_FindRelation($relName);
1666        # Get the list of fields.
1667        for my $fieldData (@{$relation->{Fields}}) {
1668            # Get the field type and add its length.
1669            my $fieldLen = $TypeTable{$fieldData->{type}}->{avgLen};
1670            $retVal += $fieldLen;
1671        }
1672        # Return the result.
1673        return $retVal;
1674    }
1675    
1676  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1677    
1678  =head3 GetLoadStats  =head3 GetLoadStats
# Line 1739  Line 2048 
2048  sub _LoadMetaData {  sub _LoadMetaData {
2049          # Get the parameters.          # Get the parameters.
2050          my ($filename) = @_;          my ($filename) = @_;
2051        Trace("Reading Sprout DBD from $filename.") if T(2);
2052          # Slurp the XML file into a variable. Extensive use of options is used to insure we          # Slurp the XML file into a variable. Extensive use of options is used to insure we
2053          # get the exact structure we want.          # get the exact structure we want.
2054          my $metadata = XML::Simple::XMLin($filename,          my $metadata = XML::Simple::XMLin($filename,
# Line 1766  Line 2076 
2076          for my $entityName (keys %{$entityList}) {          for my $entityName (keys %{$entityList}) {
2077                  my $entityStructure = $entityList->{$entityName};                  my $entityStructure = $entityList->{$entityName};
2078                  #                  #
2079                  # The first step is to run creating all the entity's default values. For C<Field> elements,          # The first step is to create all the entity's default values. For C<Field> elements,
2080                  # the relation name must be added where it is not specified. For relationships,                  # the relation name must be added where it is not specified. For relationships,
2081                  # the B<from-link> and B<to-link> fields must be inserted, and for entities an B<id>                  # the B<from-link> and B<to-link> fields must be inserted, and for entities an B<id>
2082                  # field must be added to each relation. Finally, each field will have a C<PrettySort> attribute                  # field must be added to each relation. Finally, each field will have a C<PrettySort> attribute
# Line 1945  Line 2255 
2255                  my @fromList = ();                  my @fromList = ();
2256                  my @toList = ();                  my @toList = ();
2257                  my @bothList = ();                  my @bothList = ();
2258                  Trace("Join table build for $entityName.") if T(3);          Trace("Join table build for $entityName.") if T(4);
2259                  for my $relationshipName (keys %{$relationshipList}) {                  for my $relationshipName (keys %{$relationshipList}) {
2260                          my $relationship = $relationshipList->{$relationshipName};                          my $relationship = $relationshipList->{$relationshipName};
2261                          # Determine if this relationship has our entity in one of its link fields.                          # Determine if this relationship has our entity in one of its link fields.
2262                          my $fromEntity = $relationship->{from};                          my $fromEntity = $relationship->{from};
2263                          my $toEntity = $relationship->{to};                          my $toEntity = $relationship->{to};
2264                          Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(3);              Trace("Join check for relationship $relationshipName from $fromEntity to $toEntity.") if T(4);
2265                          if ($fromEntity eq $entityName) {                          if ($fromEntity eq $entityName) {
2266                                  if ($toEntity eq $entityName) {                                  if ($toEntity eq $entityName) {
2267                                          # Here the relationship is recursive.                                          # Here the relationship is recursive.
2268                                          push @bothList, $relationshipName;                                          push @bothList, $relationshipName;
2269                                          Trace("Relationship $relationshipName put in both-list.") if T(3);                      Trace("Relationship $relationshipName put in both-list.") if T(4);
2270                                  } else {                                  } else {
2271                                          # Here the relationship comes from the entity.                                          # Here the relationship comes from the entity.
2272                                          push @fromList, $relationshipName;                                          push @fromList, $relationshipName;
2273                                          Trace("Relationship $relationshipName put in from-list.") if T(3);                      Trace("Relationship $relationshipName put in from-list.") if T(4);
2274                                  }                                  }
2275                          } elsif ($toEntity eq $entityName) {                          } elsif ($toEntity eq $entityName) {
2276                                  # Here the relationship goes to the entity.                                  # Here the relationship goes to the entity.
2277                                  push @toList, $relationshipName;                                  push @toList, $relationshipName;
2278                                  Trace("Relationship $relationshipName put in to-list.") if T(3);                  Trace("Relationship $relationshipName put in to-list.") if T(4);
2279                          }                          }
2280                  }                  }
2281                  # Create the nonrecursive joins. Note that we build two hashes for running                  # Create the nonrecursive joins. Note that we build two hashes for running
# Line 2011  Line 2321 
2321                                  # relationship can only be ambiguous with another recursive relationship,                                  # relationship can only be ambiguous with another recursive relationship,
2322                                  # and the incoming relationship from the outer loop is never recursive.                                  # and the incoming relationship from the outer loop is never recursive.
2323                                  for my $otherName (@bothList) {                                  for my $otherName (@bothList) {
2324                                          Trace("Setting up relationship joins to recursive relationship $otherName with $relationshipName.") if T(3);                      Trace("Setting up relationship joins to recursive relationship $otherName with $relationshipName.") if T(4);
2325                                          # Join from the left.                                          # Join from the left.
2326                                          $joinTable{"$relationshipName/$otherName"} =                                          $joinTable{"$relationshipName/$otherName"} =
2327                                                  "$linkField = $otherName.from_link";                                                  "$linkField = $otherName.from_link";
# Line 2026  Line 2336 
2336                  # rise to situations where we can't create the path we want; however, it is always                  # rise to situations where we can't create the path we want; however, it is always
2337                  # possible to get the same effect using multiple queries.                  # possible to get the same effect using multiple queries.
2338                  for my $relationshipName (@bothList) {                  for my $relationshipName (@bothList) {
2339                          Trace("Setting up entity joins to recursive relationship $relationshipName with $entityName.") if T(3);              Trace("Setting up entity joins to recursive relationship $relationshipName with $entityName.") if T(4);
2340                          # Join to the entity from each direction.                          # Join to the entity from each direction.
2341                          $joinTable{"$entityName/$relationshipName"} =                          $joinTable{"$entityName/$relationshipName"} =
2342                                  "$entityName.id = $relationshipName.from_link";                                  "$entityName.id = $relationshipName.from_link";
# Line 2077  Line 2387 
2387          # index descriptor does not exist, it will be created automatically so we can add          # index descriptor does not exist, it will be created automatically so we can add
2388          # the field to it.          # the field to it.
2389          unshift @{$newIndex->{IndexFields}}, $firstField;          unshift @{$newIndex->{IndexFields}}, $firstField;
2390        # If this is a one-to-many relationship, the "To" index is unique.
2391        if ($relationshipStructure->{arity} eq "1M" && $indexKey eq "To") {
2392            $newIndex->{Unique} = 'true';
2393        }
2394          # Add the index to the relation.          # Add the index to the relation.
2395          _AddIndex("idx$relationshipName$indexKey", $relationStructure, $newIndex);          _AddIndex("idx$relationshipName$indexKey", $relationStructure, $newIndex);
2396  }  }
# Line 2168  Line 2482 
2482                  # Here we have a field list. Loop through its fields.                  # Here we have a field list. Loop through its fields.
2483                  my $fieldStructures = $structure->{Fields};                  my $fieldStructures = $structure->{Fields};
2484                  for my $fieldName (keys %{$fieldStructures}) {                  for my $fieldName (keys %{$fieldStructures}) {
2485                Trace("Processing field $fieldName of $defaultRelationName.") if T(4);
2486                          my $fieldData = $fieldStructures->{$fieldName};                          my $fieldData = $fieldStructures->{$fieldName};
2487                          # Get the field type.                          # Get the field type.
2488                          my $type = $fieldData->{type};                          my $type = $fieldData->{type};

Legend:
Removed from v.1.7  
changed lines
  Added in v.1.19

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3