[Bio] / Sprout / ERDBGenerate.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBGenerate.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package ERDBGenerate;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDB;
9 :     use Stats;
10 :    
11 :     =head1 ERDB Table Data Generation Helper Object
12 :    
13 :     =head2 Introduction
14 :    
15 :     This object is designed to assist with creating the load files for an ERDB
16 :     data relation (also known as a I<table>).
17 :    
18 :     The generation process can be very long, so each table is loaded a section at a
19 :     time, with multiple sections running in parallel. After the load files for each
20 :     section are created, a separate process is used to collate the sections and load
21 :     them into the database tables.
22 :    
23 :     When the output file is being written, its name is suffixed by a tilde (C<~>) to denote
24 :     it is currently being processed. When the L</Finish> method is called, the file is closed
25 :     and renamed. If the L</Finish> method is not called, it is presumed that the load has
26 :     failed. The tilde will remain in place so that the collater knows the file is invalid.
27 :    
28 :     This object maintains the following data fields.
29 :    
30 :     =over 4
31 :    
32 :     =item directory
33 :    
34 :     Directory into which load files should be placed.
35 :    
36 :     =item dupHash
37 :    
38 :     Hash of IDs already seen for this table. This hash is only created if the table
39 :     is the primary relation of an entity. Removing duplicate IDs is not strictly necessary,
40 :     but it saves disk space and avoids unnecessary I/O, which makes us a bit faster.
41 :    
42 :     =item erdb
43 :    
44 :     [[ErdbPm]] object used to create and access the database. This will usually
45 :     be a subclass of a pure ERDB object created to manage a particular database.
46 :    
47 :     =item fh
48 :    
49 :     Open handle of the current output file (if any).
50 :    
51 :     =item fileName
52 :    
53 :     Name of the current output file (if any).
54 :    
55 :     =item relation
56 :    
57 :     Reference to the descriptor for this table's relation in the ERDB database
58 :     object.
59 :    
60 : parrello 1.2 =item stats
61 :    
62 :     Statistics object for recording events.
63 :    
64 : parrello 1.1 =item table
65 :    
66 :     Name of the relation table being loaded.
67 :    
68 :     =back
69 :    
70 :     =cut
71 :    
72 :     =head3 new
73 :    
74 : parrello 1.2 my $erdbload = ERDBGenerate->new($erdb, $directory, $table, $stats);
75 : parrello 1.1
76 :     Create an ERDB Table Load Utility object for a specified table. Note that
77 :     when generating a table, the section ID is required, but for collating
78 :     and loading it can be omitted.
79 :    
80 :     =over 4
81 :    
82 :     =item erdb
83 :    
84 :     [[ErdbPm]] object for the database being loaded.
85 :    
86 :     =item directory
87 :    
88 :     Name of the directory into which the load files are to be placed.
89 :    
90 :     =item table
91 :    
92 :     Name of the table being loaded.
93 :    
94 : parrello 1.2 =item stats
95 : parrello 1.1
96 : parrello 1.2 Statistics object for recording events.
97 : parrello 1.1
98 :     =back
99 :    
100 :     =cut
101 :    
102 :     sub new {
103 :     # Get the parameters.
104 : parrello 1.2 my ($class, $erdb, $directory, $table, $stats) = @_;
105 : parrello 1.1 # Ask the database for the relation's descriptor.
106 : parrello 1.2 my $relation = $erdb->FindRelation($table);
107 : parrello 1.1 Confess("Invalid table name \"$table\".") if (! defined $relation);
108 :     # Create the new object.
109 :     my $retVal = {
110 :     directory => $directory,
111 : parrello 1.2 erdb => $erdb,
112 : parrello 1.1 fh => undef,
113 :     fileName => undef,
114 :     relation => $relation,
115 : parrello 1.2 stats => $stats,
116 : parrello 1.1 table => $table,
117 :     };
118 :     # Is this relation the primary relation of an entity?
119 : parrello 1.2 if ($erdb->IsEntity($table)) {
120 : parrello 1.1 # Yes. Create a duplicate-check hash. We don't create an empty
121 :     # hash unless it's needed, because the method that checks for
122 :     # the hash uses "exists".
123 :     $retVal->{dupHash} = {};
124 :     }
125 :     # Bless and return the result.
126 :     bless $retVal, $class;
127 :     return $retVal;
128 :     }
129 :    
130 :     =head2 Public Methods
131 :    
132 :     =head3 Start
133 :    
134 :     $erdbload->Start($section);
135 :    
136 :     Initialize for loading the specified section into this loader's table.
137 :     This deletes any existing load file for the section and creates a
138 :     temporary file into which new data can be placed using L</Put> calls.
139 :    
140 :     =over 4
141 :    
142 :     =item section
143 :    
144 :     ID of the section being loaded.
145 :    
146 :     =back
147 :    
148 :     =cut
149 :    
150 :     sub Start {
151 :     # Get the parameters.
152 :     my ($self, $section) = @_;
153 :     # Compute the output file name.
154 :     my $fileName = CreateFileName($self->{table}, $section, 'data',
155 :     $self->{directory});
156 :     # Insure it doesn't already exist.
157 :     unlink $fileName if -e $fileName;
158 :     # Open a temporary file for it.
159 :     my $oh = Open(undef, ">" . TempOf($fileName));
160 :     # Save the name and handle.
161 :     $self->{fh} = $oh;
162 :     $self->{fileName} = $fileName;
163 :     Trace("Starting output to $fileName for section $section and table $self->{table}.") if T(4);
164 :     }
165 :    
166 :     =head3 Put
167 :    
168 :     my $length = $erdbload->Put(%putFields);
169 :    
170 :     Output the specified fields to the currently-active load file. The fields
171 :     come in as a hash mapping field names to field values. Fields whose
172 :     values are not specified will be set to their default value.
173 :    
174 :     =over 4
175 :    
176 :     =item putFields
177 :    
178 :     A hash mapping field names for this generator's target relation to
179 :     field values.
180 :    
181 :     =item RETURN
182 :    
183 :     Returns the number of characters output (excluding delimiters), or zero if
184 :     nothing is output (which usually indicates we're discarding a duplicate entity.)
185 :    
186 :     =back
187 :    
188 :     =cut
189 :    
190 :     sub Put {
191 :     # Get the parameters.
192 :     my ($self, %putFields) = @_;
193 :     # We return the number of characters output.
194 :     my $retVal = 0;
195 : parrello 1.3 # Get the database object.
196 :     my $erdb = $self->{erdb};
197 : parrello 1.1 # Get the descriptor for this relation.
198 :     my $relationTable = $self->{relation};
199 :     # Insure we have an output file to which we can write.
200 :     my $oh = $self->{fh};
201 :     Confess("Put before Start for $self->{table}.") if ! defined $oh;
202 :     # Before we try to output this record, see if it's a duplicate. This is only
203 :     # possible if the duplicate-key hash exists and we have an ID.
204 :     my $duplicate;
205 :     if (exists $self->{dupHash} && $putFields{id}) {
206 :     # Okay, here we can make the check. If the ID has already been
207 :     # seen, this statement will set $duplicate to a nonzero value.
208 :     # Otherwise, it will be undefined.
209 :     $duplicate = $self->{dupHash}->{$putFields{id}}++;
210 :     }
211 :     # Only proceed if we're NOT a duplicate.
212 :     if (! $duplicate) {
213 :     # We'll create an ordered list of field values in here.
214 :     my @values;
215 :     # Loop through the relation's fields.
216 :     for my $field (@{$relationTable->{Fields}}) {
217 : parrello 1.3 # Get this field's value. We need to consider the possibility the
218 :     # use used underscores instead of hyphens for convenience, so we
219 :     # have to check twice.
220 :     my $name = $field->{name};
221 :     my $value = $putFields{$name};
222 :     if (! defined $value) {
223 :     my $altName = $name;
224 :     $altName =~ tr/-/_/;
225 :     $value = $putFields{$altName};
226 :     }
227 :     # Did we find a value?
228 : parrello 1.1 if (! defined $value) {
229 :     # The field has no value, so check for a default.
230 :     $value = $field->{default};
231 :     # If there's no default, we have an error.
232 :     Confess("Missing value for $field->{name} in Put for $self->{table}.")
233 :     if ! defined $value;
234 :     }
235 :     # Push the value into the result list.
236 :     push @values, $value;
237 : parrello 1.3 Trace("Field $name in $self->{table} has value \"$value\".") if T(4);
238 : parrello 1.1 # Record its length.
239 :     $retVal += length("$value");
240 :     }
241 : parrello 1.3 # Verify and fix the field values.
242 :     $erdb->VerifyFields($self->{table}, \@values);
243 :     $erdb->DigestFields($self->{table}, \@values);
244 : parrello 1.1 # Write the record.
245 :     Tracer::PutLine($oh, \@values);
246 :     }
247 :     # Return the record length.
248 :     return $retVal;
249 :     }
250 :    
251 :     =head3 Finish
252 :    
253 :     $erdbload->Finish();
254 :    
255 :     Finish the load for this table, closing the output file and renaming it
256 :     to mark it finished.
257 :    
258 :     =cut
259 :    
260 :     sub Finish {
261 :     # Get the parameters.
262 :     my ($self) = @_;
263 :     # Do standard cleanup. This returns the file name.
264 :     my $fileName = $self->_Cleanup();
265 :     Confess("Finish called before Start for $self->{table}") if ! defined $fileName;
266 :     # Rename the output file so the collator will find it.
267 :     rename TempOf($fileName), $fileName;
268 :     }
269 :    
270 :     =head3 Abort
271 :    
272 :     $erdbload->Abort();
273 :    
274 :     Terminate the load for this table as having failed. The output file is
275 :     closed and deleted.
276 :    
277 :     =cut
278 :    
279 :     sub Abort {
280 :     # Get the parameters.
281 :     my ($self) = @_;
282 :     # Do standard cleanup. This returns the file name.
283 :     my $fileName = $self->_Cleanup();
284 :     # Delete the temp file (if it exists).
285 :     if (defined $fileName) {
286 :     my $tempName = TempOf($fileName);
287 :     unlink $tempName if -e $tempName;
288 :     }
289 :     }
290 :    
291 :    
292 :     =head2 File Naming Methods
293 :    
294 :     These methods are used to analyze and generate file names. There are many packages
295 :     involved in creating and managing load files. All the file names are generated by
296 :     methods in this group so that there is no breakdown of communication should the file
297 :     naming conventions change.
298 :    
299 :     Currently, a file name consists of a content name, an optional section
300 :     name preceded by a hyphen, and an extension of C<dtx> or C<dty>. A C<dtx> file
301 :     contains table data, and its content name will be the same as the relevant table
302 :     name. A C<dty> file contains control data. Files with control data
303 :     are considered transient, so during post-processing no attempt is made to insure they
304 :     are all present or absent. If a control data file is not table-related, the content
305 :     name should be in all lower case with underscores, so that it is guaranteed not to
306 :     conflict with a table name.
307 :    
308 :     =cut
309 :    
310 :     # This constant maps file name extensions to content types.
311 :     use constant FILE_TYPES => { dtx => 'data', dty => 'control', 'dtz' => 'temp' };
312 :     # This constant maps content types to file name extensions.
313 :     use constant FILE_EXTS => { data => 'dtx', control => 'dty', temp => 'dtz' };
314 :    
315 :     =head3 ParseFileName
316 :    
317 :     my ($content, $section, $type) = ERDBGenerate::ParseFileName($fileName);
318 :    
319 :     Parse a base file name to extract the content name, the section name, and the
320 :     file type. If the file is for an entire table (not partial), the section name will
321 :     be undefined. If the file does not appear to be a load-related file, all return
322 :     values will be undefined. If the file belongs to a particular table, the content
323 :     name will be the table name; otherwise, the content name will not correspond to the
324 :     name of any table.
325 :    
326 :     =over 4
327 :    
328 :     =item fileName
329 :    
330 :     File name to parse. This should be a base file name with no directory
331 :     information in it.
332 :    
333 :     =item RETURN
334 :    
335 :     Returns a three-element list. The first two elements are the content name (which
336 :     could be a table name) and the section name (which will be undefined if the
337 :     file does not belong to a specific section. The third element will be C<data>
338 :     if the file contains table data, C<control> if it contains control or status
339 :     data (such as, for example, a saved list of section names), or C<temp> if it is
340 :     a temporary file.
341 :    
342 :     =back
343 :    
344 :     =cut
345 :    
346 :     sub ParseFileName {
347 :     # Get the parameters.
348 :     my ($fileName) = @_;
349 :     # Declare the return variables.
350 :     my ($content, $section, $type);
351 :     # Try to parse the file name.
352 :     if ($fileName =~ m#^(\w+)-(.+)\.(dtx|dty)#) {
353 :     # We have a table and a section.
354 :     ($content, $section, $type) = ($1, $2, FILE_TYPES->{$3});
355 :     } elsif ($fileName =~ m#^(\w+)\.(dtx|dty)$#) {
356 :     # Here it's just a table.
357 :     ($content, $type) = ($1, FILE_TYPES->{$2});
358 :     }
359 :     # Return the results.
360 :     return ($content, $section, $type);
361 :     }
362 :    
363 :     =head3 CreateFileName
364 :    
365 :     my $fileName = ERDBGenerate::CreateFileName($content, $section, $type, $dir);
366 :    
367 :     Return a file name for the specified type of operation on the specified
368 :     content and optionally the specified section.
369 :    
370 :     =over 4
371 :    
372 :     =item content
373 :    
374 :     File content. This can be a table name or a lower-case phrase describing what's in
375 :     the file. In the latter case only letters, digits, and underscores are allowed.
376 :    
377 :     =item section
378 :    
379 :     The section of the data to which the file's content relates, or C<undef> if
380 :     the file is for all sections.
381 :    
382 :     =item type
383 :    
384 :     C<data> for a file containing table data, C<control> for a file containing
385 :     ancillary or control data, or C<temp> for a file containing temporary data.
386 :    
387 :     =item dir (optional)
388 :    
389 :     If specified, the name of a directory. The directory name will be prefixed to
390 :     the file name with an intervening slash.
391 :    
392 :     =item RETURN
393 :    
394 :     Returns a file name suitable for the specified purpose.
395 :    
396 :     =back
397 :    
398 :     =cut
399 :    
400 :     sub CreateFileName {
401 :     # Get the parameters.
402 :     my ($content, $section, $type, $dir) = @_;
403 :     # Format the section portion of the file name.
404 :     my $sectionData = (defined $section && $section ne '' ? "-$section" : '');
405 :     # Assemble it into the file name.
406 :     my $retVal = "$content$sectionData." . FILE_EXTS->{$type};
407 :     # Add the directory, if necessary.
408 :     if (defined $dir) {
409 :     $retVal = "$dir/$retVal";
410 :     }
411 :     # Return the result.
412 :     return $retVal;
413 :     }
414 :    
415 :     =head3 GetLoadFiles
416 :    
417 :     my @files = ERDBGenerate::GetLoadFiles($directory);
418 :    
419 :     Get a list of the names of the load-related files in the specified
420 :     directory. Only the base file names are returned, without any path
421 :     information. The base names can later be fed to L</ParseFileName> to
422 :     determine what is in the file.
423 :    
424 :     =over 4
425 :    
426 :     =item directory
427 :    
428 :     Load directory for the relevant database.
429 :    
430 :     =item RETURN
431 :    
432 :     Returns a list of base file names for load-related files in the specified
433 :     directory.
434 :    
435 :     =back
436 :    
437 :     =cut
438 :    
439 :     sub GetLoadFiles {
440 :     # Get the parameters.
441 :     my ($directory) = @_;
442 :     # Get matching file names from the specified directory.
443 :     my @retVal = grep { $_ =~ /\w+(-.+)?\.dt(x|y|z)$/ } Tracer::OpenDir($directory);
444 :     # Return the result.
445 :     return @retVal;
446 :     }
447 :    
448 :     =head3 TempOf
449 :    
450 :     my $fileName = ERDBGenerate::TempOf($fileName);
451 :    
452 :     Return the temporary file name associated with the specified data file
453 :     name. There is a one-to-one mapping between the name of a file containing
454 :     table data and the corresponding temporary file used during file creation.
455 :    
456 :     =over 4
457 :    
458 :     =item fileName
459 :    
460 :     Name of the table data file to be converted File name to be converted
461 :    
462 :     =item RETURN
463 :    
464 :     Returns the corresponding temporary file name.
465 :    
466 :     =back
467 :    
468 :     =cut
469 :    
470 :     sub TempOf {
471 :     # Get the parameters.
472 :     my ($fileName) = @_;
473 :     # Copy the incoming file name.
474 :     my $retVal = $fileName;
475 :     # Change the last character to 'z'.
476 :     substr($retVal, -1, 1, 'z');
477 :     # Return the result.
478 :     return $retVal;
479 :     }
480 :    
481 :     =head2 Internal Utility Methods
482 :    
483 :     =head3 _Cleanup
484 :    
485 :     my $fileName = $erdbload->_Cleanup();
486 :    
487 :     Release resources held by this object and return the name of the current
488 :     output file. This method contains operations common to both L</Abort> and
489 :     L</Finish>. If no output file is present, it will return an undefined
490 :     value.
491 :    
492 :     =cut
493 :    
494 :     sub _Cleanup {
495 :     # Get the parameters.
496 :     my ($self) = @_;
497 :     # Get the operating file name.
498 :     my $retVal = $self->{fileName};
499 :     # Close the file handle if it's open.
500 :     my $oh = $self->{fh};
501 :     close $oh if defined $oh;
502 :     # Clear the duplicate-key hash if we have one. We are careful here to
503 :     # insure that we don't create a new one by accident. The existence of
504 :     # a dupHash determines whether or not we need to make the duplicate-key
505 :     # check.
506 :     if (exists $self->{dupHash}) {
507 :     $self->{dupHash} = {};
508 :     }
509 :     # Denote we're no longer inside a section.
510 :     for my $field (qw(fh fileName section)) {
511 :     $self->{$field} = undef;
512 :     }
513 :     # Return the result.
514 :     return $retVal;
515 :     }
516 :    
517 :    
518 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3