[Bio] / Sprout / ERDBGenerate.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBGenerate.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package ERDBGenerate;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDB;
9 :     use Stats;
10 :    
11 :     =head1 ERDB Table Data Generation Helper Object
12 :    
13 :     =head2 Introduction
14 :    
15 :     This object is designed to assist with creating the load files for an ERDB
16 :     data relation (also known as a I<table>).
17 :    
18 :     The generation process can be very long, so each table is loaded a section at a
19 :     time, with multiple sections running in parallel. After the load files for each
20 :     section are created, a separate process is used to collate the sections and load
21 :     them into the database tables.
22 :    
23 :     When the output file is being written, its name is suffixed by a tilde (C<~>) to denote
24 :     it is currently being processed. When the L</Finish> method is called, the file is closed
25 :     and renamed. If the L</Finish> method is not called, it is presumed that the load has
26 :     failed. The tilde will remain in place so that the collater knows the file is invalid.
27 :    
28 :     This object maintains the following data fields.
29 :    
30 :     =over 4
31 :    
32 :     =item directory
33 :    
34 :     Directory into which load files should be placed.
35 :    
36 :     =item dupHash
37 :    
38 :     Hash of IDs already seen for this table. This hash is only created if the table
39 :     is the primary relation of an entity. Removing duplicate IDs is not strictly necessary,
40 :     but it saves disk space and avoids unnecessary I/O, which makes us a bit faster.
41 :    
42 :     =item erdb
43 :    
44 :     [[ErdbPm]] object used to create and access the database. This will usually
45 :     be a subclass of a pure ERDB object created to manage a particular database.
46 :    
47 :     =item fh
48 :    
49 :     Open handle of the current output file (if any).
50 :    
51 :     =item fileName
52 :    
53 :     Name of the current output file (if any).
54 :    
55 :     =item relation
56 :    
57 :     Reference to the descriptor for this table's relation in the ERDB database
58 :     object.
59 :    
60 : parrello 1.2 =item stats
61 :    
62 :     Statistics object for recording events.
63 :    
64 : parrello 1.1 =item table
65 :    
66 :     Name of the relation table being loaded.
67 :    
68 :     =back
69 :    
70 :     =cut
71 :    
72 :     =head3 new
73 :    
74 : parrello 1.2 my $erdbload = ERDBGenerate->new($erdb, $directory, $table, $stats);
75 : parrello 1.1
76 :     Create an ERDB Table Load Utility object for a specified table. Note that
77 :     when generating a table, the section ID is required, but for collating
78 :     and loading it can be omitted.
79 :    
80 :     =over 4
81 :    
82 :     =item erdb
83 :    
84 :     [[ErdbPm]] object for the database being loaded.
85 :    
86 :     =item directory
87 :    
88 :     Name of the directory into which the load files are to be placed.
89 :    
90 :     =item table
91 :    
92 :     Name of the table being loaded.
93 :    
94 : parrello 1.2 =item stats
95 : parrello 1.1
96 : parrello 1.2 Statistics object for recording events.
97 : parrello 1.1
98 :     =back
99 :    
100 :     =cut
101 :    
102 :     sub new {
103 :     # Get the parameters.
104 : parrello 1.2 my ($class, $erdb, $directory, $table, $stats) = @_;
105 : parrello 1.1 # Ask the database for the relation's descriptor.
106 : parrello 1.2 my $relation = $erdb->FindRelation($table);
107 : parrello 1.1 Confess("Invalid table name \"$table\".") if (! defined $relation);
108 :     # Create the new object.
109 :     my $retVal = {
110 :     directory => $directory,
111 : parrello 1.2 erdb => $erdb,
112 : parrello 1.1 fh => undef,
113 :     fileName => undef,
114 :     relation => $relation,
115 : parrello 1.2 stats => $stats,
116 : parrello 1.1 table => $table,
117 :     };
118 :     # Is this relation the primary relation of an entity?
119 : parrello 1.2 if ($erdb->IsEntity($table)) {
120 : parrello 1.1 # Yes. Create a duplicate-check hash. We don't create an empty
121 :     # hash unless it's needed, because the method that checks for
122 :     # the hash uses "exists".
123 :     $retVal->{dupHash} = {};
124 :     }
125 :     # Bless and return the result.
126 :     bless $retVal, $class;
127 :     return $retVal;
128 :     }
129 :    
130 :     =head2 Public Methods
131 :    
132 :     =head3 Start
133 :    
134 :     $erdbload->Start($section);
135 :    
136 :     Initialize for loading the specified section into this loader's table.
137 :     This deletes any existing load file for the section and creates a
138 :     temporary file into which new data can be placed using L</Put> calls.
139 :    
140 :     =over 4
141 :    
142 :     =item section
143 :    
144 :     ID of the section being loaded.
145 :    
146 :     =back
147 :    
148 :     =cut
149 :    
150 :     sub Start {
151 :     # Get the parameters.
152 :     my ($self, $section) = @_;
153 :     # Compute the output file name.
154 :     my $fileName = CreateFileName($self->{table}, $section, 'data',
155 :     $self->{directory});
156 :     # Insure it doesn't already exist.
157 :     unlink $fileName if -e $fileName;
158 :     # Open a temporary file for it.
159 :     my $oh = Open(undef, ">" . TempOf($fileName));
160 :     # Save the name and handle.
161 :     $self->{fh} = $oh;
162 :     $self->{fileName} = $fileName;
163 :     Trace("Starting output to $fileName for section $section and table $self->{table}.") if T(4);
164 :     }
165 :    
166 :     =head3 Put
167 :    
168 :     my $length = $erdbload->Put(%putFields);
169 :    
170 :     Output the specified fields to the currently-active load file. The fields
171 :     come in as a hash mapping field names to field values. Fields whose
172 :     values are not specified will be set to their default value.
173 :    
174 :     =over 4
175 :    
176 :     =item putFields
177 :    
178 :     A hash mapping field names for this generator's target relation to
179 :     field values.
180 :    
181 :     =item RETURN
182 :    
183 :     Returns the number of characters output (excluding delimiters), or zero if
184 :     nothing is output (which usually indicates we're discarding a duplicate entity.)
185 :    
186 :     =back
187 :    
188 :     =cut
189 :    
190 :     sub Put {
191 :     # Get the parameters.
192 :     my ($self, %putFields) = @_;
193 :     # We return the number of characters output.
194 :     my $retVal = 0;
195 :     # Get the descriptor for this relation.
196 :     my $relationTable = $self->{relation};
197 :     # Insure we have an output file to which we can write.
198 :     my $oh = $self->{fh};
199 :     Confess("Put before Start for $self->{table}.") if ! defined $oh;
200 :     # Before we try to output this record, see if it's a duplicate. This is only
201 :     # possible if the duplicate-key hash exists and we have an ID.
202 :     my $duplicate;
203 :     if (exists $self->{dupHash} && $putFields{id}) {
204 :     # Okay, here we can make the check. If the ID has already been
205 :     # seen, this statement will set $duplicate to a nonzero value.
206 :     # Otherwise, it will be undefined.
207 :     $duplicate = $self->{dupHash}->{$putFields{id}}++;
208 :     }
209 :     # Only proceed if we're NOT a duplicate.
210 :     if (! $duplicate) {
211 :     # We'll create an ordered list of field values in here.
212 :     my @values;
213 :     # Loop through the relation's fields.
214 :     for my $field (@{$relationTable->{Fields}}) {
215 :     # Get this field's value.
216 :     my $value = $putFields{$field->{name}};
217 :     if (! defined $value) {
218 :     # The field has no value, so check for a default.
219 :     $value = $field->{default};
220 :     # If there's no default, we have an error.
221 :     Confess("Missing value for $field->{name} in Put for $self->{table}.")
222 :     if ! defined $value;
223 :     }
224 :     # Push the value into the result list.
225 :     push @values, $value;
226 : parrello 1.2 Trace("Field $field->{name} in $self->{table} has value \"$value\".") if T(4);
227 : parrello 1.1 # Record its length.
228 :     $retVal += length("$value");
229 :     }
230 : parrello 1.2 # Fix and verify the field values.
231 :     my $truncates = $self->{erdb}->VerifyFields($self->{table}, \@values);
232 :     # Record any field truncation events.
233 :     $self->{stats}->Add(truncations => $truncates) if $truncates;
234 : parrello 1.1 # Write the record.
235 :     Tracer::PutLine($oh, \@values);
236 :     }
237 :     # Return the record length.
238 :     return $retVal;
239 :     }
240 :    
241 :     =head3 Finish
242 :    
243 :     $erdbload->Finish();
244 :    
245 :     Finish the load for this table, closing the output file and renaming it
246 :     to mark it finished.
247 :    
248 :     =cut
249 :    
250 :     sub Finish {
251 :     # Get the parameters.
252 :     my ($self) = @_;
253 :     # Do standard cleanup. This returns the file name.
254 :     my $fileName = $self->_Cleanup();
255 :     Confess("Finish called before Start for $self->{table}") if ! defined $fileName;
256 :     # Rename the output file so the collator will find it.
257 :     rename TempOf($fileName), $fileName;
258 :     }
259 :    
260 :     =head3 Abort
261 :    
262 :     $erdbload->Abort();
263 :    
264 :     Terminate the load for this table as having failed. The output file is
265 :     closed and deleted.
266 :    
267 :     =cut
268 :    
269 :     sub Abort {
270 :     # Get the parameters.
271 :     my ($self) = @_;
272 :     # Do standard cleanup. This returns the file name.
273 :     my $fileName = $self->_Cleanup();
274 :     # Delete the temp file (if it exists).
275 :     if (defined $fileName) {
276 :     my $tempName = TempOf($fileName);
277 :     unlink $tempName if -e $tempName;
278 :     }
279 :     }
280 :    
281 :    
282 :     =head2 File Naming Methods
283 :    
284 :     These methods are used to analyze and generate file names. There are many packages
285 :     involved in creating and managing load files. All the file names are generated by
286 :     methods in this group so that there is no breakdown of communication should the file
287 :     naming conventions change.
288 :    
289 :     Currently, a file name consists of a content name, an optional section
290 :     name preceded by a hyphen, and an extension of C<dtx> or C<dty>. A C<dtx> file
291 :     contains table data, and its content name will be the same as the relevant table
292 :     name. A C<dty> file contains control data. Files with control data
293 :     are considered transient, so during post-processing no attempt is made to insure they
294 :     are all present or absent. If a control data file is not table-related, the content
295 :     name should be in all lower case with underscores, so that it is guaranteed not to
296 :     conflict with a table name.
297 :    
298 :     =cut
299 :    
300 :     # This constant maps file name extensions to content types.
301 :     use constant FILE_TYPES => { dtx => 'data', dty => 'control', 'dtz' => 'temp' };
302 :     # This constant maps content types to file name extensions.
303 :     use constant FILE_EXTS => { data => 'dtx', control => 'dty', temp => 'dtz' };
304 :    
305 :     =head3 ParseFileName
306 :    
307 :     my ($content, $section, $type) = ERDBGenerate::ParseFileName($fileName);
308 :    
309 :     Parse a base file name to extract the content name, the section name, and the
310 :     file type. If the file is for an entire table (not partial), the section name will
311 :     be undefined. If the file does not appear to be a load-related file, all return
312 :     values will be undefined. If the file belongs to a particular table, the content
313 :     name will be the table name; otherwise, the content name will not correspond to the
314 :     name of any table.
315 :    
316 :     =over 4
317 :    
318 :     =item fileName
319 :    
320 :     File name to parse. This should be a base file name with no directory
321 :     information in it.
322 :    
323 :     =item RETURN
324 :    
325 :     Returns a three-element list. The first two elements are the content name (which
326 :     could be a table name) and the section name (which will be undefined if the
327 :     file does not belong to a specific section. The third element will be C<data>
328 :     if the file contains table data, C<control> if it contains control or status
329 :     data (such as, for example, a saved list of section names), or C<temp> if it is
330 :     a temporary file.
331 :    
332 :     =back
333 :    
334 :     =cut
335 :    
336 :     sub ParseFileName {
337 :     # Get the parameters.
338 :     my ($fileName) = @_;
339 :     # Declare the return variables.
340 :     my ($content, $section, $type);
341 :     # Try to parse the file name.
342 :     if ($fileName =~ m#^(\w+)-(.+)\.(dtx|dty)#) {
343 :     # We have a table and a section.
344 :     ($content, $section, $type) = ($1, $2, FILE_TYPES->{$3});
345 :     } elsif ($fileName =~ m#^(\w+)\.(dtx|dty)$#) {
346 :     # Here it's just a table.
347 :     ($content, $type) = ($1, FILE_TYPES->{$2});
348 :     }
349 :     # Return the results.
350 :     return ($content, $section, $type);
351 :     }
352 :    
353 :     =head3 CreateFileName
354 :    
355 :     my $fileName = ERDBGenerate::CreateFileName($content, $section, $type, $dir);
356 :    
357 :     Return a file name for the specified type of operation on the specified
358 :     content and optionally the specified section.
359 :    
360 :     =over 4
361 :    
362 :     =item content
363 :    
364 :     File content. This can be a table name or a lower-case phrase describing what's in
365 :     the file. In the latter case only letters, digits, and underscores are allowed.
366 :    
367 :     =item section
368 :    
369 :     The section of the data to which the file's content relates, or C<undef> if
370 :     the file is for all sections.
371 :    
372 :     =item type
373 :    
374 :     C<data> for a file containing table data, C<control> for a file containing
375 :     ancillary or control data, or C<temp> for a file containing temporary data.
376 :    
377 :     =item dir (optional)
378 :    
379 :     If specified, the name of a directory. The directory name will be prefixed to
380 :     the file name with an intervening slash.
381 :    
382 :     =item RETURN
383 :    
384 :     Returns a file name suitable for the specified purpose.
385 :    
386 :     =back
387 :    
388 :     =cut
389 :    
390 :     sub CreateFileName {
391 :     # Get the parameters.
392 :     my ($content, $section, $type, $dir) = @_;
393 :     # Format the section portion of the file name.
394 :     my $sectionData = (defined $section && $section ne '' ? "-$section" : '');
395 :     # Assemble it into the file name.
396 :     my $retVal = "$content$sectionData." . FILE_EXTS->{$type};
397 :     # Add the directory, if necessary.
398 :     if (defined $dir) {
399 :     $retVal = "$dir/$retVal";
400 :     }
401 :     # Return the result.
402 :     return $retVal;
403 :     }
404 :    
405 :     =head3 GetLoadFiles
406 :    
407 :     my @files = ERDBGenerate::GetLoadFiles($directory);
408 :    
409 :     Get a list of the names of the load-related files in the specified
410 :     directory. Only the base file names are returned, without any path
411 :     information. The base names can later be fed to L</ParseFileName> to
412 :     determine what is in the file.
413 :    
414 :     =over 4
415 :    
416 :     =item directory
417 :    
418 :     Load directory for the relevant database.
419 :    
420 :     =item RETURN
421 :    
422 :     Returns a list of base file names for load-related files in the specified
423 :     directory.
424 :    
425 :     =back
426 :    
427 :     =cut
428 :    
429 :     sub GetLoadFiles {
430 :     # Get the parameters.
431 :     my ($directory) = @_;
432 :     # Get matching file names from the specified directory.
433 :     my @retVal = grep { $_ =~ /\w+(-.+)?\.dt(x|y|z)$/ } Tracer::OpenDir($directory);
434 :     # Return the result.
435 :     return @retVal;
436 :     }
437 :    
438 :     =head3 TempOf
439 :    
440 :     my $fileName = ERDBGenerate::TempOf($fileName);
441 :    
442 :     Return the temporary file name associated with the specified data file
443 :     name. There is a one-to-one mapping between the name of a file containing
444 :     table data and the corresponding temporary file used during file creation.
445 :    
446 :     =over 4
447 :    
448 :     =item fileName
449 :    
450 :     Name of the table data file to be converted File name to be converted
451 :    
452 :     =item RETURN
453 :    
454 :     Returns the corresponding temporary file name.
455 :    
456 :     =back
457 :    
458 :     =cut
459 :    
460 :     sub TempOf {
461 :     # Get the parameters.
462 :     my ($fileName) = @_;
463 :     # Copy the incoming file name.
464 :     my $retVal = $fileName;
465 :     # Change the last character to 'z'.
466 :     substr($retVal, -1, 1, 'z');
467 :     # Return the result.
468 :     return $retVal;
469 :     }
470 :    
471 :     =head2 Internal Utility Methods
472 :    
473 :     =head3 _Cleanup
474 :    
475 :     my $fileName = $erdbload->_Cleanup();
476 :    
477 :     Release resources held by this object and return the name of the current
478 :     output file. This method contains operations common to both L</Abort> and
479 :     L</Finish>. If no output file is present, it will return an undefined
480 :     value.
481 :    
482 :     =cut
483 :    
484 :     sub _Cleanup {
485 :     # Get the parameters.
486 :     my ($self) = @_;
487 :     # Get the operating file name.
488 :     my $retVal = $self->{fileName};
489 :     # Close the file handle if it's open.
490 :     my $oh = $self->{fh};
491 :     close $oh if defined $oh;
492 :     # Clear the duplicate-key hash if we have one. We are careful here to
493 :     # insure that we don't create a new one by accident. The existence of
494 :     # a dupHash determines whether or not we need to make the duplicate-key
495 :     # check.
496 :     if (exists $self->{dupHash}) {
497 :     $self->{dupHash} = {};
498 :     }
499 :     # Denote we're no longer inside a section.
500 :     for my $field (qw(fh fileName section)) {
501 :     $self->{$field} = undef;
502 :     }
503 :     # Return the result.
504 :     return $retVal;
505 :     }
506 :    
507 :    
508 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3