[Bio] / Sprout / ERDBGenerate.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBGenerate.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package ERDBGenerate;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDB;
9 :     use Stats;
10 :    
11 :     =head1 ERDB Table Data Generation Helper Object
12 :    
13 :     =head2 Introduction
14 :    
15 :     This object is designed to assist with creating the load files for an ERDB
16 :     data relation (also known as a I<table>).
17 :    
18 :     The generation process can be very long, so each table is loaded a section at a
19 :     time, with multiple sections running in parallel. After the load files for each
20 :     section are created, a separate process is used to collate the sections and load
21 :     them into the database tables.
22 :    
23 :     When the output file is being written, its name is suffixed by a tilde (C<~>) to denote
24 :     it is currently being processed. When the L</Finish> method is called, the file is closed
25 :     and renamed. If the L</Finish> method is not called, it is presumed that the load has
26 :     failed. The tilde will remain in place so that the collater knows the file is invalid.
27 :    
28 :     This object maintains the following data fields.
29 :    
30 :     =over 4
31 :    
32 :     =item directory
33 :    
34 :     Directory into which load files should be placed.
35 :    
36 :     =item dupHash
37 :    
38 :     Hash of IDs already seen for this table. This hash is only created if the table
39 :     is the primary relation of an entity. Removing duplicate IDs is not strictly necessary,
40 :     but it saves disk space and avoids unnecessary I/O, which makes us a bit faster.
41 :    
42 :     =item erdb
43 :    
44 :     [[ErdbPm]] object used to create and access the database. This will usually
45 :     be a subclass of a pure ERDB object created to manage a particular database.
46 :    
47 :     =item fh
48 :    
49 :     Open handle of the current output file (if any).
50 :    
51 :     =item fileName
52 :    
53 :     Name of the current output file (if any).
54 :    
55 :     =item relation
56 :    
57 :     Reference to the descriptor for this table's relation in the ERDB database
58 :     object.
59 :    
60 :     =item table
61 :    
62 :     Name of the relation table being loaded.
63 :    
64 :     =back
65 :    
66 :     =cut
67 :    
68 :     =head3 new
69 :    
70 :     my $erdbload = ERDBGenerate->new($erdb, $directory, $table, $section);
71 :    
72 :     Create an ERDB Table Load Utility object for a specified table. Note that
73 :     when generating a table, the section ID is required, but for collating
74 :     and loading it can be omitted.
75 :    
76 :     =over 4
77 :    
78 :     =item erdb
79 :    
80 :     [[ErdbPm]] object for the database being loaded.
81 :    
82 :     =item directory
83 :    
84 :     Name of the directory into which the load files are to be placed.
85 :    
86 :     =item table
87 :    
88 :     Name of the table being loaded.
89 :    
90 :     =item section
91 :    
92 :     ID of the current section.
93 :    
94 :     =back
95 :    
96 :     =cut
97 :    
98 :     sub new {
99 :     # Get the parameters.
100 :     my ($class, $db, $directory, $table) = @_;
101 :     # Ask the database for the relation's descriptor.
102 :     my $relation = $db->FindRelation($table);
103 :     Confess("Invalid table name \"$table\".") if (! defined $relation);
104 :     # Create the new object.
105 :     my $retVal = {
106 :     directory => $directory,
107 :     erdb => $db,
108 :     fh => undef,
109 :     fileName => undef,
110 :     relation => $relation,
111 :     table => $table,
112 :     };
113 :     # Is this relation the primary relation of an entity?
114 :     if ($db->IsEntity($table)) {
115 :     # Yes. Create a duplicate-check hash. We don't create an empty
116 :     # hash unless it's needed, because the method that checks for
117 :     # the hash uses "exists".
118 :     $retVal->{dupHash} = {};
119 :     }
120 :     # Bless and return the result.
121 :     bless $retVal, $class;
122 :     return $retVal;
123 :     }
124 :    
125 :     =head2 Public Methods
126 :    
127 :     =head3 Start
128 :    
129 :     $erdbload->Start($section);
130 :    
131 :     Initialize for loading the specified section into this loader's table.
132 :     This deletes any existing load file for the section and creates a
133 :     temporary file into which new data can be placed using L</Put> calls.
134 :    
135 :     =over 4
136 :    
137 :     =item section
138 :    
139 :     ID of the section being loaded.
140 :    
141 :     =back
142 :    
143 :     =cut
144 :    
145 :     sub Start {
146 :     # Get the parameters.
147 :     my ($self, $section) = @_;
148 :     # Compute the output file name.
149 :     my $fileName = CreateFileName($self->{table}, $section, 'data',
150 :     $self->{directory});
151 :     # Insure it doesn't already exist.
152 :     unlink $fileName if -e $fileName;
153 :     # Open a temporary file for it.
154 :     my $oh = Open(undef, ">" . TempOf($fileName));
155 :     # Save the name and handle.
156 :     $self->{fh} = $oh;
157 :     $self->{fileName} = $fileName;
158 :     Trace("Starting output to $fileName for section $section and table $self->{table}.") if T(4);
159 :     }
160 :    
161 :     =head3 Put
162 :    
163 :     my $length = $erdbload->Put(%putFields);
164 :    
165 :     Output the specified fields to the currently-active load file. The fields
166 :     come in as a hash mapping field names to field values. Fields whose
167 :     values are not specified will be set to their default value.
168 :    
169 :     =over 4
170 :    
171 :     =item putFields
172 :    
173 :     A hash mapping field names for this generator's target relation to
174 :     field values.
175 :    
176 :     =item RETURN
177 :    
178 :     Returns the number of characters output (excluding delimiters), or zero if
179 :     nothing is output (which usually indicates we're discarding a duplicate entity.)
180 :    
181 :     =back
182 :    
183 :     =cut
184 :    
185 :     sub Put {
186 :     # Get the parameters.
187 :     my ($self, %putFields) = @_;
188 :     # We return the number of characters output.
189 :     my $retVal = 0;
190 :     # Get the descriptor for this relation.
191 :     my $relationTable = $self->{relation};
192 :     # Insure we have an output file to which we can write.
193 :     my $oh = $self->{fh};
194 :     Confess("Put before Start for $self->{table}.") if ! defined $oh;
195 :     # Before we try to output this record, see if it's a duplicate. This is only
196 :     # possible if the duplicate-key hash exists and we have an ID.
197 :     my $duplicate;
198 :     if (exists $self->{dupHash} && $putFields{id}) {
199 :     # Okay, here we can make the check. If the ID has already been
200 :     # seen, this statement will set $duplicate to a nonzero value.
201 :     # Otherwise, it will be undefined.
202 :     $duplicate = $self->{dupHash}->{$putFields{id}}++;
203 :     }
204 :     # Only proceed if we're NOT a duplicate.
205 :     if (! $duplicate) {
206 :     # We'll create an ordered list of field values in here.
207 :     my @values;
208 :     # Loop through the relation's fields.
209 :     for my $field (@{$relationTable->{Fields}}) {
210 :     # Get this field's value.
211 :     my $value = $putFields{$field->{name}};
212 :     if (! defined $value) {
213 :     # The field has no value, so check for a default.
214 :     $value = $field->{default};
215 :     # If there's no default, we have an error.
216 :     Confess("Missing value for $field->{name} in Put for $self->{table}.")
217 :     if ! defined $value;
218 :     }
219 :     # Push the value into the result list.
220 :     push @values, $value;
221 :     # Record its length.
222 :     $retVal += length("$value");
223 :     }
224 :     # Write the record.
225 :     Tracer::PutLine($oh, \@values);
226 :     }
227 :     # Return the record length.
228 :     return $retVal;
229 :     }
230 :    
231 :     =head3 Finish
232 :    
233 :     $erdbload->Finish();
234 :    
235 :     Finish the load for this table, closing the output file and renaming it
236 :     to mark it finished.
237 :    
238 :     =cut
239 :    
240 :     sub Finish {
241 :     # Get the parameters.
242 :     my ($self) = @_;
243 :     # Do standard cleanup. This returns the file name.
244 :     my $fileName = $self->_Cleanup();
245 :     Confess("Finish called before Start for $self->{table}") if ! defined $fileName;
246 :     # Rename the output file so the collator will find it.
247 :     rename TempOf($fileName), $fileName;
248 :     }
249 :    
250 :     =head3 Abort
251 :    
252 :     $erdbload->Abort();
253 :    
254 :     Terminate the load for this table as having failed. The output file is
255 :     closed and deleted.
256 :    
257 :     =cut
258 :    
259 :     sub Abort {
260 :     # Get the parameters.
261 :     my ($self) = @_;
262 :     # Do standard cleanup. This returns the file name.
263 :     my $fileName = $self->_Cleanup();
264 :     # Delete the temp file (if it exists).
265 :     if (defined $fileName) {
266 :     my $tempName = TempOf($fileName);
267 :     unlink $tempName if -e $tempName;
268 :     }
269 :     }
270 :    
271 :    
272 :     =head2 File Naming Methods
273 :    
274 :     These methods are used to analyze and generate file names. There are many packages
275 :     involved in creating and managing load files. All the file names are generated by
276 :     methods in this group so that there is no breakdown of communication should the file
277 :     naming conventions change.
278 :    
279 :     Currently, a file name consists of a content name, an optional section
280 :     name preceded by a hyphen, and an extension of C<dtx> or C<dty>. A C<dtx> file
281 :     contains table data, and its content name will be the same as the relevant table
282 :     name. A C<dty> file contains control data. Files with control data
283 :     are considered transient, so during post-processing no attempt is made to insure they
284 :     are all present or absent. If a control data file is not table-related, the content
285 :     name should be in all lower case with underscores, so that it is guaranteed not to
286 :     conflict with a table name.
287 :    
288 :     =cut
289 :    
290 :     # This constant maps file name extensions to content types.
291 :     use constant FILE_TYPES => { dtx => 'data', dty => 'control', 'dtz' => 'temp' };
292 :     # This constant maps content types to file name extensions.
293 :     use constant FILE_EXTS => { data => 'dtx', control => 'dty', temp => 'dtz' };
294 :    
295 :     =head3 ParseFileName
296 :    
297 :     my ($content, $section, $type) = ERDBGenerate::ParseFileName($fileName);
298 :    
299 :     Parse a base file name to extract the content name, the section name, and the
300 :     file type. If the file is for an entire table (not partial), the section name will
301 :     be undefined. If the file does not appear to be a load-related file, all return
302 :     values will be undefined. If the file belongs to a particular table, the content
303 :     name will be the table name; otherwise, the content name will not correspond to the
304 :     name of any table.
305 :    
306 :     =over 4
307 :    
308 :     =item fileName
309 :    
310 :     File name to parse. This should be a base file name with no directory
311 :     information in it.
312 :    
313 :     =item RETURN
314 :    
315 :     Returns a three-element list. The first two elements are the content name (which
316 :     could be a table name) and the section name (which will be undefined if the
317 :     file does not belong to a specific section. The third element will be C<data>
318 :     if the file contains table data, C<control> if it contains control or status
319 :     data (such as, for example, a saved list of section names), or C<temp> if it is
320 :     a temporary file.
321 :    
322 :     =back
323 :    
324 :     =cut
325 :    
326 :     sub ParseFileName {
327 :     # Get the parameters.
328 :     my ($fileName) = @_;
329 :     # Declare the return variables.
330 :     my ($content, $section, $type);
331 :     # Try to parse the file name.
332 :     if ($fileName =~ m#^(\w+)-(.+)\.(dtx|dty)#) {
333 :     # We have a table and a section.
334 :     ($content, $section, $type) = ($1, $2, FILE_TYPES->{$3});
335 :     } elsif ($fileName =~ m#^(\w+)\.(dtx|dty)$#) {
336 :     # Here it's just a table.
337 :     ($content, $type) = ($1, FILE_TYPES->{$2});
338 :     }
339 :     # Return the results.
340 :     return ($content, $section, $type);
341 :     }
342 :    
343 :     =head3 CreateFileName
344 :    
345 :     my $fileName = ERDBGenerate::CreateFileName($content, $section, $type, $dir);
346 :    
347 :     Return a file name for the specified type of operation on the specified
348 :     content and optionally the specified section.
349 :    
350 :     =over 4
351 :    
352 :     =item content
353 :    
354 :     File content. This can be a table name or a lower-case phrase describing what's in
355 :     the file. In the latter case only letters, digits, and underscores are allowed.
356 :    
357 :     =item section
358 :    
359 :     The section of the data to which the file's content relates, or C<undef> if
360 :     the file is for all sections.
361 :    
362 :     =item type
363 :    
364 :     C<data> for a file containing table data, C<control> for a file containing
365 :     ancillary or control data, or C<temp> for a file containing temporary data.
366 :    
367 :     =item dir (optional)
368 :    
369 :     If specified, the name of a directory. The directory name will be prefixed to
370 :     the file name with an intervening slash.
371 :    
372 :     =item RETURN
373 :    
374 :     Returns a file name suitable for the specified purpose.
375 :    
376 :     =back
377 :    
378 :     =cut
379 :    
380 :     sub CreateFileName {
381 :     # Get the parameters.
382 :     my ($content, $section, $type, $dir) = @_;
383 :     # Format the section portion of the file name.
384 :     my $sectionData = (defined $section && $section ne '' ? "-$section" : '');
385 :     # Assemble it into the file name.
386 :     my $retVal = "$content$sectionData." . FILE_EXTS->{$type};
387 :     # Add the directory, if necessary.
388 :     if (defined $dir) {
389 :     $retVal = "$dir/$retVal";
390 :     }
391 :     # Return the result.
392 :     return $retVal;
393 :     }
394 :    
395 :     =head3 GetLoadFiles
396 :    
397 :     my @files = ERDBGenerate::GetLoadFiles($directory);
398 :    
399 :     Get a list of the names of the load-related files in the specified
400 :     directory. Only the base file names are returned, without any path
401 :     information. The base names can later be fed to L</ParseFileName> to
402 :     determine what is in the file.
403 :    
404 :     =over 4
405 :    
406 :     =item directory
407 :    
408 :     Load directory for the relevant database.
409 :    
410 :     =item RETURN
411 :    
412 :     Returns a list of base file names for load-related files in the specified
413 :     directory.
414 :    
415 :     =back
416 :    
417 :     =cut
418 :    
419 :     sub GetLoadFiles {
420 :     # Get the parameters.
421 :     my ($directory) = @_;
422 :     # Get matching file names from the specified directory.
423 :     my @retVal = grep { $_ =~ /\w+(-.+)?\.dt(x|y|z)$/ } Tracer::OpenDir($directory);
424 :     # Return the result.
425 :     return @retVal;
426 :     }
427 :    
428 :     =head3 TempOf
429 :    
430 :     my $fileName = ERDBGenerate::TempOf($fileName);
431 :    
432 :     Return the temporary file name associated with the specified data file
433 :     name. There is a one-to-one mapping between the name of a file containing
434 :     table data and the corresponding temporary file used during file creation.
435 :    
436 :     =over 4
437 :    
438 :     =item fileName
439 :    
440 :     Name of the table data file to be converted File name to be converted
441 :    
442 :     =item RETURN
443 :    
444 :     Returns the corresponding temporary file name.
445 :    
446 :     =back
447 :    
448 :     =cut
449 :    
450 :     sub TempOf {
451 :     # Get the parameters.
452 :     my ($fileName) = @_;
453 :     # Copy the incoming file name.
454 :     my $retVal = $fileName;
455 :     # Change the last character to 'z'.
456 :     substr($retVal, -1, 1, 'z');
457 :     # Return the result.
458 :     return $retVal;
459 :     }
460 :    
461 :     =head2 Internal Utility Methods
462 :    
463 :     =head3 _Cleanup
464 :    
465 :     my $fileName = $erdbload->_Cleanup();
466 :    
467 :     Release resources held by this object and return the name of the current
468 :     output file. This method contains operations common to both L</Abort> and
469 :     L</Finish>. If no output file is present, it will return an undefined
470 :     value.
471 :    
472 :     =cut
473 :    
474 :     sub _Cleanup {
475 :     # Get the parameters.
476 :     my ($self) = @_;
477 :     # Get the operating file name.
478 :     my $retVal = $self->{fileName};
479 :     # Close the file handle if it's open.
480 :     my $oh = $self->{fh};
481 :     close $oh if defined $oh;
482 :     # Clear the duplicate-key hash if we have one. We are careful here to
483 :     # insure that we don't create a new one by accident. The existence of
484 :     # a dupHash determines whether or not we need to make the duplicate-key
485 :     # check.
486 :     if (exists $self->{dupHash}) {
487 :     $self->{dupHash} = {};
488 :     }
489 :     # Denote we're no longer inside a section.
490 :     for my $field (qw(fh fileName section)) {
491 :     $self->{$field} = undef;
492 :     }
493 :     # Return the result.
494 :     return $retVal;
495 :     }
496 :    
497 :    
498 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3