[Bio] / Sprout / ERDBGenerate.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBGenerate.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package ERDBGenerate;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDB;
9 :     use Stats;
10 :    
11 :     =head1 ERDB Table Data Generation Helper Object
12 :    
13 :     =head2 Introduction
14 :    
15 :     This object is designed to assist with creating the load files for an ERDB
16 :     data relation (also known as a I<table>).
17 :    
18 :     The generation process can be very long, so each table is loaded a section at a
19 :     time, with multiple sections running in parallel. After the load files for each
20 :     section are created, a separate process is used to collate the sections and load
21 :     them into the database tables.
22 :    
23 :     When the output file is being written, its name is suffixed by a tilde (C<~>) to denote
24 :     it is currently being processed. When the L</Finish> method is called, the file is closed
25 :     and renamed. If the L</Finish> method is not called, it is presumed that the load has
26 :     failed. The tilde will remain in place so that the collater knows the file is invalid.
27 :    
28 :     This object maintains the following data fields.
29 :    
30 :     =over 4
31 :    
32 :     =item directory
33 :    
34 :     Directory into which load files should be placed.
35 :    
36 :     =item erdb
37 :    
38 : parrello 1.6 L<ERDB> object used to create and access the database. This will usually
39 : parrello 1.1 be a subclass of a pure ERDB object created to manage a particular database.
40 :    
41 :     =item fh
42 :    
43 :     Open handle of the current output file (if any).
44 :    
45 :     =item fileName
46 :    
47 :     Name of the current output file (if any).
48 :    
49 :     =item relation
50 :    
51 :     Reference to the descriptor for this table's relation in the ERDB database
52 :     object.
53 :    
54 : parrello 1.2 =item stats
55 :    
56 :     Statistics object for recording events.
57 :    
58 : parrello 1.1 =item table
59 :    
60 :     Name of the relation table being loaded.
61 :    
62 :     =back
63 :    
64 :     =cut
65 :    
66 :     =head3 new
67 :    
68 : parrello 1.2 my $erdbload = ERDBGenerate->new($erdb, $directory, $table, $stats);
69 : parrello 1.1
70 :     Create an ERDB Table Load Utility object for a specified table. Note that
71 :     when generating a table, the section ID is required, but for collating
72 :     and loading it can be omitted.
73 :    
74 :     =over 4
75 :    
76 :     =item erdb
77 :    
78 : parrello 1.6 L<ERDB> object for the database being loaded.
79 : parrello 1.1
80 :     =item directory
81 :    
82 :     Name of the directory into which the load files are to be placed.
83 :    
84 :     =item table
85 :    
86 :     Name of the table being loaded.
87 :    
88 : parrello 1.2 =item stats
89 : parrello 1.1
90 : parrello 1.2 Statistics object for recording events.
91 : parrello 1.1
92 :     =back
93 :    
94 :     =cut
95 :    
96 :     sub new {
97 :     # Get the parameters.
98 : parrello 1.2 my ($class, $erdb, $directory, $table, $stats) = @_;
99 : parrello 1.1 # Ask the database for the relation's descriptor.
100 : parrello 1.2 my $relation = $erdb->FindRelation($table);
101 : parrello 1.1 Confess("Invalid table name \"$table\".") if (! defined $relation);
102 :     # Create the new object.
103 :     my $retVal = {
104 :     directory => $directory,
105 : parrello 1.2 erdb => $erdb,
106 : parrello 1.1 fh => undef,
107 :     fileName => undef,
108 :     relation => $relation,
109 : parrello 1.2 stats => $stats,
110 : parrello 1.1 table => $table,
111 :     };
112 :     # Bless and return the result.
113 :     bless $retVal, $class;
114 :     return $retVal;
115 :     }
116 :    
117 :     =head2 Public Methods
118 :    
119 :     =head3 Start
120 :    
121 :     $erdbload->Start($section);
122 :    
123 :     Initialize for loading the specified section into this loader's table.
124 :     This deletes any existing load file for the section and creates a
125 :     temporary file into which new data can be placed using L</Put> calls.
126 :    
127 :     =over 4
128 :    
129 :     =item section
130 :    
131 :     ID of the section being loaded.
132 :    
133 :     =back
134 :    
135 :     =cut
136 :    
137 :     sub Start {
138 :     # Get the parameters.
139 :     my ($self, $section) = @_;
140 :     # Compute the output file name.
141 :     my $fileName = CreateFileName($self->{table}, $section, 'data',
142 :     $self->{directory});
143 :     # Insure it doesn't already exist.
144 :     unlink $fileName if -e $fileName;
145 :     # Open a temporary file for it.
146 :     my $oh = Open(undef, ">" . TempOf($fileName));
147 :     # Save the name and handle.
148 :     $self->{fh} = $oh;
149 :     $self->{fileName} = $fileName;
150 :     Trace("Starting output to $fileName for section $section and table $self->{table}.") if T(4);
151 :     }
152 :    
153 :     =head3 Put
154 :    
155 :     my $length = $erdbload->Put(%putFields);
156 :    
157 :     Output the specified fields to the currently-active load file. The fields
158 :     come in as a hash mapping field names to field values. Fields whose
159 :     values are not specified will be set to their default value.
160 :    
161 :     =over 4
162 :    
163 :     =item putFields
164 :    
165 :     A hash mapping field names for this generator's target relation to
166 :     field values.
167 :    
168 :     =item RETURN
169 :    
170 :     Returns the number of characters output (excluding delimiters), or zero if
171 :     nothing is output (which usually indicates we're discarding a duplicate entity.)
172 :    
173 :     =back
174 :    
175 :     =cut
176 :    
177 :     sub Put {
178 :     # Get the parameters.
179 :     my ($self, %putFields) = @_;
180 :     # We return the number of characters output.
181 :     my $retVal = 0;
182 : parrello 1.3 # Get the database object.
183 :     my $erdb = $self->{erdb};
184 : parrello 1.1 # Get the descriptor for this relation.
185 :     my $relationTable = $self->{relation};
186 :     # Insure we have an output file to which we can write.
187 :     my $oh = $self->{fh};
188 :     Confess("Put before Start for $self->{table}.") if ! defined $oh;
189 : parrello 1.4 # We'll create an ordered list of field values in here.
190 :     my @values;
191 :     # Loop through the relation's fields.
192 :     for my $field (@{$relationTable->{Fields}}) {
193 :     # Get this field's value. We need to consider the possibility the
194 :     # use used underscores instead of hyphens for convenience, so we
195 :     # have to check twice.
196 :     my $name = $field->{name};
197 :     my $value = $putFields{$name};
198 :     if (! defined $value) {
199 :     my $altName = $name;
200 :     $altName =~ tr/-/_/;
201 :     $value = $putFields{$altName};
202 :     }
203 :     # Did we find a value?
204 :     if (! defined $value) {
205 :     # The field has no value, so check for a default.
206 :     $value = $field->{default};
207 :     # If there's no default, we have an error.
208 :     Confess("Missing value for $field->{name} in Put for $self->{table}.")
209 :     if ! defined $value;
210 : parrello 1.1 }
211 : parrello 1.4 # Push the value into the result list.
212 :     push @values, $value;
213 :     Trace("Field $name in $self->{table} has value \"$value\".") if T(4);
214 : parrello 1.1 }
215 : parrello 1.4 # Verify and fix the field values.
216 :     $erdb->VerifyFields($self->{table}, \@values);
217 :     $erdb->DigestFields($self->{table}, \@values);
218 : parrello 1.5 # Compute the total field length.
219 :     for my $value (@values) {
220 :     $retVal += length("$value");
221 :     }
222 : parrello 1.4 # Write the record.
223 :     Tracer::PutLine($oh, \@values);
224 : parrello 1.1 # Return the record length.
225 :     return $retVal;
226 :     }
227 :    
228 :     =head3 Finish
229 :    
230 :     $erdbload->Finish();
231 :    
232 :     Finish the load for this table, closing the output file and renaming it
233 :     to mark it finished.
234 :    
235 :     =cut
236 :    
237 :     sub Finish {
238 :     # Get the parameters.
239 :     my ($self) = @_;
240 :     # Do standard cleanup. This returns the file name.
241 :     my $fileName = $self->_Cleanup();
242 :     Confess("Finish called before Start for $self->{table}") if ! defined $fileName;
243 :     # Rename the output file so the collator will find it.
244 :     rename TempOf($fileName), $fileName;
245 :     }
246 :    
247 :     =head3 Abort
248 :    
249 :     $erdbload->Abort();
250 :    
251 :     Terminate the load for this table as having failed. The output file is
252 :     closed and deleted.
253 :    
254 :     =cut
255 :    
256 :     sub Abort {
257 :     # Get the parameters.
258 :     my ($self) = @_;
259 :     # Do standard cleanup. This returns the file name.
260 :     my $fileName = $self->_Cleanup();
261 :     # Delete the temp file (if it exists).
262 :     if (defined $fileName) {
263 :     my $tempName = TempOf($fileName);
264 :     unlink $tempName if -e $tempName;
265 :     }
266 :     }
267 :    
268 :    
269 :     =head2 File Naming Methods
270 :    
271 :     These methods are used to analyze and generate file names. There are many packages
272 :     involved in creating and managing load files. All the file names are generated by
273 :     methods in this group so that there is no breakdown of communication should the file
274 :     naming conventions change.
275 :    
276 :     Currently, a file name consists of a content name, an optional section
277 :     name preceded by a hyphen, and an extension of C<dtx> or C<dty>. A C<dtx> file
278 :     contains table data, and its content name will be the same as the relevant table
279 :     name. A C<dty> file contains control data. Files with control data
280 :     are considered transient, so during post-processing no attempt is made to insure they
281 :     are all present or absent. If a control data file is not table-related, the content
282 :     name should be in all lower case with underscores, so that it is guaranteed not to
283 :     conflict with a table name.
284 :    
285 :     =cut
286 :    
287 :     # This constant maps file name extensions to content types.
288 :     use constant FILE_TYPES => { dtx => 'data', dty => 'control', 'dtz' => 'temp' };
289 :     # This constant maps content types to file name extensions.
290 :     use constant FILE_EXTS => { data => 'dtx', control => 'dty', temp => 'dtz' };
291 :    
292 :     =head3 ParseFileName
293 :    
294 :     my ($content, $section, $type) = ERDBGenerate::ParseFileName($fileName);
295 :    
296 :     Parse a base file name to extract the content name, the section name, and the
297 :     file type. If the file is for an entire table (not partial), the section name will
298 :     be undefined. If the file does not appear to be a load-related file, all return
299 :     values will be undefined. If the file belongs to a particular table, the content
300 :     name will be the table name; otherwise, the content name will not correspond to the
301 :     name of any table.
302 :    
303 :     =over 4
304 :    
305 :     =item fileName
306 :    
307 :     File name to parse. This should be a base file name with no directory
308 :     information in it.
309 :    
310 :     =item RETURN
311 :    
312 :     Returns a three-element list. The first two elements are the content name (which
313 :     could be a table name) and the section name (which will be undefined if the
314 :     file does not belong to a specific section. The third element will be C<data>
315 :     if the file contains table data, C<control> if it contains control or status
316 :     data (such as, for example, a saved list of section names), or C<temp> if it is
317 :     a temporary file.
318 :    
319 :     =back
320 :    
321 :     =cut
322 :    
323 :     sub ParseFileName {
324 :     # Get the parameters.
325 :     my ($fileName) = @_;
326 :     # Declare the return variables.
327 :     my ($content, $section, $type);
328 :     # Try to parse the file name.
329 :     if ($fileName =~ m#^(\w+)-(.+)\.(dtx|dty)#) {
330 :     # We have a table and a section.
331 :     ($content, $section, $type) = ($1, $2, FILE_TYPES->{$3});
332 :     } elsif ($fileName =~ m#^(\w+)\.(dtx|dty)$#) {
333 :     # Here it's just a table.
334 :     ($content, $type) = ($1, FILE_TYPES->{$2});
335 :     }
336 :     # Return the results.
337 :     return ($content, $section, $type);
338 :     }
339 :    
340 :     =head3 CreateFileName
341 :    
342 :     my $fileName = ERDBGenerate::CreateFileName($content, $section, $type, $dir);
343 :    
344 :     Return a file name for the specified type of operation on the specified
345 :     content and optionally the specified section.
346 :    
347 :     =over 4
348 :    
349 :     =item content
350 :    
351 :     File content. This can be a table name or a lower-case phrase describing what's in
352 :     the file. In the latter case only letters, digits, and underscores are allowed.
353 :    
354 :     =item section
355 :    
356 :     The section of the data to which the file's content relates, or C<undef> if
357 :     the file is for all sections.
358 :    
359 :     =item type
360 :    
361 :     C<data> for a file containing table data, C<control> for a file containing
362 :     ancillary or control data, or C<temp> for a file containing temporary data.
363 :    
364 :     =item dir (optional)
365 :    
366 :     If specified, the name of a directory. The directory name will be prefixed to
367 :     the file name with an intervening slash.
368 :    
369 :     =item RETURN
370 :    
371 :     Returns a file name suitable for the specified purpose.
372 :    
373 :     =back
374 :    
375 :     =cut
376 :    
377 :     sub CreateFileName {
378 :     # Get the parameters.
379 :     my ($content, $section, $type, $dir) = @_;
380 :     # Format the section portion of the file name.
381 :     my $sectionData = (defined $section && $section ne '' ? "-$section" : '');
382 :     # Assemble it into the file name.
383 :     my $retVal = "$content$sectionData." . FILE_EXTS->{$type};
384 :     # Add the directory, if necessary.
385 :     if (defined $dir) {
386 :     $retVal = "$dir/$retVal";
387 :     }
388 :     # Return the result.
389 :     return $retVal;
390 :     }
391 :    
392 :     =head3 GetLoadFiles
393 :    
394 :     my @files = ERDBGenerate::GetLoadFiles($directory);
395 :    
396 :     Get a list of the names of the load-related files in the specified
397 :     directory. Only the base file names are returned, without any path
398 :     information. The base names can later be fed to L</ParseFileName> to
399 :     determine what is in the file.
400 :    
401 :     =over 4
402 :    
403 :     =item directory
404 :    
405 :     Load directory for the relevant database.
406 :    
407 :     =item RETURN
408 :    
409 :     Returns a list of base file names for load-related files in the specified
410 :     directory.
411 :    
412 :     =back
413 :    
414 :     =cut
415 :    
416 :     sub GetLoadFiles {
417 :     # Get the parameters.
418 :     my ($directory) = @_;
419 : parrello 1.4 Trace("GetLoadFiles called for $directory.") if T(3);
420 : parrello 1.1 # Get matching file names from the specified directory.
421 : parrello 1.4 my @retVal = grep { $_ =~ /\.dt[xyz]$/ } Tracer::OpenDir($directory);
422 : parrello 1.1 # Return the result.
423 :     return @retVal;
424 :     }
425 :    
426 :     =head3 TempOf
427 :    
428 :     my $fileName = ERDBGenerate::TempOf($fileName);
429 :    
430 :     Return the temporary file name associated with the specified data file
431 :     name. There is a one-to-one mapping between the name of a file containing
432 :     table data and the corresponding temporary file used during file creation.
433 :    
434 :     =over 4
435 :    
436 :     =item fileName
437 :    
438 :     Name of the table data file to be converted File name to be converted
439 :    
440 :     =item RETURN
441 :    
442 :     Returns the corresponding temporary file name.
443 :    
444 :     =back
445 :    
446 :     =cut
447 :    
448 :     sub TempOf {
449 :     # Get the parameters.
450 :     my ($fileName) = @_;
451 :     # Copy the incoming file name.
452 :     my $retVal = $fileName;
453 :     # Change the last character to 'z'.
454 :     substr($retVal, -1, 1, 'z');
455 :     # Return the result.
456 :     return $retVal;
457 :     }
458 :    
459 :     =head2 Internal Utility Methods
460 :    
461 :     =head3 _Cleanup
462 :    
463 :     my $fileName = $erdbload->_Cleanup();
464 :    
465 :     Release resources held by this object and return the name of the current
466 :     output file. This method contains operations common to both L</Abort> and
467 :     L</Finish>. If no output file is present, it will return an undefined
468 :     value.
469 :    
470 :     =cut
471 :    
472 :     sub _Cleanup {
473 :     # Get the parameters.
474 :     my ($self) = @_;
475 :     # Get the operating file name.
476 :     my $retVal = $self->{fileName};
477 :     # Close the file handle if it's open.
478 :     my $oh = $self->{fh};
479 :     close $oh if defined $oh;
480 :     # Denote we're no longer inside a section.
481 : parrello 1.4 for my $field (qw(fh fileName)) {
482 : parrello 1.1 $self->{$field} = undef;
483 :     }
484 :     # Return the result.
485 :     return $retVal;
486 :     }
487 :    
488 :    
489 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3