[Bio] / Sprout / ERDBGenerate.pm Repository:
ViewVC logotype

Annotation of /Sprout/ERDBGenerate.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package ERDBGenerate;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDB;
9 :     use Stats;
10 :    
11 :     =head1 ERDB Table Data Generation Helper Object
12 :    
13 :     =head2 Introduction
14 :    
15 :     This object is designed to assist with creating the load files for an ERDB
16 :     data relation (also known as a I<table>).
17 :    
18 :     The generation process can be very long, so each table is loaded a section at a
19 :     time, with multiple sections running in parallel. After the load files for each
20 :     section are created, a separate process is used to collate the sections and load
21 :     them into the database tables.
22 :    
23 :     When the output file is being written, its name is suffixed by a tilde (C<~>) to denote
24 :     it is currently being processed. When the L</Finish> method is called, the file is closed
25 :     and renamed. If the L</Finish> method is not called, it is presumed that the load has
26 :     failed. The tilde will remain in place so that the collater knows the file is invalid.
27 :    
28 :     This object maintains the following data fields.
29 :    
30 :     =over 4
31 :    
32 :     =item directory
33 :    
34 :     Directory into which load files should be placed.
35 :    
36 :     =item erdb
37 :    
38 :     [[ErdbPm]] object used to create and access the database. This will usually
39 :     be a subclass of a pure ERDB object created to manage a particular database.
40 :    
41 :     =item fh
42 :    
43 :     Open handle of the current output file (if any).
44 :    
45 :     =item fileName
46 :    
47 :     Name of the current output file (if any).
48 :    
49 :     =item relation
50 :    
51 :     Reference to the descriptor for this table's relation in the ERDB database
52 :     object.
53 :    
54 : parrello 1.2 =item stats
55 :    
56 :     Statistics object for recording events.
57 :    
58 : parrello 1.1 =item table
59 :    
60 :     Name of the relation table being loaded.
61 :    
62 :     =back
63 :    
64 :     =cut
65 :    
66 :     =head3 new
67 :    
68 : parrello 1.2 my $erdbload = ERDBGenerate->new($erdb, $directory, $table, $stats);
69 : parrello 1.1
70 :     Create an ERDB Table Load Utility object for a specified table. Note that
71 :     when generating a table, the section ID is required, but for collating
72 :     and loading it can be omitted.
73 :    
74 :     =over 4
75 :    
76 :     =item erdb
77 :    
78 :     [[ErdbPm]] object for the database being loaded.
79 :    
80 :     =item directory
81 :    
82 :     Name of the directory into which the load files are to be placed.
83 :    
84 :     =item table
85 :    
86 :     Name of the table being loaded.
87 :    
88 : parrello 1.2 =item stats
89 : parrello 1.1
90 : parrello 1.2 Statistics object for recording events.
91 : parrello 1.1
92 :     =back
93 :    
94 :     =cut
95 :    
96 :     sub new {
97 :     # Get the parameters.
98 : parrello 1.2 my ($class, $erdb, $directory, $table, $stats) = @_;
99 : parrello 1.1 # Ask the database for the relation's descriptor.
100 : parrello 1.2 my $relation = $erdb->FindRelation($table);
101 : parrello 1.1 Confess("Invalid table name \"$table\".") if (! defined $relation);
102 :     # Create the new object.
103 :     my $retVal = {
104 :     directory => $directory,
105 : parrello 1.2 erdb => $erdb,
106 : parrello 1.1 fh => undef,
107 :     fileName => undef,
108 :     relation => $relation,
109 : parrello 1.2 stats => $stats,
110 : parrello 1.1 table => $table,
111 :     };
112 :     # Bless and return the result.
113 :     bless $retVal, $class;
114 :     return $retVal;
115 :     }
116 :    
117 :     =head2 Public Methods
118 :    
119 :     =head3 Start
120 :    
121 :     $erdbload->Start($section);
122 :    
123 :     Initialize for loading the specified section into this loader's table.
124 :     This deletes any existing load file for the section and creates a
125 :     temporary file into which new data can be placed using L</Put> calls.
126 :    
127 :     =over 4
128 :    
129 :     =item section
130 :    
131 :     ID of the section being loaded.
132 :    
133 :     =back
134 :    
135 :     =cut
136 :    
137 :     sub Start {
138 :     # Get the parameters.
139 :     my ($self, $section) = @_;
140 :     # Compute the output file name.
141 :     my $fileName = CreateFileName($self->{table}, $section, 'data',
142 :     $self->{directory});
143 :     # Insure it doesn't already exist.
144 :     unlink $fileName if -e $fileName;
145 :     # Open a temporary file for it.
146 :     my $oh = Open(undef, ">" . TempOf($fileName));
147 :     # Save the name and handle.
148 :     $self->{fh} = $oh;
149 :     $self->{fileName} = $fileName;
150 :     Trace("Starting output to $fileName for section $section and table $self->{table}.") if T(4);
151 :     }
152 :    
153 :     =head3 Put
154 :    
155 :     my $length = $erdbload->Put(%putFields);
156 :    
157 :     Output the specified fields to the currently-active load file. The fields
158 :     come in as a hash mapping field names to field values. Fields whose
159 :     values are not specified will be set to their default value.
160 :    
161 :     =over 4
162 :    
163 :     =item putFields
164 :    
165 :     A hash mapping field names for this generator's target relation to
166 :     field values.
167 :    
168 :     =item RETURN
169 :    
170 :     Returns the number of characters output (excluding delimiters), or zero if
171 :     nothing is output (which usually indicates we're discarding a duplicate entity.)
172 :    
173 :     =back
174 :    
175 :     =cut
176 :    
177 :     sub Put {
178 :     # Get the parameters.
179 :     my ($self, %putFields) = @_;
180 :     # We return the number of characters output.
181 :     my $retVal = 0;
182 : parrello 1.3 # Get the database object.
183 :     my $erdb = $self->{erdb};
184 : parrello 1.1 # Get the descriptor for this relation.
185 :     my $relationTable = $self->{relation};
186 :     # Insure we have an output file to which we can write.
187 :     my $oh = $self->{fh};
188 :     Confess("Put before Start for $self->{table}.") if ! defined $oh;
189 : parrello 1.4 # We'll create an ordered list of field values in here.
190 :     my @values;
191 :     # Loop through the relation's fields.
192 :     for my $field (@{$relationTable->{Fields}}) {
193 :     # Get this field's value. We need to consider the possibility the
194 :     # use used underscores instead of hyphens for convenience, so we
195 :     # have to check twice.
196 :     my $name = $field->{name};
197 :     my $value = $putFields{$name};
198 :     if (! defined $value) {
199 :     my $altName = $name;
200 :     $altName =~ tr/-/_/;
201 :     $value = $putFields{$altName};
202 :     }
203 :     # Did we find a value?
204 :     if (! defined $value) {
205 :     # The field has no value, so check for a default.
206 :     $value = $field->{default};
207 :     # If there's no default, we have an error.
208 :     Confess("Missing value for $field->{name} in Put for $self->{table}.")
209 :     if ! defined $value;
210 : parrello 1.1 }
211 : parrello 1.4 # Push the value into the result list.
212 :     push @values, $value;
213 :     Trace("Field $name in $self->{table} has value \"$value\".") if T(4);
214 :     # Record its length.
215 :     $retVal += length("$value");
216 : parrello 1.1 }
217 : parrello 1.4 # Verify and fix the field values.
218 :     $erdb->VerifyFields($self->{table}, \@values);
219 :     $erdb->DigestFields($self->{table}, \@values);
220 :     # Write the record.
221 :     Tracer::PutLine($oh, \@values);
222 : parrello 1.1 # Return the record length.
223 :     return $retVal;
224 :     }
225 :    
226 :     =head3 Finish
227 :    
228 :     $erdbload->Finish();
229 :    
230 :     Finish the load for this table, closing the output file and renaming it
231 :     to mark it finished.
232 :    
233 :     =cut
234 :    
235 :     sub Finish {
236 :     # Get the parameters.
237 :     my ($self) = @_;
238 :     # Do standard cleanup. This returns the file name.
239 :     my $fileName = $self->_Cleanup();
240 :     Confess("Finish called before Start for $self->{table}") if ! defined $fileName;
241 :     # Rename the output file so the collator will find it.
242 :     rename TempOf($fileName), $fileName;
243 :     }
244 :    
245 :     =head3 Abort
246 :    
247 :     $erdbload->Abort();
248 :    
249 :     Terminate the load for this table as having failed. The output file is
250 :     closed and deleted.
251 :    
252 :     =cut
253 :    
254 :     sub Abort {
255 :     # Get the parameters.
256 :     my ($self) = @_;
257 :     # Do standard cleanup. This returns the file name.
258 :     my $fileName = $self->_Cleanup();
259 :     # Delete the temp file (if it exists).
260 :     if (defined $fileName) {
261 :     my $tempName = TempOf($fileName);
262 :     unlink $tempName if -e $tempName;
263 :     }
264 :     }
265 :    
266 :    
267 :     =head2 File Naming Methods
268 :    
269 :     These methods are used to analyze and generate file names. There are many packages
270 :     involved in creating and managing load files. All the file names are generated by
271 :     methods in this group so that there is no breakdown of communication should the file
272 :     naming conventions change.
273 :    
274 :     Currently, a file name consists of a content name, an optional section
275 :     name preceded by a hyphen, and an extension of C<dtx> or C<dty>. A C<dtx> file
276 :     contains table data, and its content name will be the same as the relevant table
277 :     name. A C<dty> file contains control data. Files with control data
278 :     are considered transient, so during post-processing no attempt is made to insure they
279 :     are all present or absent. If a control data file is not table-related, the content
280 :     name should be in all lower case with underscores, so that it is guaranteed not to
281 :     conflict with a table name.
282 :    
283 :     =cut
284 :    
285 :     # This constant maps file name extensions to content types.
286 :     use constant FILE_TYPES => { dtx => 'data', dty => 'control', 'dtz' => 'temp' };
287 :     # This constant maps content types to file name extensions.
288 :     use constant FILE_EXTS => { data => 'dtx', control => 'dty', temp => 'dtz' };
289 :    
290 :     =head3 ParseFileName
291 :    
292 :     my ($content, $section, $type) = ERDBGenerate::ParseFileName($fileName);
293 :    
294 :     Parse a base file name to extract the content name, the section name, and the
295 :     file type. If the file is for an entire table (not partial), the section name will
296 :     be undefined. If the file does not appear to be a load-related file, all return
297 :     values will be undefined. If the file belongs to a particular table, the content
298 :     name will be the table name; otherwise, the content name will not correspond to the
299 :     name of any table.
300 :    
301 :     =over 4
302 :    
303 :     =item fileName
304 :    
305 :     File name to parse. This should be a base file name with no directory
306 :     information in it.
307 :    
308 :     =item RETURN
309 :    
310 :     Returns a three-element list. The first two elements are the content name (which
311 :     could be a table name) and the section name (which will be undefined if the
312 :     file does not belong to a specific section. The third element will be C<data>
313 :     if the file contains table data, C<control> if it contains control or status
314 :     data (such as, for example, a saved list of section names), or C<temp> if it is
315 :     a temporary file.
316 :    
317 :     =back
318 :    
319 :     =cut
320 :    
321 :     sub ParseFileName {
322 :     # Get the parameters.
323 :     my ($fileName) = @_;
324 :     # Declare the return variables.
325 :     my ($content, $section, $type);
326 :     # Try to parse the file name.
327 :     if ($fileName =~ m#^(\w+)-(.+)\.(dtx|dty)#) {
328 :     # We have a table and a section.
329 :     ($content, $section, $type) = ($1, $2, FILE_TYPES->{$3});
330 :     } elsif ($fileName =~ m#^(\w+)\.(dtx|dty)$#) {
331 :     # Here it's just a table.
332 :     ($content, $type) = ($1, FILE_TYPES->{$2});
333 :     }
334 :     # Return the results.
335 :     return ($content, $section, $type);
336 :     }
337 :    
338 :     =head3 CreateFileName
339 :    
340 :     my $fileName = ERDBGenerate::CreateFileName($content, $section, $type, $dir);
341 :    
342 :     Return a file name for the specified type of operation on the specified
343 :     content and optionally the specified section.
344 :    
345 :     =over 4
346 :    
347 :     =item content
348 :    
349 :     File content. This can be a table name or a lower-case phrase describing what's in
350 :     the file. In the latter case only letters, digits, and underscores are allowed.
351 :    
352 :     =item section
353 :    
354 :     The section of the data to which the file's content relates, or C<undef> if
355 :     the file is for all sections.
356 :    
357 :     =item type
358 :    
359 :     C<data> for a file containing table data, C<control> for a file containing
360 :     ancillary or control data, or C<temp> for a file containing temporary data.
361 :    
362 :     =item dir (optional)
363 :    
364 :     If specified, the name of a directory. The directory name will be prefixed to
365 :     the file name with an intervening slash.
366 :    
367 :     =item RETURN
368 :    
369 :     Returns a file name suitable for the specified purpose.
370 :    
371 :     =back
372 :    
373 :     =cut
374 :    
375 :     sub CreateFileName {
376 :     # Get the parameters.
377 :     my ($content, $section, $type, $dir) = @_;
378 :     # Format the section portion of the file name.
379 :     my $sectionData = (defined $section && $section ne '' ? "-$section" : '');
380 :     # Assemble it into the file name.
381 :     my $retVal = "$content$sectionData." . FILE_EXTS->{$type};
382 :     # Add the directory, if necessary.
383 :     if (defined $dir) {
384 :     $retVal = "$dir/$retVal";
385 :     }
386 :     # Return the result.
387 :     return $retVal;
388 :     }
389 :    
390 :     =head3 GetLoadFiles
391 :    
392 :     my @files = ERDBGenerate::GetLoadFiles($directory);
393 :    
394 :     Get a list of the names of the load-related files in the specified
395 :     directory. Only the base file names are returned, without any path
396 :     information. The base names can later be fed to L</ParseFileName> to
397 :     determine what is in the file.
398 :    
399 :     =over 4
400 :    
401 :     =item directory
402 :    
403 :     Load directory for the relevant database.
404 :    
405 :     =item RETURN
406 :    
407 :     Returns a list of base file names for load-related files in the specified
408 :     directory.
409 :    
410 :     =back
411 :    
412 :     =cut
413 :    
414 :     sub GetLoadFiles {
415 :     # Get the parameters.
416 :     my ($directory) = @_;
417 : parrello 1.4 Trace("GetLoadFiles called for $directory.") if T(3);
418 : parrello 1.1 # Get matching file names from the specified directory.
419 : parrello 1.4 my @retVal = grep { $_ =~ /\.dt[xyz]$/ } Tracer::OpenDir($directory);
420 : parrello 1.1 # Return the result.
421 :     return @retVal;
422 :     }
423 :    
424 :     =head3 TempOf
425 :    
426 :     my $fileName = ERDBGenerate::TempOf($fileName);
427 :    
428 :     Return the temporary file name associated with the specified data file
429 :     name. There is a one-to-one mapping between the name of a file containing
430 :     table data and the corresponding temporary file used during file creation.
431 :    
432 :     =over 4
433 :    
434 :     =item fileName
435 :    
436 :     Name of the table data file to be converted File name to be converted
437 :    
438 :     =item RETURN
439 :    
440 :     Returns the corresponding temporary file name.
441 :    
442 :     =back
443 :    
444 :     =cut
445 :    
446 :     sub TempOf {
447 :     # Get the parameters.
448 :     my ($fileName) = @_;
449 :     # Copy the incoming file name.
450 :     my $retVal = $fileName;
451 :     # Change the last character to 'z'.
452 :     substr($retVal, -1, 1, 'z');
453 :     # Return the result.
454 :     return $retVal;
455 :     }
456 :    
457 :     =head2 Internal Utility Methods
458 :    
459 :     =head3 _Cleanup
460 :    
461 :     my $fileName = $erdbload->_Cleanup();
462 :    
463 :     Release resources held by this object and return the name of the current
464 :     output file. This method contains operations common to both L</Abort> and
465 :     L</Finish>. If no output file is present, it will return an undefined
466 :     value.
467 :    
468 :     =cut
469 :    
470 :     sub _Cleanup {
471 :     # Get the parameters.
472 :     my ($self) = @_;
473 :     # Get the operating file name.
474 :     my $retVal = $self->{fileName};
475 :     # Close the file handle if it's open.
476 :     my $oh = $self->{fh};
477 :     close $oh if defined $oh;
478 :     # Denote we're no longer inside a section.
479 : parrello 1.4 for my $field (qw(fh fileName)) {
480 : parrello 1.1 $self->{$field} = undef;
481 :     }
482 :     # Return the result.
483 :     return $retVal;
484 :     }
485 :    
486 :    
487 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3