[Bio] / Sprout / ERDBGenerator.pl Repository:
ViewVC logotype

Annotation of /Sprout/ERDBGenerator.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use ERDB;
23 :     use ERDBGenerate;
24 :     use ERDBLoadGroup;
25 : parrello 1.8 use ERDBExtras;
26 : parrello 1.1
27 :     =head1 ERDBGenerator Script
28 :    
29 :     ERDBGenerator [options] database group1 group2 ...
30 :    
31 :     Generate ERDB table load files
32 :    
33 :     =head2 Introduction
34 :    
35 :     This script manages the generation of load files for an ERDB database. It can
36 :     either function as a worker process that reads section IDs from the standard
37 :     input and generates the load files for each, or it can function as a management
38 :     process that starts a bunch of workers and gives them work.
39 :    
40 :     The positional parameters include a list of load groups to process and the name
41 :     of the database.
42 :    
43 :     =head2 Positional Parameters
44 :    
45 :     =over 4
46 :    
47 :     =item database
48 :    
49 : parrello 1.9 Name of the ERDB database. This should be the class name for the L<ERDB>
50 : parrello 1.1 subclass used to access the database.
51 :    
52 :     =item groups
53 :    
54 : parrello 1.2 List of the table groups to load. A C<+> at the end of the list indicates that
55 :     all groups that follow the last-named group in the standard order should
56 :     be loaded. A C<+> by itself loads all groups in standard order.
57 : parrello 1.1
58 :     =back
59 :    
60 :     =head2 Command-Line Options
61 :    
62 :     =over 4
63 :    
64 :     =item background
65 :    
66 :     Save the standard and error output to files. The files will be created
67 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
68 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
69 :     B<user> option above.
70 :    
71 :     =item clean
72 :    
73 :     Remove temporary files from the load directory. Use this option with care,
74 :     since it will crash if a worker process is still running.
75 :    
76 :     =item clear
77 :    
78 :     If specified, all generated files in the load directory with a C<dt>X suffix
79 :     will be erased. This restores the load directory to a pristine, pre-loading state.
80 :    
81 :     =item clearGroups
82 :    
83 :     If specified, all generated files related to each specified group will be
84 :     erased prior to any further processing. This is useful if a single group
85 :     needs to be reloaded and we don't want to be confused by files leftover
86 :     from previous loads.
87 :    
88 :     =item forked
89 :    
90 :     If specified, then the trace file will not be erased during initialization.
91 :     This prevents the worker processes from stepping on each other's trace output.
92 :    
93 :     =item help
94 :    
95 :     Display this command's parameters and options.
96 :    
97 :     =item maxErrors
98 :    
99 :     If specified, then this prcoess will terminate after the specified number of
100 :     section load errors; otherwise, the process will keep going after a section
101 : parrello 1.2 error. A value of C<0> means the process will ignore all errors. A value of
102 : parrello 1.5 C<1> means it will stop after the first error. The default is C<1>.
103 : parrello 1.1
104 :     =item phone
105 :    
106 :     Phone number to message when the script is complete.
107 :    
108 :     =item sections
109 :    
110 :     Name of a file containing a list of sections to process. If C<*> is specified (the
111 :     default), all sections are processed. This options is ignored if C<workers> is
112 :     C<0>. In that case, the list of sections is taken from the standard input. When
113 :     a file name is specified, if it is not an absolute file name, it is presumed to
114 : parrello 1.11 be in the database's load directory.
115 : parrello 1.1
116 :     =item sql
117 :    
118 :     If specified, turns on tracing of SQL activity.
119 :    
120 :     =item trace
121 :    
122 :     Specifies the tracing level. The higher the tracing level, the more messages
123 :     will appear in the trace log. Use E to specify emergency tracing.
124 :    
125 :     =item user
126 :    
127 :     Name suffix to be used for log files. If omitted, the PID is used.
128 :    
129 :     =item warn
130 :    
131 :     Create an event in the RSS feed when an error occurs.
132 :    
133 : parrello 1.3 =item label
134 :    
135 :     Name of this process, for display during tracing.
136 :    
137 : parrello 1.5 =item resume
138 :    
139 :     If specified, load files that already exist will not be regenerated.
140 :    
141 : parrello 1.1 =item workers
142 :    
143 :     If C<0>, then this is considered to be a worker process and the sections in the
144 :     standard input are processed. If C<1>, then all sections are processed without
145 :     any parallelism and the standard input is ignored. If it is any other number,
146 :     then the appropriate number of worker processes are generated and the sections
147 :     are assigned to them in a round-robin fashion.
148 :    
149 : parrello 1.7 =item memTrace
150 :    
151 :     Trace memory usage at the end of each section.
152 :    
153 : parrello 1.5 =item DBD
154 :    
155 : parrello 1.8 Fully-qualified name of the DBD file. This option allows the use of an alternate
156 : parrello 1.5 DBD during load, so that access to the database by other processes is not
157 :     compromised.
158 :    
159 : parrello 1.1 =back
160 :    
161 :     =cut
162 :    
163 :     # Get the command-line options and parameters.
164 :     my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDBGenerate ERDB Stats) ],
165 :     {
166 :     clear => ["", "if specified, the entire load directory will be cleared"],
167 :     clean => ["", "if specified, temporary files in the load directory will be deleted"],
168 :     clearGroups => ["", "if specified, pre-exising load files from the groups processed will be deleted"],
169 : parrello 1.5 maxErrors => ["1", "if non-zero, the maximum allowed number of section failures"],
170 : parrello 1.2 phone => ["", "phone number (international format) to call when load finishes"],
171 : parrello 1.5 trace => ["3", "tracing level"],
172 : parrello 1.1 workers => ["1", "number of worker processes"],
173 : parrello 1.3 label => ["Main", "name of this process"],
174 : parrello 1.5 resume => ["", "if specified, only groups and sections that do not already have load files will be processed"],
175 : parrello 1.1 sections => ["*", "name of a file in the database's load directory containing a list of sections to process"],
176 : parrello 1.5 DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
177 : parrello 1.7 memTrace => ["", "if specified, memory usage will be traced at the end of each section"],
178 : parrello 1.1 },
179 :     "<database> <group1> <group2> ...",
180 :     @ARGV);
181 :     # This is a list of the options that are for manager scripts only.
182 : parrello 1.5 my @managerOptions = qw(clear clean clearGroups sections);
183 : parrello 1.1 # We're doing heavy pipe stuff, so we need to throw an error on a broken-pipe signal.
184 :     local $SIG{PIPE} = sub { Confess("Broken pipe.") };
185 :     # Insure we catch errors.
186 :     eval {
187 :     # Get the parameters.
188 :     my ($database, @groups) = @parameters;
189 : parrello 1.5 # Check for an alternate DBD.
190 : parrello 1.8 my $altDBD = $options->{DBD} || undef;
191 : parrello 1.2 # Connect to the database and get its load directory.
192 : parrello 1.10 my $erdb = ERDB::GetDatabase($database, $altDBD, externalDBD => 1);
193 : parrello 1.2 my $directory = $erdb->LoadDirectory();
194 : parrello 1.11 Trace("Load directory is $directory.") if T(3);
195 : parrello 1.7 my $source = $erdb->GetSourceObject();
196 : parrello 1.1 # Fix the group list.
197 : parrello 1.2 my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
198 : parrello 1.1 # Are we a worker or a manager?
199 :     if ($options->{workers} == 0) {
200 :     # Yes, we're a worker.
201 : parrello 1.5 Trace("Worker process $options->{label} started.") if T(2);
202 :     LoadFromInput(\*STDIN, $erdb, \@realGroups, $options);
203 : parrello 1.1 } else {
204 :     # Here we're a manager. If the user wants us to clear the directory,
205 :     # do that first.
206 :     if ($options->{clear}) {
207 :     # Count the number of files deleted.
208 :     my $deleteCount = 0;
209 :     # Get a list of the applicable file names.
210 :     my @files = ERDBGenerate::GetLoadFiles($directory);
211 :     # It's worth noting if we didn't find any.
212 :     if (! @files) {
213 : parrello 1.5 Trace("Load directory is already clear.") if T(2);
214 : parrello 1.1 } else {
215 :     # Delete the files we found.
216 :     for my $file (@files) {
217 :     unlink "$directory/$file";
218 :     $deleteCount++;
219 :     }
220 : parrello 1.5 Trace("$deleteCount files deleted from load directory during Clear.") if T(2);
221 : parrello 1.1 }
222 :     } elsif ($options->{clearGroups}) {
223 :     # Here the user only wants to clear the load files for the specified
224 :     # groups. This operation requires significantly greater care. Get
225 :     # the hash of groups to table names.
226 :     my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);
227 :     # Get a list of the files in this directory in alphabetical order.
228 :     my @files = ERDBGenerate::GetLoadFiles($directory);
229 :     # Get a hash of all the tables to be deleted.
230 :     my %tables = map { $_ => 1 } map { @{$groupHash->{$_}} } @realGroups;
231 :     # We'll count the number of files deleted in here.
232 :     my $deleteCount = 0;
233 :     # Loop through all the files in the directory.
234 :     for my $file (@files) {
235 :     # Extract the relevant table name from the file.
236 :     my ($table) = ERDBGenerate::ParseFileName($file);
237 :     if ($tables{$table}) {
238 :     # This is one of our tables, so delete the file.
239 :     unlink "$directory/$file";
240 :     $deleteCount++;
241 : parrello 1.4 Trace("$deleteCount files deleted.") if T(3) && $deleteCount % 100 == 0;
242 : parrello 1.1 }
243 :     }
244 :     Trace("$deleteCount files deleted from load directory during ClearGroups.") if T(2);
245 :     }
246 : parrello 1.2 # Delete any leftover kill file if it exists.
247 :     my $killFileName = ERDBLoadGroup::KillFileName($erdb, $directory);
248 :     if (-f $killFileName) {
249 :     Trace("Deleting kill file $killFileName.") if T(2);
250 :     unlink $killFileName;
251 :     }
252 : parrello 1.1 # Now we need to get our list of sections. Check to see if the user
253 :     # supplied a section file.
254 :     my $sectionFile = $options->{sections};
255 :     if ($sectionFile eq "*") {
256 :     # No, so we must create one.
257 :     $sectionFile = "$directory/Sections$$.txt";
258 :     Open(\*SECTIONS, ">$sectionFile");
259 : parrello 1.7 for my $section ($erdb->SectionList($source)) {
260 : parrello 1.1 print SECTIONS "$section\n";
261 :     }
262 :     close SECTIONS;
263 :     } elsif ($sectionFile =~ m#^\w#) {
264 :     # Yes, but it doesn't have a directory name, so add one.
265 :     $sectionFile = "$directory/$sectionFile";
266 :     }
267 :     # Compute the options to be used for worker processes (or ourselves if
268 :     # we're sequential).
269 :     my %workerOptions = %{$options};
270 :     # Get rid of the manager-only options.
271 :     for my $optionID (@managerOptions) {
272 :     delete $workerOptions{$optionID};
273 :     }
274 :     # Insure the worker knows what it is.
275 :     $workerOptions{workers} = 0;
276 :     $workerOptions{forked} = 1;
277 :     $workerOptions{background} = 1;
278 :     # Prepare to read the section file.
279 :     my $ih = Open(undef, "<$sectionFile");
280 :     # Are we a sequential load or a multi-worker manager?
281 :     my $numWorkers = $options->{workers};
282 :     if ($numWorkers == 1) {
283 : parrello 1.2 # We're sequential, so we do all the work ourselves.
284 : parrello 1.1 Trace("Sequential load started.") if T(2);
285 : parrello 1.5 LoadFromInput($ih, $erdb, \@realGroups, \%workerOptions);
286 : parrello 1.1 } else {
287 : parrello 1.5 # Here we need to create the workers. The following array will contain
288 :     # a descriptor for each worker.
289 : parrello 1.1 my @workers = ();
290 : parrello 1.5 # Compute the positional parameters to use for the workers.
291 : parrello 1.3 my $commandParms = join(" ", $database, @realGroups);
292 :     my $command = $0;
293 : parrello 1.1 # Create the workers.
294 :     for (my $i = 0; $i < $numWorkers; $i++) {
295 : parrello 1.3 my $label = "$options->{label}$i";
296 :     $workerOptions{label} = $label;
297 :     my $commandOptions = Tracer::UnparseOptions(\%workerOptions);
298 : parrello 1.8 my $inFile = "$ERDBExtras::temp/Pipe-$label.tbl";
299 : parrello 1.5 my $oh = Open(undef, ">$inFile");
300 :     my $command = "$command $commandOptions $commandParms <$inFile >null &";
301 :     push @workers, { handle => $oh, label => $label, command => $command };
302 : parrello 1.1 }
303 :     # Now we assign sections to the workers.
304 : parrello 1.3 my $w = 0;
305 : parrello 1.1 while (! eof $ih) {
306 :     # Get the name of the next section.
307 :     my $line = <$ih>;
308 : parrello 1.5 # Get the output handle for the next worker in rotation.
309 :     my $wh = $workers[$w]->{handle};
310 : parrello 1.1 # Send this section to it.
311 : parrello 1.5 print $wh $line;
312 :     Trace(Tracer::Strip($line) . " sent to $workers[$w]->{label}") if T(3);
313 : parrello 1.3 # Position on the next worker.
314 :     $w = ($w + 1) % $numWorkers;
315 : parrello 1.1 }
316 : parrello 1.5 # All done, close the files.
317 :     for my $worker (@workers) {
318 :     close $worker->{handle};
319 :     }
320 :     # Now start the workers.
321 : parrello 1.1 for my $worker (@workers) {
322 : parrello 1.5 my $cmd = $worker->{command};
323 :     Trace("Starting: $cmd") if T(3);
324 :     system($worker->{command});
325 : parrello 1.1 }
326 :     }
327 : parrello 1.4 Trace("Load manager completed.") if T(2);
328 : parrello 1.1 }
329 :     };
330 :     if ($@) {
331 :     Trace("Script failed with error: $@") if T(0);
332 :     } else {
333 :     Trace("Script complete.") if T(2);
334 :     }
335 :     if ($options->{phone}) {
336 :     my $msgID = Tracer::SendSMS($options->{phone}, "ERDBGenerator has ended.");
337 :     if ($msgID) {
338 :     Trace("Phone message sent with ID $msgID.") if T(2);
339 :     } else {
340 :     Trace("Phone message not sent.") if T(2);
341 :     }
342 :     }
343 :    
344 : parrello 1.6 =head2 Internal Methods
345 :    
346 : parrello 1.1 =head3 LoadFromInput
347 :    
348 :     LoadFromInput($ih, $erdb, \@groups, \%options);
349 :    
350 :     Load one or more sections of data for the specified table groups. The IDs
351 :     of the data sections will be read from the standard input. The groups
352 :     will be loaded in the order specified, once per section.
353 :    
354 :     =over 4
355 :    
356 :     =item ih
357 :    
358 :     File handle for the input stream containing the list of sections to process.
359 :    
360 :     =item erdb
361 :    
362 :     Database object containing information about the tables being loaded.
363 :    
364 :     =item groups
365 :    
366 :     Reference to a list of the names for the load groups to process.
367 :    
368 :     =item options
369 :    
370 :     Reference to a hash of the options passed in from the command line.
371 :    
372 :     =back
373 :    
374 :     =cut
375 :    
376 :     sub LoadFromInput {
377 :     # Get the parameters.
378 : parrello 1.5 my ($ih, $erdb, $groups, $options) = @_;
379 : parrello 1.1 # We'll count our errors in here.
380 :     my $errorCount = 0;
381 :     my $maxErrors = $options->{maxErrors};
382 : parrello 1.8 # Create the master statistics object.
383 :     my $stats = Stats->new();
384 : parrello 1.2 # Compute the kill file name.
385 :     my $killFileName = ERDBLoadGroup::KillFileName($erdb, $erdb->LoadDirectory());
386 :     my $killed = 0;
387 : parrello 1.5 # Slurp in the sections.
388 :     my @sections = ();
389 :     while (! eof $ih) {
390 :     push @sections, Tracer::GetLine($ih);
391 :     }
392 :     # Loop through the groups.
393 :     for my $group (@$groups) {
394 :     # Create a loader for this group.
395 :     my $loader = $erdb->Loader($group, $options);
396 :     # Loop through the sections.
397 :     for my $section (@sections) {
398 : parrello 1.2 # Only proceed if we haven't been killed.
399 :     if (! $killed) {
400 :     # Check for a kill file.
401 :     if (-f $killFileName) {
402 :     # Found one, so kill ourselves.
403 : parrello 1.3 Trace("$options->{label} terminated by kill file.") if T(2);
404 : parrello 1.2 $killed = 1;
405 :     } else {
406 :     # No kill file, so we process the section.
407 : parrello 1.5 Trace("Processing section $section for group $group in $options->{label}.") if T(3);
408 : parrello 1.7 # Memorize the current memory footprint.
409 :     my $memory0 = Tracer::GetMemorySize();
410 : parrello 1.5 my $ok = $loader->ProcessSection($section);
411 : parrello 1.7 # Do memory tracing.
412 :     if ($options->{memTrace}) {
413 :     my $memory1 = Tracer::GetMemorySize();
414 :     Trace("Memory usage by $options->{label} for $group $section was $memory0 to $memory1.") if T(2);
415 :     }
416 : parrello 1.2 # Check to see if we've exceeded the maximum error count. We only care
417 :     # if maxErrors is nonzero.
418 :     if (! $ok && $maxErrors && ++$errorCount >= $maxErrors) {
419 :     Trace("Error limit exceeded in database loader.") if T(0);
420 :     $killed = 1;
421 :     }
422 :     }
423 : parrello 1.1 }
424 :     }
425 : parrello 1.8 # Display our statistics.
426 : parrello 1.5 Trace("Statistics for $group in $options->{label}:\n" . $loader->DisplayStats()) if T(2);
427 : parrello 1.8 # Add them to the master statistics.
428 :     $loader->AccumulateStats($stats);
429 : parrello 1.1 }
430 : parrello 1.8 # Tell the user we're done.
431 : parrello 1.5 Trace("Processing finished for worker $options->{label}.") if T(2);
432 : parrello 1.8 Trace("Statistics for this worker:\n" . $stats->Show()) if T(2);
433 : parrello 1.1 }
434 :    
435 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3