[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Annotation of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use ERDB;
23 :     use ERDBLoadGroup;
24 :     use ERDBGenerate;
25 :     use Stats;
26 : parrello 1.2 use Time::HiRes;
27 : parrello 1.1
28 :    
29 :     =head1 ERDBLoader Script
30 :    
31 :     ERDBLoader [options] <database> <group1> <group2> ...
32 :    
33 :     ERDB Database Load Finisher
34 :    
35 :     =head2 Introduction
36 :    
37 :     This script finishes the database load process begun by [[ERDBGeneratorPl]].
38 :    
39 :     [[ERDBGeneratorPl]] divides the source data into sections, and generates a
40 :     partial load file for each section of each table. To finish the load process, we
41 :     need to combine the partial files into single files and load the resulting
42 :     single files into the database tables.
43 :    
44 :     Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related
45 :     tables that are loaded at the same time. For each table in a named group that
46 :     does not exist in the database, the script first attempts to find a completed
47 :     data file. If one does not exist, it attempts to create one by collating section
48 : parrello 1.2 files. Once the collated section files for a load group are finished, they are
49 :     loaded into the database.
50 : parrello 1.1
51 :     =head2 Positional Parameters
52 :    
53 :     =over 4
54 :    
55 :     =item database
56 :    
57 :     Name of the ERDB database. This should be the class name for the subclass used
58 :     to access the database.
59 :    
60 :     =back
61 :    
62 :     =head2 Command-Line Options
63 :    
64 :     =over 4
65 :    
66 :     =item trace
67 :    
68 :     Specifies the tracing level. The higher the tracing level, the more messages
69 :     will appear in the trace log. Use E to specify emergency tracing.
70 :    
71 :     =item user
72 :    
73 :     Name suffix to be used for log files. If omitted, the PID is used.
74 :    
75 :     =item sql
76 :    
77 :     If specified, turns on tracing of SQL activity.
78 :    
79 : parrello 1.7 =item clear
80 :    
81 :     If specified, existing load files will be recreated from sections if the sections
82 :     are present.
83 :    
84 : parrello 1.1 =item background
85 :    
86 :     Save the standard and error output to files. The files will be created
87 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
88 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
89 :     B<user> option above.
90 :    
91 :     =item help
92 :    
93 :     Display this command's parameters and options.
94 :    
95 : parrello 1.3 =item keepSections
96 :    
97 :     If specified, section files (the fragments of data load files created by
98 :     [[ERDBGeneratorPl]], will not be deleted after they are collated.
99 :    
100 : parrello 1.4 =item sanityCheck
101 :    
102 :     If specified, no tables will be loaded. Instead, the first I<N> records from the
103 :     assembled load files will be displayed so that the file contents can be
104 :     visually matched against the column names.
105 :    
106 : parrello 1.1 =item warn
107 :    
108 :     Create an event in the RSS feed when an error occurs.
109 :    
110 :     =item phone
111 :    
112 :     Phone number to message when the script is complete.
113 :    
114 : parrello 1.6 =item DBD
115 :    
116 : parrello 1.10 Fully-qualified name of the DBD file. This option allows the use of an alternate
117 : parrello 1.9 DBD during load so that access to the database by other processes is not
118 : parrello 1.6 compromised.
119 :    
120 : parrello 1.8 =item loadDirectory
121 :    
122 :     Directoty containing the load files. This option allows you to request that
123 :     load files from another version of the NMPDR be used, which is useful when
124 :     creating a new NMPDR: we can yank in the data from the previous database while
125 :     waiting for the new load files to be generated.
126 :    
127 : parrello 1.10 =item dbName
128 :    
129 :     SQL name of the target database. If not specified, the default name is used.
130 :     This option allows you to specify a backup or alternate database that can
131 :     be loaded without compromising the main database.
132 :    
133 : parrello 1.1 =back
134 :    
135 :     =cut
136 :    
137 :     # Get the command-line options and parameters.
138 :     my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
139 :     {
140 : parrello 1.10 dbName => ["", "if specified, the SQL name of the target database"],
141 : parrello 1.7 clear => ["", "overwrite existing load files if sections are present"],
142 : parrello 1.4 sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
143 : parrello 1.6 trace => ["2", "tracing level"],
144 : parrello 1.3 keepSections => ["", "if specified, section files will not be deleted after being collated"],
145 : parrello 1.6 phone => ["", "phone number (international format) to call when load finishes"],
146 :     DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
147 : parrello 1.8 loadDirectory => ["", "if specified, an alternate directory containing the load files"],
148 : parrello 1.1 },
149 :     "<database> <group1> <group2> ...",
150 :     @ARGV);
151 :     # Set a variable to contain return type information.
152 :     my $rtype;
153 :     # Insure we catch errors.
154 :     eval {
155 :     # Get the parameters.
156 :     my ($database, @groups) = @parameters;
157 : parrello 1.6 # Connect to the database and get its load directory.
158 : parrello 1.10 my $erdb = ERDB::GetDatabase($database, undef, %$options);
159 : parrello 1.1 # Fix the group list.
160 : parrello 1.3 my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
161 : parrello 1.1 # Get the source object and load directory for this database.
162 :     my $source = $erdb->GetSourceObject();
163 : parrello 1.8 my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
164 : parrello 1.1 # Get the list of sections.
165 :     my @sectionList = $erdb->SectionList($source);
166 :     # Create a statistics object to track our progress.
167 :     my $stats = Stats->new();
168 : parrello 1.4 # Find out if we're doing a sanity check.
169 :     my $sanityCheck = $options->{sanityCheck} || "";
170 : parrello 1.2 # Start a timer.
171 :     my $totalStart = time();
172 : parrello 1.1 # Loop through the groups.
173 :     for my $group (@realGroups) {
174 :     # Get the list of tables for this group.
175 : parrello 1.3 my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
176 : parrello 1.2 # We need to insure there is a data file for every table. If we fail to find one,
177 :     # we set the following error flag, which prevents us from loading the database.
178 :     my $missingTable = 0;
179 :     # Loop through the tables in this group.
180 : parrello 1.3 for my $table (@tableList) {
181 : parrello 1.4 Trace("Processing table $table for assembly.") if T(2);
182 :     # Get the section file names.
183 :     my @sectionFiles =
184 :     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
185 : parrello 1.2 # Get the data file name.
186 :     my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
187 :     # Do we have it?
188 : parrello 1.7 if (-f $dataFile && ! $options->{clear}) {
189 : parrello 1.2 # Yes. This is good news.
190 :     $stats->Add('tables-found' => 1);
191 : parrello 1.4 Trace("Table file found for $table.") if T(3);
192 : parrello 1.2 } else {
193 :     # No, we must build it. Verify that we have all the sections.
194 :     my @missingFiles = grep { ! -f $_ } @sectionFiles;
195 :     # Did we find everything?
196 :     if (scalar @missingFiles) {
197 :     # No! Denote that we have a missing table.
198 : parrello 1.3 $missingTable++;
199 : parrello 1.2 $stats->Add('tables-skipped' => 1);
200 : parrello 1.6 # If the user wants a sanity check, we want to give him some
201 :     # data anyway.
202 :     if ($sanityCheck) {
203 :     # Get some data lines in the sections. Note we stop when we've exceeded
204 :     # the number of lines expected by the sanity check.
205 :     my @lines;
206 :     for my $sectionFile (@sectionFiles) {
207 :     if (-s $sectionFile && scalar(@lines) < $sanityCheck) {
208 :     Trace("Reading from $sectionFile for $table.") if T(3);
209 :     push @lines, Tracer::GetFile($sectionFile);
210 :     }
211 :     }
212 :     # Create a new temporary file.
213 :     my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";
214 :     my $oh = Open(undef, ">$tmpFile");
215 :     # Put all the data into it.
216 :     Trace(scalar(@lines) . " data lines found.") if T(3);
217 :     print $oh join("\n", @lines);
218 :     close $oh;
219 :     # Sanity check the temp file.
220 :     CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);
221 :     # Clean it up.
222 :     unlink $tmpFile;
223 :     } else {
224 :     # Otherwise tell the user about all the missing files.
225 :     for my $missingFile (@missingFiles) {
226 :     $stats->Add('sections-missing' => 1);
227 :     $stats->AddMessage("Data file $missingFile not found for table $table.");
228 :     }
229 :     }
230 : parrello 1.2 } else {
231 : parrello 1.6 # We have all the sections. Try to assemble them into a data file.
232 : parrello 1.2 my $sortStart = time();
233 : parrello 1.4 my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
234 :     Trace("Sort command: $sortCommand") if T(3);
235 : parrello 1.6 # Pipe to the sort command. Note that we turn on autoflush
236 :     # so there's no buffering.
237 : parrello 1.4 my $oh = Open(undef, "| $sortCommand");
238 : parrello 1.6 select $oh; $| = 1; select STDOUT;
239 :     # Loop through the sections.
240 : parrello 1.2 for my $sectionFile (@sectionFiles) {
241 : parrello 1.4 Trace("Collating $sectionFile.") if T(3);
242 :     $stats->Add("$table-sections" => 1);
243 : parrello 1.6 # Loop through the section file.
244 :     my $ih = Open(undef, "<$sectionFile");
245 :     while (defined (my $line = <$ih>)) {
246 :     print $oh $line;
247 : parrello 1.4 $stats->Add("$table-collations" => 1);
248 : parrello 1.2 }
249 :     }
250 :     # Finish the sort step.
251 : parrello 1.4 Trace("Finishing collate for $table.") if T(3);
252 : parrello 1.2 close $oh;
253 :     $stats->Add('tables-collated' => 1);
254 : parrello 1.4 $stats->Add('collate-time' => time() - $sortStart);
255 :     }
256 :     }
257 :     # Now that we know we have a full data file, we can delete the
258 :     # section files to make room in the data directory. The user can
259 :     # turn this behavior off with the keepSections option.
260 :     if (! $options->{keepSections}) {
261 :     for my $sectionFile (@sectionFiles) {
262 :     if (-e $sectionFile) {
263 :     unlink $sectionFile;
264 :     $stats->Add('files-deleted' => 1);
265 : parrello 1.2 }
266 :     }
267 : parrello 1.4 Trace("Section files for $table deleted.") if T(3);
268 : parrello 1.2 }
269 :     }
270 :     # Were any tables missing?
271 :     if ($missingTable) {
272 :     # Yes, skip this group.
273 :     $stats->Add('groups-skipped' => 1);
274 : parrello 1.6 Trace("Skipping $group group: $missingTable missing tables.") if T(2);
275 : parrello 1.2 } else {
276 : parrello 1.4 # No! Process this group's files.
277 :     if ($sanityCheck eq "") {
278 :     Trace("Loading group $group into database.") if T(2);
279 :     } else {
280 :     Trace("Sanity check for group $group.") if T(2);
281 :     }
282 : parrello 1.2 my $loadStart = time();
283 : parrello 1.3 for my $table (@tableList) {
284 : parrello 1.2 my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
285 : parrello 1.4 # Do we want a real load or a sanity check?
286 :     if ($sanityCheck eq "") {
287 :     # Real load.
288 :     my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
289 :     $stats->Accumulate($newStats);
290 :     Trace("$fileName loaded into $table.") if T(3);
291 :     } elsif ($sanityCheck > 0) {
292 :     # Here we want a sanity check. Note that if the check value is 0,
293 :     # we don't bother. The user just wants to suppress the load step.
294 :     CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
295 :     }
296 : parrello 1.2 }
297 :     $stats->Add("groups-loaded" => 1);
298 :     $stats->Add('load-time' => 1);
299 :     }
300 : parrello 1.1 }
301 : parrello 1.2 $stats->Add('total-time' => time() - $totalStart);
302 :     # Display the statistics from this run.
303 :     Trace("Statistics for load:\n" . $stats->Show()) if T(2);
304 : parrello 1.1 };
305 :     if ($@) {
306 :     Trace("Script failed with error: $@") if T(0);
307 :     } else {
308 :     Trace("Script complete.") if T(2);
309 :     }
310 :     if ($options->{phone}) {
311 : parrello 1.2 my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
312 : parrello 1.1 if ($msgID) {
313 :     Trace("Phone message sent with ID $msgID.") if T(2);
314 :     } else {
315 :     Trace("Phone message not sent.") if T(2);
316 :     }
317 :     }
318 :    
319 : parrello 1.8 =head2 Internal Methods
320 :    
321 : parrello 1.4 =head3 CheckLoadFile
322 :    
323 :     CheckLoadFile($erdb, $table, $fileName, $count);
324 :    
325 :     Read the first few records of a load file and trace the contents at level
326 :     2. This allows the user to visually compare the load file contents with
327 :     the database definition.
328 :    
329 :     =over 4
330 :    
331 :     =item erdb
332 :    
333 :     [[ErdbPm]] object describing the database.
334 :    
335 :     =item table
336 :    
337 :     Name of the table to check.
338 :    
339 :     =item fileName
340 :    
341 :     Name of the load file to check.
342 :    
343 :     =item count
344 :    
345 :     Number of records to check.
346 :    
347 :     =back
348 :    
349 :     =cut
350 :    
351 :     sub CheckLoadFile {
352 :     # Get the parameters.
353 :     my ($erdb, $table, $fileName, $count) = @_;
354 :     # Open the file for input.
355 :     my $ih = Open(undef, "<$fileName");
356 :     # Slurp the first N records.
357 :     my @records;
358 :     while (! eof $ih && scalar(@records) < $count) {
359 :     push @records, [ Tracer::GetLine($ih) ];
360 :     }
361 :     my $found = scalar(@records);
362 : parrello 1.6 Trace("$found records for $table found in sanity check using $fileName.") if T(2);
363 : parrello 1.4 # Do we have any data at all?
364 :     if ($found) {
365 :     # Yes. Get the table's descriptor. We use this to determine the field names.
366 :     my $relationData = $erdb->FindRelation($table);
367 :     Confess("Relation $table not found in database.") if (! defined $relationData);
368 :     my @fields = @{$relationData->{Fields}};
369 : parrello 1.6 # If this is a relationship, we need the FROM and TO data.
370 :     my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);
371 : parrello 1.4 # Loop through the fields. We generate one message per field.
372 :     for (my $i = 0; $i <= $#fields; $i++) {
373 :     # Get this field's information.
374 :     my $fieldInfo = $fields[$i];
375 :     my $type = $fieldInfo->{type};
376 : parrello 1.6 my $name = $fieldInfo->{name};
377 :     if ($name =~ /^(from|to)-link$/) {
378 :     # Here it's a relationship link, so add the name of the target table to
379 :     # the type.
380 :     $type .= " ($ends{$1})";
381 :     }
382 : parrello 1.4 # This is going to be a multi-line trace message. We start with the field name and type.
383 : parrello 1.6 my @lines = ("Values for $table($name), type $type:\n");
384 : parrello 1.4 # Loop through the records. We generate one line of data per record.
385 :     for (my $j = 0; $j < $found; $j++) {
386 :     # Get the field value.
387 :     my $field = $records[$j]->[$i];
388 :     # Compute the record label.
389 :     my $line = "Record $j";
390 :     # Check for unusual cases.
391 : parrello 1.6 if (! defined $field || $field eq '') {
392 : parrello 1.4 $line .= "= <empty>";
393 :     } else {
394 :     # Make sure we don't trace something ungodly.
395 :     my $excess = (length $field) - 40;
396 :     if ($excess > 0) {
397 :     $field = substr($field, 0, 40) . " >> + $excess characters";
398 :     }
399 :     $line .= ": $field";
400 :     }
401 :     # Save this line. We indent a little for readability.
402 :     push @lines, " $line";
403 :     }
404 :     # Trace this field.
405 :     Trace(join("\n", @lines)) if T(2);
406 :     }
407 :     }
408 :     }
409 :    
410 :    
411 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3