[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Sep 30 15:23:55 2008 UTC revision 1.9, Mon Mar 23 19:34:35 2009 UTC
# Line 23  Line 23 
23  use ERDBLoadGroup;  use ERDBLoadGroup;
24  use ERDBGenerate;  use ERDBGenerate;
25  use Stats;  use Stats;
26    use Time::HiRes;
27    
28    
29  =head1 ERDBLoader Script  =head1 ERDBLoader Script
# Line 44  Line 45 
45  tables that are loaded at the same time. For each table in a named group that  tables that are loaded at the same time. For each table in a named group that
46  does not exist in the database, the script first attempts to find a completed  does not exist in the database, the script first attempts to find a completed
47  data file. If one does not exist, it attempts to create one by collating section  data file. If one does not exist, it attempts to create one by collating section
48  files. Once the collated section file is finished, it is loaded into the  files. Once the collated section files for a load group are finished, they are
49  database.  loaded into the database.
50    
51  =head2 Positional Parameters  =head2 Positional Parameters
52    
# Line 56  Line 57 
57  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
58  to access the database.  to access the database.
59    
   
60  =back  =back
61    
62  =head2 Command-Line Options  =head2 Command-Line Options
# Line 68  Line 68 
68  Specifies the tracing level. The higher the tracing level, the more messages  Specifies the tracing level. The higher the tracing level, the more messages
69  will appear in the trace log. Use E to specify emergency tracing.  will appear in the trace log. Use E to specify emergency tracing.
70    
 =item replace  
   
 Normally, if a table exists in the database, it will not be loaded. If this  
 option is specified, however, existing tables will be dropped and recreated.  
   
 =item resume  
   
 If specified, then the group list must contain a single group. The specified  
 group and all groups after it in the group list will be processed.  
   
71  =item user  =item user
72    
73  Name suffix to be used for log files. If omitted, the PID is used.  Name suffix to be used for log files. If omitted, the PID is used.
# Line 86  Line 76 
76    
77  If specified, turns on tracing of SQL activity.  If specified, turns on tracing of SQL activity.
78    
79    =item clear
80    
81    If specified, existing load files will be recreated from sections if the sections
82    are present.
83    
84  =item background  =item background
85    
86  Save the standard and error output to files. The files will be created  Save the standard and error output to files. The files will be created
# Line 97  Line 92 
92    
93  Display this command's parameters and options.  Display this command's parameters and options.
94    
95    =item keepSections
96    
97    If specified, section files (the fragments of data load files created by
98    [[ERDBGeneratorPl]], will not be deleted after they are collated.
99    
100    =item sanityCheck
101    
102    If specified, no tables will be loaded. Instead, the first I<N> records from the
103    assembled load files will be displayed so that the file contents can be
104    visually matched against the column names.
105    
106  =item warn  =item warn
107    
108  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 105  Line 111 
111    
112  Phone number to message when the script is complete.  Phone number to message when the script is complete.
113    
114    =item DBD
115    
116    Name of the DBD file. If specified, the DBD must be in the main FIG directory
117    (specified in C<$FIG_Config::fig>). This option allows the use of an alternate
118    DBD during load so that access to the database by other processes is not
119    compromised.
120    
121    =item loadDirectory
122    
123    Directoty containing the load files. This option allows you to request that
124    load files from another version of the NMPDR be used, which is useful when
125    creating a new NMPDR: we can yank in the data from the previous database while
126    waiting for the new load files to be generated.
127    
128  =back  =back
129    
130  =cut  =cut
# Line 112  Line 132 
132  # Get the command-line options and parameters.  # Get the command-line options and parameters.
133  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
134                                             {                                             {
135                                                trace => ["", "tracing level"],                                                clear => ["", "overwrite existing load files if sections are present"],
136                                                replace => ["", "if specified, existing tables will be overwritten"],                                                sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
137                                                resume => ["", "if specified, the specified group and all groups that normally come after it will be processed"],                                                trace => ["2", "tracing level"],
138                                                phone => ["", "phone number (international format) to call when load finishes"]                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
139                                                  phone => ["", "phone number (international format) to call when load finishes"],
140                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
141                                                  loadDirectory => ["", "if specified, an alternate directory containing the load files"],
142                                             },                                             },
143                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
144                                             @ARGV);                                             @ARGV);
# Line 125  Line 148 
148  eval {  eval {
149      # Get the parameters.      # Get the parameters.
150      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
151      # Connect to the database.      # Check for an alternate DBD.
152      my $erdb = ERDB::GetDatabase($database);      my $altDBD = ($options->{DBD} ? "$FIG_Config::fig/$options->{DBD}" : undef);
153        # Connect to the database and get its load directory.
154        my $erdb = ERDB::GetDatabase($database, $altDBD);
155      # Fix the group list.      # Fix the group list.
156      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, $options, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
157      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
158      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
159      my $directory = $erdb->LoadDirectory();      my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
160      # Get the list of sections.      # Get the list of sections.
161      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
162      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
163      my $stats = Stats->new();      my $stats = Stats->new();
164      # Get the hash of group names to table names.      # Find out if we're doing a sanity check.
165      my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);      my $sanityCheck = $options->{sanityCheck} || "";
166        # Start a timer.
167        my $totalStart = time();
168      # Loop through the groups.      # Loop through the groups.
169      for my $group (@realGroups) {      for my $group (@realGroups) {
170          # Get the list of tables for this group.          # Get the list of tables for this group.
171          my $tableList = $groupHash->{$group};          my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
172            # We need to insure there is a data file for every table. If we fail to find one,
173            # we set the following error flag, which prevents us from loading the database.
174            my $missingTable = 0;
175            # Loop through the tables in this group.
176            for my $table (@tableList) {
177                Trace("Processing table $table for assembly.") if T(2);
178                # Get the section file names.
179                my @sectionFiles =
180                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
181                # Get the data file name.
182                my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
183                # Do we have it?
184                if (-f $dataFile && ! $options->{clear}) {
185                    # Yes. This is good news.
186                    $stats->Add('tables-found' => 1);
187                    Trace("Table file found for $table.") if T(3);
188                } else {
189                    # No, we must build it. Verify that we have all the sections.
190                    my @missingFiles = grep { ! -f $_ } @sectionFiles;
191                    # Did we find everything?
192                    if (scalar @missingFiles) {
193                        # No! Denote that we have a missing table.
194                        $missingTable++;
195                        $stats->Add('tables-skipped' => 1);
196                        # If the user wants a sanity check, we want to give him some
197                        # data anyway.
198                        if ($sanityCheck) {
199                            # Get some data lines in the sections. Note we stop when we've exceeded
200                            # the number of lines expected by the sanity check.
201                            my @lines;
202                            for my $sectionFile (@sectionFiles) {
203                                if (-s $sectionFile && scalar(@lines) < $sanityCheck) {
204                                    Trace("Reading from $sectionFile for $table.") if T(3);
205                                    push @lines, Tracer::GetFile($sectionFile);
206                                }
207                            }
208                            # Create a new temporary file.
209                            my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";
210                            my $oh = Open(undef, ">$tmpFile");
211                            # Put all the data into it.
212                            Trace(scalar(@lines) . " data lines found.") if T(3);
213                            print $oh join("\n", @lines);
214                            close $oh;
215                            # Sanity check the temp file.
216                            CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);
217                            # Clean it up.
218                            unlink $tmpFile;
219                        } else {
220                            # Otherwise tell the user about all the missing files.
221                            for my $missingFile (@missingFiles) {
222                                $stats->Add('sections-missing' => 1);
223                                $stats->AddMessage("Data file $missingFile not found for table $table.");
224                            }
225                        }
226                    } else {
227                        # We have all the sections. Try to assemble them into a data file.
228                        my $sortStart = time();
229                        my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
230                        Trace("Sort command: $sortCommand") if T(3);
231                        # Pipe to the sort command. Note that we turn on autoflush
232                        # so there's no buffering.
233                        my $oh = Open(undef, "| $sortCommand");
234                        select $oh; $| = 1; select STDOUT;
235                        # Loop through the sections.
236                        for my $sectionFile (@sectionFiles) {
237                            Trace("Collating $sectionFile.") if T(3);
238                            $stats->Add("$table-sections" => 1);
239                            # Loop through the section file.
240                            my $ih = Open(undef, "<$sectionFile");
241                            while (defined (my $line = <$ih>)) {
242                                print $oh $line;
243                                $stats->Add("$table-collations" => 1);
244                            }
245                        }
246                        # Finish the sort step.
247                        Trace("Finishing collate for $table.") if T(3);
248                        close $oh;
249                        $stats->Add('tables-collated' => 1);
250                        $stats->Add('collate-time' => time() - $sortStart);
251                    }
252                }
253                # Now that we know we have a full data file, we can delete the
254                # section files to make room in the data directory. The user can
255                # turn this behavior off with the keepSections option.
256                if (! $options->{keepSections}) {
257                    for my $sectionFile (@sectionFiles) {
258                        if (-e $sectionFile) {
259                            unlink $sectionFile;
260                            $stats->Add('files-deleted' => 1);
261                        }
262                    }
263                    Trace("Section files for $table deleted.") if T(3);
264                }
265            }
266            # Were any tables missing?
267            if ($missingTable) {
268                # Yes, skip this group.
269                $stats->Add('groups-skipped' => 1);
270                Trace("Skipping $group group: $missingTable missing tables.") if T(2);
271            } else {
272                # No! Process this group's files.
273                if ($sanityCheck eq "") {
274                    Trace("Loading group $group into database.") if T(2);
275                } else {
276                    Trace("Sanity check for group $group.") if T(2);
277                }
278                my $loadStart = time();
279                for my $table (@tableList) {
280                    my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
281                    # Do we want a real load or a sanity check?
282                    if ($sanityCheck eq "") {
283                        # Real load.
284                        my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
285                        $stats->Accumulate($newStats);
286                        Trace("$fileName loaded into $table.") if T(3);
287                    } elsif ($sanityCheck > 0) {
288                        # Here we want a sanity check. Note that if the check value is 0,
289                        # we don't bother. The user just wants to suppress the load step.
290                        CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
291                    }
292      }      }
293                $stats->Add("groups-loaded" => 1);
294                $stats->Add('load-time' => 1);
295            }
296        }
297        $stats->Add('total-time' => time() - $totalStart);
298        # Display the statistics from this run.
299        Trace("Statistics for load:\n" . $stats->Show()) if T(2);
300  };  };
301  if ($@) {  if ($@) {
302      Trace("Script failed with error: $@") if T(0);      Trace("Script failed with error: $@") if T(0);
     $rtype = "error";  
303  } else {  } else {
304      Trace("Script complete.") if T(2);      Trace("Script complete.") if T(2);
     $rtype = "no error";  
305  }  }
306  if ($options->{phone}) {  if ($options->{phone}) {
307      my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader terminated with $rtype.");      my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
308      if ($msgID) {      if ($msgID) {
309          Trace("Phone message sent with ID $msgID.") if T(2);          Trace("Phone message sent with ID $msgID.") if T(2);
310      } else {      } else {
# Line 160  Line 312 
312      }      }
313  }  }
314    
315    =head2 Internal Methods
316    
317    =head3 CheckLoadFile
318    
319        CheckLoadFile($erdb, $table, $fileName, $count);
320    
321    Read the first few records of a load file and trace the contents at level
322    2. This allows the user to visually compare the load file contents with
323    the database definition.
324    
325    =over 4
326    
327    =item erdb
328    
329    [[ErdbPm]] object describing the database.
330    
331    =item table
332    
333    Name of the table to check.
334    
335    =item fileName
336    
337    Name of the load file to check.
338    
339    =item count
340    
341    Number of records to check.
342    
343    =back
344    
345    =cut
346    
347    sub CheckLoadFile {
348        # Get the parameters.
349        my ($erdb, $table, $fileName, $count) = @_;
350        # Open the file for input.
351        my $ih = Open(undef, "<$fileName");
352        # Slurp the first N records.
353        my @records;
354        while (! eof $ih && scalar(@records) < $count) {
355            push @records, [ Tracer::GetLine($ih) ];
356        }
357        my $found = scalar(@records);
358        Trace("$found records for $table found in sanity check using $fileName.") if T(2);
359        # Do we have any data at all?
360        if ($found) {
361            # Yes. Get the table's descriptor. We use this to determine the field names.
362            my $relationData = $erdb->FindRelation($table);
363            Confess("Relation $table not found in database.") if (! defined $relationData);
364            my @fields = @{$relationData->{Fields}};
365            # If this is a relationship, we need the FROM and TO data.
366            my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);
367            # Loop through the fields. We generate one message per field.
368            for (my $i = 0; $i <= $#fields; $i++) {
369                # Get this field's information.
370                my $fieldInfo = $fields[$i];
371                my $type = $fieldInfo->{type};
372                my $name = $fieldInfo->{name};
373                if ($name =~ /^(from|to)-link$/) {
374                    # Here it's a relationship link, so add the name of the target table to
375                    # the type.
376                    $type .= " ($ends{$1})";
377                }
378                # This is going to be a multi-line trace message. We start with the field name and type.
379                my @lines = ("Values for $table($name), type $type:\n");
380                # Loop through the records. We generate one line of data per record.
381                for (my $j = 0; $j < $found; $j++) {
382                    # Get the field value.
383                    my $field = $records[$j]->[$i];
384                    # Compute the record label.
385                    my $line = "Record $j";
386                    # Check for unusual cases.
387                    if (! defined $field || $field eq '') {
388                        $line .= "= <empty>";
389                    } else {
390                        # Make sure we don't trace something ungodly.
391                        my $excess = (length $field) - 40;
392                        if ($excess > 0) {
393                            $field = substr($field, 0, 40) . " >> + $excess characters";
394                        }
395                        $line .= ": $field";
396                    }
397                    # Save this line. We indent a little for readability.
398                    push @lines, "   $line";
399                }
400                # Trace this field.
401                Trace(join("\n", @lines)) if T(2);
402            }
403        }
404    }
405    
406    
407  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3