[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Thu Oct 2 16:32:42 2008 UTC revision 1.10, Mon May 4 18:49:49 2009 UTC
# Line 57  Line 57 
57  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
58  to access the database.  to access the database.
59    
   
60  =back  =back
61    
62  =head2 Command-Line Options  =head2 Command-Line Options
# Line 77  Line 76 
76    
77  If specified, turns on tracing of SQL activity.  If specified, turns on tracing of SQL activity.
78    
79    =item clear
80    
81    If specified, existing load files will be recreated from sections if the sections
82    are present.
83    
84  =item background  =item background
85    
86  Save the standard and error output to files. The files will be created  Save the standard and error output to files. The files will be created
# Line 93  Line 97 
97  If specified, section files (the fragments of data load files created by  If specified, section files (the fragments of data load files created by
98  [[ERDBGeneratorPl]], will not be deleted after they are collated.  [[ERDBGeneratorPl]], will not be deleted after they are collated.
99    
100    =item sanityCheck
101    
102    If specified, no tables will be loaded. Instead, the first I<N> records from the
103    assembled load files will be displayed so that the file contents can be
104    visually matched against the column names.
105    
106  =item warn  =item warn
107    
108  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 101  Line 111 
111    
112  Phone number to message when the script is complete.  Phone number to message when the script is complete.
113    
114    =item DBD
115    
116    Fully-qualified name of the DBD file. This option allows the use of an alternate
117    DBD during load so that access to the database by other processes is not
118    compromised.
119    
120    =item loadDirectory
121    
122    Directoty containing the load files. This option allows you to request that
123    load files from another version of the NMPDR be used, which is useful when
124    creating a new NMPDR: we can yank in the data from the previous database while
125    waiting for the new load files to be generated.
126    
127    =item dbName
128    
129    SQL name of the target database. If not specified, the default name is used.
130    This option allows you to specify a backup or alternate database that can
131    be loaded without compromising the main database.
132    
133  =back  =back
134    
135  =cut  =cut
# Line 108  Line 137 
137  # Get the command-line options and parameters.  # Get the command-line options and parameters.
138  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
139                                             {                                             {
140                                                trace => ["", "tracing level"],                                                dbName => ["", "if specified, the SQL name of the target database"],
141                                                  clear => ["", "overwrite existing load files if sections are present"],
142                                                  sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
143                                                  trace => ["2", "tracing level"],
144                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
145                                                phone => ["", "phone number (international format) to call when load finishes"]                                                phone => ["", "phone number (international format) to call when load finishes"],
146                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
147                                                  loadDirectory => ["", "if specified, an alternate directory containing the load files"],
148                                             },                                             },
149                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
150                                             @ARGV);                                             @ARGV);
# Line 120  Line 154 
154  eval {  eval {
155      # Get the parameters.      # Get the parameters.
156      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
157      # Connect to the database.      # Connect to the database and get its load directory.
158      my $erdb = ERDB::GetDatabase($database);      my $erdb = ERDB::GetDatabase($database, undef, %$options);
159      # Fix the group list.      # Fix the group list.
160      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
     Trace("Real groups are: " . join(" ", @realGroups)) if T(3);  
161      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
162      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
163      my $directory = $erdb->LoadDirectory();      my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
164      # Get the list of sections.      # Get the list of sections.
165      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
166      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
167      my $stats = Stats->new();      my $stats = Stats->new();
168        # Find out if we're doing a sanity check.
169        my $sanityCheck = $options->{sanityCheck} || "";
170      # Start a timer.      # Start a timer.
171      my $totalStart = time();      my $totalStart = time();
172      # Loop through the groups.      # Loop through the groups.
# Line 143  Line 178 
178          my $missingTable = 0;          my $missingTable = 0;
179          # Loop through the tables in this group.          # Loop through the tables in this group.
180          for my $table (@tableList) {          for my $table (@tableList) {
181              Trace("Processing table $table.") if T(2);              Trace("Processing table $table for assembly.") if T(2);
182                # Get the section file names.
183                my @sectionFiles =
184                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
185              # Get the data file name.              # Get the data file name.
186              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
187              # Do we have it?              # Do we have it?
188              if (-f $dataFile) {              if (-f $dataFile && ! $options->{clear}) {
189                  # Yes. This is good news.                  # Yes. This is good news.
190                  $stats->Add('tables-found' => 1);                  $stats->Add('tables-found' => 1);
191                    Trace("Table file found for $table.") if T(3);
192              } else {              } else {
193                  # No, we must build it. Verify that we have all the sections.                  # No, we must build it. Verify that we have all the sections.
                 my @sectionFiles =  
                     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;  
194                  my @missingFiles = grep { ! -f $_ } @sectionFiles;                  my @missingFiles = grep { ! -f $_ } @sectionFiles;
                 # Tell the user about all the missing files.  
                 for my $missingFile (@missingFiles) {  
                     $stats->Add('sections-missing' => 1);  
                     $stats->AddMessage("Data file $missingFile not found for table $table.");  
                 }  
195                  # Did we find everything?                  # Did we find everything?
196                  if (scalar @missingFiles) {                  if (scalar @missingFiles) {
197                      # No! Denote that we have a missing table.                      # No! Denote that we have a missing table.
198                      $missingTable++;                      $missingTable++;
199                      $stats->Add('tables-skipped' => 1);                      $stats->Add('tables-skipped' => 1);
200                        # If the user wants a sanity check, we want to give him some
201                        # data anyway.
202                        if ($sanityCheck) {
203                            # Get some data lines in the sections. Note we stop when we've exceeded
204                            # the number of lines expected by the sanity check.
205                            my @lines;
206                            for my $sectionFile (@sectionFiles) {
207                                if (-s $sectionFile && scalar(@lines) < $sanityCheck) {
208                                    Trace("Reading from $sectionFile for $table.") if T(3);
209                                    push @lines, Tracer::GetFile($sectionFile);
210                                }
211                            }
212                            # Create a new temporary file.
213                            my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";
214                            my $oh = Open(undef, ">$tmpFile");
215                            # Put all the data into it.
216                            Trace(scalar(@lines) . " data lines found.") if T(3);
217                            print $oh join("\n", @lines);
218                            close $oh;
219                            # Sanity check the temp file.
220                            CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);
221                            # Clean it up.
222                            unlink $tmpFile;
223                  } else {                  } else {
224                      # Yes! Try to assemble the sections into a data file.                          # Otherwise tell the user about all the missing files.
225                            for my $missingFile (@missingFiles) {
226                                $stats->Add('sections-missing' => 1);
227                                $stats->AddMessage("Data file $missingFile not found for table $table.");
228                            }
229                        }
230                    } else {
231                        # We have all the sections. Try to assemble them into a data file.
232                      my $sortStart = time();                      my $sortStart = time();
233                      my $sortCommand = $erdb->SortNeeded($table);                      my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
234                      my $oh = Open(undef, "| $sortCommand >$dataFile");                      Trace("Sort command: $sortCommand") if T(3);
235                        # Pipe to the sort command. Note that we turn on autoflush
236                        # so there's no buffering.
237                        my $oh = Open(undef, "| $sortCommand");
238                        select $oh; $| = 1; select STDOUT;
239                        # Loop through the sections.
240                      for my $sectionFile (@sectionFiles) {                      for my $sectionFile (@sectionFiles) {
241                          Trace("Collating $sectionFile.") if T(4);                          Trace("Collating $sectionFile.") if T(3);
242                          $stats->Add('sections-loaded' => 1);                          $stats->Add("$table-sections" => 1);
243                          for my $line (Tracer::GetFile($sectionFile)) {                          # Loop through the section file.
244                              print $oh "$line\n";                          my $ih = Open(undef, "<$sectionFile");
245                              $stats->Add('lines-collated' => 1);                          while (defined (my $line = <$ih>)) {
246                                print $oh $line;
247                                $stats->Add("$table-collations" => 1);
248                          }                          }
249                      }                      }
250                      # Finish the sort step.                      # Finish the sort step.
251                        Trace("Finishing collate for $table.") if T(3);
252                      close $oh;                      close $oh;
253                      $stats->Add('tables-collated' => 1);                      $stats->Add('tables-collated' => 1);
254                      # Now that we've collated the section files, we can delete them                      $stats->Add('collate-time' => time() - $sortStart);
255                      # to make room in the data directory. The user can turn this                  }
256                      # behavior off with the keepSections option.              }
257                # Now that we know we have a full data file, we can delete the
258                # section files to make room in the data directory. The user can
259                # turn this behavior off with the keepSections option.
260                      if (! $options->{keepSections}) {                      if (! $options->{keepSections}) {
261                          for my $sectionFile (@sectionFiles) {                          for my $sectionFile (@sectionFiles) {
262                        if (-e $sectionFile) {
263                              unlink $sectionFile;                              unlink $sectionFile;
264                              $stats->Add('files-deleted' => 1);                              $stats->Add('files-deleted' => 1);
265                          }                          }
                         Trace("Section files for $table deleted.") if T(3);  
                     }  
                     $stats->Add('collate-time' => time() - $sortStart);  
266                  }                  }
267                    Trace("Section files for $table deleted.") if T(3);
268              }              }
269          }          }
270          # Were any tables missing?          # Were any tables missing?
271          if ($missingTable) {          if ($missingTable) {
272              # Yes, skip this group.              # Yes, skip this group.
273              $stats->Add('groups-skipped' => 1);              $stats->Add('groups-skipped' => 1);
274              Trace("Skipping $group group: $missingTable missing tables.") if T(3);              Trace("Skipping $group group: $missingTable missing tables.") if T(2);
275            } else {
276                # No! Process this group's files.
277                if ($sanityCheck eq "") {
278                    Trace("Loading group $group into database.") if T(2);
279          } else {          } else {
280              # No! Load this group into the database.                  Trace("Sanity check for group $group.") if T(2);
281              Trace("Loading $group group into database.") if T(2);              }
282              my $loadStart = time();              my $loadStart = time();
283              for my $table (@tableList) {              for my $table (@tableList) {
284                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
285                    # Do we want a real load or a sanity check?
286                    if ($sanityCheck eq "") {
287                        # Real load.
288                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
289                  $stats->Accumulate($newStats);                  $stats->Accumulate($newStats);
290                  Trace("$fileName loaded into $table.") if T(3);                  Trace("$fileName loaded into $table.") if T(3);
291                    } elsif ($sanityCheck > 0) {
292                        # Here we want a sanity check. Note that if the check value is 0,
293                        # we don't bother. The user just wants to suppress the load step.
294                        CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
295                    }
296              }              }
297              $stats->Add("groups-loaded" => 1);              $stats->Add("groups-loaded" => 1);
298              $stats->Add('load-time' => 1);              $stats->Add('load-time' => 1);
# Line 232  Line 316 
316      }      }
317  }  }
318    
319    =head2 Internal Methods
320    
321    =head3 CheckLoadFile
322    
323        CheckLoadFile($erdb, $table, $fileName, $count);
324    
325    Read the first few records of a load file and trace the contents at level
326    2. This allows the user to visually compare the load file contents with
327    the database definition.
328    
329    =over 4
330    
331    =item erdb
332    
333    [[ErdbPm]] object describing the database.
334    
335    =item table
336    
337    Name of the table to check.
338    
339    =item fileName
340    
341    Name of the load file to check.
342    
343    =item count
344    
345    Number of records to check.
346    
347    =back
348    
349    =cut
350    
351    sub CheckLoadFile {
352        # Get the parameters.
353        my ($erdb, $table, $fileName, $count) = @_;
354        # Open the file for input.
355        my $ih = Open(undef, "<$fileName");
356        # Slurp the first N records.
357        my @records;
358        while (! eof $ih && scalar(@records) < $count) {
359            push @records, [ Tracer::GetLine($ih) ];
360        }
361        my $found = scalar(@records);
362        Trace("$found records for $table found in sanity check using $fileName.") if T(2);
363        # Do we have any data at all?
364        if ($found) {
365            # Yes. Get the table's descriptor. We use this to determine the field names.
366            my $relationData = $erdb->FindRelation($table);
367            Confess("Relation $table not found in database.") if (! defined $relationData);
368            my @fields = @{$relationData->{Fields}};
369            # If this is a relationship, we need the FROM and TO data.
370            my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);
371            # Loop through the fields. We generate one message per field.
372            for (my $i = 0; $i <= $#fields; $i++) {
373                # Get this field's information.
374                my $fieldInfo = $fields[$i];
375                my $type = $fieldInfo->{type};
376                my $name = $fieldInfo->{name};
377                if ($name =~ /^(from|to)-link$/) {
378                    # Here it's a relationship link, so add the name of the target table to
379                    # the type.
380                    $type .= " ($ends{$1})";
381                }
382                # This is going to be a multi-line trace message. We start with the field name and type.
383                my @lines = ("Values for $table($name), type $type:\n");
384                # Loop through the records. We generate one line of data per record.
385                for (my $j = 0; $j < $found; $j++) {
386                    # Get the field value.
387                    my $field = $records[$j]->[$i];
388                    # Compute the record label.
389                    my $line = "Record $j";
390                    # Check for unusual cases.
391                    if (! defined $field || $field eq '') {
392                        $line .= "= <empty>";
393                    } else {
394                        # Make sure we don't trace something ungodly.
395                        my $excess = (length $field) - 40;
396                        if ($excess > 0) {
397                            $field = substr($field, 0, 40) . " >> + $excess characters";
398                        }
399                        $line .= ": $field";
400                    }
401                    # Save this line. We indent a little for readability.
402                    push @lines, "   $line";
403                }
404                # Trace this field.
405                Trace(join("\n", @lines)) if T(2);
406            }
407        }
408    }
409    
410    
411  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.10

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3