[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Thu Oct 9 17:23:54 2008 UTC revision 1.17, Mon Jan 3 18:16:47 2011 UTC
# Line 22  Line 22 
22  use ERDB;  use ERDB;
23  use ERDBLoadGroup;  use ERDBLoadGroup;
24  use ERDBGenerate;  use ERDBGenerate;
25    use ERDBExtras;
26  use Stats;  use Stats;
27  use Time::HiRes;  use Time::HiRes;
28    
# Line 34  Line 35 
35    
36  =head2 Introduction  =head2 Introduction
37    
38  This script finishes the database load process begun by [[ERDBGeneratorPl]].  This script finishes the database load process begun by L<ERDBGenerator.pl>.
39    
40  [[ERDBGeneratorPl]] divides the source data into sections, and generates a  L<ERDBGenerator.pl> divides the source data into sections, and generates a
41  partial load file for each section of each table. To finish the load process, we  partial load file for each section of each table. To finish the load process, we
42  need to combine the partial files into single files and load the resulting  need to combine the partial files into single files and load the resulting
43  single files into the database tables.  single files into the database tables.
44    
45  Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related  Like L<ERDBGenerator.pl>, this script acts on load groups-- sets of related
46  tables that are loaded at the same time. For each table in a named group that  tables that are loaded at the same time. For each table in a named group that
47  does not exist in the database, the script first attempts to find a completed  does not exist in the database, the script first attempts to find a complete set
48  data file. If one does not exist, it attempts to create one by collating section  of section files that it will collate into a data file. If there are no sections,
49  files. Once the collated section files for a load group are finished, they are  then it will look for a data file that is already collated. Once the collated
50  loaded into the database.  section files for a load group are all verified, they are loaded into the database.
51    
52  =head2 Positional Parameters  =head2 Positional Parameters
53    
# Line 57  Line 58 
58  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
59  to access the database.  to access the database.
60    
   
61  =back  =back
62    
63  =head2 Command-Line Options  =head2 Command-Line Options
# Line 91  Line 91 
91  =item keepSections  =item keepSections
92    
93  If specified, section files (the fragments of data load files created by  If specified, section files (the fragments of data load files created by
94  [[ERDBGeneratorPl]], will not be deleted after they are collated.  L<ERDBGenerator.pl>, will not be deleted after they are collated.
   
 =item sanityCheck  
   
 If specified, no tables will be loaded. Instead, the first I<N> records from the  
 assembled load files will be displayed so that the file contents can be  
 visually matched against the column names.  
95    
96  =item warn  =item warn
97    
# Line 107  Line 101 
101    
102  Phone number to message when the script is complete.  Phone number to message when the script is complete.
103    
104    =item DBD
105    
106    Fully-qualified name of the DBD file. This option allows the use of an alternate
107    DBD during load so that access to the database by other processes is not
108    compromised.
109    
110    =item loadDirectory
111    
112    Directoty containing the load files. This option allows you to request that
113    load files from another version of the NMPDR be used, which is useful when
114    creating a new NMPDR: we can yank in the data from the previous database while
115    waiting for the new load files to be generated.
116    
117    =item dbName
118    
119    SQL name of the target database. If not specified, the default name is used.
120    This option allows you to specify a backup or alternate database that can
121    be loaded without compromising the main database.
122    
123  =back  =back
124    
125  =cut  =cut
# Line 114  Line 127 
127  # Get the command-line options and parameters.  # Get the command-line options and parameters.
128  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
129                                             {                                             {
130                                                sanityCheck => ["", "don't load, trace contents of first N load file records instead"],                                                dbName => ["", "if specified, the SQL name of the target database"],
131                                                trace => ["", "tracing level"],                                                trace => ["2", "tracing level"],
132                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
133                                                phone => ["", "phone number (international format) to call when load finishes"]                                                phone => ["", "phone number (international format) to call when load finishes"],
134                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
135                                                  loadDirectory => ["", "if specified, an alternate directory containing the load files"],
136                                             },                                             },
137                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
138                                             @ARGV);                                             @ARGV);
# Line 127  Line 142 
142  eval {  eval {
143      # Get the parameters.      # Get the parameters.
144      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
145      # Connect to the database.      # Connect to the database and get its load directory.
146      my $erdb = ERDB::GetDatabase($database);      my $erdb = ERDB::GetDatabase($database, undef, %$options, externalDBD => 1);
147      # Fix the group list.      # Fix the group list.
148      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
149      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
150      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
151      my $directory = $erdb->LoadDirectory();      my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
152      # Get the list of sections.      # Get the list of sections.
153      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
154      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
155      my $stats = Stats->new();      my $stats = Stats->new();
156      # Find out if we're doing a sanity check.      # We make one pass to assemble all the tables in all the groups, and
157      my $sanityCheck = $options->{sanityCheck} || "";      # then another to do the actual loads. The groups that are ready to load
158        # in the second pass will go in this list.
159        my @goodGroups;
160      # Start a timer.      # Start a timer.
161      my $totalStart = time();      my $totalStart = time();
162      # Loop through the groups.      # Loop through the groups.
# Line 158  Line 175 
175              # Get the data file name.              # Get the data file name.
176              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
177              # Do we have it?              # Do we have it?
178              if (-f $dataFile) {              my $haveFile = -f $dataFile;
179                  # Yes. This is good news.              # See if we can build it. Verify that we have all the sections.
                 $stats->Add('tables-found' => 1);  
                 Trace("Table file found for $table.") if T(3);  
             } else {  
                 # No, we must build it. Verify that we have all the sections.  
180                  my @missingFiles = grep { ! -f $_ } @sectionFiles;                  my @missingFiles = grep { ! -f $_ } @sectionFiles;
181                # Did we find everything?
182                if (scalar(@missingFiles) && ! $haveFile) {
183                    # No, and there's no main file! Denote that we have a missing table.
184                    $missingTable++;
185                    $stats->Add('tables-skipped' => 1);
186                  # Tell the user about all the missing files.                  # Tell the user about all the missing files.
187                  for my $missingFile (@missingFiles) {                  for my $missingFile (@missingFiles) {
188                      $stats->Add('sections-missing' => 1);                      $stats->Add('sections-missing' => 1);
189                      $stats->AddMessage("Data file $missingFile not found for table $table.");                      $stats->AddMessage("Data file $missingFile not found for table $table.");
190                  }                  }
191                  # Did we find everything?              } elsif (! scalar @missingFiles) {
192                  if (scalar @missingFiles) {                  # We have all the sections. Try to assemble them into a data file.
                     # No! Denote that we have a missing table.  
                     $missingTable++;  
                     $stats->Add('tables-skipped' => 1);  
                 } else {  
                     # Yes! Try to assemble the sections into a data file.  
193                      my $sortStart = time();                      my $sortStart = time();
194                      my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";                      my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
195                      Trace("Sort command: $sortCommand") if T(3);                      Trace("Sort command: $sortCommand") if T(3);
196                    # Pipe to the sort command. Note that we turn on autoflush
197                    # so there's no buffering.
198                      my $oh = Open(undef, "| $sortCommand");                      my $oh = Open(undef, "| $sortCommand");
199                    select $oh; $| = 1; select STDOUT;
200                    # Loop through the sections.
201                      for my $sectionFile (@sectionFiles) {                      for my $sectionFile (@sectionFiles) {
202                          Trace("Collating $sectionFile.") if T(3);                          Trace("Collating $sectionFile.") if T(3);
203                          $stats->Add("$table-sections" => 1);                          $stats->Add("$table-sections" => 1);
204                          for my $line (Tracer::GetFile($sectionFile)) {                      # Loop through the section file.
205                              print $oh "$line\n";                      my $ih = Open(undef, "<$sectionFile");
206                        while (defined (my $line = <$ih>)) {
207                            print $oh $line;
208                              $stats->Add("$table-collations" => 1);                              $stats->Add("$table-collations" => 1);
209                          }                          }
210                      }                      }
211                      # Finish the sort step.                      # Finish the sort step.
212                      Trace("Finishing collate for $table.") if T(3);                  Trace("Finishing collate for $table.") if T(2);
213                      close $oh;                      close $oh;
214                      $stats->Add('tables-collated' => 1);                      $stats->Add('tables-collated' => 1);
215                      $stats->Add('collate-time' => time() - $sortStart);                      $stats->Add('collate-time' => time() - $sortStart);
216                  }              } else {
217                    # We have a data file and no sections, so we use the data file.
218                    $stats->Add('tables-found' => 1);
219              }              }
220              # Now that we know we have a full data file, we can delete the              # Now that we know we have a full data file, we can delete the
221              # section files to make room in the data directory. The user can              # section files to make room in the data directory. The user can
# Line 213  Line 234 
234          if ($missingTable) {          if ($missingTable) {
235              # Yes, skip this group.              # Yes, skip this group.
236              $stats->Add('groups-skipped' => 1);              $stats->Add('groups-skipped' => 1);
237              Trace("Skipping $group group: $missingTable missing tables.") if T(3);              Trace("Skipping $group group: $missingTable missing tables.") if T(2);
238          } else {          } else {
239              # No! Process this group's files.              # No! File this group for processing in the second pass.
240              if ($sanityCheck eq "") {              push @goodGroups, $group;
241                  Trace("Loading group $group into database.") if T(2);          }
242        }
243        # Now we loop through the good groups, doing the actual loads.
244        for my $group (@goodGroups) {
245            # Get a group object.
246            my $groupData = $erdb->Loader($group);
247            # Do the post-processing.
248            my $postStats = $groupData->PostProcess();
249            # Determine what happened.
250            if (! defined $postStats) {
251                Trace("Post-processing not required for $group.") if T(3);
252              } else {              } else {
253                  Trace("Sanity check for group $group.") if T(2);              $stats->Accumulate($postStats);
254                $stats->Add('post-processing' => 1);
255              }              }
256            # Process this group's files.
257            Trace("Loading group $group into database.") if T(2);
258            # Get the list of tables.
259            my @tableList = $groupData->GetTables();
260            # Start a timer.
261              my $loadStart = time();              my $loadStart = time();
262              for my $table (@tableList) {              for my $table (@tableList) {
263                # Compute the load file name.
264                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
265                  # Do we want a real load or a sanity check?              # Do the actual load.
                 if ($sanityCheck eq "") {  
                     # Real load.  
266                      my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);                      my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
267                      $stats->Accumulate($newStats);                      $stats->Accumulate($newStats);
268                      Trace("$fileName loaded into $table.") if T(3);                      Trace("$fileName loaded into $table.") if T(3);
                 } elsif ($sanityCheck > 0) {  
                     # Here we want a sanity check. Note that if the check value is 0,  
                     # we don't bother. The user just wants to suppress the load step.  
                     CheckLoadFile($erdb, $table, $fileName, $sanityCheck);  
                 }  
269              }              }
270              $stats->Add("groups-loaded" => 1);              $stats->Add("groups-loaded" => 1);
271              $stats->Add('load-time' => 1);          $stats->Add('load-time' => (time() - $loadStart));
         }  
272      }      }
273        # Save the DBD.
274        Trace("Saving DBD.") if T(2);
275        $erdb->InternalizeDBD();
276      $stats->Add('total-time' => time() - $totalStart);      $stats->Add('total-time' => time() - $totalStart);
277      # Display the statistics from this run.      # Display the statistics from this run.
278      Trace("Statistics for load:\n" . $stats->Show()) if T(2);      Trace("Statistics for load:\n" . $stats->Show()) if T(2);
# Line 258  Line 291 
291      }      }
292  }  }
293    
 =head3 CheckLoadFile  
   
     CheckLoadFile($erdb, $table, $fileName, $count);  
   
 Read the first few records of a load file and trace the contents at level  
 2. This allows the user to visually compare the load file contents with  
 the database definition.  
   
 =over 4  
   
 =item erdb  
   
 [[ErdbPm]] object describing the database.  
   
 =item table  
   
 Name of the table to check.  
   
 =item fileName  
   
 Name of the load file to check.  
   
 =item count  
   
 Number of records to check.  
   
 =back  
   
 =cut  
   
 sub CheckLoadFile {  
     # Get the parameters.  
     my ($erdb, $table, $fileName, $count) = @_;  
     # Open the file for input.  
     my $ih = Open(undef, "<$fileName");  
     # Slurp the first N records.  
     my @records;  
     while (! eof $ih && scalar(@records) < $count) {  
         push @records, [ Tracer::GetLine($ih) ];  
     }  
     my $found = scalar(@records);  
     Trace("$found records for $table found in sanity check.") if T(3);  
     # Do we have any data at all?  
     if ($found) {  
         # Yes. Get the table's descriptor. We use this to determine the field names.  
         my $relationData = $erdb->FindRelation($table);  
         Confess("Relation $table not found in database.") if (! defined $relationData);  
         my @fields = @{$relationData->{Fields}};  
         # Loop through the fields. We generate one message per field.  
         for (my $i = 0; $i <= $#fields; $i++) {  
             # Get this field's information.  
             my $fieldInfo = $fields[$i];  
             my $type = $fieldInfo->{type};  
             # This is going to be a multi-line trace message. We start with the field name and type.  
             my @lines = ("Values for $fieldInfo->{name}, type $type:\n");  
             # Loop through the records. We generate one line of data per record.  
             for (my $j = 0; $j < $found; $j++) {  
                 # Get the field value.  
                 my $field = $records[$j]->[$i];  
                 # Compute the record label.  
                 my $line = "Record $j";  
                 # Check for unusual cases.  
                 if (! defined $field) {  
                     $line .= "= <null>";  
                 } elsif ($field eq '') {  
                     $line .= "= <empty>";  
                 } else {  
                     # Make sure we don't trace something ungodly.  
                     my $excess = (length $field) - 40;  
                     if ($excess > 0) {  
                         $field = substr($field, 0, 40) . " >> + $excess characters";  
                     }  
                     $line .= ": $field";  
                 }  
                 # Save this line. We indent a little for readability.  
                 push @lines, "   $line";  
             }  
             # Trace this field.  
             Trace(join("\n", @lines)) if T(2);  
         }  
     }  
 }  
   
294    
295  1;  1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3