[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Mon Mar 23 19:34:35 2009 UTC revision 1.13, Tue Sep 8 21:30:33 2009 UTC
# Line 22  Line 22 
22  use ERDB;  use ERDB;
23  use ERDBLoadGroup;  use ERDBLoadGroup;
24  use ERDBGenerate;  use ERDBGenerate;
25    use ERDBExtras;
26  use Stats;  use Stats;
27  use Time::HiRes;  use Time::HiRes;
28    
# Line 34  Line 35 
35    
36  =head2 Introduction  =head2 Introduction
37    
38  This script finishes the database load process begun by [[ERDBGeneratorPl]].  This script finishes the database load process begun by L<ERDBGenerator.pl>.
39    
40  [[ERDBGeneratorPl]] divides the source data into sections, and generates a  L<ERDBGenerator.pl> divides the source data into sections, and generates a
41  partial load file for each section of each table. To finish the load process, we  partial load file for each section of each table. To finish the load process, we
42  need to combine the partial files into single files and load the resulting  need to combine the partial files into single files and load the resulting
43  single files into the database tables.  single files into the database tables.
44    
45  Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related  Like L<ERDBGenerator.pl>, this script acts on load groups-- sets of related
46  tables that are loaded at the same time. For each table in a named group that  tables that are loaded at the same time. For each table in a named group that
47  does not exist in the database, the script first attempts to find a completed  does not exist in the database, the script first attempts to find a completed
48  data file. If one does not exist, it attempts to create one by collating section  data file. If one does not exist, it attempts to create one by collating section
# Line 95  Line 96 
96  =item keepSections  =item keepSections
97    
98  If specified, section files (the fragments of data load files created by  If specified, section files (the fragments of data load files created by
99  [[ERDBGeneratorPl]], will not be deleted after they are collated.  L<ERDBGenerator.pl>, will not be deleted after they are collated.
   
 =item sanityCheck  
   
 If specified, no tables will be loaded. Instead, the first I<N> records from the  
 assembled load files will be displayed so that the file contents can be  
 visually matched against the column names.  
100    
101  =item warn  =item warn
102    
# Line 113  Line 108 
108    
109  =item DBD  =item DBD
110    
111  Name of the DBD file. If specified, the DBD must be in the main FIG directory  Fully-qualified name of the DBD file. This option allows the use of an alternate
 (specified in C<$FIG_Config::fig>). This option allows the use of an alternate  
112  DBD during load so that access to the database by other processes is not  DBD during load so that access to the database by other processes is not
113  compromised.  compromised.
114    
# Line 125  Line 119 
119  creating a new NMPDR: we can yank in the data from the previous database while  creating a new NMPDR: we can yank in the data from the previous database while
120  waiting for the new load files to be generated.  waiting for the new load files to be generated.
121    
122    =item dbName
123    
124    SQL name of the target database. If not specified, the default name is used.
125    This option allows you to specify a backup or alternate database that can
126    be loaded without compromising the main database.
127    
128  =back  =back
129    
130  =cut  =cut
# Line 132  Line 132 
132  # Get the command-line options and parameters.  # Get the command-line options and parameters.
133  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
134                                             {                                             {
135                                                  dbName => ["", "if specified, the SQL name of the target database"],
136                                                clear => ["", "overwrite existing load files if sections are present"],                                                clear => ["", "overwrite existing load files if sections are present"],
                                               sanityCheck => ["", "don't load, trace contents of first N load file records instead"],  
137                                                trace => ["2", "tracing level"],                                                trace => ["2", "tracing level"],
138                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
139                                                phone => ["", "phone number (international format) to call when load finishes"],                                                phone => ["", "phone number (international format) to call when load finishes"],
# Line 148  Line 148 
148  eval {  eval {
149      # Get the parameters.      # Get the parameters.
150      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
     # Check for an alternate DBD.  
     my $altDBD = ($options->{DBD} ? "$FIG_Config::fig/$options->{DBD}" : undef);  
151      # Connect to the database and get its load directory.      # Connect to the database and get its load directory.
152      my $erdb = ERDB::GetDatabase($database, $altDBD);      my $erdb = ERDB::GetDatabase($database, undef, %$options, externalDBD => 1);
153      # Fix the group list.      # Fix the group list.
154      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
155      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
# Line 161  Line 159 
159      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
160      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
161      my $stats = Stats->new();      my $stats = Stats->new();
162      # Find out if we're doing a sanity check.      # We make one pass to assemble all the tables in all the groups, and
163      my $sanityCheck = $options->{sanityCheck} || "";      # then another to do the actual loads. The groups that are ready to load
164        # in the second pass will go in this list.
165        my @goodGroups;
166      # Start a timer.      # Start a timer.
167      my $totalStart = time();      my $totalStart = time();
168      # Loop through the groups.      # Loop through the groups.
# Line 193  Line 193 
193                      # No! Denote that we have a missing table.                      # No! Denote that we have a missing table.
194                      $missingTable++;                      $missingTable++;
195                      $stats->Add('tables-skipped' => 1);                      $stats->Add('tables-skipped' => 1);
196                      # If the user wants a sanity check, we want to give him some                      # Tell the user about all the missing files.
                     # data anyway.  
                     if ($sanityCheck) {  
                         # Get some data lines in the sections. Note we stop when we've exceeded  
                         # the number of lines expected by the sanity check.  
                         my @lines;  
                         for my $sectionFile (@sectionFiles) {  
                             if (-s $sectionFile && scalar(@lines) < $sanityCheck) {  
                                 Trace("Reading from $sectionFile for $table.") if T(3);  
                                 push @lines, Tracer::GetFile($sectionFile);  
                             }  
                         }  
                         # Create a new temporary file.  
                         my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";  
                         my $oh = Open(undef, ">$tmpFile");  
                         # Put all the data into it.  
                         Trace(scalar(@lines) . " data lines found.") if T(3);  
                         print $oh join("\n", @lines);  
                         close $oh;  
                         # Sanity check the temp file.  
                         CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);  
                         # Clean it up.  
                         unlink $tmpFile;  
                     } else {  
                         # Otherwise tell the user about all the missing files.  
197                          for my $missingFile (@missingFiles) {                          for my $missingFile (@missingFiles) {
198                              $stats->Add('sections-missing' => 1);                              $stats->Add('sections-missing' => 1);
199                              $stats->AddMessage("Data file $missingFile not found for table $table.");                              $stats->AddMessage("Data file $missingFile not found for table $table.");
200                          }                          }
                     }  
201                  } else {                  } else {
202                      # We have all the sections. Try to assemble them into a data file.                      # We have all the sections. Try to assemble them into a data file.
203                      my $sortStart = time();                      my $sortStart = time();
# Line 269  Line 244 
244              $stats->Add('groups-skipped' => 1);              $stats->Add('groups-skipped' => 1);
245              Trace("Skipping $group group: $missingTable missing tables.") if T(2);              Trace("Skipping $group group: $missingTable missing tables.") if T(2);
246          } else {          } else {
247              # No! Process this group's files.              # No! File this group for processing in the second pass.
248              if ($sanityCheck eq "") {              push @goodGroups, $group;
249                  Trace("Loading group $group into database.") if T(2);          }
250        }
251        # Now we loop through the good groups, doing the actual loads.
252        for my $group (@goodGroups) {
253            # Get a group object.
254            my $groupData = $erdb->Loader($group);
255            # Do the post-processing.
256            my $postStats = $groupData->PostProcess();
257            # Determine what happened.
258            if (! defined $postStats) {
259                Trace("Post-processing not required for $group.") if T(3);
260              } else {              } else {
261                  Trace("Sanity check for group $group.") if T(2);              $stats->Accumulate($postStats);
262                $stats->Add('post-processing' => 1);
263              }              }
264            # Process this group's files.
265            Trace("Loading group $group into database.") if T(2);
266            # Get the list of tables.
267            my @tableList = $groupData->GetTables();
268            # Start a timer.
269              my $loadStart = time();              my $loadStart = time();
270              for my $table (@tableList) {              for my $table (@tableList) {
271                # Compute the load file name.
272                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
273                  # Do we want a real load or a sanity check?              # Do the actual load.
                 if ($sanityCheck eq "") {  
                     # Real load.  
274                      my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);                      my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
275                      $stats->Accumulate($newStats);                      $stats->Accumulate($newStats);
276                      Trace("$fileName loaded into $table.") if T(3);                      Trace("$fileName loaded into $table.") if T(3);
                 } elsif ($sanityCheck > 0) {  
                     # Here we want a sanity check. Note that if the check value is 0,  
                     # we don't bother. The user just wants to suppress the load step.  
                     CheckLoadFile($erdb, $table, $fileName, $sanityCheck);  
                 }  
277              }              }
278              $stats->Add("groups-loaded" => 1);              $stats->Add("groups-loaded" => 1);
279              $stats->Add('load-time' => 1);              $stats->Add('load-time' => 1);
280          }          }
281      }      # Save the DBD.
282        Trace("Saving DBD.") if T(2);
283        $erdb->InternalizeDBD();
284      $stats->Add('total-time' => time() - $totalStart);      $stats->Add('total-time' => time() - $totalStart);
285      # Display the statistics from this run.      # Display the statistics from this run.
286      Trace("Statistics for load:\n" . $stats->Show()) if T(2);      Trace("Statistics for load:\n" . $stats->Show()) if T(2);
# Line 312  Line 299 
299      }      }
300  }  }
301    
 =head2 Internal Methods  
   
 =head3 CheckLoadFile  
   
     CheckLoadFile($erdb, $table, $fileName, $count);  
   
 Read the first few records of a load file and trace the contents at level  
 2. This allows the user to visually compare the load file contents with  
 the database definition.  
   
 =over 4  
   
 =item erdb  
   
 [[ErdbPm]] object describing the database.  
   
 =item table  
   
 Name of the table to check.  
   
 =item fileName  
   
 Name of the load file to check.  
   
 =item count  
   
 Number of records to check.  
   
 =back  
   
 =cut  
   
 sub CheckLoadFile {  
     # Get the parameters.  
     my ($erdb, $table, $fileName, $count) = @_;  
     # Open the file for input.  
     my $ih = Open(undef, "<$fileName");  
     # Slurp the first N records.  
     my @records;  
     while (! eof $ih && scalar(@records) < $count) {  
         push @records, [ Tracer::GetLine($ih) ];  
     }  
     my $found = scalar(@records);  
     Trace("$found records for $table found in sanity check using $fileName.") if T(2);  
     # Do we have any data at all?  
     if ($found) {  
         # Yes. Get the table's descriptor. We use this to determine the field names.  
         my $relationData = $erdb->FindRelation($table);  
         Confess("Relation $table not found in database.") if (! defined $relationData);  
         my @fields = @{$relationData->{Fields}};  
         # If this is a relationship, we need the FROM and TO data.  
         my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);  
         # Loop through the fields. We generate one message per field.  
         for (my $i = 0; $i <= $#fields; $i++) {  
             # Get this field's information.  
             my $fieldInfo = $fields[$i];  
             my $type = $fieldInfo->{type};  
             my $name = $fieldInfo->{name};  
             if ($name =~ /^(from|to)-link$/) {  
                 # Here it's a relationship link, so add the name of the target table to  
                 # the type.  
                 $type .= " ($ends{$1})";  
             }  
             # This is going to be a multi-line trace message. We start with the field name and type.  
             my @lines = ("Values for $table($name), type $type:\n");  
             # Loop through the records. We generate one line of data per record.  
             for (my $j = 0; $j < $found; $j++) {  
                 # Get the field value.  
                 my $field = $records[$j]->[$i];  
                 # Compute the record label.  
                 my $line = "Record $j";  
                 # Check for unusual cases.  
                 if (! defined $field || $field eq '') {  
                     $line .= "= <empty>";  
                 } else {  
                     # Make sure we don't trace something ungodly.  
                     my $excess = (length $field) - 40;  
                     if ($excess > 0) {  
                         $field = substr($field, 0, 40) . " >> + $excess characters";  
                     }  
                     $line .= ": $field";  
                 }  
                 # Save this line. We indent a little for readability.  
                 push @lines, "   $line";  
             }  
             # Trace this field.  
             Trace(join("\n", @lines)) if T(2);  
         }  
     }  
 }  
   
302    
303  1;  1;

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.13

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3