--- ERDBLoader.pl 2009/03/02 22:16:27 1.8 +++ ERDBLoader.pl 2011/01/03 18:16:47 1.17 @@ -22,6 +22,7 @@ use ERDB; use ERDBLoadGroup; use ERDBGenerate; +use ERDBExtras; use Stats; use Time::HiRes; @@ -34,19 +35,19 @@ =head2 Introduction -This script finishes the database load process begun by [[ERDBGeneratorPl]]. +This script finishes the database load process begun by L. -[[ERDBGeneratorPl]] divides the source data into sections, and generates a +L divides the source data into sections, and generates a partial load file for each section of each table. To finish the load process, we need to combine the partial files into single files and load the resulting single files into the database tables. -Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related +Like L, this script acts on load groups-- sets of related tables that are loaded at the same time. For each table in a named group that -does not exist in the database, the script first attempts to find a completed -data file. If one does not exist, it attempts to create one by collating section -files. Once the collated section files for a load group are finished, they are -loaded into the database. +does not exist in the database, the script first attempts to find a complete set +of section files that it will collate into a data file. If there are no sections, +then it will look for a data file that is already collated. Once the collated +section files for a load group are all verified, they are loaded into the database. =head2 Positional Parameters @@ -76,11 +77,6 @@ If specified, turns on tracing of SQL activity. -=item clear - -If specified, existing load files will be recreated from sections if the sections -are present. - =item background Save the standard and error output to files. The files will be created @@ -95,13 +91,7 @@ =item keepSections If specified, section files (the fragments of data load files created by -[[ERDBGeneratorPl]], will not be deleted after they are collated. - -=item sanityCheck - -If specified, no tables will be loaded. Instead, the first I records from the -assembled load files will be displayed so that the file contents can be -visually matched against the column names. +L, will not be deleted after they are collated. =item warn @@ -113,9 +103,8 @@ =item DBD -Name of the DBD file. If specified, the DBD must be in the main FIG directory -(specified in C<$FIG_Config::fig>). This option allows the use of an alternate -DBD during load, so that access to the database by other processes is not +Fully-qualified name of the DBD file. This option allows the use of an alternate +DBD during load so that access to the database by other processes is not compromised. =item loadDirectory @@ -125,6 +114,12 @@ creating a new NMPDR: we can yank in the data from the previous database while waiting for the new load files to be generated. +=item dbName + +SQL name of the target database. If not specified, the default name is used. +This option allows you to specify a backup or alternate database that can +be loaded without compromising the main database. + =back =cut @@ -132,8 +127,7 @@ # Get the command-line options and parameters. my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ], { - clear => ["", "overwrite existing load files if sections are present"], - sanityCheck => ["", "don't load, trace contents of first N load file records instead"], + dbName => ["", "if specified, the SQL name of the target database"], trace => ["2", "tracing level"], keepSections => ["", "if specified, section files will not be deleted after being collated"], phone => ["", "phone number (international format) to call when load finishes"], @@ -148,22 +142,21 @@ eval { # Get the parameters. my ($database, @groups) = @parameters; - # Check for an alternate DBD. - my $altDBD = ($options->{DBD} ? "$FIG_Config::fig/$options->{DBD}" : undef); # Connect to the database and get its load directory. - my $erdb = ERDB::GetDatabase($database, $altDBD); + my $erdb = ERDB::GetDatabase($database, undef, %$options, externalDBD => 1); # Fix the group list. my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups); # Get the source object and load directory for this database. my $source = $erdb->GetSourceObject(); - my $directory = $options->{loadDirectory} || $erdb->LoadDirectory(); # Get the list of sections. my @sectionList = $erdb->SectionList($source); # Create a statistics object to track our progress. my $stats = Stats->new(); - # Find out if we're doing a sanity check. - my $sanityCheck = $options->{sanityCheck} || ""; + # We make one pass to assemble all the tables in all the groups, and + # then another to do the actual loads. The groups that are ready to load + # in the second pass will go in this list. + my @goodGroups; # Start a timer. my $totalStart = time(); # Loop through the groups. @@ -182,74 +175,47 @@ # Get the data file name. my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory); # Do we have it? - if (-f $dataFile && ! $options->{clear}) { - # Yes. This is good news. - $stats->Add('tables-found' => 1); - Trace("Table file found for $table.") if T(3); - } else { - # No, we must build it. Verify that we have all the sections. - my @missingFiles = grep { ! -f $_ } @sectionFiles; - # Did we find everything? - if (scalar @missingFiles) { - # No! Denote that we have a missing table. - $missingTable++; - $stats->Add('tables-skipped' => 1); - # If the user wants a sanity check, we want to give him some - # data anyway. - if ($sanityCheck) { - # Get some data lines in the sections. Note we stop when we've exceeded - # the number of lines expected by the sanity check. - my @lines; - for my $sectionFile (@sectionFiles) { - if (-s $sectionFile && scalar(@lines) < $sanityCheck) { - Trace("Reading from $sectionFile for $table.") if T(3); - push @lines, Tracer::GetFile($sectionFile); - } - } - # Create a new temporary file. - my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx"; - my $oh = Open(undef, ">$tmpFile"); - # Put all the data into it. - Trace(scalar(@lines) . " data lines found.") if T(3); - print $oh join("\n", @lines); - close $oh; - # Sanity check the temp file. - CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck); - # Clean it up. - unlink $tmpFile; - } else { - # Otherwise tell the user about all the missing files. - for my $missingFile (@missingFiles) { - $stats->Add('sections-missing' => 1); - $stats->AddMessage("Data file $missingFile not found for table $table."); - } - } - } else { - # We have all the sections. Try to assemble them into a data file. - my $sortStart = time(); - my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile"; - Trace("Sort command: $sortCommand") if T(3); - # Pipe to the sort command. Note that we turn on autoflush - # so there's no buffering. - my $oh = Open(undef, "| $sortCommand"); - select $oh; $| = 1; select STDOUT; - # Loop through the sections. - for my $sectionFile (@sectionFiles) { - Trace("Collating $sectionFile.") if T(3); - $stats->Add("$table-sections" => 1); - # Loop through the section file. - my $ih = Open(undef, "<$sectionFile"); - while (defined (my $line = <$ih>)) { - print $oh $line; - $stats->Add("$table-collations" => 1); - } + my $haveFile = -f $dataFile; + # See if we can build it. Verify that we have all the sections. + my @missingFiles = grep { ! -f $_ } @sectionFiles; + # Did we find everything? + if (scalar(@missingFiles) && ! $haveFile) { + # No, and there's no main file! Denote that we have a missing table. + $missingTable++; + $stats->Add('tables-skipped' => 1); + # Tell the user about all the missing files. + for my $missingFile (@missingFiles) { + $stats->Add('sections-missing' => 1); + $stats->AddMessage("Data file $missingFile not found for table $table."); + } + } elsif (! scalar @missingFiles) { + # We have all the sections. Try to assemble them into a data file. + my $sortStart = time(); + my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile"; + Trace("Sort command: $sortCommand") if T(3); + # Pipe to the sort command. Note that we turn on autoflush + # so there's no buffering. + my $oh = Open(undef, "| $sortCommand"); + select $oh; $| = 1; select STDOUT; + # Loop through the sections. + for my $sectionFile (@sectionFiles) { + Trace("Collating $sectionFile.") if T(3); + $stats->Add("$table-sections" => 1); + # Loop through the section file. + my $ih = Open(undef, "<$sectionFile"); + while (defined (my $line = <$ih>)) { + print $oh $line; + $stats->Add("$table-collations" => 1); } - # Finish the sort step. - Trace("Finishing collate for $table.") if T(3); - close $oh; - $stats->Add('tables-collated' => 1); - $stats->Add('collate-time' => time() - $sortStart); } + # Finish the sort step. + Trace("Finishing collate for $table.") if T(2); + close $oh; + $stats->Add('tables-collated' => 1); + $stats->Add('collate-time' => time() - $sortStart); + } else { + # We have a data file and no sections, so we use the data file. + $stats->Add('tables-found' => 1); } # Now that we know we have a full data file, we can delete the # section files to make room in the data directory. The user can @@ -270,31 +236,43 @@ $stats->Add('groups-skipped' => 1); Trace("Skipping $group group: $missingTable missing tables.") if T(2); } else { - # No! Process this group's files. - if ($sanityCheck eq "") { - Trace("Loading group $group into database.") if T(2); - } else { - Trace("Sanity check for group $group.") if T(2); - } - my $loadStart = time(); - for my $table (@tableList) { - my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory); - # Do we want a real load or a sanity check? - if ($sanityCheck eq "") { - # Real load. - my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1); - $stats->Accumulate($newStats); - Trace("$fileName loaded into $table.") if T(3); - } elsif ($sanityCheck > 0) { - # Here we want a sanity check. Note that if the check value is 0, - # we don't bother. The user just wants to suppress the load step. - CheckLoadFile($erdb, $table, $fileName, $sanityCheck); - } - } - $stats->Add("groups-loaded" => 1); - $stats->Add('load-time' => 1); + # No! File this group for processing in the second pass. + push @goodGroups, $group; + } + } + # Now we loop through the good groups, doing the actual loads. + for my $group (@goodGroups) { + # Get a group object. + my $groupData = $erdb->Loader($group); + # Do the post-processing. + my $postStats = $groupData->PostProcess(); + # Determine what happened. + if (! defined $postStats) { + Trace("Post-processing not required for $group.") if T(3); + } else { + $stats->Accumulate($postStats); + $stats->Add('post-processing' => 1); + } + # Process this group's files. + Trace("Loading group $group into database.") if T(2); + # Get the list of tables. + my @tableList = $groupData->GetTables(); + # Start a timer. + my $loadStart = time(); + for my $table (@tableList) { + # Compute the load file name. + my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory); + # Do the actual load. + my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1); + $stats->Accumulate($newStats); + Trace("$fileName loaded into $table.") if T(3); } + $stats->Add("groups-loaded" => 1); + $stats->Add('load-time' => (time() - $loadStart)); } + # Save the DBD. + Trace("Saving DBD.") if T(2); + $erdb->InternalizeDBD(); $stats->Add('total-time' => time() - $totalStart); # Display the statistics from this run. Trace("Statistics for load:\n" . $stats->Show()) if T(2); @@ -313,96 +291,5 @@ } } -=head2 Internal Methods - -=head3 CheckLoadFile - - CheckLoadFile($erdb, $table, $fileName, $count); - -Read the first few records of a load file and trace the contents at level -2. This allows the user to visually compare the load file contents with -the database definition. - -=over 4 - -=item erdb - -[[ErdbPm]] object describing the database. - -=item table - -Name of the table to check. - -=item fileName - -Name of the load file to check. - -=item count - -Number of records to check. - -=back - -=cut - -sub CheckLoadFile { - # Get the parameters. - my ($erdb, $table, $fileName, $count) = @_; - # Open the file for input. - my $ih = Open(undef, "<$fileName"); - # Slurp the first N records. - my @records; - while (! eof $ih && scalar(@records) < $count) { - push @records, [ Tracer::GetLine($ih) ]; - } - my $found = scalar(@records); - Trace("$found records for $table found in sanity check using $fileName.") if T(2); - # Do we have any data at all? - if ($found) { - # Yes. Get the table's descriptor. We use this to determine the field names. - my $relationData = $erdb->FindRelation($table); - Confess("Relation $table not found in database.") if (! defined $relationData); - my @fields = @{$relationData->{Fields}}; - # If this is a relationship, we need the FROM and TO data. - my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table); - # Loop through the fields. We generate one message per field. - for (my $i = 0; $i <= $#fields; $i++) { - # Get this field's information. - my $fieldInfo = $fields[$i]; - my $type = $fieldInfo->{type}; - my $name = $fieldInfo->{name}; - if ($name =~ /^(from|to)-link$/) { - # Here it's a relationship link, so add the name of the target table to - # the type. - $type .= " ($ends{$1})"; - } - # This is going to be a multi-line trace message. We start with the field name and type. - my @lines = ("Values for $table($name), type $type:\n"); - # Loop through the records. We generate one line of data per record. - for (my $j = 0; $j < $found; $j++) { - # Get the field value. - my $field = $records[$j]->[$i]; - # Compute the record label. - my $line = "Record $j"; - # Check for unusual cases. - if (! defined $field || $field eq '') { - $line .= "= "; - } else { - # Make sure we don't trace something ungodly. - my $excess = (length $field) - 40; - if ($excess > 0) { - $field = substr($field, 0, 40) . " >> + $excess characters"; - } - $line .= ": $field"; - } - # Save this line. We indent a little for readability. - push @lines, " $line"; - } - # Trace this field. - Trace(join("\n", @lines)) if T(2); - } - } -} - 1; \ No newline at end of file