[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Thu Oct 2 16:32:42 2008 UTC revision 1.8, Mon Mar 2 22:16:27 2009 UTC
# Line 57  Line 57 
57  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
58  to access the database.  to access the database.
59    
   
60  =back  =back
61    
62  =head2 Command-Line Options  =head2 Command-Line Options
# Line 77  Line 76 
76    
77  If specified, turns on tracing of SQL activity.  If specified, turns on tracing of SQL activity.
78    
79    =item clear
80    
81    If specified, existing load files will be recreated from sections if the sections
82    are present.
83    
84  =item background  =item background
85    
86  Save the standard and error output to files. The files will be created  Save the standard and error output to files. The files will be created
# Line 93  Line 97 
97  If specified, section files (the fragments of data load files created by  If specified, section files (the fragments of data load files created by
98  [[ERDBGeneratorPl]], will not be deleted after they are collated.  [[ERDBGeneratorPl]], will not be deleted after they are collated.
99    
100    =item sanityCheck
101    
102    If specified, no tables will be loaded. Instead, the first I<N> records from the
103    assembled load files will be displayed so that the file contents can be
104    visually matched against the column names.
105    
106  =item warn  =item warn
107    
108  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 101  Line 111 
111    
112  Phone number to message when the script is complete.  Phone number to message when the script is complete.
113    
114    =item DBD
115    
116    Name of the DBD file. If specified, the DBD must be in the main FIG directory
117    (specified in C<$FIG_Config::fig>). This option allows the use of an alternate
118    DBD during load, so that access to the database by other processes is not
119    compromised.
120    
121    =item loadDirectory
122    
123    Directoty containing the load files. This option allows you to request that
124    load files from another version of the NMPDR be used, which is useful when
125    creating a new NMPDR: we can yank in the data from the previous database while
126    waiting for the new load files to be generated.
127    
128  =back  =back
129    
130  =cut  =cut
# Line 108  Line 132 
132  # Get the command-line options and parameters.  # Get the command-line options and parameters.
133  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
134                                             {                                             {
135                                                trace => ["", "tracing level"],                                                clear => ["", "overwrite existing load files if sections are present"],
136                                                  sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
137                                                  trace => ["2", "tracing level"],
138                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
139                                                phone => ["", "phone number (international format) to call when load finishes"]                                                phone => ["", "phone number (international format) to call when load finishes"],
140                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
141                                                  loadDirectory => ["", "if specified, an alternate directory containing the load files"],
142                                             },                                             },
143                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
144                                             @ARGV);                                             @ARGV);
# Line 120  Line 148 
148  eval {  eval {
149      # Get the parameters.      # Get the parameters.
150      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
151      # Connect to the database.      # Check for an alternate DBD.
152      my $erdb = ERDB::GetDatabase($database);      my $altDBD = ($options->{DBD} ? "$FIG_Config::fig/$options->{DBD}" : undef);
153        # Connect to the database and get its load directory.
154        my $erdb = ERDB::GetDatabase($database, $altDBD);
155      # Fix the group list.      # Fix the group list.
156      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
     Trace("Real groups are: " . join(" ", @realGroups)) if T(3);  
157      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
158      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
159      my $directory = $erdb->LoadDirectory();  
160        my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
161      # Get the list of sections.      # Get the list of sections.
162      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
163      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
164      my $stats = Stats->new();      my $stats = Stats->new();
165        # Find out if we're doing a sanity check.
166        my $sanityCheck = $options->{sanityCheck} || "";
167      # Start a timer.      # Start a timer.
168      my $totalStart = time();      my $totalStart = time();
169      # Loop through the groups.      # Loop through the groups.
# Line 143  Line 175 
175          my $missingTable = 0;          my $missingTable = 0;
176          # Loop through the tables in this group.          # Loop through the tables in this group.
177          for my $table (@tableList) {          for my $table (@tableList) {
178              Trace("Processing table $table.") if T(2);              Trace("Processing table $table for assembly.") if T(2);
179                # Get the section file names.
180                my @sectionFiles =
181                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
182              # Get the data file name.              # Get the data file name.
183              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
184              # Do we have it?              # Do we have it?
185              if (-f $dataFile) {              if (-f $dataFile && ! $options->{clear}) {
186                  # Yes. This is good news.                  # Yes. This is good news.
187                  $stats->Add('tables-found' => 1);                  $stats->Add('tables-found' => 1);
188                    Trace("Table file found for $table.") if T(3);
189              } else {              } else {
190                  # No, we must build it. Verify that we have all the sections.                  # No, we must build it. Verify that we have all the sections.
                 my @sectionFiles =  
                     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;  
191                  my @missingFiles = grep { ! -f $_ } @sectionFiles;                  my @missingFiles = grep { ! -f $_ } @sectionFiles;
                 # Tell the user about all the missing files.  
                 for my $missingFile (@missingFiles) {  
                     $stats->Add('sections-missing' => 1);  
                     $stats->AddMessage("Data file $missingFile not found for table $table.");  
                 }  
192                  # Did we find everything?                  # Did we find everything?
193                  if (scalar @missingFiles) {                  if (scalar @missingFiles) {
194                      # No! Denote that we have a missing table.                      # No! Denote that we have a missing table.
195                      $missingTable++;                      $missingTable++;
196                      $stats->Add('tables-skipped' => 1);                      $stats->Add('tables-skipped' => 1);
197                        # If the user wants a sanity check, we want to give him some
198                        # data anyway.
199                        if ($sanityCheck) {
200                            # Get some data lines in the sections. Note we stop when we've exceeded
201                            # the number of lines expected by the sanity check.
202                            my @lines;
203                            for my $sectionFile (@sectionFiles) {
204                                if (-s $sectionFile && scalar(@lines) < $sanityCheck) {
205                                    Trace("Reading from $sectionFile for $table.") if T(3);
206                                    push @lines, Tracer::GetFile($sectionFile);
207                                }
208                            }
209                            # Create a new temporary file.
210                            my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";
211                            my $oh = Open(undef, ">$tmpFile");
212                            # Put all the data into it.
213                            Trace(scalar(@lines) . " data lines found.") if T(3);
214                            print $oh join("\n", @lines);
215                            close $oh;
216                            # Sanity check the temp file.
217                            CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);
218                            # Clean it up.
219                            unlink $tmpFile;
220                        } else {
221                            # Otherwise tell the user about all the missing files.
222                            for my $missingFile (@missingFiles) {
223                                $stats->Add('sections-missing' => 1);
224                                $stats->AddMessage("Data file $missingFile not found for table $table.");
225                            }
226                        }
227                  } else {                  } else {
228                      # Yes! Try to assemble the sections into a data file.                      # We have all the sections. Try to assemble them into a data file.
229                      my $sortStart = time();                      my $sortStart = time();
230                      my $sortCommand = $erdb->SortNeeded($table);                      my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
231                      my $oh = Open(undef, "| $sortCommand >$dataFile");                      Trace("Sort command: $sortCommand") if T(3);
232                        # Pipe to the sort command. Note that we turn on autoflush
233                        # so there's no buffering.
234                        my $oh = Open(undef, "| $sortCommand");
235                        select $oh; $| = 1; select STDOUT;
236                        # Loop through the sections.
237                      for my $sectionFile (@sectionFiles) {                      for my $sectionFile (@sectionFiles) {
238                          Trace("Collating $sectionFile.") if T(4);                          Trace("Collating $sectionFile.") if T(3);
239                          $stats->Add('sections-loaded' => 1);                          $stats->Add("$table-sections" => 1);
240                          for my $line (Tracer::GetFile($sectionFile)) {                          # Loop through the section file.
241                              print $oh "$line\n";                          my $ih = Open(undef, "<$sectionFile");
242                              $stats->Add('lines-collated' => 1);                          while (defined (my $line = <$ih>)) {
243                                print $oh $line;
244                                $stats->Add("$table-collations" => 1);
245                          }                          }
246                      }                      }
247                      # Finish the sort step.                      # Finish the sort step.
248                        Trace("Finishing collate for $table.") if T(3);
249                      close $oh;                      close $oh;
250                      $stats->Add('tables-collated' => 1);                      $stats->Add('tables-collated' => 1);
251                      # Now that we've collated the section files, we can delete them                      $stats->Add('collate-time' => time() - $sortStart);
252                      # to make room in the data directory. The user can turn this                  }
253                      # behavior off with the keepSections option.              }
254                # Now that we know we have a full data file, we can delete the
255                # section files to make room in the data directory. The user can
256                # turn this behavior off with the keepSections option.
257                      if (! $options->{keepSections}) {                      if (! $options->{keepSections}) {
258                          for my $sectionFile (@sectionFiles) {                          for my $sectionFile (@sectionFiles) {
259                        if (-e $sectionFile) {
260                              unlink $sectionFile;                              unlink $sectionFile;
261                              $stats->Add('files-deleted' => 1);                              $stats->Add('files-deleted' => 1);
262                          }                          }
                         Trace("Section files for $table deleted.") if T(3);  
                     }  
                     $stats->Add('collate-time' => time() - $sortStart);  
263                  }                  }
264                    Trace("Section files for $table deleted.") if T(3);
265              }              }
266          }          }
267          # Were any tables missing?          # Were any tables missing?
268          if ($missingTable) {          if ($missingTable) {
269              # Yes, skip this group.              # Yes, skip this group.
270              $stats->Add('groups-skipped' => 1);              $stats->Add('groups-skipped' => 1);
271              Trace("Skipping $group group: $missingTable missing tables.") if T(3);              Trace("Skipping $group group: $missingTable missing tables.") if T(2);
272            } else {
273                # No! Process this group's files.
274                if ($sanityCheck eq "") {
275                    Trace("Loading group $group into database.") if T(2);
276          } else {          } else {
277              # No! Load this group into the database.                  Trace("Sanity check for group $group.") if T(2);
278              Trace("Loading $group group into database.") if T(2);              }
279              my $loadStart = time();              my $loadStart = time();
280              for my $table (@tableList) {              for my $table (@tableList) {
281                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
282                    # Do we want a real load or a sanity check?
283                    if ($sanityCheck eq "") {
284                        # Real load.
285                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
286                  $stats->Accumulate($newStats);                  $stats->Accumulate($newStats);
287                  Trace("$fileName loaded into $table.") if T(3);                  Trace("$fileName loaded into $table.") if T(3);
288                    } elsif ($sanityCheck > 0) {
289                        # Here we want a sanity check. Note that if the check value is 0,
290                        # we don't bother. The user just wants to suppress the load step.
291                        CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
292                    }
293              }              }
294              $stats->Add("groups-loaded" => 1);              $stats->Add("groups-loaded" => 1);
295              $stats->Add('load-time' => 1);              $stats->Add('load-time' => 1);
# Line 232  Line 313 
313      }      }
314  }  }
315    
316    =head2 Internal Methods
317    
318    =head3 CheckLoadFile
319    
320        CheckLoadFile($erdb, $table, $fileName, $count);
321    
322    Read the first few records of a load file and trace the contents at level
323    2. This allows the user to visually compare the load file contents with
324    the database definition.
325    
326    =over 4
327    
328    =item erdb
329    
330    [[ErdbPm]] object describing the database.
331    
332    =item table
333    
334    Name of the table to check.
335    
336    =item fileName
337    
338    Name of the load file to check.
339    
340    =item count
341    
342    Number of records to check.
343    
344    =back
345    
346    =cut
347    
348    sub CheckLoadFile {
349        # Get the parameters.
350        my ($erdb, $table, $fileName, $count) = @_;
351        # Open the file for input.
352        my $ih = Open(undef, "<$fileName");
353        # Slurp the first N records.
354        my @records;
355        while (! eof $ih && scalar(@records) < $count) {
356            push @records, [ Tracer::GetLine($ih) ];
357        }
358        my $found = scalar(@records);
359        Trace("$found records for $table found in sanity check using $fileName.") if T(2);
360        # Do we have any data at all?
361        if ($found) {
362            # Yes. Get the table's descriptor. We use this to determine the field names.
363            my $relationData = $erdb->FindRelation($table);
364            Confess("Relation $table not found in database.") if (! defined $relationData);
365            my @fields = @{$relationData->{Fields}};
366            # If this is a relationship, we need the FROM and TO data.
367            my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);
368            # Loop through the fields. We generate one message per field.
369            for (my $i = 0; $i <= $#fields; $i++) {
370                # Get this field's information.
371                my $fieldInfo = $fields[$i];
372                my $type = $fieldInfo->{type};
373                my $name = $fieldInfo->{name};
374                if ($name =~ /^(from|to)-link$/) {
375                    # Here it's a relationship link, so add the name of the target table to
376                    # the type.
377                    $type .= " ($ends{$1})";
378                }
379                # This is going to be a multi-line trace message. We start with the field name and type.
380                my @lines = ("Values for $table($name), type $type:\n");
381                # Loop through the records. We generate one line of data per record.
382                for (my $j = 0; $j < $found; $j++) {
383                    # Get the field value.
384                    my $field = $records[$j]->[$i];
385                    # Compute the record label.
386                    my $line = "Record $j";
387                    # Check for unusual cases.
388                    if (! defined $field || $field eq '') {
389                        $line .= "= <empty>";
390                    } else {
391                        # Make sure we don't trace something ungodly.
392                        my $excess = (length $field) - 40;
393                        if ($excess > 0) {
394                            $field = substr($field, 0, 40) . " >> + $excess characters";
395                        }
396                        $line .= ": $field";
397                    }
398                    # Save this line. We indent a little for readability.
399                    push @lines, "   $line";
400                }
401                # Trace this field.
402                Trace(join("\n", @lines)) if T(2);
403            }
404        }
405    }
406    
407    
408  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3