[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Thu Oct 2 16:32:42 2008 UTC revision 1.6, Mon Jan 19 21:46:21 2009 UTC
# Line 57  Line 57 
57  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
58  to access the database.  to access the database.
59    
   
60  =back  =back
61    
62  =head2 Command-Line Options  =head2 Command-Line Options
# Line 93  Line 92 
92  If specified, section files (the fragments of data load files created by  If specified, section files (the fragments of data load files created by
93  [[ERDBGeneratorPl]], will not be deleted after they are collated.  [[ERDBGeneratorPl]], will not be deleted after they are collated.
94    
95    =item sanityCheck
96    
97    If specified, no tables will be loaded. Instead, the first I<N> records from the
98    assembled load files will be displayed so that the file contents can be
99    visually matched against the column names.
100    
101  =item warn  =item warn
102    
103  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 101  Line 106 
106    
107  Phone number to message when the script is complete.  Phone number to message when the script is complete.
108    
109    =item DBD
110    
111    Name of the DBD file. If specified, the DBD must be in the main FIG directory
112    (specified in C<$FIG_Config::fig>). This option allows the use of an alternate
113    DBD during load, so that access to the database by other processes is not
114    compromised.
115    
116  =back  =back
117    
118  =cut  =cut
# Line 108  Line 120 
120  # Get the command-line options and parameters.  # Get the command-line options and parameters.
121  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
122                                             {                                             {
123                                                trace => ["", "tracing level"],                                                sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
124                                                  trace => ["2", "tracing level"],
125                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
126                                                phone => ["", "phone number (international format) to call when load finishes"]                                                phone => ["", "phone number (international format) to call when load finishes"],
127                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
128                                             },                                             },
129                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
130                                             @ARGV);                                             @ARGV);
# Line 120  Line 134 
134  eval {  eval {
135      # Get the parameters.      # Get the parameters.
136      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
137      # Connect to the database.      # Check for an alternate DBD.
138      my $erdb = ERDB::GetDatabase($database);      my $altDBD = ($options->{DBD} ? "$FIG_Config::fig/$options->{DBD}" : undef);
139        # Connect to the database and get its load directory.
140        my $erdb = ERDB::GetDatabase($database, $altDBD);
141      # Fix the group list.      # Fix the group list.
142      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
     Trace("Real groups are: " . join(" ", @realGroups)) if T(3);  
143      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
144      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
145      my $directory = $erdb->LoadDirectory();      my $directory = $erdb->LoadDirectory();
# Line 132  Line 147 
147      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
148      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
149      my $stats = Stats->new();      my $stats = Stats->new();
150        # Find out if we're doing a sanity check.
151        my $sanityCheck = $options->{sanityCheck} || "";
152      # Start a timer.      # Start a timer.
153      my $totalStart = time();      my $totalStart = time();
154      # Loop through the groups.      # Loop through the groups.
# Line 143  Line 160 
160          my $missingTable = 0;          my $missingTable = 0;
161          # Loop through the tables in this group.          # Loop through the tables in this group.
162          for my $table (@tableList) {          for my $table (@tableList) {
163              Trace("Processing table $table.") if T(2);              Trace("Processing table $table for assembly.") if T(2);
164                # Get the section file names.
165                my @sectionFiles =
166                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
167              # Get the data file name.              # Get the data file name.
168              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
169              # Do we have it?              # Do we have it?
170              if (-f $dataFile) {              if (-f $dataFile) {
171                  # Yes. This is good news.                  # Yes. This is good news.
172                  $stats->Add('tables-found' => 1);                  $stats->Add('tables-found' => 1);
173                    Trace("Table file found for $table.") if T(3);
174              } else {              } else {
175                  # No, we must build it. Verify that we have all the sections.                  # No, we must build it. Verify that we have all the sections.
                 my @sectionFiles =  
                     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;  
176                  my @missingFiles = grep { ! -f $_ } @sectionFiles;                  my @missingFiles = grep { ! -f $_ } @sectionFiles;
                 # Tell the user about all the missing files.  
                 for my $missingFile (@missingFiles) {  
                     $stats->Add('sections-missing' => 1);  
                     $stats->AddMessage("Data file $missingFile not found for table $table.");  
                 }  
177                  # Did we find everything?                  # Did we find everything?
178                  if (scalar @missingFiles) {                  if (scalar @missingFiles) {
179                      # No! Denote that we have a missing table.                      # No! Denote that we have a missing table.
180                      $missingTable++;                      $missingTable++;
181                      $stats->Add('tables-skipped' => 1);                      $stats->Add('tables-skipped' => 1);
182                        # If the user wants a sanity check, we want to give him some
183                        # data anyway.
184                        if ($sanityCheck) {
185                            # Get some data lines in the sections. Note we stop when we've exceeded
186                            # the number of lines expected by the sanity check.
187                            my @lines;
188                            for my $sectionFile (@sectionFiles) {
189                                if (-s $sectionFile && scalar(@lines) < $sanityCheck) {
190                                    Trace("Reading from $sectionFile for $table.") if T(3);
191                                    push @lines, Tracer::GetFile($sectionFile);
192                                }
193                            }
194                            # Create a new temporary file.
195                            my $tmpFile = "$FIG_Config::temp/$table$$.temp.dtx";
196                            my $oh = Open(undef, ">$tmpFile");
197                            # Put all the data into it.
198                            Trace(scalar(@lines) . " data lines found.") if T(3);
199                            print $oh join("\n", @lines);
200                            close $oh;
201                            # Sanity check the temp file.
202                            CheckLoadFile($erdb, $table, $tmpFile, $sanityCheck);
203                            # Clean it up.
204                            unlink $tmpFile;
205                        } else {
206                            # Otherwise tell the user about all the missing files.
207                            for my $missingFile (@missingFiles) {
208                                $stats->Add('sections-missing' => 1);
209                                $stats->AddMessage("Data file $missingFile not found for table $table.");
210                            }
211                        }
212                  } else {                  } else {
213                      # Yes! Try to assemble the sections into a data file.                      # We have all the sections. Try to assemble them into a data file.
214                      my $sortStart = time();                      my $sortStart = time();
215                      my $sortCommand = $erdb->SortNeeded($table);                      my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
216                      my $oh = Open(undef, "| $sortCommand >$dataFile");                      Trace("Sort command: $sortCommand") if T(3);
217                        # Pipe to the sort command. Note that we turn on autoflush
218                        # so there's no buffering.
219                        my $oh = Open(undef, "| $sortCommand");
220                        select $oh; $| = 1; select STDOUT;
221                        # Loop through the sections.
222                      for my $sectionFile (@sectionFiles) {                      for my $sectionFile (@sectionFiles) {
223                          Trace("Collating $sectionFile.") if T(4);                          Trace("Collating $sectionFile.") if T(3);
224                          $stats->Add('sections-loaded' => 1);                          $stats->Add("$table-sections" => 1);
225                          for my $line (Tracer::GetFile($sectionFile)) {                          # Loop through the section file.
226                              print $oh "$line\n";                          my $ih = Open(undef, "<$sectionFile");
227                              $stats->Add('lines-collated' => 1);                          while (defined (my $line = <$ih>)) {
228                                print $oh $line;
229                                $stats->Add("$table-collations" => 1);
230                          }                          }
231                      }                      }
232                      # Finish the sort step.                      # Finish the sort step.
233                        Trace("Finishing collate for $table.") if T(3);
234                      close $oh;                      close $oh;
235                      $stats->Add('tables-collated' => 1);                      $stats->Add('tables-collated' => 1);
236                      # Now that we've collated the section files, we can delete them                      $stats->Add('collate-time' => time() - $sortStart);
237                      # to make room in the data directory. The user can turn this                  }
238                      # behavior off with the keepSections option.              }
239                # Now that we know we have a full data file, we can delete the
240                # section files to make room in the data directory. The user can
241                # turn this behavior off with the keepSections option.
242                      if (! $options->{keepSections}) {                      if (! $options->{keepSections}) {
243                          for my $sectionFile (@sectionFiles) {                          for my $sectionFile (@sectionFiles) {
244                        if (-e $sectionFile) {
245                              unlink $sectionFile;                              unlink $sectionFile;
246                              $stats->Add('files-deleted' => 1);                              $stats->Add('files-deleted' => 1);
247                          }                          }
                         Trace("Section files for $table deleted.") if T(3);  
                     }  
                     $stats->Add('collate-time' => time() - $sortStart);  
248                  }                  }
249                    Trace("Section files for $table deleted.") if T(3);
250              }              }
251          }          }
252          # Were any tables missing?          # Were any tables missing?
253          if ($missingTable) {          if ($missingTable) {
254              # Yes, skip this group.              # Yes, skip this group.
255              $stats->Add('groups-skipped' => 1);              $stats->Add('groups-skipped' => 1);
256              Trace("Skipping $group group: $missingTable missing tables.") if T(3);              Trace("Skipping $group group: $missingTable missing tables.") if T(2);
257          } else {          } else {
258              # No! Load this group into the database.              # No! Process this group's files.
259              Trace("Loading $group group into database.") if T(2);              if ($sanityCheck eq "") {
260                    Trace("Loading group $group into database.") if T(2);
261                } else {
262                    Trace("Sanity check for group $group.") if T(2);
263                }
264              my $loadStart = time();              my $loadStart = time();
265              for my $table (@tableList) {              for my $table (@tableList) {
266                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
267                    # Do we want a real load or a sanity check?
268                    if ($sanityCheck eq "") {
269                        # Real load.
270                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
271                  $stats->Accumulate($newStats);                  $stats->Accumulate($newStats);
272                  Trace("$fileName loaded into $table.") if T(3);                  Trace("$fileName loaded into $table.") if T(3);
273                    } elsif ($sanityCheck > 0) {
274                        # Here we want a sanity check. Note that if the check value is 0,
275                        # we don't bother. The user just wants to suppress the load step.
276                        CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
277                    }
278              }              }
279              $stats->Add("groups-loaded" => 1);              $stats->Add("groups-loaded" => 1);
280              $stats->Add('load-time' => 1);              $stats->Add('load-time' => 1);
# Line 232  Line 298 
298      }      }
299  }  }
300    
301    =head3 CheckLoadFile
302    
303        CheckLoadFile($erdb, $table, $fileName, $count);
304    
305    Read the first few records of a load file and trace the contents at level
306    2. This allows the user to visually compare the load file contents with
307    the database definition.
308    
309    =over 4
310    
311    =item erdb
312    
313    [[ErdbPm]] object describing the database.
314    
315    =item table
316    
317    Name of the table to check.
318    
319    =item fileName
320    
321    Name of the load file to check.
322    
323    =item count
324    
325    Number of records to check.
326    
327    =back
328    
329    =cut
330    
331    sub CheckLoadFile {
332        # Get the parameters.
333        my ($erdb, $table, $fileName, $count) = @_;
334        # Open the file for input.
335        my $ih = Open(undef, "<$fileName");
336        # Slurp the first N records.
337        my @records;
338        while (! eof $ih && scalar(@records) < $count) {
339            push @records, [ Tracer::GetLine($ih) ];
340        }
341        my $found = scalar(@records);
342        Trace("$found records for $table found in sanity check using $fileName.") if T(2);
343        # Do we have any data at all?
344        if ($found) {
345            # Yes. Get the table's descriptor. We use this to determine the field names.
346            my $relationData = $erdb->FindRelation($table);
347            Confess("Relation $table not found in database.") if (! defined $relationData);
348            my @fields = @{$relationData->{Fields}};
349            # If this is a relationship, we need the FROM and TO data.
350            my %ends; ($ends{from}, $ends{to}) = $erdb->GetRelationshipEntities($table);
351            # Loop through the fields. We generate one message per field.
352            for (my $i = 0; $i <= $#fields; $i++) {
353                # Get this field's information.
354                my $fieldInfo = $fields[$i];
355                my $type = $fieldInfo->{type};
356                my $name = $fieldInfo->{name};
357                if ($name =~ /^(from|to)-link$/) {
358                    # Here it's a relationship link, so add the name of the target table to
359                    # the type.
360                    $type .= " ($ends{$1})";
361                }
362                # This is going to be a multi-line trace message. We start with the field name and type.
363                my @lines = ("Values for $table($name), type $type:\n");
364                # Loop through the records. We generate one line of data per record.
365                for (my $j = 0; $j < $found; $j++) {
366                    # Get the field value.
367                    my $field = $records[$j]->[$i];
368                    # Compute the record label.
369                    my $line = "Record $j";
370                    # Check for unusual cases.
371                    if (! defined $field || $field eq '') {
372                        $line .= "= <empty>";
373                    } else {
374                        # Make sure we don't trace something ungodly.
375                        my $excess = (length $field) - 40;
376                        if ($excess > 0) {
377                            $field = substr($field, 0, 40) . " >> + $excess characters";
378                        }
379                        $line .= ": $field";
380                    }
381                    # Save this line. We indent a little for readability.
382                    push @lines, "   $line";
383                }
384                # Trace this field.
385                Trace(join("\n", @lines)) if T(2);
386            }
387        }
388    }
389    
390    
391  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3