[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Sep 30 15:23:55 2008 UTC revision 1.5, Wed Oct 15 11:41:12 2008 UTC
# Line 23  Line 23 
23  use ERDBLoadGroup;  use ERDBLoadGroup;
24  use ERDBGenerate;  use ERDBGenerate;
25  use Stats;  use Stats;
26    use Time::HiRes;
27    
28    
29  =head1 ERDBLoader Script  =head1 ERDBLoader Script
# Line 44  Line 45 
45  tables that are loaded at the same time. For each table in a named group that  tables that are loaded at the same time. For each table in a named group that
46  does not exist in the database, the script first attempts to find a completed  does not exist in the database, the script first attempts to find a completed
47  data file. If one does not exist, it attempts to create one by collating section  data file. If one does not exist, it attempts to create one by collating section
48  files. Once the collated section file is finished, it is loaded into the  files. Once the collated section files for a load group are finished, they are
49  database.  loaded into the database.
50    
51  =head2 Positional Parameters  =head2 Positional Parameters
52    
# Line 56  Line 57 
57  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
58  to access the database.  to access the database.
59    
   
60  =back  =back
61    
62  =head2 Command-Line Options  =head2 Command-Line Options
# Line 68  Line 68 
68  Specifies the tracing level. The higher the tracing level, the more messages  Specifies the tracing level. The higher the tracing level, the more messages
69  will appear in the trace log. Use E to specify emergency tracing.  will appear in the trace log. Use E to specify emergency tracing.
70    
 =item replace  
   
 Normally, if a table exists in the database, it will not be loaded. If this  
 option is specified, however, existing tables will be dropped and recreated.  
   
 =item resume  
   
 If specified, then the group list must contain a single group. The specified  
 group and all groups after it in the group list will be processed.  
   
71  =item user  =item user
72    
73  Name suffix to be used for log files. If omitted, the PID is used.  Name suffix to be used for log files. If omitted, the PID is used.
# Line 97  Line 87 
87    
88  Display this command's parameters and options.  Display this command's parameters and options.
89    
90    =item keepSections
91    
92    If specified, section files (the fragments of data load files created by
93    [[ERDBGeneratorPl]], will not be deleted after they are collated.
94    
95    =item sanityCheck
96    
97    If specified, no tables will be loaded. Instead, the first I<N> records from the
98    assembled load files will be displayed so that the file contents can be
99    visually matched against the column names.
100    
101  =item warn  =item warn
102    
103  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 112  Line 113 
113  # Get the command-line options and parameters.  # Get the command-line options and parameters.
114  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
115                                             {                                             {
116                                                  sanityCheck => ["", "don't load, trace contents of first N load file records instead"],
117                                                trace => ["", "tracing level"],                                                trace => ["", "tracing level"],
118                                                replace => ["", "if specified, existing tables will be overwritten"],                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
                                               resume => ["", "if specified, the specified group and all groups that normally come after it will be processed"],  
119                                                phone => ["", "phone number (international format) to call when load finishes"]                                                phone => ["", "phone number (international format) to call when load finishes"]
120                                             },                                             },
121                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
# Line 128  Line 129 
129      # Connect to the database.      # Connect to the database.
130      my $erdb = ERDB::GetDatabase($database);      my $erdb = ERDB::GetDatabase($database);
131      # Fix the group list.      # Fix the group list.
132      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, $options, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
133      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
134      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
135      my $directory = $erdb->LoadDirectory();      my $directory = $erdb->LoadDirectory();
# Line 136  Line 137 
137      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
138      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
139      my $stats = Stats->new();      my $stats = Stats->new();
140      # Get the hash of group names to table names.      # Find out if we're doing a sanity check.
141      my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);      my $sanityCheck = $options->{sanityCheck} || "";
142        # Start a timer.
143        my $totalStart = time();
144      # Loop through the groups.      # Loop through the groups.
145      for my $group (@realGroups) {      for my $group (@realGroups) {
146          # Get the list of tables for this group.          # Get the list of tables for this group.
147          my $tableList = $groupHash->{$group};          my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
148            # We need to insure there is a data file for every table. If we fail to find one,
149            # we set the following error flag, which prevents us from loading the database.
150            my $missingTable = 0;
151            # Loop through the tables in this group.
152            for my $table (@tableList) {
153                Trace("Processing table $table for assembly.") if T(2);
154                # Get the section file names.
155                my @sectionFiles =
156                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
157                # Get the data file name.
158                my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
159                # Do we have it?
160                if (-f $dataFile) {
161                    # Yes. This is good news.
162                    $stats->Add('tables-found' => 1);
163                    Trace("Table file found for $table.") if T(3);
164                } else {
165                    # No, we must build it. Verify that we have all the sections.
166                    my @missingFiles = grep { ! -f $_ } @sectionFiles;
167                    # Tell the user about all the missing files.
168                    for my $missingFile (@missingFiles) {
169                        $stats->Add('sections-missing' => 1);
170                        $stats->AddMessage("Data file $missingFile not found for table $table.");
171                    }
172                    # Did we find everything?
173                    if (scalar @missingFiles) {
174                        # No! Denote that we have a missing table.
175                        $missingTable++;
176                        $stats->Add('tables-skipped' => 1);
177                    } else {
178                        # Yes! Try to assemble the sections into a data file.
179                        my $sortStart = time();
180                        my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
181                        Trace("Sort command: $sortCommand") if T(3);
182                        my $oh = Open(undef, "| $sortCommand");
183                        for my $sectionFile (@sectionFiles) {
184                            Trace("Collating $sectionFile.") if T(3);
185                            $stats->Add("$table-sections" => 1);
186                            for my $line (Tracer::GetFile($sectionFile)) {
187                                print $oh "$line\n";
188                                $stats->Add("$table-collations" => 1);
189                            }
190                        }
191                        # Finish the sort step.
192                        Trace("Finishing collate for $table.") if T(3);
193                        close $oh;
194                        $stats->Add('tables-collated' => 1);
195                        $stats->Add('collate-time' => time() - $sortStart);
196      }      }
197                }
198                # Now that we know we have a full data file, we can delete the
199                # section files to make room in the data directory. The user can
200                # turn this behavior off with the keepSections option.
201                if (! $options->{keepSections}) {
202                    for my $sectionFile (@sectionFiles) {
203                        if (-e $sectionFile) {
204                            unlink $sectionFile;
205                            $stats->Add('files-deleted' => 1);
206                        }
207                    }
208                    Trace("Section files for $table deleted.") if T(3);
209                }
210            }
211            # Were any tables missing?
212            if ($missingTable) {
213                # Yes, skip this group.
214                $stats->Add('groups-skipped' => 1);
215                Trace("Skipping $group group: $missingTable missing tables.") if T(3);
216            } else {
217                # No! Process this group's files.
218                if ($sanityCheck eq "") {
219                    Trace("Loading group $group into database.") if T(2);
220                } else {
221                    Trace("Sanity check for group $group.") if T(2);
222                }
223                my $loadStart = time();
224                for my $table (@tableList) {
225                    my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
226                    # Do we want a real load or a sanity check?
227                    if ($sanityCheck eq "") {
228                        # Real load.
229                        my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
230                        $stats->Accumulate($newStats);
231                        Trace("$fileName loaded into $table.") if T(3);
232                    } elsif ($sanityCheck > 0) {
233                        # Here we want a sanity check. Note that if the check value is 0,
234                        # we don't bother. The user just wants to suppress the load step.
235                        CheckLoadFile($erdb, $table, $fileName, $sanityCheck);
236                    }
237                }
238                $stats->Add("groups-loaded" => 1);
239                $stats->Add('load-time' => 1);
240            }
241        }
242        $stats->Add('total-time' => time() - $totalStart);
243        # Display the statistics from this run.
244        Trace("Statistics for load:\n" . $stats->Show()) if T(2);
245  };  };
246  if ($@) {  if ($@) {
247      Trace("Script failed with error: $@") if T(0);      Trace("Script failed with error: $@") if T(0);
     $rtype = "error";  
248  } else {  } else {
249      Trace("Script complete.") if T(2);      Trace("Script complete.") if T(2);
     $rtype = "no error";  
250  }  }
251  if ($options->{phone}) {  if ($options->{phone}) {
252      my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader terminated with $rtype.");      my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
253      if ($msgID) {      if ($msgID) {
254          Trace("Phone message sent with ID $msgID.") if T(2);          Trace("Phone message sent with ID $msgID.") if T(2);
255      } else {      } else {
# Line 160  Line 257 
257      }      }
258  }  }
259    
260    =head3 CheckLoadFile
261    
262        CheckLoadFile($erdb, $table, $fileName, $count);
263    
264    Read the first few records of a load file and trace the contents at level
265    2. This allows the user to visually compare the load file contents with
266    the database definition.
267    
268    =over 4
269    
270    =item erdb
271    
272    [[ErdbPm]] object describing the database.
273    
274    =item table
275    
276    Name of the table to check.
277    
278    =item fileName
279    
280    Name of the load file to check.
281    
282    =item count
283    
284    Number of records to check.
285    
286    =back
287    
288    =cut
289    
290    sub CheckLoadFile {
291        # Get the parameters.
292        my ($erdb, $table, $fileName, $count) = @_;
293        # Open the file for input.
294        my $ih = Open(undef, "<$fileName");
295        # Slurp the first N records.
296        my @records;
297        while (! eof $ih && scalar(@records) < $count) {
298            push @records, [ Tracer::GetLine($ih) ];
299        }
300        my $found = scalar(@records);
301        Trace("$found records for $table found in sanity check.") if T(3);
302        # Do we have any data at all?
303        if ($found) {
304            # Yes. Get the table's descriptor. We use this to determine the field names.
305            my $relationData = $erdb->FindRelation($table);
306            Confess("Relation $table not found in database.") if (! defined $relationData);
307            my @fields = @{$relationData->{Fields}};
308            # Loop through the fields. We generate one message per field.
309            for (my $i = 0; $i <= $#fields; $i++) {
310                # Get this field's information.
311                my $fieldInfo = $fields[$i];
312                my $type = $fieldInfo->{type};
313                # This is going to be a multi-line trace message. We start with the field name and type.
314                my @lines = ("Values for $fieldInfo->{name}, type $type:\n");
315                # Loop through the records. We generate one line of data per record.
316                for (my $j = 0; $j < $found; $j++) {
317                    # Get the field value.
318                    my $field = $records[$j]->[$i];
319                    # Compute the record label.
320                    my $line = "Record $j";
321                    # Check for unusual cases.
322                    if (! defined $field) {
323                        $line .= "= <null>";
324                    } elsif ($field eq '') {
325                        $line .= "= <empty>";
326                    } else {
327                        # Make sure we don't trace something ungodly.
328                        my $excess = (length $field) - 40;
329                        if ($excess > 0) {
330                            $field = substr($field, 0, 40) . " >> + $excess characters";
331                        }
332                        $line .= ": $field";
333                    }
334                    # Save this line. We indent a little for readability.
335                    push @lines, "   $line";
336                }
337                # Trace this field.
338                Trace(join("\n", @lines)) if T(2);
339            }
340        }
341    }
342    
343    
344  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3