[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Sep 30 15:23:55 2008 UTC revision 1.15, Sat Dec 25 15:57:45 2010 UTC
# Line 22  Line 22 
22  use ERDB;  use ERDB;
23  use ERDBLoadGroup;  use ERDBLoadGroup;
24  use ERDBGenerate;  use ERDBGenerate;
25    use ERDBExtras;
26  use Stats;  use Stats;
27    use Time::HiRes;
28    
29    
30  =head1 ERDBLoader Script  =head1 ERDBLoader Script
# Line 33  Line 35 
35    
36  =head2 Introduction  =head2 Introduction
37    
38  This script finishes the database load process begun by [[ERDBGeneratorPl]].  This script finishes the database load process begun by L<ERDBGenerator.pl>.
39    
40  [[ERDBGeneratorPl]] divides the source data into sections, and generates a  L<ERDBGenerator.pl> divides the source data into sections, and generates a
41  partial load file for each section of each table. To finish the load process, we  partial load file for each section of each table. To finish the load process, we
42  need to combine the partial files into single files and load the resulting  need to combine the partial files into single files and load the resulting
43  single files into the database tables.  single files into the database tables.
44    
45  Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related  Like L<ERDBGenerator.pl>, this script acts on load groups-- sets of related
46  tables that are loaded at the same time. For each table in a named group that  tables that are loaded at the same time. For each table in a named group that
47  does not exist in the database, the script first attempts to find a completed  does not exist in the database, the script first attempts to find a complete set
48  data file. If one does not exist, it attempts to create one by collating section  of section files that it will collate into a data file. If there are no sections,
49  files. Once the collated section file is finished, it is loaded into the  then it will look for a data file that is already collated. Once the collated
50  database.  section files for a load group are all verified, they are loaded into the database.
51    
52  =head2 Positional Parameters  =head2 Positional Parameters
53    
# Line 56  Line 58 
58  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
59  to access the database.  to access the database.
60    
   
61  =back  =back
62    
63  =head2 Command-Line Options  =head2 Command-Line Options
# Line 68  Line 69 
69  Specifies the tracing level. The higher the tracing level, the more messages  Specifies the tracing level. The higher the tracing level, the more messages
70  will appear in the trace log. Use E to specify emergency tracing.  will appear in the trace log. Use E to specify emergency tracing.
71    
 =item replace  
   
 Normally, if a table exists in the database, it will not be loaded. If this  
 option is specified, however, existing tables will be dropped and recreated.  
   
 =item resume  
   
 If specified, then the group list must contain a single group. The specified  
 group and all groups after it in the group list will be processed.  
   
72  =item user  =item user
73    
74  Name suffix to be used for log files. If omitted, the PID is used.  Name suffix to be used for log files. If omitted, the PID is used.
# Line 97  Line 88 
88    
89  Display this command's parameters and options.  Display this command's parameters and options.
90    
91    =item keepSections
92    
93    If specified, section files (the fragments of data load files created by
94    L<ERDBGenerator.pl>, will not be deleted after they are collated.
95    
96  =item warn  =item warn
97    
98  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 105  Line 101 
101    
102  Phone number to message when the script is complete.  Phone number to message when the script is complete.
103    
104    =item DBD
105    
106    Fully-qualified name of the DBD file. This option allows the use of an alternate
107    DBD during load so that access to the database by other processes is not
108    compromised.
109    
110    =item loadDirectory
111    
112    Directoty containing the load files. This option allows you to request that
113    load files from another version of the NMPDR be used, which is useful when
114    creating a new NMPDR: we can yank in the data from the previous database while
115    waiting for the new load files to be generated.
116    
117    =item dbName
118    
119    SQL name of the target database. If not specified, the default name is used.
120    This option allows you to specify a backup or alternate database that can
121    be loaded without compromising the main database.
122    
123  =back  =back
124    
125  =cut  =cut
# Line 112  Line 127 
127  # Get the command-line options and parameters.  # Get the command-line options and parameters.
128  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
129                                             {                                             {
130                                                trace => ["", "tracing level"],                                                dbName => ["", "if specified, the SQL name of the target database"],
131                                                replace => ["", "if specified, existing tables will be overwritten"],                                                clear => ["", "overwrite existing load files if sections are present"],
132                                                resume => ["", "if specified, the specified group and all groups that normally come after it will be processed"],                                                trace => ["2", "tracing level"],
133                                                phone => ["", "phone number (international format) to call when load finishes"]                                                keepSections => ["", "if specified, section files will not be deleted after being collated"],
134                                                  phone => ["", "phone number (international format) to call when load finishes"],
135                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
136                                                  loadDirectory => ["", "if specified, an alternate directory containing the load files"],
137                                             },                                             },
138                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
139                                             @ARGV);                                             @ARGV);
# Line 125  Line 143 
143  eval {  eval {
144      # Get the parameters.      # Get the parameters.
145      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
146      # Connect to the database.      # Connect to the database and get its load directory.
147      my $erdb = ERDB::GetDatabase($database);      my $erdb = ERDB::GetDatabase($database, undef, %$options, externalDBD => 1);
148      # Fix the group list.      # Fix the group list.
149      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, $options, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
150      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
151      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
152      my $directory = $erdb->LoadDirectory();      my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
153      # Get the list of sections.      # Get the list of sections.
154      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
155      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
156      my $stats = Stats->new();      my $stats = Stats->new();
157      # Get the hash of group names to table names.      # We make one pass to assemble all the tables in all the groups, and
158      my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);      # then another to do the actual loads. The groups that are ready to load
159        # in the second pass will go in this list.
160        my @goodGroups;
161        # Start a timer.
162        my $totalStart = time();
163      # Loop through the groups.      # Loop through the groups.
164      for my $group (@realGroups) {      for my $group (@realGroups) {
165          # Get the list of tables for this group.          # Get the list of tables for this group.
166          my $tableList = $groupHash->{$group};          my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
167            # We need to insure there is a data file for every table. If we fail to find one,
168            # we set the following error flag, which prevents us from loading the database.
169            my $missingTable = 0;
170            # Loop through the tables in this group.
171            for my $table (@tableList) {
172                Trace("Processing table $table for assembly.") if T(2);
173                # Get the section file names.
174                my @sectionFiles =
175                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
176                # Get the data file name.
177                my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
178                # Do we have it?
179                my $haveFile = -f $dataFile;
180                # See if we can build it. Verify that we have all the sections.
181                my @missingFiles = grep { ! -f $_ } @sectionFiles;
182                # Did we find everything?
183                if (scalar(@missingFiles) && ! $haveFile) {
184                    # No, and there's no main file! Denote that we have a missing table.
185                    $missingTable++;
186                    $stats->Add('tables-skipped' => 1);
187                    # Tell the user about all the missing files.
188                    for my $missingFile (@missingFiles) {
189                        $stats->Add('sections-missing' => 1);
190                        $stats->AddMessage("Data file $missingFile not found for table $table.");
191                    }
192                } elsif (! scalar @missingFiles) {
193                    # We have all the sections. Try to assemble them into a data file.
194                    my $sortStart = time();
195                    my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
196                    Trace("Sort command: $sortCommand") if T(3);
197                    # Pipe to the sort command. Note that we turn on autoflush
198                    # so there's no buffering.
199                    my $oh = Open(undef, "| $sortCommand");
200                    select $oh; $| = 1; select STDOUT;
201                    # Loop through the sections.
202                    for my $sectionFile (@sectionFiles) {
203                        Trace("Collating $sectionFile.") if T(3);
204                        $stats->Add("$table-sections" => 1);
205                        # Loop through the section file.
206                        my $ih = Open(undef, "<$sectionFile");
207                        while (defined (my $line = <$ih>)) {
208                            print $oh $line;
209                            $stats->Add("$table-collations" => 1);
210                        }
211      }      }
212                    # Finish the sort step.
213                    Trace("Finishing collate for $table.") if T(3);
214                    close $oh;
215                    $stats->Add('tables-collated' => 1);
216                    $stats->Add('collate-time' => time() - $sortStart);
217                } else {
218                    # We have a data file and no sections, so we use the data file.
219                    $stats->Add('tables-found' => 1);
220                }
221                # Now that we know we have a full data file, we can delete the
222                # section files to make room in the data directory. The user can
223                # turn this behavior off with the keepSections option.
224                if (! $options->{keepSections}) {
225                    for my $sectionFile (@sectionFiles) {
226                        if (-e $sectionFile) {
227                            unlink $sectionFile;
228                            $stats->Add('files-deleted' => 1);
229                        }
230                    }
231                    Trace("Section files for $table deleted.") if T(3);
232                }
233            }
234            # Were any tables missing?
235            if ($missingTable) {
236                # Yes, skip this group.
237                $stats->Add('groups-skipped' => 1);
238                Trace("Skipping $group group: $missingTable missing tables.") if T(2);
239            } else {
240                # No! File this group for processing in the second pass.
241                push @goodGroups, $group;
242            }
243        }
244        # Now we loop through the good groups, doing the actual loads.
245        for my $group (@goodGroups) {
246            # Get a group object.
247            my $groupData = $erdb->Loader($group);
248            # Do the post-processing.
249            my $postStats = $groupData->PostProcess();
250            # Determine what happened.
251            if (! defined $postStats) {
252                Trace("Post-processing not required for $group.") if T(3);
253            } else {
254                $stats->Accumulate($postStats);
255                $stats->Add('post-processing' => 1);
256            }
257            # Process this group's files.
258            Trace("Loading group $group into database.") if T(2);
259            # Get the list of tables.
260            my @tableList = $groupData->GetTables();
261            # Start a timer.
262            my $loadStart = time();
263            for my $table (@tableList) {
264                # Compute the load file name.
265                my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
266                # Do the actual load.
267                my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
268                $stats->Accumulate($newStats);
269                Trace("$fileName loaded into $table.") if T(3);
270            }
271            $stats->Add("groups-loaded" => 1);
272            $stats->Add('load-time' => (time() - $loadStart));
273        }
274        # Save the DBD.
275        Trace("Saving DBD.") if T(2);
276        $erdb->InternalizeDBD();
277        $stats->Add('total-time' => time() - $totalStart);
278        # Display the statistics from this run.
279        Trace("Statistics for load:\n" . $stats->Show()) if T(2);
280  };  };
281  if ($@) {  if ($@) {
282      Trace("Script failed with error: $@") if T(0);      Trace("Script failed with error: $@") if T(0);
     $rtype = "error";  
283  } else {  } else {
284      Trace("Script complete.") if T(2);      Trace("Script complete.") if T(2);
     $rtype = "no error";  
285  }  }
286  if ($options->{phone}) {  if ($options->{phone}) {
287      my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader terminated with $rtype.");      my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
288      if ($msgID) {      if ($msgID) {
289          Trace("Phone message sent with ID $msgID.") if T(2);          Trace("Phone message sent with ID $msgID.") if T(2);
290      } else {      } else {
# Line 160  Line 292 
292      }      }
293  }  }
294    
295    
296  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.15

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3