[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Diff of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Wed Oct 1 03:07:08 2008 UTC revision 1.12, Tue Jun 30 19:53:01 2009 UTC
# Line 22  Line 22 
22  use ERDB;  use ERDB;
23  use ERDBLoadGroup;  use ERDBLoadGroup;
24  use ERDBGenerate;  use ERDBGenerate;
25    use ERDBExtras;
26  use Stats;  use Stats;
27  use Time::HiRes;  use Time::HiRes;
28    
# Line 34  Line 35 
35    
36  =head2 Introduction  =head2 Introduction
37    
38  This script finishes the database load process begun by [[ERDBGeneratorPl]].  This script finishes the database load process begun by L<ERDBGenerator.pl>.
39    
40  [[ERDBGeneratorPl]] divides the source data into sections, and generates a  L<ERDBGenerator.pl> divides the source data into sections, and generates a
41  partial load file for each section of each table. To finish the load process, we  partial load file for each section of each table. To finish the load process, we
42  need to combine the partial files into single files and load the resulting  need to combine the partial files into single files and load the resulting
43  single files into the database tables.  single files into the database tables.
44    
45  Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related  Like L<ERDBGenerator.pl>, this script acts on load groups-- sets of related
46  tables that are loaded at the same time. For each table in a named group that  tables that are loaded at the same time. For each table in a named group that
47  does not exist in the database, the script first attempts to find a completed  does not exist in the database, the script first attempts to find a completed
48  data file. If one does not exist, it attempts to create one by collating section  data file. If one does not exist, it attempts to create one by collating section
# Line 57  Line 58 
58  Name of the ERDB database. This should be the class name for the subclass used  Name of the ERDB database. This should be the class name for the subclass used
59  to access the database.  to access the database.
60    
   
61  =back  =back
62    
63  =head2 Command-Line Options  =head2 Command-Line Options
# Line 69  Line 69 
69  Specifies the tracing level. The higher the tracing level, the more messages  Specifies the tracing level. The higher the tracing level, the more messages
70  will appear in the trace log. Use E to specify emergency tracing.  will appear in the trace log. Use E to specify emergency tracing.
71    
 =item resume  
   
 If specified, then the group list must contain a single group. The specified  
 group and all groups after it in the group list will be processed.  
   
72  =item user  =item user
73    
74  Name suffix to be used for log files. If omitted, the PID is used.  Name suffix to be used for log files. If omitted, the PID is used.
# Line 82  Line 77 
77    
78  If specified, turns on tracing of SQL activity.  If specified, turns on tracing of SQL activity.
79    
80    =item clear
81    
82    If specified, existing load files will be recreated from sections if the sections
83    are present.
84    
85  =item background  =item background
86    
87  Save the standard and error output to files. The files will be created  Save the standard and error output to files. The files will be created
# Line 93  Line 93 
93    
94  Display this command's parameters and options.  Display this command's parameters and options.
95    
96    =item keepSections
97    
98    If specified, section files (the fragments of data load files created by
99    L<ERDBGenerator.pl>, will not be deleted after they are collated.
100    
101  =item warn  =item warn
102    
103  Create an event in the RSS feed when an error occurs.  Create an event in the RSS feed when an error occurs.
# Line 101  Line 106 
106    
107  Phone number to message when the script is complete.  Phone number to message when the script is complete.
108    
109    =item DBD
110    
111    Fully-qualified name of the DBD file. This option allows the use of an alternate
112    DBD during load so that access to the database by other processes is not
113    compromised.
114    
115    =item loadDirectory
116    
117    Directoty containing the load files. This option allows you to request that
118    load files from another version of the NMPDR be used, which is useful when
119    creating a new NMPDR: we can yank in the data from the previous database while
120    waiting for the new load files to be generated.
121    
122    =item dbName
123    
124    SQL name of the target database. If not specified, the default name is used.
125    This option allows you to specify a backup or alternate database that can
126    be loaded without compromising the main database.
127    
128  =back  =back
129    
130  =cut  =cut
# Line 108  Line 132 
132  # Get the command-line options and parameters.  # Get the command-line options and parameters.
133  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],  my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
134                                             {                                             {
135                                                trace => ["", "tracing level"],                                                dbName => ["", "if specified, the SQL name of the target database"],
136                                                resume => ["", "if specified, the specified group and all groups that normally come after it will be processed"],                                                clear => ["", "overwrite existing load files if sections are present"],
137                                                phone => ["", "phone number (international format) to call when load finishes"]                                                trace => ["2", "tracing level"],
138                                                  keepSections => ["", "if specified, section files will not be deleted after being collated"],
139                                                  phone => ["", "phone number (international format) to call when load finishes"],
140                                                  DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
141                                                  loadDirectory => ["", "if specified, an alternate directory containing the load files"],
142                                             },                                             },
143                                             "<database> <group1> <group2> ...",                                             "<database> <group1> <group2> ...",
144                                             @ARGV);                                             @ARGV);
# Line 120  Line 148 
148  eval {  eval {
149      # Get the parameters.      # Get the parameters.
150      my ($database, @groups) = @parameters;      my ($database, @groups) = @parameters;
151      # Connect to the database.      # Connect to the database and get its load directory.
152      my $erdb = ERDB::GetDatabase($database);      my $erdb = ERDB::GetDatabase($database, undef, %$options);
153      # Fix the group list.      # Fix the group list.
154      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, $options, \@groups);      my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
155      # Get the source object and load directory for this database.      # Get the source object and load directory for this database.
156      my $source = $erdb->GetSourceObject();      my $source = $erdb->GetSourceObject();
157      my $directory = $erdb->LoadDirectory();      my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
158      # Get the list of sections.      # Get the list of sections.
159      my @sectionList = $erdb->SectionList($source);      my @sectionList = $erdb->SectionList($source);
160      # Create a statistics object to track our progress.      # Create a statistics object to track our progress.
161      my $stats = Stats->new();      my $stats = Stats->new();
162      # Get the hash of group names to table names.      # We make one pass to assemble all the tables in all the groups, and
163      my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);      # then another to do the actual loads. The groups that are ready to load
164        # in the second pass will go in this list.
165        my @goodGroups;
166      # Start a timer.      # Start a timer.
167      my $totalStart = time();      my $totalStart = time();
168      # Loop through the groups.      # Loop through the groups.
169      for my $group (@realGroups) {      for my $group (@realGroups) {
170          # Get the list of tables for this group.          # Get the list of tables for this group.
171          my $tableList = $groupHash->{$group};          my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
172          # We need to insure there is a data file for every table. If we fail to find one,          # We need to insure there is a data file for every table. If we fail to find one,
173          # we set the following error flag, which prevents us from loading the database.          # we set the following error flag, which prevents us from loading the database.
174          my $missingTable = 0;          my $missingTable = 0;
175          # Loop through the tables in this group.          # Loop through the tables in this group.
176          for my $table (@$tableList) {          for my $table (@tableList) {
177                Trace("Processing table $table for assembly.") if T(2);
178                # Get the section file names.
179                my @sectionFiles =
180                    map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
181              # Get the data file name.              # Get the data file name.
182              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);              my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
183              # Do we have it?              # Do we have it?
184              if (-f $dataFile) {              if (-f $dataFile && ! $options->{clear}) {
185                  # Yes. This is good news.                  # Yes. This is good news.
186                  $stats->Add('tables-found' => 1);                  $stats->Add('tables-found' => 1);
187                    Trace("Table file found for $table.") if T(3);
188              } else {              } else {
189                  # No, we must build it. Verify that we have all the sections.                  # No, we must build it. Verify that we have all the sections.
                 my @sectionFiles =  
                     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;  
190                  my @missingFiles = grep { ! -f $_ } @sectionFiles;                  my @missingFiles = grep { ! -f $_ } @sectionFiles;
191                    # Did we find everything?
192                    if (scalar @missingFiles) {
193                        # No! Denote that we have a missing table.
194                        $missingTable++;
195                        $stats->Add('tables-skipped' => 1);
196                  # Tell the user about all the missing files.                  # Tell the user about all the missing files.
197                  for my $missingFile (@missingFiles) {                  for my $missingFile (@missingFiles) {
198                      $stats->Add('sections-missing' => 1);                      $stats->Add('sections-missing' => 1);
199                      $stats->AddMessage("Data file $missingFile not found for table $table.");                      $stats->AddMessage("Data file $missingFile not found for table $table.");
200                  }                  }
                 # Did we find everything?  
                 if (scalar @missingFiles) {  
                     # No! Denote that we have a missing table.  
                     $missingTable = 1;  
                     $stats->Add('tables-skipped' => 1);  
201                  } else {                  } else {
202                      # Yes! Try to assemble the sections into a data file.                      # We have all the sections. Try to assemble them into a data file.
203                      my $sortStart = time();                      my $sortStart = time();
204                      my $sortCommand = $erdb->SortNeeded($table);                      my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
205                      my $oh = Open(undef, "| $sortCommand >$dataFile");                      Trace("Sort command: $sortCommand") if T(3);
206                        # Pipe to the sort command. Note that we turn on autoflush
207                        # so there's no buffering.
208                        my $oh = Open(undef, "| $sortCommand");
209                        select $oh; $| = 1; select STDOUT;
210                        # Loop through the sections.
211                      for my $sectionFile (@sectionFiles) {                      for my $sectionFile (@sectionFiles) {
212                          Trace("Collating $sectionFile.") if T(4);                          Trace("Collating $sectionFile.") if T(3);
213                          $stats->Add('sections-loaded' => 1);                          $stats->Add("$table-sections" => 1);
214                          for my $line (Tracer::GetFile($sectionFile)) {                          # Loop through the section file.
215                              print $oh "$line\n";                          my $ih = Open(undef, "<$sectionFile");
216                              $stats->Add('lines-collated' => 1);                          while (defined (my $line = <$ih>)) {
217                                print $oh $line;
218                                $stats->Add("$table-collations" => 1);
219                          }                          }
220                      }                      }
221                      # Finish the sort step.                      # Finish the sort step.
222                        Trace("Finishing collate for $table.") if T(3);
223                      close $oh;                      close $oh;
224                      $stats->Add('tables-collated' => 1);                      $stats->Add('tables-collated' => 1);
225                      # Now that we've collated the section files, we can delete them                      $stats->Add('collate-time' => time() - $sortStart);
226                      # to make room in the data directory.                  }
227                }
228                # Now that we know we have a full data file, we can delete the
229                # section files to make room in the data directory. The user can
230                # turn this behavior off with the keepSections option.
231                if (! $options->{keepSections}) {
232                      for my $sectionFile (@sectionFiles) {                      for my $sectionFile (@sectionFiles) {
233                        if (-e $sectionFile) {
234                          unlink $sectionFile;                          unlink $sectionFile;
235                          $stats->Add('files-deleted' => 1);                          $stats->Add('files-deleted' => 1);
236                      }                      }
                     $stats->Add('collate-time' => time() - $sortStart);  
237                  }                  }
238                    Trace("Section files for $table deleted.") if T(3);
239              }              }
240          }          }
241          # Were any tables missing?          # Were any tables missing?
242          if ($missingTable) {          if ($missingTable) {
243              # Yes, skip this group.              # Yes, skip this group.
244              $stats->Add('groups-skipped' => 1);              $stats->Add('groups-skipped' => 1);
245                Trace("Skipping $group group: $missingTable missing tables.") if T(2);
246            } else {
247                # No! File this group for processing in the second pass.
248                push @goodGroups, $group;
249            }
250        }
251        # Now we loop through the good groups, doing the actual loads.
252        for my $group (@goodGroups) {
253            # Get a group object.
254            my $groupData = $erdb->Loader($group);
255            # Do the post-processing.
256            my $postStats = $groupData->PostProcess();
257            # Determine what happened.
258            if (! defined $postStats) {
259                Trace("Post-processing not required for $group.") if T(3);
260          } else {          } else {
261              # No! Load this group into the database.              $stats->Accumulate($postStats);
262                $stats->Add('post-processing' => 1);
263            }
264            # Process this group's files.
265            Trace("Loading group $group into database.") if T(2);
266            # Get the list of tables.
267            my @tableList = $groupData->GetTables();
268            # Start a timer.
269              my $loadStart = time();              my $loadStart = time();
270              for my $table (@$tableList) {          for my $table (@tableList) {
271                # Compute the load file name.
272                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);                  my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
273                # Do the actual load.
274                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);                  my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
275                  $stats->Accumulate($newStats);                  $stats->Accumulate($newStats);
276                Trace("$fileName loaded into $table.") if T(3);
277              }              }
278              $stats->Add("groups-loaded" => 1);              $stats->Add("groups-loaded" => 1);
279              $stats->Add('load-time' => 1);              $stats->Add('load-time' => 1);
280          }          }
     }  
281      $stats->Add('total-time' => time() - $totalStart);      $stats->Add('total-time' => time() - $totalStart);
282      # Display the statistics from this run.      # Display the statistics from this run.
283      Trace("Statistics for load:\n" . $stats->Show()) if T(2);      Trace("Statistics for load:\n" . $stats->Show()) if T(2);
# Line 225  Line 296 
296      }      }
297  }  }
298    
299    
300  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3