[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Annotation of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use ERDB;
23 :     use ERDBLoadGroup;
24 :     use ERDBGenerate;
25 : parrello 1.11 use ERDBExtras;
26 : parrello 1.1 use Stats;
27 : parrello 1.2 use Time::HiRes;
28 : parrello 1.1
29 :    
30 :     =head1 ERDBLoader Script
31 :    
32 :     ERDBLoader [options] <database> <group1> <group2> ...
33 :    
34 :     ERDB Database Load Finisher
35 :    
36 :     =head2 Introduction
37 :    
38 : parrello 1.12 This script finishes the database load process begun by L<ERDBGenerator.pl>.
39 : parrello 1.1
40 : parrello 1.12 L<ERDBGenerator.pl> divides the source data into sections, and generates a
41 : parrello 1.1 partial load file for each section of each table. To finish the load process, we
42 :     need to combine the partial files into single files and load the resulting
43 :     single files into the database tables.
44 :    
45 : parrello 1.12 Like L<ERDBGenerator.pl>, this script acts on load groups-- sets of related
46 : parrello 1.1 tables that are loaded at the same time. For each table in a named group that
47 :     does not exist in the database, the script first attempts to find a completed
48 :     data file. If one does not exist, it attempts to create one by collating section
49 : parrello 1.2 files. Once the collated section files for a load group are finished, they are
50 :     loaded into the database.
51 : parrello 1.1
52 :     =head2 Positional Parameters
53 :    
54 :     =over 4
55 :    
56 :     =item database
57 :    
58 :     Name of the ERDB database. This should be the class name for the subclass used
59 :     to access the database.
60 :    
61 :     =back
62 :    
63 :     =head2 Command-Line Options
64 :    
65 :     =over 4
66 :    
67 :     =item trace
68 :    
69 :     Specifies the tracing level. The higher the tracing level, the more messages
70 :     will appear in the trace log. Use E to specify emergency tracing.
71 :    
72 :     =item user
73 :    
74 :     Name suffix to be used for log files. If omitted, the PID is used.
75 :    
76 :     =item sql
77 :    
78 :     If specified, turns on tracing of SQL activity.
79 :    
80 : parrello 1.7 =item clear
81 :    
82 :     If specified, existing load files will be recreated from sections if the sections
83 :     are present.
84 :    
85 : parrello 1.1 =item background
86 :    
87 :     Save the standard and error output to files. The files will be created
88 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
89 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
90 :     B<user> option above.
91 :    
92 :     =item help
93 :    
94 :     Display this command's parameters and options.
95 :    
96 : parrello 1.3 =item keepSections
97 :    
98 :     If specified, section files (the fragments of data load files created by
99 : parrello 1.12 L<ERDBGenerator.pl>, will not be deleted after they are collated.
100 : parrello 1.3
101 : parrello 1.1 =item warn
102 :    
103 :     Create an event in the RSS feed when an error occurs.
104 :    
105 :     =item phone
106 :    
107 :     Phone number to message when the script is complete.
108 :    
109 : parrello 1.6 =item DBD
110 :    
111 : parrello 1.10 Fully-qualified name of the DBD file. This option allows the use of an alternate
112 : parrello 1.9 DBD during load so that access to the database by other processes is not
113 : parrello 1.6 compromised.
114 :    
115 : parrello 1.8 =item loadDirectory
116 :    
117 :     Directoty containing the load files. This option allows you to request that
118 :     load files from another version of the NMPDR be used, which is useful when
119 :     creating a new NMPDR: we can yank in the data from the previous database while
120 :     waiting for the new load files to be generated.
121 :    
122 : parrello 1.10 =item dbName
123 :    
124 :     SQL name of the target database. If not specified, the default name is used.
125 :     This option allows you to specify a backup or alternate database that can
126 :     be loaded without compromising the main database.
127 :    
128 : parrello 1.1 =back
129 :    
130 :     =cut
131 :    
132 :     # Get the command-line options and parameters.
133 :     my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
134 :     {
135 : parrello 1.10 dbName => ["", "if specified, the SQL name of the target database"],
136 : parrello 1.7 clear => ["", "overwrite existing load files if sections are present"],
137 : parrello 1.6 trace => ["2", "tracing level"],
138 : parrello 1.3 keepSections => ["", "if specified, section files will not be deleted after being collated"],
139 : parrello 1.6 phone => ["", "phone number (international format) to call when load finishes"],
140 :     DBD => ["", "if specified, the name of a DBD file in the FIG directory"],
141 : parrello 1.8 loadDirectory => ["", "if specified, an alternate directory containing the load files"],
142 : parrello 1.1 },
143 :     "<database> <group1> <group2> ...",
144 :     @ARGV);
145 :     # Set a variable to contain return type information.
146 :     my $rtype;
147 :     # Insure we catch errors.
148 :     eval {
149 :     # Get the parameters.
150 :     my ($database, @groups) = @parameters;
151 : parrello 1.6 # Connect to the database and get its load directory.
152 : parrello 1.13 my $erdb = ERDB::GetDatabase($database, undef, %$options, externalDBD => 1);
153 : parrello 1.1 # Fix the group list.
154 : parrello 1.3 my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, \@groups);
155 : parrello 1.1 # Get the source object and load directory for this database.
156 :     my $source = $erdb->GetSourceObject();
157 : parrello 1.8 my $directory = $options->{loadDirectory} || $erdb->LoadDirectory();
158 : parrello 1.1 # Get the list of sections.
159 :     my @sectionList = $erdb->SectionList($source);
160 :     # Create a statistics object to track our progress.
161 :     my $stats = Stats->new();
162 : parrello 1.11 # We make one pass to assemble all the tables in all the groups, and
163 :     # then another to do the actual loads. The groups that are ready to load
164 :     # in the second pass will go in this list.
165 :     my @goodGroups;
166 : parrello 1.2 # Start a timer.
167 :     my $totalStart = time();
168 : parrello 1.1 # Loop through the groups.
169 :     for my $group (@realGroups) {
170 :     # Get the list of tables for this group.
171 : parrello 1.3 my @tableList = ERDBLoadGroup::GetTables($erdb, $group);
172 : parrello 1.2 # We need to insure there is a data file for every table. If we fail to find one,
173 :     # we set the following error flag, which prevents us from loading the database.
174 :     my $missingTable = 0;
175 :     # Loop through the tables in this group.
176 : parrello 1.3 for my $table (@tableList) {
177 : parrello 1.4 Trace("Processing table $table for assembly.") if T(2);
178 :     # Get the section file names.
179 :     my @sectionFiles =
180 :     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
181 : parrello 1.2 # Get the data file name.
182 :     my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
183 :     # Do we have it?
184 : parrello 1.7 if (-f $dataFile && ! $options->{clear}) {
185 : parrello 1.2 # Yes. This is good news.
186 :     $stats->Add('tables-found' => 1);
187 : parrello 1.4 Trace("Table file found for $table.") if T(3);
188 : parrello 1.2 } else {
189 :     # No, we must build it. Verify that we have all the sections.
190 :     my @missingFiles = grep { ! -f $_ } @sectionFiles;
191 :     # Did we find everything?
192 :     if (scalar @missingFiles) {
193 :     # No! Denote that we have a missing table.
194 : parrello 1.3 $missingTable++;
195 : parrello 1.2 $stats->Add('tables-skipped' => 1);
196 : parrello 1.11 # Tell the user about all the missing files.
197 :     for my $missingFile (@missingFiles) {
198 :     $stats->Add('sections-missing' => 1);
199 :     $stats->AddMessage("Data file $missingFile not found for table $table.");
200 : parrello 1.6 }
201 : parrello 1.2 } else {
202 : parrello 1.6 # We have all the sections. Try to assemble them into a data file.
203 : parrello 1.2 my $sortStart = time();
204 : parrello 1.4 my $sortCommand = $erdb->SortNeeded($table) . " >$dataFile";
205 :     Trace("Sort command: $sortCommand") if T(3);
206 : parrello 1.6 # Pipe to the sort command. Note that we turn on autoflush
207 :     # so there's no buffering.
208 : parrello 1.4 my $oh = Open(undef, "| $sortCommand");
209 : parrello 1.6 select $oh; $| = 1; select STDOUT;
210 :     # Loop through the sections.
211 : parrello 1.2 for my $sectionFile (@sectionFiles) {
212 : parrello 1.4 Trace("Collating $sectionFile.") if T(3);
213 :     $stats->Add("$table-sections" => 1);
214 : parrello 1.6 # Loop through the section file.
215 :     my $ih = Open(undef, "<$sectionFile");
216 :     while (defined (my $line = <$ih>)) {
217 :     print $oh $line;
218 : parrello 1.4 $stats->Add("$table-collations" => 1);
219 : parrello 1.2 }
220 :     }
221 :     # Finish the sort step.
222 : parrello 1.4 Trace("Finishing collate for $table.") if T(3);
223 : parrello 1.2 close $oh;
224 :     $stats->Add('tables-collated' => 1);
225 : parrello 1.4 $stats->Add('collate-time' => time() - $sortStart);
226 :     }
227 :     }
228 :     # Now that we know we have a full data file, we can delete the
229 :     # section files to make room in the data directory. The user can
230 :     # turn this behavior off with the keepSections option.
231 :     if (! $options->{keepSections}) {
232 :     for my $sectionFile (@sectionFiles) {
233 :     if (-e $sectionFile) {
234 :     unlink $sectionFile;
235 :     $stats->Add('files-deleted' => 1);
236 : parrello 1.2 }
237 :     }
238 : parrello 1.4 Trace("Section files for $table deleted.") if T(3);
239 : parrello 1.2 }
240 :     }
241 :     # Were any tables missing?
242 :     if ($missingTable) {
243 :     # Yes, skip this group.
244 :     $stats->Add('groups-skipped' => 1);
245 : parrello 1.6 Trace("Skipping $group group: $missingTable missing tables.") if T(2);
246 : parrello 1.2 } else {
247 : parrello 1.11 # No! File this group for processing in the second pass.
248 :     push @goodGroups, $group;
249 :     }
250 :     }
251 :     # Now we loop through the good groups, doing the actual loads.
252 :     for my $group (@goodGroups) {
253 :     # Get a group object.
254 :     my $groupData = $erdb->Loader($group);
255 :     # Do the post-processing.
256 :     my $postStats = $groupData->PostProcess();
257 :     # Determine what happened.
258 :     if (! defined $postStats) {
259 :     Trace("Post-processing not required for $group.") if T(3);
260 :     } else {
261 :     $stats->Accumulate($postStats);
262 :     $stats->Add('post-processing' => 1);
263 :     }
264 :     # Process this group's files.
265 :     Trace("Loading group $group into database.") if T(2);
266 :     # Get the list of tables.
267 :     my @tableList = $groupData->GetTables();
268 :     # Start a timer.
269 :     my $loadStart = time();
270 :     for my $table (@tableList) {
271 :     # Compute the load file name.
272 :     my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
273 :     # Do the actual load.
274 :     my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
275 :     $stats->Accumulate($newStats);
276 :     Trace("$fileName loaded into $table.") if T(3);
277 : parrello 1.2 }
278 : parrello 1.11 $stats->Add("groups-loaded" => 1);
279 : parrello 1.14 $stats->Add('load-time' => (time() - $loadStart));
280 : parrello 1.1 }
281 : parrello 1.13 # Save the DBD.
282 :     Trace("Saving DBD.") if T(2);
283 :     $erdb->InternalizeDBD();
284 : parrello 1.2 $stats->Add('total-time' => time() - $totalStart);
285 :     # Display the statistics from this run.
286 :     Trace("Statistics for load:\n" . $stats->Show()) if T(2);
287 : parrello 1.1 };
288 :     if ($@) {
289 :     Trace("Script failed with error: $@") if T(0);
290 :     } else {
291 :     Trace("Script complete.") if T(2);
292 :     }
293 :     if ($options->{phone}) {
294 : parrello 1.2 my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
295 : parrello 1.1 if ($msgID) {
296 :     Trace("Phone message sent with ID $msgID.") if T(2);
297 :     } else {
298 :     Trace("Phone message not sent.") if T(2);
299 :     }
300 :     }
301 :    
302 : parrello 1.4
303 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3