[Bio] / Sprout / ERDBLoader.pl Repository:
ViewVC logotype

Annotation of /Sprout/ERDBLoader.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use ERDB;
23 :     use ERDBLoadGroup;
24 :     use ERDBGenerate;
25 :     use Stats;
26 : parrello 1.2 use Time::HiRes;
27 : parrello 1.1
28 :    
29 :     =head1 ERDBLoader Script
30 :    
31 :     ERDBLoader [options] <database> <group1> <group2> ...
32 :    
33 :     ERDB Database Load Finisher
34 :    
35 :     =head2 Introduction
36 :    
37 :     This script finishes the database load process begun by [[ERDBGeneratorPl]].
38 :    
39 :     [[ERDBGeneratorPl]] divides the source data into sections, and generates a
40 :     partial load file for each section of each table. To finish the load process, we
41 :     need to combine the partial files into single files and load the resulting
42 :     single files into the database tables.
43 :    
44 :     Like [[ERDBGeneratorPl]], this script acts on load groups-- sets of related
45 :     tables that are loaded at the same time. For each table in a named group that
46 :     does not exist in the database, the script first attempts to find a completed
47 :     data file. If one does not exist, it attempts to create one by collating section
48 : parrello 1.2 files. Once the collated section files for a load group are finished, they are
49 :     loaded into the database.
50 : parrello 1.1
51 :     =head2 Positional Parameters
52 :    
53 :     =over 4
54 :    
55 :     =item database
56 :    
57 :     Name of the ERDB database. This should be the class name for the subclass used
58 :     to access the database.
59 :    
60 :    
61 :     =back
62 :    
63 :     =head2 Command-Line Options
64 :    
65 :     =over 4
66 :    
67 :     =item trace
68 :    
69 :     Specifies the tracing level. The higher the tracing level, the more messages
70 :     will appear in the trace log. Use E to specify emergency tracing.
71 :    
72 :     =item resume
73 :    
74 :     If specified, then the group list must contain a single group. The specified
75 :     group and all groups after it in the group list will be processed.
76 :    
77 :     =item user
78 :    
79 :     Name suffix to be used for log files. If omitted, the PID is used.
80 :    
81 :     =item sql
82 :    
83 :     If specified, turns on tracing of SQL activity.
84 :    
85 :     =item background
86 :    
87 :     Save the standard and error output to files. The files will be created
88 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
89 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
90 :     B<user> option above.
91 :    
92 :     =item help
93 :    
94 :     Display this command's parameters and options.
95 :    
96 :     =item warn
97 :    
98 :     Create an event in the RSS feed when an error occurs.
99 :    
100 :     =item phone
101 :    
102 :     Phone number to message when the script is complete.
103 :    
104 :     =back
105 :    
106 :     =cut
107 :    
108 :     # Get the command-line options and parameters.
109 :     my ($options, @parameters) = StandardSetup([qw(ERDBLoadGroup ERDB Stats) ],
110 :     {
111 :     trace => ["", "tracing level"],
112 :     resume => ["", "if specified, the specified group and all groups that normally come after it will be processed"],
113 :     phone => ["", "phone number (international format) to call when load finishes"]
114 :     },
115 :     "<database> <group1> <group2> ...",
116 :     @ARGV);
117 :     # Set a variable to contain return type information.
118 :     my $rtype;
119 :     # Insure we catch errors.
120 :     eval {
121 :     # Get the parameters.
122 :     my ($database, @groups) = @parameters;
123 :     # Connect to the database.
124 :     my $erdb = ERDB::GetDatabase($database);
125 :     # Fix the group list.
126 :     my @realGroups = ERDBLoadGroup::ComputeGroups($erdb, $options, \@groups);
127 :     # Get the source object and load directory for this database.
128 :     my $source = $erdb->GetSourceObject();
129 :     my $directory = $erdb->LoadDirectory();
130 :     # Get the list of sections.
131 :     my @sectionList = $erdb->SectionList($source);
132 :     # Create a statistics object to track our progress.
133 :     my $stats = Stats->new();
134 :     # Get the hash of group names to table names.
135 :     my $groupHash = ERDBLoadGroup::GetGroupHash($erdb);
136 : parrello 1.2 # Start a timer.
137 :     my $totalStart = time();
138 : parrello 1.1 # Loop through the groups.
139 :     for my $group (@realGroups) {
140 :     # Get the list of tables for this group.
141 :     my $tableList = $groupHash->{$group};
142 : parrello 1.2 # We need to insure there is a data file for every table. If we fail to find one,
143 :     # we set the following error flag, which prevents us from loading the database.
144 :     my $missingTable = 0;
145 :     # Loop through the tables in this group.
146 :     for my $table (@$tableList) {
147 :     # Get the data file name.
148 :     my $dataFile = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
149 :     # Do we have it?
150 :     if (-f $dataFile) {
151 :     # Yes. This is good news.
152 :     $stats->Add('tables-found' => 1);
153 :     } else {
154 :     # No, we must build it. Verify that we have all the sections.
155 :     my @sectionFiles =
156 :     map { ERDBGenerate::CreateFileName($table, $_, 'data', $directory) } @sectionList;
157 :     my @missingFiles = grep { ! -f $_ } @sectionFiles;
158 :     # Tell the user about all the missing files.
159 :     for my $missingFile (@missingFiles) {
160 :     $stats->Add('sections-missing' => 1);
161 :     $stats->AddMessage("Data file $missingFile not found for table $table.");
162 :     }
163 :     # Did we find everything?
164 :     if (scalar @missingFiles) {
165 :     # No! Denote that we have a missing table.
166 :     $missingTable = 1;
167 :     $stats->Add('tables-skipped' => 1);
168 :     } else {
169 :     # Yes! Try to assemble the sections into a data file.
170 :     my $sortStart = time();
171 :     my $sortCommand = $erdb->SortNeeded($table);
172 :     my $oh = Open(undef, "| $sortCommand >$dataFile");
173 :     for my $sectionFile (@sectionFiles) {
174 :     Trace("Collating $sectionFile.") if T(4);
175 :     $stats->Add('sections-loaded' => 1);
176 :     for my $line (Tracer::GetFile($sectionFile)) {
177 :     print $oh "$line\n";
178 :     $stats->Add('lines-collated' => 1);
179 :     }
180 :     }
181 :     # Finish the sort step.
182 :     close $oh;
183 :     $stats->Add('tables-collated' => 1);
184 :     # Now that we've collated the section files, we can delete them
185 :     # to make room in the data directory.
186 :     for my $sectionFile (@sectionFiles) {
187 :     unlink $sectionFile;
188 :     $stats->Add('files-deleted' => 1);
189 :     }
190 :     $stats->Add('collate-time' => time() - $sortStart);
191 :     }
192 :     }
193 :     }
194 :     # Were any tables missing?
195 :     if ($missingTable) {
196 :     # Yes, skip this group.
197 :     $stats->Add('groups-skipped' => 1);
198 :     } else {
199 :     # No! Load this group into the database.
200 :     my $loadStart = time();
201 :     for my $table (@$tableList) {
202 :     my $fileName = ERDBGenerate::CreateFileName($table, undef, 'data', $directory);
203 :     my $newStats = $erdb->LoadTable($fileName, $table, truncate => 1, failOnError => 1);
204 :     $stats->Accumulate($newStats);
205 :     }
206 :     $stats->Add("groups-loaded" => 1);
207 :     $stats->Add('load-time' => 1);
208 :     }
209 : parrello 1.1 }
210 : parrello 1.2 $stats->Add('total-time' => time() - $totalStart);
211 :     # Display the statistics from this run.
212 :     Trace("Statistics for load:\n" . $stats->Show()) if T(2);
213 : parrello 1.1 };
214 :     if ($@) {
215 :     Trace("Script failed with error: $@") if T(0);
216 :     } else {
217 :     Trace("Script complete.") if T(2);
218 :     }
219 :     if ($options->{phone}) {
220 : parrello 1.2 my $msgID = Tracer::SendSMS($options->{phone}, "ERDBLoader completed.");
221 : parrello 1.1 if ($msgID) {
222 :     Trace("Phone message sent with ID $msgID.") if T(2);
223 :     } else {
224 :     Trace("Phone message not sent.") if T(2);
225 :     }
226 :     }
227 :    
228 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3