Parent Directory
|
Revision Log
Revision 1.46 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | =head1 Load Sprout Tables | ||
4 : | |||
5 : | parrello | 1.12 | =head2 Introduction |
6 : | |||
7 : | parrello | 1.14 | The Sprout database reflects a snapshot of the SEED taken at a particular point in |
8 : | time. At some point in the future, it will be possible to add annotations to the | ||
9 : | Sprout data. All records added to Sprout after the snapshot is taken are | ||
10 : | specially-marked so that the changes can be copied to the SEED. The SEED remains | ||
11 : | the live version of the data. | ||
12 : | |||
13 : | The snapshot is produced by reading the SEED data and writing it to sequential | ||
14 : | files. There is one file per Sprout table, and each such file's name consists of | ||
15 : | the table name with the suffix C<dtx>. Thus, the file for the C<Genome> table | ||
16 : | would be named C<Genome.dtx>. These files are used to load the actual Sprout | ||
17 : | database and to generate Glimpse indices. | ||
18 : | |||
19 : | To load all the Sprout tables and then validate the result, you need to issue three | ||
20 : | commands. | ||
21 : | |||
22 : | LoadSproutTables -dbLoad -dbCreate "*" | ||
23 : | parrello | 1.27 | TestSproutLoad [genomeID] ... |
24 : | index_sprout_lucene | ||
25 : | |||
26 : | where I<[genomeID]> is one or more genome IDs. These genomes will be tested more | ||
27 : | thoroughly than the others. | ||
28 : | parrello | 1.14 | |
29 : | All three commands send output to the console. In addition, C<LoadSproutTables> and | ||
30 : | parrello | 1.27 | C<TestSproutLoad> write tracing information to a trace log in the FIG temporary |
31 : | parrello | 1.14 | directory (B<$FIG_Config::Tmp>). At the bottom of the log file will be a complete |
32 : | list of errors. If errors occur in C<LoadSproutTables>, then the data must be corrected | ||
33 : | and the offending table group reloaded. So, for example, if there are errors in the | ||
34 : | load of the B<MadeAnnotation> and B<Compound> tables, you would need to run | ||
35 : | |||
36 : | LoadSproutTables -dbLoad Annotation Reaction | ||
37 : | |||
38 : | because B<MadeAnnotation> is in the C<Annotation> group, and B<Compound> is in the | ||
39 : | C<Reaction> group. A list of the groups is given below. | ||
40 : | |||
41 : | You can omit the C<dbLoad> option to create the load files without | ||
42 : | loading the database, and you can add a C<trace> option to change the trace level. | ||
43 : | The command below creates the Genome-related load files with a trace level of 3 and | ||
44 : | does not load them into the Sprout database. | ||
45 : | |||
46 : | LoadSproutTables -trace=3 Genome | ||
47 : | |||
48 : | C<LoadSproutTables> takes a long time to run, so setting the trace level to 3 helps | ||
49 : | to give you an idea of the progress. | ||
50 : | |||
51 : | parrello | 1.28 | =head2 The NMPDR Web Site |
52 : | |||
53 : | Sprout is the database engine for the NMPDR web site. The NMPDR web site consists of two | ||
54 : | pieces that run on two different machines. The B<WEB> machine contains HTML pages | ||
55 : | generated by a Content Management Tool. | ||
56 : | |||
57 : | parrello | 1.14 | =head2 Procedure For Loading Sprout |
58 : | |||
59 : | parrello | 1.27 | In order to load the Sprout, you need to have the B<Sprout>, B<NmpdrConfigs>, and |
60 : | B<LuceneSearch> projects checked out from CVS in addition to the standard FIG | ||
61 : | projects. You must also set up the following B<FIG_Config.pm> variables in addition | ||
62 : | to the normal ones. | ||
63 : | |||
64 : | =over 4 | ||
65 : | |||
66 : | =item sproutData | ||
67 : | |||
68 : | Name of the data directory for the Sprout load files. | ||
69 : | |||
70 : | =item var | ||
71 : | |||
72 : | Name of the directory to contain cached NMPDR pages. The most important file in | ||
73 : | this directory is C<nmpdr_page_template.html>, which contains a skeleton page | ||
74 : | from the main NMPDR web site. This skeleton page is used to generate output | ||
75 : | pages that look like the other NMPDR pages. | ||
76 : | |||
77 : | =item java | ||
78 : | |||
79 : | Path to the Java runtime environment. | ||
80 : | |||
81 : | =item sproutDB | ||
82 : | |||
83 : | Name of the Sprout database | ||
84 : | |||
85 : | =item dbuser | ||
86 : | |||
87 : | User name for logging into the Sprout database. | ||
88 : | |||
89 : | =item dbpass | ||
90 : | |||
91 : | Password for logging into the Sprout database. | ||
92 : | |||
93 : | =item nmpdr_site_url | ||
94 : | |||
95 : | URL for the NMPDR cover pages. The NMPDR cover pages are informational and text | ||
96 : | pages that serve as the entry point to the NMPDR web site. They are generated by | ||
97 : | a Content Management tool, and some Sprout scripts need to know where to find | ||
98 : | them. | ||
99 : | |||
100 : | =item nmpdr_site_template_id | ||
101 : | |||
102 : | Page number for the template page used to generate results that look like they're | ||
103 : | part of the NMPDR web site. | ||
104 : | |||
105 : | =back | ||
106 : | |||
107 : | parrello | 1.14 | =over 4 |
108 : | |||
109 : | parrello | 1.39 | Most of the above preparation is performed by the B<NMPDRSetup> utility. |
110 : | NMPDRSetup prints the instructions for completing the process, including | ||
111 : | loading the Sprout database. The specific procedure for loading | ||
112 : | the Sprout data, however, is as follows. | ||
113 : | parrello | 1.27 | |
114 : | parrello | 1.14 | =item 1 |
115 : | |||
116 : | parrello | 1.25 | Type |
117 : | |||
118 : | parrello | 1.39 | nohup LoadSproutTables -dbLoad -user=you -background "*" >null & |
119 : | parrello | 1.25 | |
120 : | parrello | 1.39 | where C<you> is your user ID, and press ENTER. |
121 : | parrello | 1.25 | |
122 : | The above command line runs the load in the background. The standard output, | ||
123 : | standard error, and trace output will be directed to files in the FIG temporary | ||
124 : | directory. If your user name is C<Bruce> then the files will be named | ||
125 : | C<outBruce.log>, C<errBruce.log>, and C<traceBruce.log> respectively. | ||
126 : | |||
127 : | If the load fails at some point and you are able to correct the problem, use the | ||
128 : | C<resume> option to restart it. For example, if the load failed while doing the | ||
129 : | Feature load group, you would resume it using | ||
130 : | |||
131 : | nohup LoadSproutTables -dbLoad -dbCreate -user=you -resume -background Feature >null & | ||
132 : | parrello | 1.14 | |
133 : | =item 2 | ||
134 : | |||
135 : | parrello | 1.27 | Type |
136 : | |||
137 : | index_sprout_lucene | ||
138 : | |||
139 : | and press ENTER. This will create the Lucene indexes for the Sprout data. | ||
140 : | |||
141 : | parrello | 1.14 | =back |
142 : | |||
143 : | =head2 LoadSproutTables Command | ||
144 : | |||
145 : | C<LoadSproutTables> creates the load files for Sprout tables and optionally loads them. | ||
146 : | parrello | 1.12 | The parameters are the names of the table groups whose data is to be created. |
147 : | The legal table group names are given below. | ||
148 : | parrello | 1.1 | |
149 : | =over 4 | ||
150 : | |||
151 : | =item Genome | ||
152 : | |||
153 : | Loads B<Genome>, B<HasContig>, B<Contig>, B<IsMadeUpOf>, and B<Sequence>. | ||
154 : | |||
155 : | parrello | 1.30 | =item Feature |
156 : | |||
157 : | Loads B<Feature>, B<FeatureAlias>, B<FeatureTranslation>, B<FeatureUpstream>, | ||
158 : | parrello | 1.44 | B<IsLocatedIn>, B<FeatureLink>, B<IsAliasOf>, B<CDD>, B<HasFeature>, |
159 : | B<HasRoleInSubsystem>, B<FeatureEssential>, B<FeatureVirulent>, B<FeatureIEDB>, | ||
160 : | B<CDD>, and B<IsPresentOnProteinOf> | ||
161 : | parrello | 1.1 | |
162 : | =item Subsystem | ||
163 : | |||
164 : | parrello | 1.2 | Loads B<Subsystem>, B<Role>, B<SSCell>, B<ContainsFeature>, B<IsGenomeOf>, |
165 : | parrello | 1.8 | B<IsRoleOf>, B<OccursInSubsystem>, B<ParticipatesIn>, B<HasSSCell>, |
166 : | parrello | 1.11 | B<Catalyzes>, B<ConsistsOfRoles>, B<RoleSubset>, B<HasRoleSubset>, |
167 : | parrello | 1.13 | B<ConsistsOfGenomes>, B<GenomeSubset>, B<HasGenomeSubset>, B<Diagram>, |
168 : | parrello | 1.44 | B<RoleOccursIn>, B<SubSystemClass>, B<RoleEC>, B<IsIdentifiedByEC>, |
169 : | B<ContainsFeature>. | ||
170 : | parrello | 1.1 | |
171 : | parrello | 1.2 | =item Annotation |
172 : | |||
173 : | Loads B<SproutUser>, B<UserAccess>, B<Annotation>, B<IsTargetOfAnnotation>, | ||
174 : | B<MadeAnnotation>. | ||
175 : | |||
176 : | =item Property | ||
177 : | |||
178 : | Loads B<Property>, B<HasProperty>. | ||
179 : | |||
180 : | parrello | 1.3 | =item Group |
181 : | |||
182 : | Loads B<GenomeGroups>. | ||
183 : | |||
184 : | =item Source | ||
185 : | |||
186 : | Loads B<Source>, B<ComesFrom>, B<SourceURL>. | ||
187 : | |||
188 : | parrello | 1.4 | =item External |
189 : | |||
190 : | Loads B<ExternalAliasOrg>, B<ExternalAliasFunc>. | ||
191 : | |||
192 : | parrello | 1.8 | =item Reaction |
193 : | |||
194 : | Loads B<ReactionURL>, B<Compound>, B<CompoundName>, | ||
195 : | parrello | 1.11 | B<CompoundCAS>, B<IsAComponentOf>, B<Reaction>. |
196 : | parrello | 1.8 | |
197 : | parrello | 1.31 | =item Synonym |
198 : | |||
199 : | Loads B<SynonymGroup> and B<IsSynonymGroupFor>. | ||
200 : | |||
201 : | parrello | 1.36 | =item Family |
202 : | |||
203 : | parrello | 1.38 | Loads B<Family> and B<IsFamilyForFeature>. |
204 : | parrello | 1.36 | |
205 : | parrello | 1.41 | =item Drug |
206 : | |||
207 : | parrello | 1.44 | Loads B<PDB>, B<DocksWith>, C<IsProteinForFeature>, and C<Ligand>. |
208 : | parrello | 1.41 | |
209 : | parrello | 1.3 | =item * |
210 : | |||
211 : | Loads all of the above tables. | ||
212 : | |||
213 : | parrello | 1.1 | =back |
214 : | |||
215 : | parrello | 1.7 | The command-line options are given below. |
216 : | parrello | 1.1 | |
217 : | =over 4 | ||
218 : | |||
219 : | =item geneFile | ||
220 : | |||
221 : | The name of the file containing the genomes and their associated access codes. The | ||
222 : | file should have one line per genome, each line consisting of the genome ID followed | ||
223 : | by the access code, separated by a tab. If no file is specified, all complete genomes | ||
224 : | parrello | 1.39 | will be processed and the access code will be 1. Specify C<default> to use the |
225 : | default gene file-- C<genes.tbl> in the C<SproutData> directory. | ||
226 : | parrello | 1.1 | |
227 : | =item subsysFile | ||
228 : | |||
229 : | The name of the file containing the trusted subsystems. The file should have one line | ||
230 : | per trusted subsystem. If no file is specified, all subsystems will be trusted. | ||
231 : | |||
232 : | =item trace | ||
233 : | |||
234 : | Desired tracing level. The default is 3. | ||
235 : | |||
236 : | parrello | 1.25 | =item user |
237 : | |||
238 : | parrello | 1.35 | Suffix to use for trace, output, and error files created. |
239 : | parrello | 1.25 | |
240 : | parrello | 1.10 | =item dbLoad |
241 : | |||
242 : | If TRUE, the database tables will be loaded automatically from the load files created. | ||
243 : | |||
244 : | parrello | 1.14 | =item dbCreate |
245 : | parrello | 1.1 | |
246 : | parrello | 1.14 | If TRUE, the database will be created. If the database exists already, it will be |
247 : | dropped. Use the function with caution. | ||
248 : | parrello | 1.12 | |
249 : | parrello | 1.17 | =item loadOnly |
250 : | |||
251 : | If TRUE, the database tables will be loaded from existing load files. Load files | ||
252 : | will not be created. This option is useful if you are setting up a copy of Sprout | ||
253 : | and have load files already set up from the original version. | ||
254 : | |||
255 : | parrello | 1.25 | =item background |
256 : | |||
257 : | Redirect the standard and error output to files in the FIG temporary directory. | ||
258 : | |||
259 : | =item resume | ||
260 : | |||
261 : | Resume an interrupted load, starting with the load group specified in the first | ||
262 : | positional parameter. | ||
263 : | |||
264 : | =item sql | ||
265 : | |||
266 : | Trace SQL statements. | ||
267 : | |||
268 : | parrello | 1.32 | =item phone |
269 : | |||
270 : | Phone number to message when the load finishes. | ||
271 : | |||
272 : | parrello | 1.14 | =back |
273 : | parrello | 1.12 | |
274 : | parrello | 1.1 | =cut |
275 : | |||
276 : | use strict; | ||
277 : | use Tracer; | ||
278 : | use DocUtils; | ||
279 : | use Cwd; | ||
280 : | use FIG; | ||
281 : | use SFXlate; | ||
282 : | use File::Copy; | ||
283 : | use File::Path; | ||
284 : | use SproutLoad; | ||
285 : | use Stats; | ||
286 : | parrello | 1.9 | use SFXlate; |
287 : | parrello | 1.1 | |
288 : | parrello | 1.44 | # This is a list of the load groups in their natural order. We'll go through these in sequence, processing |
289 : | # the ones the user asks for. | ||
290 : | my @LoadGroups = qw(Genome Feature Subsystem Property Annotation Source External Reaction Synonym Family Drug); | ||
291 : | |||
292 : | parrello | 1.1 | # Get the command-line parameters and options. |
293 : | parrello | 1.17 | my ($options, @parameters) = StandardSetup(['SproutLoad', 'ERDBLoad', 'Stats', |
294 : | parrello | 1.26 | 'ERDB', 'Load', 'Sprout', 'Subsystem'], |
295 : | parrello | 1.18 | { geneFile => ["", "name of the genome list file"], |
296 : | subsysFile => ["", "name of the trusted subsystem file"], | ||
297 : | dbLoad => [0, "load the database from generated files"], | ||
298 : | dbCreate => [0, "drop and re-create the database"], | ||
299 : | parrello | 1.19 | loadOnly => [0, "load the database from previously generated files"], |
300 : | parrello | 1.23 | resume => [0, "resume a complete load starting with the first group specified in the parameter list"], |
301 : | parrello | 1.32 | phone => ["", "phone number (international format) to call when load finishes"], |
302 : | parrello | 1.18 | }, |
303 : | "<group1> <group2> ...", | ||
304 : | parrello | 1.17 | @ARGV); |
305 : | # If we're doing a load-only, turn on loading. | ||
306 : | if ($options->{loadOnly}) { | ||
307 : | $options->{dbLoad} = 1 | ||
308 : | } | ||
309 : | parrello | 1.14 | if ($options->{dbCreate}) { |
310 : | # Here we want to drop and re-create the database. | ||
311 : | my $db = $FIG_Config::sproutDB; | ||
312 : | parrello | 1.20 | DBKernel::CreateDB($db); |
313 : | parrello | 1.14 | } |
314 : | parrello | 1.39 | # Compute the gene file name. |
315 : | my $geneFile = $options->{geneFile}; | ||
316 : | if ($geneFile eq 'default') { | ||
317 : | $geneFile = "$FIG_Config::sproutData/genes.tbl"; | ||
318 : | } | ||
319 : | parrello | 1.9 | # Create the sprout loader object. Note that the Sprout object does not |
320 : | parrello | 1.10 | # open the database unless the "dbLoad" option is turned on. |
321 : | parrello | 1.1 | my $fig = FIG->new(); |
322 : | parrello | 1.10 | my $sprout = SFXlate->new_sprout_only(undef, undef, undef, ! $options->{dbLoad}); |
323 : | parrello | 1.39 | my $spl = SproutLoad->new($sprout, $fig, $geneFile, $options->{subsysFile}, $options); |
324 : | parrello | 1.15 | # Insure we have an output directory. |
325 : | FIG::verify_dir($FIG_Config::sproutData); | ||
326 : | parrello | 1.46 | # Check for the "*" option. |
327 : | if ($parameters[0] eq '*') { | ||
328 : | @parameters = @LoadGroups; | ||
329 : | } | ||
330 : | parrello | 1.23 | # If we're resuming, we only want to have 1 parameter. |
331 : | my $resume = $options->{resume}; | ||
332 : | if ($resume && @parameters > 1) { | ||
333 : | Confess("If resume=1, only one load group can be specified."); | ||
334 : | } elsif (! @parameters) { | ||
335 : | parrello | 1.34 | Trace("No load groups were specified.") if T(0); |
336 : | parrello | 1.23 | } |
337 : | parrello | 1.44 | # Process the resume option here. We modify the incoming parameters to |
338 : | # contain the resume group and everything after it. | ||
339 : | if ($resume) { | ||
340 : | # Save the starting group. | ||
341 : | my $resumeGroup = $parameters[0]; | ||
342 : | # Copy the load group list into the parameter array. | ||
343 : | @parameters = @LoadGroups; | ||
344 : | # Shift out the groups until we reach our desired starting point. | ||
345 : | while (scalar(@parameters) && $parameters[0] ne $resumeGroup) { | ||
346 : | shift @parameters; | ||
347 : | } | ||
348 : | if (! @parameters) { | ||
349 : | Confess("Resume group \"$resumeGroup\" not found."); | ||
350 : | } | ||
351 : | } | ||
352 : | parrello | 1.32 | # Set a variable to contain return type information. |
353 : | my $rtype; | ||
354 : | parrello | 1.44 | # Set up a statistics object for statistics about the entire load. |
355 : | my $totalStats = Stats->new(); | ||
356 : | parrello | 1.32 | # Insure we catch errors. |
357 : | eval { | ||
358 : | # Process the parameters. | ||
359 : | for my $group (@parameters) { | ||
360 : | Trace("Processing load group $group.") if T(2); | ||
361 : | parrello | 1.44 | # Compute the string we want to execute. |
362 : | my $code = "\$spl->Load${group}Data()"; | ||
363 : | # Load this group. | ||
364 : | my $stats = eval($code); | ||
365 : | parrello | 1.45 | if ($@) { |
366 : | Confess("Load group error: $@"); | ||
367 : | } | ||
368 : | parrello | 1.44 | # Merge the statistics into the master. |
369 : | $totalStats->Accumulate($stats); | ||
370 : | } | ||
371 : | # Compute the statistical display. | ||
372 : | my $statDisplay = $totalStats->Show(); | ||
373 : | # Display it. | ||
374 : | Trace("Statistics for this load:\n$statDisplay") if T(2); | ||
375 : | # Check for a "table load failed" message. If we find one, we want | ||
376 : | # to end with an error. | ||
377 : | if ($statDisplay =~ /table load failed/i) { | ||
378 : | Confess("One or more table loads failed."); | ||
379 : | parrello | 1.32 | } |
380 : | }; | ||
381 : | if ($@) { | ||
382 : | Trace("Load failed with error: $@") if T(0); | ||
383 : | $rtype = "error"; | ||
384 : | } else { | ||
385 : | Trace("Load complete.") if T(2); | ||
386 : | $rtype = "no error"; | ||
387 : | } | ||
388 : | parrello | 1.33 | if ($options->{phone}) { |
389 : | parrello | 1.32 | my $msgID = Tracer::SendSMS($options->{phone}, "Sprout load terminated with $rtype."); |
390 : | if ($msgID) { | ||
391 : | Trace("Phone message sent with ID $msgID.") if T(2); | ||
392 : | } else { | ||
393 : | Trace("Phone message not sent.") if T(2); | ||
394 : | parrello | 1.31 | } |
395 : | parrello | 1.1 | } |
396 : | parrello | 1.35 | |
397 : | parrello | 1.1 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |