[Bio] / Sprout / SaplingLoadCheck.pl Repository:
ViewVC logotype

Annotation of /Sprout/SaplingLoadCheck.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     =head1 Sapling Incremental Load
4 :    
5 : parrello 1.6 This script performs a periodic, incremental update of a Sapling database
6 : parrello 1.1 from the main SEED files. The update is determined by comparing the SEED files
7 :     to the current database content. In particular, the list of genomes, the list of
8 :     subsystems, and the expression data will be compared. The Taxonomy will always
9 : parrello 1.8 be reloaded. FIGfams will be reloaded if they are a new release. It is possible
10 : parrello 1.5 that subsystems and genomes may be deleted.
11 : parrello 1.1
12 :     The currently-supported command-line options are as follows.
13 :    
14 :     =over 4
15 :    
16 : parrello 1.7 =item create
17 :    
18 :     If specified, the database is presumed to be new. The tables will be created and the DBD stored.
19 :    
20 : parrello 1.1 =item user
21 :    
22 :     Name suffix to be used for log files. If omitted, the PID is used.
23 :    
24 :     =item trace
25 :    
26 :     Numeric trace level. A higher trace level causes more messages to appear. The
27 :     default trace level is 2. Tracing will be directly to the standard output
28 :     as well as to a C<trace>I<User>C<.log> file in the FIG temporary directory,
29 :     where I<User> is the value of the B<user> option above.
30 :    
31 :     =item sql
32 :    
33 :     If specified, turns on tracing of SQL activity.
34 :    
35 :     =item background
36 :    
37 :     Save the standard and error output to files. The files will be created
38 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
39 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
40 :     B<user> option above.
41 :    
42 :     =item h
43 :    
44 :     Display this command's parameters and options.
45 :    
46 : parrello 1.2 =item dbhost
47 : parrello 1.1
48 :     Alternate database host, if the database is located somewhere other than the
49 :     default. This is necessary on some Sapling machines to insure we get a writable
50 :     copy of the database.
51 :    
52 : parrello 1.2 =item dbName
53 :    
54 :     Name of the sapling database to update, if it is not the standard one.
55 :    
56 :     =item port
57 :    
58 :     Database access port. This is useful for testing.
59 :    
60 :     =item notaxon
61 :    
62 :     If specified, the taxonomy information will NOT be reloaded.
63 :    
64 :     =item fighost
65 :    
66 :     Alternate database host for the SEED. This is useful for testing.
67 :    
68 :     =item figport
69 :    
70 :     Alternate database access port for the SEED. This is useful for testing.
71 :    
72 :     =item figdisk
73 :    
74 :     Alternate directory for the SEED. This is useful for testing.
75 :    
76 : parrello 1.7 =item figdb
77 :    
78 :     Name of the database for the SEED. This is useful for testing.
79 :    
80 : parrello 1.8 =item replaceAll
81 :    
82 :     If specified, all genomes will be replaced. This is an expensive operation, but
83 :     occasionally necessary.
84 :    
85 : parrello 1.1 =back
86 :    
87 :     =cut
88 :    
89 :     use strict;
90 :     use Tracer;
91 :     use Sapling;
92 :     use SaplingDataLoader;
93 :     use Stats;
94 :    
95 :     use SaplingExpressionLoader;
96 :     use SaplingFunctionLoader;
97 :     use SaplingGenomeLoader;
98 :     use SaplingSubsystemLoader;
99 :     use SaplingTaxonomyLoader;
100 : parrello 1.5 use SaplingFamilyLoader;
101 : parrello 1.2 use FIG;
102 : parrello 1.1
103 :     # Get the command-line options and parameters.
104 :     my ($options, @parameters) = StandardSetup([qw(SaplingDataLoader) ],
105 : parrello 1.2 { dbhost => ["", "alternate database host machine"],
106 :     port => ["", "alternate database port"],
107 :     notaxon => ["", "if specified, the taxonomy data will NOT be reloaded"],
108 :     fighost => ["", "alternate SEED database host"],
109 :     figport => ["", "alternate SEED database port"],
110 :     figdisk => ["", "FIG instance directory (requires a special FIG_Config)"],
111 : parrello 1.7 figdb => ["", "name of the SEED MySQL database"],
112 :     dbName => ["", "name of the Sapling database to update"],
113 : parrello 1.8 create => ["", "create the database tables"],
114 :     replaceAll => ["", "replace all genomes"] },
115 : parrello 1.2 "",
116 : parrello 1.1 @ARGV);
117 :     # Create the statistics object.
118 :     my $stats = Stats->new();
119 :     # Insure we catch errors.
120 :     eval {
121 :     # Get the Sapling database.
122 : parrello 1.2 my $sap = Sapling->new(dbhost => $options->{dbhost}, port => $options->{port},
123 :     dbName => $options->{dbName});
124 :     # Get the SEED data. We may need to update some of the configuration parameters.
125 :     if ($options->{fighost}) {
126 :     $FIG_Config::dbhost = $options->{fighost};
127 :     }
128 :     if ($options->{figport}) {
129 :     $FIG_Config::dbport = $options->{figport};
130 :     }
131 :     if ($options->{figdisk}) {
132 :     $FIG_Config::fig_disk = $options->{figdisk};
133 :     $FIG_Config::global = "$options->{figdisk}/FIG/Data/Global";
134 :     $FIG_Config::organisms = "$options->{figdisk}/FIG/Data/Organisms";
135 :     $FIG_Config::data = "$options->{figdisk}/FIG/Data";
136 :     }
137 : parrello 1.7 if ($options->{figdb}) {
138 :     $FIG_Config::db = "$options->{figdb}";
139 :     }
140 : parrello 1.1 my $fig = $sap->GetSourceObject();
141 : parrello 1.7 # Check for a table-create situation.
142 :     if ($options->{create}) {
143 :     # Store the DBD.
144 :     $sap->InternalizeDBD();
145 :     Trace("DBD stored in database.") if T(2);
146 :     # Recreate the tables.
147 :     $sap->CreateTables();
148 :     }
149 : parrello 1.1 # Update the taxonomies.
150 : parrello 1.2 if (! $options->{notaxon}) {
151 : parrello 1.7 my $setFile = "$FIG_Config::global/genome.sets";
152 :     if (! -f $setFile) {
153 :     $setFile = "";
154 :     Trace("WARNING: No OTU file found.") if T(1);
155 :     }
156 : parrello 1.2 Trace("Updating taxonomy data.") if T(2);
157 : parrello 1.8 my $subStats = SaplingTaxonomyLoader::Process($sap, "$FIG_Config::global/Taxonomy",
158 : parrello 1.7 $setFile);
159 : parrello 1.2 }
160 : parrello 1.1 # Compute the updated subsystems. A subsystems is "changed" if it is new or
161 :     # its version number has changed. A subsystem is "deleted" if it is in the
162 :     # database but not the SEED. The function below returns a hash
163 :     # reference. In the case of the changed subsystems, the hash reference
164 :     # maps the subsystem ID to its directory name. In the case of deleted subsystems,
165 :     # the hash reference maps the subsystem ID to an empty string.
166 :     my $changedSubsystems = ComputeSubsystemChanges($stats,
167 :     $sap, $fig);
168 :     # Perform the subsystem changes. This returns a hash that maps each genome
169 :     # to a hash of the subsystems of which it is a direct member.
170 :     my $subsysGenomes = UpdateSubsystems($changedSubsystems, $stats, $sap);
171 :     # Update the genomes. This returns a hash of the genomes added.
172 : parrello 1.8 my $newGenomes = UpdateGenomes($stats, $sap, $options->{replaceAll});
173 : parrello 1.1 # Loop through the genomes, applying the bindings.
174 : parrello 1.2 UpdateBindings($changedSubsystems, $subsysGenomes, $newGenomes, $stats, $sap);
175 : parrello 1.5 # Now we must compare the expression data.
176 : parrello 1.4 UpdateExpressionData($stats, $sap);
177 : parrello 1.5 # Finally, we need to update the FIGfams. First, we must find
178 :     # the latest figfam-prod release directory.
179 :     my @releases = sort { Tracer::Cmp($a, $b) } grep { $_ =~ /^Release\d+/ } OpenDir("/vol/figfam-prod");
180 :     # Find the first valid FIGfam directory.
181 :     my $figFamRel;
182 :     for (my $i = $#releases; $i >= 0 && ! $figFamRel; $i--) {
183 :     my $release = $releases[$i];
184 :     if (-f "/vol/figfam-prod/$release/coupling.values") {
185 :     $figFamRel = $release;
186 :     }
187 :     }
188 :     if (! $figFamRel) {
189 :     Confess("No FIGfam directory found.");
190 :     } else {
191 :     # We have a FIGfam release directory.
192 :     my $figFamDir = "/vol/figfam-prod/$figFamRel";
193 :     Trace("FIGfams are currently in $figFamDir.") if T(2);
194 :     # Get the current release from the database.
195 :     my $dbRel = $sap->GetFlat('FamilyType', "FamilyType(id) = ?",
196 :     ['FIGfams'], 'version');
197 :     if (! $dbRel || $dbRel ne $figFamRel) {
198 :     # Here we have a new release, so we need to reload.
199 :     Trace("Reloading FIGfams.") if T(2);
200 :     my $subStats = SaplingFamilyLoader::Process($sap, $figFamDir);
201 :     $stats->Accumulate($subStats);
202 :     # Update the release information.
203 :     if (! $dbRel) {
204 :     Trace("Adding release marker for $figFamRel.") if T(2);
205 : parrello 1.8 $sap->InsertObject('FamilyType', id => 'FIGfams',
206 : parrello 1.5 version => $figFamRel);
207 :     } else {
208 :     Trace("Updating release marker for $figFamRel.") if T(2);
209 :     $sap->UpdateEntity('FamilyType', 'FIGfams', version => $figFamRel);
210 :     }
211 :     }
212 :     }
213 : parrello 1.1 # All done.
214 :     Trace("Processing complete.") if T(2);
215 :     };
216 :     if ($@) {
217 :     Trace("Script failed with error: $@") if T(0);
218 :     } else {
219 :     Trace("Script complete.") if T(2);
220 :     }
221 :     Trace("Statistics for this run:\n" . $stats->Show()) if T(2);
222 :    
223 :     =head2 Subroutines
224 :    
225 :     =head3 ComputeSubsystemChanges
226 :    
227 :     my $changedSubsystems = ComputeSubsystemChanges($stats, $sap, $fig);
228 :    
229 :     This method compares the subsystems in the SEED to the subsystems in the
230 :     Sapling. If the subsystem is in both places and the version number has changed,
231 :     or if it is only in the SEED, it will be marked for reloading. If it is only
232 :     in the Sapling, it will be marked for deletion.
233 :    
234 :     =over 4
235 :    
236 :     =item stats
237 :    
238 :     A L<Stats> object that will be used to record the method's activity.
239 :    
240 :     =item sap
241 :    
242 :     The L<Sapling> object used to communicate with the database.
243 :    
244 :     =item fig
245 :    
246 :     A L<FIG> object used to communicate with the SEED,
247 :    
248 :     =item RETURN
249 :    
250 :     Returns a reference to a hash keyed by subsystem ID. For subsystems to be
251 :     loaded or reloaded, it maps the ID to the subsystem's directory name. For
252 :     subsystems to be deleted, it maps the ID to an empty string.
253 :    
254 :     =back
255 :    
256 :     =cut
257 :    
258 :     sub ComputeSubsystemChanges {
259 :     # Get the parameters.
260 :     my ($stats, $sap, $fig) = @_;
261 :     Trace("Analyzing subsystems.") if T(2);
262 :     # Get the list of subsystems in the SEED. This requires a direct query
263 : parrello 1.8 # to the SEED database to get us the version numbers, and we have to
264 : parrello 1.1 # convert the subsystem IDs.
265 :     Trace("Reading subsystems from SEED.") if T(3);
266 :     # First we get the configured list of subsystems.
267 :     my $subsWanted = $sap->SubsystemHash();
268 :     # Now we read the subsystems and versions from the SEED database.
269 :     my $fig_dbh = $fig->db_handle;
270 :     my @seedSubs = map { [$sap->SubsystemID($_->[0]), $_] }
271 :     @{$fig_dbh->SQL("SELECT `subsystem`, `version` FROM subsystem_metadata")};
272 : parrello 1.8 # Filter the version list against the configured-subsystem list to create
273 : parrello 1.1 # the desired hash.
274 :     my %seedSubs = map { $_->[0] => $_->[1] } grep { $subsWanted->{$_->[0]} } @seedSubs;
275 :     # Get a similar list for the Sapling.
276 :     Trace("Reading subsystems from Sapling.") if T(3);
277 :     my %sapSubs = map { $_->[0] => $_->[1] }
278 :     $sap->GetAll("Subsystem", "", [], 'id version');
279 :     # Declare the return hash.
280 :     my %retVal;
281 :     Trace("Scanning subsystems for changes.") if T(2);
282 :     # Loop through the SEED subsystems, looking for ones to reload.
283 : parrello 1.2 my @seedSubList = sort keys %seedSubs;
284 :     for my $seedSub (@seedSubList) {
285 : parrello 1.1 $stats->Add(seedSubsystemsChecked => 1);
286 :     # Get this subsystem's version and directory name.
287 :     my $seedVersion = $seedSubs{$seedSub}[1];
288 :     my $seedDirectory = "$FIG_Config::data/Subsystems/$seedSubs{$seedSub}[0]";
289 : parrello 1.2 # Verify that the subsystem is real.
290 :     if (! -d $seedDirectory) {
291 :     Trace("Subsystem $seedSub is not found in the data directory.") if T(3);
292 :     delete $seedSubs{$seedSub};
293 :     } else {
294 :     # It is. See if the subsystem is new or changed.
295 :     my $sapVersion = $sapSubs{$seedSub};
296 :     if (! defined $sapVersion) {
297 :     $stats->Add(seedSubsystemsNewFound => 1);
298 :     $retVal{$seedSub} = $seedDirectory;
299 :     } elsif ($seedVersion > $sapVersion) {
300 :     $stats->Add(seedSubsystemsChangedFound => 1);
301 :     $retVal{$seedSub} = $seedDirectory;
302 : parrello 1.5 Trace("Must update $seedSub from $sapVersion to $seedVersion.") if T(3);
303 : parrello 1.2 }
304 : parrello 1.1 }
305 :     }
306 :     # Loop through the Sapling subsystems, looking for ones that were deleted.
307 :     Trace("Scanning for deleted subsystems.") if T(2);
308 :     for my $sapSub (sort keys %sapSubs) {
309 :     $stats->Add(sapSubsystemsChecked => 1);
310 :     # Check for this subsystem in the SEED.
311 :     if (! exists $seedSubs{$sapSub}) {
312 :     $stats->Add(sapSubsystemDeletesFound => 1);
313 :     $retVal{$sapSub} = '';
314 :     }
315 :     }
316 :     # Return the hash of updates.
317 :     return \%retVal;
318 :     }
319 :    
320 :     =head3 UpdateSubsystems
321 :    
322 :     my $subsysGenomes = UpdateSubsystems($changedSubsystems, $stats, $sap);
323 :    
324 :     Update the specified subsystems in the database. New subsystems will be added,
325 :     existing subsystems may be changed, and obsolete subsystems will be deleted. The
326 :     return hash will indicate which genomes are direct members of updated subsystems.
327 :     When the subsystem bindings are examined for the various genomes, the direct
328 :     members will be skipped, since they will already have been updated by this
329 :     process.
330 :    
331 :     =over 4
332 :    
333 :     =item changedSubsystems
334 :    
335 :     Reference to a hash mapping the IDs of the modified subsystems. If a subsystem is
336 :     to be deleted, it will map to an empty string. If it is to be created or updated,
337 :     it will map to the name of the directory containing the subsystem data.
338 :    
339 :     =item stats
340 :    
341 :     L<Stats> object to be updated with statistics from this operation.
342 :    
343 :     =item sap
344 :    
345 :     L<Sapling> object for accessing the database.
346 :    
347 :     =item RETURN
348 :    
349 :     Returns a reference to a hash that maps each genome modified by the subsystem
350 :     changes to a list of the subsystems containing it.
351 :    
352 :     =back
353 :    
354 :     =cut
355 :    
356 :     sub UpdateSubsystems {
357 :     # Get the parameters.
358 :     my ($changedSubsystems, $stats, $sap) = @_;
359 :     Trace("Processing subsystem updates.") if T(2);
360 :     # Declare the return variable.
361 :     my %retVal;
362 :     # Loop through the updated subsystems.
363 :     for my $subsysID (sort keys %$changedSubsystems) {
364 :     my $subsysDir = $changedSubsystems->{$subsysID};
365 :     if (! $subsysDir) {
366 :     # Here the subsystem is being deleted. Note that in this case we don't
367 :     # need to track anything in the return hash, since the subsystem will
368 :     # be skipped during the bindings by virtue of not being in the database.
369 : parrello 1.2 Trace("Deleting subsystem $subsysID.") if T(2);
370 : parrello 1.1 my $subStats = SaplingSubsystemLoader::ClearSubsystem($sap, $subsysID);
371 :     $stats->Accumulate($subStats);
372 :     $stats->Add(foundSubsystemsDeleted => 1);
373 :     } else {
374 :     # Here the subsystem is being updated or loaded.
375 : parrello 1.2 Trace("Updating subsystem $subsysID from $subsysDir.") if T(2);
376 : parrello 1.1 my $subStats = SaplingSubsystemLoader::Process($sap, $subsysID, $subsysDir);
377 :     $stats->Accumulate($subStats);
378 :     $stats->Add(foundSubsystemsUpdated => 1);
379 :     # Get the list of genomes to skip when updating this subsystem's
380 :     # bindings.
381 :     my @genomes = $sap->GetFlat('Describes IsImplementedBy IsUsedBy',
382 :     'Describes(from-link) = ?', [$subsysID], 'IsUsedBy(to-link)');
383 :     for my $genome (@genomes) {
384 :     push @{$retVal{$genome}}, $subsysID;
385 :     $stats->Add(bindingExceptions => 1);
386 :     }
387 :     }
388 :     }
389 :     # Return the hash of binding exceptions.
390 :     return \%retVal;
391 :     }
392 :    
393 :     =head3 UpdateGenomes
394 :    
395 : parrello 1.8 my $newGenomes = UpdateGenomes($stats, $sap, $all);
396 : parrello 1.1
397 :     Compare the genomes in the Sapling database to the genomes in the SEED. New
398 :     genomes will be added and obsolete genomes will be deleted. A list of the
399 :     genomes added will be returned to help control the processing of the
400 :     subsystem bindings. For those genomes, the subsystem bindings will already
401 :     have been processed by this method.
402 :    
403 :     =over 4
404 :    
405 :     =item stats
406 :    
407 :     L<Stats> object to contain statistics on this operation.
408 :    
409 :     =item sap
410 :    
411 :     L<Sapling> object for accessing the Sapling database.
412 :    
413 : parrello 1.8 =item all
414 :    
415 :     If TRUE, then all genomes will be considered new, forcing a mass replacement.
416 :    
417 : parrello 1.1 =item RETURN
418 :    
419 :     Returns a reference to a list of the IDs for the genomes added to the
420 :     Sapling by this method.
421 :    
422 :     =back
423 :    
424 :     =cut
425 :    
426 :     sub UpdateGenomes {
427 :     # Get the parameters.
428 : parrello 1.8 my ($stats, $sap, $all) = @_;
429 : parrello 1.1 # Declare the return variable.
430 :     my @retVal;
431 :     Trace("Processing genomes.") if T(2);
432 :     # Get the hash of SEED genomes.
433 :     Trace("Retrieving genomes from SEED.") if T(3);
434 :     my $seedGenomes = $sap->GenomeHash();
435 :     # Get a similar hash of genomes currently in Sapling.
436 :     my %sapGenomes = map { $_ => 1 } $sap->GetFlat('Genome', "", [], 'id');
437 :     Trace("Scanning for new genomes.") if T(2);
438 :     # Loop through the SEED genomes, looking for new ones.
439 :     for my $seedGenome (sort keys %$seedGenomes) {
440 :     $stats->Add(seedGenomesChecked => 1);
441 : parrello 1.8 if ($all || ! $sapGenomes{$seedGenome}) {
442 :     # Here we have a genome to be loaded.
443 :     if (! $sapGenomes{$seedGenome}) {
444 :     $stats->Add(seedGenomesNewFound => 1);
445 :     } else {
446 :     $stats->Add(seedGenomesReplaced => 1);
447 :     }
448 : parrello 1.3 my $subStats = SaplingGenomeLoader::Process($sap, $seedGenome,
449 : parrello 1.1 "$FIG_Config::organisms/$seedGenome");
450 :     $stats->Accumulate($subStats);
451 :     # Record it in the return list.
452 :     push @retVal, $seedGenome;
453 :     }
454 :     }
455 :     # Loop through the Sapling genomes, looking for obsolete ones.
456 :     for my $sapGenome (sort keys %sapGenomes) {
457 :     $stats->Add(sapGenomesChecked => 1);
458 :     if (! $seedGenomes->{$sapGenome}) {
459 :     # Here we have an obsolete genome to be deleted.
460 :     $stats->Add(sapGenomeDeletes => 1);
461 :     my $subStats = SaplingGenomeLoader::ClearGenome($sap, $sapGenome);
462 :     $stats->Accumulate($subStats);
463 :     }
464 :     }
465 :     # Return the list of new genomes.
466 :     return \@retVal;
467 :     }
468 :    
469 :     =head3 UpdateBindings
470 :    
471 :     UpdateBindings($changedSubsystems, $subsysGenomes, $newGenomes, $stats, $sap);
472 :    
473 :     Run through all the genomes in the database, updating the subsystem
474 :     bindings. Only the subsystems listed as changed will be processed. New genomes
475 :     will be skipped, because the bindings were processed when the genome was
476 :     loaded. Genomes that are direct subsystem members will be skipped for those
477 :     subsystems, since those bindings were processed when the subsystem was loaded.
478 :    
479 :     =over 4
480 :    
481 :     =item changedSubsystems
482 :    
483 : parrello 1.8 Reference to a hash whose keys are the IDs of modified subsystems. These
484 : parrello 1.1 subsystems are the ones whose bindings will be processed.
485 :    
486 :     =item subsysGenomes
487 :    
488 :     Reference to a hash mapping genome IDs to the IDs of the subsystems for
489 :     which the genomes are direct members. Each genome ID is mapped to a list
490 :     of subsystem IDs.
491 :    
492 :     =item newGenomes
493 :    
494 :     Reference to a list of the genomes that have been added in this run.
495 :    
496 :     =item stats
497 :    
498 :     L<Stats> object for tracking the statistics of this operation.
499 :    
500 :     =item sap
501 :    
502 :     L<Sapling> object for connecting to the Sapling object.
503 :    
504 :     =back
505 :    
506 :     =cut
507 :    
508 :     sub UpdateBindings {
509 :     # Get the parameters.
510 :     my ($changedSubsystems, $subsysGenomes, $newGenomes, $stats, $sap) = @_;
511 :     # Get the full list of changed subsystems.
512 :     my @changes = grep { $changedSubsystems->{$_} } keys %$changedSubsystems;
513 :     # Get the list of old genomes.
514 :     my $genomeHash = $sap->GenomeHash();
515 :     my %newGenomeHash = map { $_ => 1 } @$newGenomes;
516 :     my @genomeList = grep { ! $newGenomeHash{$_} } sort keys %$genomeHash;
517 :     # Loop through them.
518 :     Trace("Processing genome bindings.") if T(2);
519 :     for my $genome (@genomeList) {
520 :     Trace("Processing bindings for $genome.") if T(3);
521 :     # Compute the subsystem list for this genome. It's all the changed
522 :     # subsystems less the ones directly attached to this genome.
523 :     my %excluded = map { $_ => 1 } @{$subsysGenomes->{$genome}};
524 :     my @subsysList = grep { ! $excluded{$_} } @changes;
525 :     # Create a loader for this genome.
526 :     my $loader = SaplingGenomeLoader->new($sap, $genome, "$FIG_Config::organisms/$genome");
527 :     # Use it to update the subsystems.
528 :     $loader->LoadSubsystems(\@subsysList);
529 :     # Fold in the statistics.
530 :     $stats->Accumulate($loader->{stats});
531 :     }
532 :     }
533 :    
534 : parrello 1.4 =head3 UpdateExpressionData
535 :    
536 :     UpdateExpressionData($stats, $sap);
537 :    
538 :     Compare the expression data in the SEED and in the Sapling data base. Insure that
539 :     all genomes in the Sapling which have expression data in the SEED have expression
540 :     data in the Sapling as well.
541 :    
542 :     =over 4
543 :    
544 :     =item stats
545 :    
546 :     L<Stats> object to track the activity of this operation.
547 :    
548 :     =item sap
549 :    
550 :     L<Sapling> object for connecting to the database.
551 :    
552 :     =cut
553 :    
554 :     sub UpdateExpressionData {
555 :     # Get the parameters.
556 :     my ($stats, $sap) = @_;
557 :     # Get the expression data directory.
558 :     my $expDirectory = '/vol/expression/current';
559 :     # Get the list of Sapling genomes.
560 :     my %genomeHash = map { $_ => 1 } $sap->GetFlat('Genome', '', [], 'id');
561 :     # Get the list of Sapling genomes with expression data.
562 :     my %expDataHash = map { $_ => 1 } $sap->GetFlat('Genome HadResultsProducedBy',
563 :     '', [], 'id');
564 :     # Now we run through the expression directory. For each genome that is in the
565 :     # genome hash and not in the expression data hash, we load its expression
566 :     # data. Every genome that is in the expression directory will also be deleted
567 :     # from the expression data hash. At the end, whatever is left in the expression
568 :     # data hash will be deleted from the data base.
569 :     my @expFiles = OpenDir($expDirectory, 1);
570 :     Trace("Processing expression data. " . scalar(@expFiles) . " directories found.") if T(2);
571 :     # Loop through the expression directory.
572 :     for my $expFile (@expFiles) {
573 :     # Check the status of this genome.
574 :     if (! $genomeHash{$expFile}) {
575 :     $stats->Add(expressionGenomeSkipped => 1);
576 :     } elsif ($expDataHash{$expFile}) {
577 :     $stats->Add(expressionGenomeFound => 1);
578 :     # Insure we know not to delete this expression data.
579 :     delete $expDataHash{$expFile};
580 :     } else {
581 :     # Here we need to load the genome's expression data.
582 :     Trace("Loading expression data for $expFile.") if T(2);
583 :     $stats->Add(expressionGenomeNew => 1);
584 :     # Load the expression data.
585 :     my $subStats = SaplingExpressionLoader::Process($sap, $expFile,
586 :     "$expDirectory/$expFile");
587 :     # Fold in the statistics.
588 :     $stats->Accumulate($subStats);
589 :     # Insure we know not to delete this expression data.
590 :     delete $expDataHash{$expFile};
591 :     }
592 :     }
593 :     # Delete the expression data not found on disk.
594 :     for my $genome (sort keys %expDataHash) {
595 :     Trace("Deleting expression data for $genome.") if T(2);
596 :     $stats->Add(expressionGenomeDelete => 1);
597 :     # Delete the expression data.
598 :     my $subStats = SaplingExpressionLoader::ClearExpressionData($sap, $genome);
599 :     # Fold in the statistics.
600 :     $stats->Accumulate($subStats);
601 :     }
602 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3