[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Annotation of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.65 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SproutLoad;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDBLoad;
9 :     use FIG;
10 :     use Sprout;
11 :     use Stats;
12 :     use BasicLocation;
13 : parrello 1.18 use HTML;
14 : parrello 1.1
15 :     =head1 Sprout Load Methods
16 :    
17 :     =head2 Introduction
18 :    
19 :     This object contains the methods needed to copy data from the FIG data store to the
20 :     Sprout database. It makes heavy use of the ERDBLoad object to manage the load into
21 :     individual tables. The client can create an instance of this object and then
22 :     call methods for each group of tables to load. For example, the following code will
23 :     load the Genome- and Feature-related tables. (It is presumed the first command line
24 :     parameter contains the name of a file specifying the genomes.)
25 :    
26 :     my $fig = FIG->new();
27 :     my $sprout = SFXlate->new_sprout_only();
28 :     my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]);
29 :     my $stats = $spl->LoadGenomeData();
30 :     $stats->Accumulate($spl->LoadFeatureData());
31 :     print $stats->Show();
32 :    
33 :     It is worth noting that the FIG object does not need to be a real one. Any object
34 :     that implements the FIG methods for data retrieval could be used. So, for example,
35 :     this object could be used to copy data from one Sprout database to another, or
36 :     from any FIG-compliant data story implemented in the future.
37 :    
38 :     To insure that this is possible, each time the FIG object is used, it will be via
39 :     a variable called C<$fig>. This makes it fairly straightforward to determine which
40 :     FIG methods are required to load the Sprout database.
41 :    
42 : parrello 1.5 This object creates the load files; however, the tables are not created until it
43 :     is time to actually do the load from the files into the target database.
44 :    
45 : parrello 1.1 =cut
46 :    
47 :     #: Constructor SproutLoad->new();
48 :    
49 :     =head2 Public Methods
50 :    
51 :     =head3 new
52 :    
53 : parrello 1.8 C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
54 : parrello 1.1
55 :     Construct a new Sprout Loader object, specifying the two participating databases and
56 :     the name of the files containing the list of genomes and subsystems to use.
57 :    
58 :     =over 4
59 :    
60 :     =item sprout
61 :    
62 :     Sprout object representing the target database. This also specifies the directory to
63 :     be used for creating the load files.
64 :    
65 :     =item fig
66 :    
67 :     FIG object representing the source data store from which the data is to be taken.
68 :    
69 :     =item genomeFile
70 :    
71 :     Either the name of the file containing the list of genomes to load or a reference to
72 :     a hash of genome IDs to access codes. If nothing is specified, all complete genomes
73 :     will be loaded and the access code will default to 1. The genome list is presumed
74 :     to be all-inclusive. In other words, all existing data in the target database will
75 :     be deleted and replaced with the data on the specified genes. If a file is specified,
76 :     it should contain one genome ID and access code per line, tab-separated.
77 :    
78 :     =item subsysFile
79 :    
80 :     Either the name of the file containing the list of trusted subsystems or a reference
81 : parrello 1.34 to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
82 :     considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
83 :     in its data directory.) Only subsystem data related to the trusted subsystems is loaded.
84 : parrello 1.1
85 : parrello 1.8 =item options
86 :    
87 :     Reference to a hash of command-line options.
88 :    
89 : parrello 1.1 =back
90 :    
91 :     =cut
92 :    
93 :     sub new {
94 :     # Get the parameters.
95 : parrello 1.8 my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
96 : parrello 1.35 # Create the genome hash.
97 :     my %genomes = ();
98 :     # We only need it if load-only is NOT specified.
99 :     if (! $options->{loadOnly}) {
100 :     if (! defined($genomeFile) || $genomeFile eq '') {
101 :     # Here we want all the complete genomes and an access code of 1.
102 :     my @genomeList = $fig->genomes(1);
103 :     %genomes = map { $_ => 1 } @genomeList;
104 :     } else {
105 :     my $type = ref $genomeFile;
106 :     Trace("Genome file parameter type is \"$type\".") if T(3);
107 :     if ($type eq 'HASH') {
108 :     # Here the user specified a hash of genome IDs to access codes, which is
109 :     # exactly what we want.
110 :     %genomes = %{$genomeFile};
111 :     } elsif (! $type || $type eq 'SCALAR' ) {
112 :     # The caller specified a file, so read the genomes from the file. (Note
113 :     # that some PERLs return an empty string rather than SCALAR.)
114 :     my @genomeList = Tracer::GetFile($genomeFile);
115 :     if (! @genomeList) {
116 :     # It's an error if the genome file is empty or not found.
117 :     Confess("No genomes found in file \"$genomeFile\".");
118 :     } else {
119 :     # We build the genome Hash using a loop rather than "map" so that
120 :     # an omitted access code can be defaulted to 1.
121 :     for my $genomeLine (@genomeList) {
122 :     my ($genomeID, $accessCode) = split("\t", $genomeLine);
123 : parrello 1.65 if (! defined($accessCode)) {
124 : parrello 1.35 $accessCode = 1;
125 :     }
126 :     $genomes{$genomeID} = $accessCode;
127 : parrello 1.3 }
128 : parrello 1.1 }
129 : parrello 1.35 } else {
130 :     Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
131 : parrello 1.1 }
132 :     }
133 :     }
134 :     # Load the list of trusted subsystems.
135 :     my %subsystems = ();
136 : parrello 1.35 # We only need it if load-only is NOT specified.
137 :     if (! $options->{loadOnly}) {
138 :     if (! defined $subsysFile || $subsysFile eq '') {
139 : parrello 1.55 # Here we want all the usable subsystems. First we get the whole list.
140 : parrello 1.35 my @subs = $fig->all_subsystems();
141 : parrello 1.55 # Loop through, checking for usability.
142 : parrello 1.35 for my $sub (@subs) {
143 : parrello 1.55 if ($fig->usable_subsystem($sub)) {
144 : parrello 1.35 $subsystems{$sub} = 1;
145 :     }
146 : parrello 1.33 }
147 : parrello 1.35 } else {
148 :     my $type = ref $subsysFile;
149 :     if ($type eq 'ARRAY') {
150 :     # Here the user passed in a list of subsystems.
151 :     %subsystems = map { $_ => 1 } @{$subsysFile};
152 :     } elsif (! $type || $type eq 'SCALAR') {
153 :     # Here the list of subsystems is in a file.
154 :     if (! -e $subsysFile) {
155 :     # It's an error if the file does not exist.
156 :     Confess("Trusted subsystem file not found.");
157 :     } else {
158 :     # GetFile automatically chomps end-of-line characters, so this
159 :     # is an easy task.
160 :     %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile);
161 :     }
162 : parrello 1.4 } else {
163 : parrello 1.35 Confess("Invalid subsystem parameter in SproutLoad constructor.");
164 : parrello 1.4 }
165 : parrello 1.1 }
166 :     }
167 :     # Get the data directory from the Sprout object.
168 :     my ($directory) = $sprout->LoadInfo();
169 :     # Create the Sprout load object.
170 :     my $retVal = {
171 :     fig => $fig,
172 :     genomes => \%genomes,
173 :     subsystems => \%subsystems,
174 :     sprout => $sprout,
175 :     loadDirectory => $directory,
176 : parrello 1.39 erdb => $sprout,
177 : parrello 1.8 loaders => [],
178 :     options => $options
179 : parrello 1.1 };
180 :     # Bless and return it.
181 :     bless $retVal, $class;
182 :     return $retVal;
183 :     }
184 :    
185 : parrello 1.23 =head3 LoadOnly
186 :    
187 :     C<< my $flag = $spl->LoadOnly; >>
188 :    
189 :     Return TRUE if we are in load-only mode, else FALSE.
190 :    
191 :     =cut
192 :    
193 :     sub LoadOnly {
194 :     my ($self) = @_;
195 :     return $self->{options}->{loadOnly};
196 :     }
197 :    
198 : parrello 1.25 =head3 PrimaryOnly
199 :    
200 :     C<< my $flag = $spl->PrimaryOnly; >>
201 :    
202 :     Return TRUE if only the main entity is to be loaded, else FALSE.
203 :    
204 :     =cut
205 :    
206 :     sub PrimaryOnly {
207 :     my ($self) = @_;
208 :     return $self->{options}->{primaryOnly};
209 :     }
210 :    
211 : parrello 1.1 =head3 LoadGenomeData
212 :    
213 :     C<< my $stats = $spl->LoadGenomeData(); >>
214 :    
215 :     Load the Genome, Contig, and Sequence data from FIG into Sprout.
216 :    
217 :     The Sequence table is the largest single relation in the Sprout database, so this
218 :     method is expected to be slow and clumsy. At some point we will need to make it
219 :     restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be
220 :     very annoying otherwise.
221 :    
222 :     The following relations are loaded by this method.
223 :    
224 :     Genome
225 :     HasContig
226 :     Contig
227 :     IsMadeUpOf
228 :     Sequence
229 :    
230 :     =over 4
231 :    
232 :     =item RETURNS
233 :    
234 :     Returns a statistics object for the loads.
235 :    
236 :     =back
237 :    
238 :     =cut
239 :     #: Return Type $%;
240 :     sub LoadGenomeData {
241 :     # Get this object instance.
242 :     my ($self) = @_;
243 :     # Get the FIG object.
244 :     my $fig = $self->{fig};
245 :     # Get the genome count.
246 :     my $genomeHash = $self->{genomes};
247 :     my $genomeCount = (keys %{$genomeHash});
248 :     # Create load objects for each of the tables we're loading.
249 : parrello 1.23 my $loadGenome = $self->_TableLoader('Genome');
250 : parrello 1.25 my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly);
251 :     my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly);
252 :     my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly);
253 :     my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly);
254 : parrello 1.23 if ($self->{options}->{loadOnly}) {
255 :     Trace("Loading from existing files.") if T(2);
256 :     } else {
257 :     Trace("Generating genome data.") if T(2);
258 :     # Now we loop through the genomes, generating the data for each one.
259 :     for my $genomeID (sort keys %{$genomeHash}) {
260 :     Trace("Generating data for genome $genomeID.") if T(3);
261 :     $loadGenome->Add("genomeIn");
262 :     # The access code comes in via the genome hash.
263 :     my $accessCode = $genomeHash->{$genomeID};
264 : parrello 1.28 # Get the genus, species, and strain from the scientific name.
265 : parrello 1.23 my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
266 : parrello 1.28 my $extra = join " ", @extraData;
267 : parrello 1.23 # Get the full taxonomy.
268 :     my $taxonomy = $fig->taxonomy_of($genomeID);
269 :     # Output the genome record.
270 :     $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,
271 :     $species, $extra, $taxonomy);
272 :     # Now we loop through each of the genome's contigs.
273 :     my @contigs = $fig->all_contigs($genomeID);
274 :     for my $contigID (@contigs) {
275 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
276 :     $loadContig->Add("contigIn");
277 :     $loadSequence->Add("contigIn");
278 :     # Create the contig ID.
279 :     my $sproutContigID = "$genomeID:$contigID";
280 :     # Create the contig record and relate it to the genome.
281 :     $loadContig->Put($sproutContigID);
282 :     $loadHasContig->Put($genomeID, $sproutContigID);
283 :     # Now we need to split the contig into sequences. The maximum sequence size is
284 :     # a property of the Sprout object.
285 :     my $chunkSize = $self->{sprout}->MaxSequence();
286 :     # Now we get the sequence a chunk at a time.
287 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
288 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
289 :     $loadSequence->Add("chunkIn");
290 :     # Compute the endpoint of this chunk.
291 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
292 :     # Get the actual DNA.
293 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
294 :     # Compute the sequenceID.
295 :     my $seqID = "$sproutContigID.$i";
296 :     # Write out the data. For now, the quality vector is always "unknown".
297 :     $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i);
298 :     $loadSequence->Put($seqID, "unknown", $dna);
299 :     }
300 : parrello 1.1 }
301 :     }
302 :     }
303 :     # Finish the loads.
304 :     my $retVal = $self->_FinishAll();
305 :     # Return the result.
306 :     return $retVal;
307 :     }
308 :    
309 :     =head3 LoadCouplingData
310 :    
311 :     C<< my $stats = $spl->LoadCouplingData(); >>
312 :    
313 :     Load the coupling and evidence data from FIG into Sprout.
314 :    
315 :     The coupling data specifies which genome features are functionally coupled. The
316 :     evidence data explains why the coupling is functional.
317 :    
318 :     The following relations are loaded by this method.
319 :    
320 :     Coupling
321 :     IsEvidencedBy
322 :     PCH
323 :     ParticipatesInCoupling
324 :     UsesAsEvidence
325 :    
326 :     =over 4
327 :    
328 :     =item RETURNS
329 :    
330 :     Returns a statistics object for the loads.
331 :    
332 :     =back
333 :    
334 :     =cut
335 :     #: Return Type $%;
336 :     sub LoadCouplingData {
337 :     # Get this object instance.
338 :     my ($self) = @_;
339 :     # Get the FIG object.
340 :     my $fig = $self->{fig};
341 :     # Get the genome hash.
342 :     my $genomeFilter = $self->{genomes};
343 : parrello 1.50 # Set up an ID counter for the PCHs.
344 :     my $pchID = 0;
345 : parrello 1.1 # Start the loads.
346 : parrello 1.23 my $loadCoupling = $self->_TableLoader('Coupling');
347 : parrello 1.25 my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly);
348 :     my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly);
349 :     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly);
350 :     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly);
351 : parrello 1.23 if ($self->{options}->{loadOnly}) {
352 :     Trace("Loading from existing files.") if T(2);
353 :     } else {
354 :     Trace("Generating coupling data.") if T(2);
355 :     # Loop through the genomes found.
356 :     for my $genome (sort keys %{$genomeFilter}) {
357 :     Trace("Generating coupling data for $genome.") if T(3);
358 :     $loadCoupling->Add("genomeIn");
359 :     # Create a hash table for holding coupled pairs. We use this to prevent
360 :     # duplicates. For example, if A is coupled to B, we don't want to also
361 :     # assert that B is coupled to A, because we already know it. Fortunately,
362 :     # all couplings occur within a genome, so we can keep the hash table
363 :     # size reasonably small.
364 :     my %dupHash = ();
365 :     # Get all of the genome's PEGs.
366 :     my @pegs = $fig->pegs_of($genome);
367 :     # Loop through the PEGs.
368 :     for my $peg1 (@pegs) {
369 :     $loadCoupling->Add("pegIn");
370 :     Trace("Processing PEG $peg1 for $genome.") if T(4);
371 :     # Get a list of the coupled PEGs.
372 :     my @couplings = $fig->coupled_to($peg1);
373 :     # For each coupled PEG, we need to verify that a coupling already
374 :     # exists. If not, we have to create one.
375 :     for my $coupleData (@couplings) {
376 :     my ($peg2, $score) = @{$coupleData};
377 :     # Compute the coupling ID.
378 : parrello 1.47 my $coupleID = $self->{erdb}->CouplingID($peg1, $peg2);
379 : parrello 1.23 if (! exists $dupHash{$coupleID}) {
380 :     $loadCoupling->Add("couplingIn");
381 :     # Here we have a new coupling to store in the load files.
382 :     Trace("Storing coupling ($coupleID) with score $score.") if T(4);
383 :     # Ensure we don't do this again.
384 :     $dupHash{$coupleID} = $score;
385 :     # Write the coupling record.
386 :     $loadCoupling->Put($coupleID, $score);
387 :     # Connect it to the coupled PEGs.
388 :     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);
389 :     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);
390 :     # Get the evidence for this coupling.
391 :     my @evidence = $fig->coupling_evidence($peg1, $peg2);
392 :     # Organize the evidence into a hash table.
393 :     my %evidenceMap = ();
394 :     # Process each evidence item.
395 :     for my $evidenceData (@evidence) {
396 :     $loadPCH->Add("evidenceIn");
397 :     my ($peg3, $peg4, $usage) = @{$evidenceData};
398 :     # Only proceed if the evidence is from a Sprout
399 :     # genome.
400 :     if ($genomeFilter->{$fig->genome_of($peg3)}) {
401 :     $loadUsesAsEvidence->Add("evidenceChosen");
402 :     my $evidenceKey = "$coupleID $peg3 $peg4";
403 :     # We store this evidence in the hash if the usage
404 :     # is nonzero or no prior evidence has been found. This
405 :     # insures that if there is duplicate evidence, we
406 :     # at least keep the meaningful ones. Only evidence in
407 :     # the hash makes it to the output.
408 :     if ($usage || ! exists $evidenceMap{$evidenceKey}) {
409 :     $evidenceMap{$evidenceKey} = $evidenceData;
410 :     }
411 : parrello 1.1 }
412 :     }
413 : parrello 1.23 for my $evidenceID (keys %evidenceMap) {
414 : parrello 1.50 # Get the ID for this evidence.
415 :     $pchID++;
416 : parrello 1.23 # Create the evidence record.
417 :     my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
418 : parrello 1.50 $loadPCH->Put($pchID, $usage);
419 : parrello 1.23 # Connect it to the coupling.
420 : parrello 1.50 $loadIsEvidencedBy->Put($coupleID, $pchID);
421 : parrello 1.23 # Connect it to the features.
422 : parrello 1.50 $loadUsesAsEvidence->Put($pchID, $peg3, 1);
423 :     $loadUsesAsEvidence->Put($pchID, $peg4, 2);
424 : parrello 1.23 }
425 : parrello 1.1 }
426 :     }
427 :     }
428 :     }
429 :     }
430 :     # All done. Finish the load.
431 :     my $retVal = $self->_FinishAll();
432 :     return $retVal;
433 :     }
434 :    
435 :     =head3 LoadFeatureData
436 :    
437 :     C<< my $stats = $spl->LoadFeatureData(); >>
438 :    
439 :     Load the feature data from FIG into Sprout.
440 :    
441 :     Features represent annotated genes, and are therefore the heart of the data store.
442 :    
443 :     The following relations are loaded by this method.
444 :    
445 :     Feature
446 :     FeatureAlias
447 :     FeatureLink
448 :     FeatureTranslation
449 :     FeatureUpstream
450 :     IsLocatedIn
451 : parrello 1.30 HasFeature
452 : parrello 1.1
453 :     =over 4
454 :    
455 :     =item RETURNS
456 :    
457 :     Returns a statistics object for the loads.
458 :    
459 :     =back
460 :    
461 :     =cut
462 :     #: Return Type $%;
463 :     sub LoadFeatureData {
464 :     # Get this object instance.
465 :     my ($self) = @_;
466 :     # Get the FIG object.
467 :     my $fig = $self->{fig};
468 :     # Get the table of genome IDs.
469 :     my $genomeHash = $self->{genomes};
470 :     # Create load objects for each of the tables we're loading.
471 : parrello 1.23 my $loadFeature = $self->_TableLoader('Feature');
472 : parrello 1.25 my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly);
473 : parrello 1.23 my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
474 :     my $loadFeatureLink = $self->_TableLoader('FeatureLink');
475 :     my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
476 :     my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
477 : parrello 1.30 my $loadHasFeature = $self->_TableLoader('HasFeature');
478 : parrello 1.1 # Get the maximum sequence size. We need this later for splitting up the
479 :     # locations.
480 :     my $chunkSize = $self->{sprout}->MaxSegment();
481 : parrello 1.23 if ($self->{options}->{loadOnly}) {
482 :     Trace("Loading from existing files.") if T(2);
483 :     } else {
484 :     Trace("Generating feature data.") if T(2);
485 :     # Now we loop through the genomes, generating the data for each one.
486 :     for my $genomeID (sort keys %{$genomeHash}) {
487 :     Trace("Loading features for genome $genomeID.") if T(3);
488 :     $loadFeature->Add("genomeIn");
489 :     # Get the feature list for this genome.
490 :     my $features = $fig->all_features_detailed($genomeID);
491 : parrello 1.56 # Sort and count the list.
492 : parrello 1.57 my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
493 :     my $count = scalar @featureTuples;
494 : parrello 1.54 Trace("$count features found for genome $genomeID.") if T(3);
495 : parrello 1.56 # Set up for our duplicate-feature check.
496 :     my $oldFeatureID = "";
497 : parrello 1.23 # Loop through the features.
498 : parrello 1.57 for my $featureTuple (@featureTuples) {
499 : parrello 1.23 # Split the tuple.
500 : parrello 1.57 my ($featureID, $locations, undef, $type) = @{$featureTuple};
501 : parrello 1.56 # Check for duplicates.
502 :     if ($featureID eq $oldFeatureID) {
503 :     Trace("Duplicate feature $featureID found.") if T(1);
504 :     } else {
505 :     $oldFeatureID = $featureID;
506 :     # Count this feature.
507 :     $loadFeature->Add("featureIn");
508 :     # Create the feature record.
509 :     $loadFeature->Put($featureID, 1, $type);
510 :     # Link it to the parent genome.
511 :     $loadHasFeature->Put($genomeID, $featureID, $type);
512 :     # Create the aliases.
513 :     for my $alias ($fig->feature_aliases($featureID)) {
514 :     $loadFeatureAlias->Put($featureID, $alias);
515 : parrello 1.8 }
516 : parrello 1.56 # Get the links.
517 :     my @links = $fig->fid_links($featureID);
518 :     for my $link (@links) {
519 :     $loadFeatureLink->Put($featureID, $link);
520 : parrello 1.8 }
521 : parrello 1.56 # If this is a peg, generate the translation and the upstream.
522 :     if ($type eq 'peg') {
523 :     $loadFeatureTranslation->Add("pegIn");
524 :     my $translation = $fig->get_translation($featureID);
525 :     if ($translation) {
526 :     $loadFeatureTranslation->Put($featureID, $translation);
527 :     }
528 :     # We use the default upstream values of u=200 and c=100.
529 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
530 :     if ($upstream) {
531 :     $loadFeatureUpstream->Put($featureID, $upstream);
532 :     }
533 : parrello 1.23 }
534 : parrello 1.56 # This part is the roughest. We need to relate the features to contig
535 :     # locations, and the locations must be split so that none of them exceed
536 :     # the maximum segment size. This simplifies the genes_in_region processing
537 :     # for Sprout.
538 :     my @locationList = split /\s*,\s*/, $locations;
539 :     # Create the location position indicator.
540 :     my $i = 1;
541 :     # Loop through the locations.
542 :     for my $location (@locationList) {
543 :     # Parse the location.
544 :     my $locObject = BasicLocation->new("$genomeID:$location");
545 :     # Split it into a list of chunks.
546 :     my @locOList = ();
547 :     while (my $peeling = $locObject->Peel($chunkSize)) {
548 :     $loadIsLocatedIn->Add("peeling");
549 :     push @locOList, $peeling;
550 :     }
551 :     push @locOList, $locObject;
552 :     # Loop through the chunks, creating IsLocatedIn records. The variable
553 :     # "$i" will be used to keep the location index.
554 :     for my $locChunk (@locOList) {
555 :     $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
556 :     $locChunk->Dir, $locChunk->Length, $i);
557 :     $i++;
558 :     }
559 : parrello 1.23 }
560 : parrello 1.1 }
561 :     }
562 :     }
563 :     }
564 :     # Finish the loads.
565 :     my $retVal = $self->_FinishAll();
566 :     return $retVal;
567 :     }
568 :    
569 :     =head3 LoadBBHData
570 :    
571 :     C<< my $stats = $spl->LoadBBHData(); >>
572 :    
573 :     Load the bidirectional best hit data from FIG into Sprout.
574 :    
575 :     Sprout does not store information on similarities. Instead, it has only the
576 :     bi-directional best hits. Even so, the BBH table is one of the largest in
577 :     the database.
578 :    
579 :     The following relations are loaded by this method.
580 :    
581 :     IsBidirectionalBestHitOf
582 :    
583 :     =over 4
584 :    
585 :     =item RETURNS
586 :    
587 :     Returns a statistics object for the loads.
588 :    
589 :     =back
590 :    
591 :     =cut
592 :     #: Return Type $%;
593 : parrello 1.2 sub LoadBBHData {
594 : parrello 1.1 # Get this object instance.
595 :     my ($self) = @_;
596 :     # Get the FIG object.
597 :     my $fig = $self->{fig};
598 :     # Get the table of genome IDs.
599 :     my $genomeHash = $self->{genomes};
600 :     # Create load objects for each of the tables we're loading.
601 : parrello 1.23 my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf');
602 :     if ($self->{options}->{loadOnly}) {
603 :     Trace("Loading from existing files.") if T(2);
604 :     } else {
605 :     Trace("Generating BBH data.") if T(2);
606 :     # Now we loop through the genomes, generating the data for each one.
607 :     for my $genomeID (sort keys %{$genomeHash}) {
608 :     $loadIsBidirectionalBestHitOf->Add("genomeIn");
609 :     Trace("Processing features for genome $genomeID.") if T(3);
610 :     # Get the feature list for this genome.
611 :     my $features = $fig->all_features_detailed($genomeID);
612 : parrello 1.65 # Count the BBHs we find.
613 :     my $bbhCount = 0;
614 : parrello 1.23 # Loop through the features.
615 :     for my $featureData (@{$features}) {
616 :     # Split the tuple.
617 :     my ($featureID, $locations, $aliases, $type) = @{$featureData};
618 :     # Get the bi-directional best hits.
619 :     my @bbhList = $fig->bbhs($featureID);
620 :     for my $bbhEntry (@bbhList) {
621 :     # Get the target feature ID and the score.
622 :     my ($targetID, $score) = @{$bbhEntry};
623 :     # Check the target feature's genome.
624 :     my $targetGenomeID = $fig->genome_of($targetID);
625 :     # Only proceed if it's one of our genomes.
626 :     if ($genomeHash->{$targetGenomeID}) {
627 :     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,
628 :     $score);
629 : parrello 1.65 $bbhCount++;
630 : parrello 1.23 }
631 : parrello 1.1 }
632 :     }
633 : parrello 1.65 Trace("$bbhCount BBHs found for $genomeID.") if T(3);
634 : parrello 1.1 }
635 :     }
636 :     # Finish the loads.
637 :     my $retVal = $self->_FinishAll();
638 :     return $retVal;
639 :     }
640 :    
641 :     =head3 LoadSubsystemData
642 :    
643 :     C<< my $stats = $spl->LoadSubsystemData(); >>
644 :    
645 :     Load the subsystem data from FIG into Sprout.
646 :    
647 :     Subsystems are groupings of genetic roles that work together to effect a specific
648 :     chemical reaction. Similar organisms require similar subsystems. To curate a subsystem,
649 :     a spreadsheet is created with genomes on one axis and subsystem roles on the other
650 :     axis. Similar features are then mapped into the cells, allowing the annotation of one
651 :     genome's roles to be used to assist in the annotation of others.
652 :    
653 :     The following relations are loaded by this method.
654 :    
655 :     Subsystem
656 : parrello 1.46 SubsystemClass
657 : parrello 1.1 Role
658 : parrello 1.19 RoleEC
659 : parrello 1.1 SSCell
660 :     ContainsFeature
661 :     IsGenomeOf
662 :     IsRoleOf
663 :     OccursInSubsystem
664 :     ParticipatesIn
665 :     HasSSCell
666 : parrello 1.18 ConsistsOfRoles
667 :     RoleSubset
668 :     HasRoleSubset
669 :     ConsistsOfGenomes
670 :     GenomeSubset
671 :     HasGenomeSubset
672 : parrello 1.20 Catalyzes
673 : parrello 1.21 Diagram
674 :     RoleOccursIn
675 : parrello 1.1
676 :     =over 4
677 :    
678 :     =item RETURNS
679 :    
680 :     Returns a statistics object for the loads.
681 :    
682 :     =back
683 :    
684 :     =cut
685 :     #: Return Type $%;
686 :     sub LoadSubsystemData {
687 :     # Get this object instance.
688 :     my ($self) = @_;
689 :     # Get the FIG object.
690 :     my $fig = $self->{fig};
691 :     # Get the genome hash. We'll use it to filter the genomes in each
692 :     # spreadsheet.
693 :     my $genomeHash = $self->{genomes};
694 :     # Get the subsystem hash. This lists the subsystems we'll process.
695 :     my $subsysHash = $self->{subsystems};
696 :     my @subsysIDs = sort keys %{$subsysHash};
697 : parrello 1.21 # Get the map list.
698 :     my @maps = $fig->all_maps;
699 : parrello 1.1 # Create load objects for each of the tables we're loading.
700 : parrello 1.25 my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly);
701 :     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly);
702 : parrello 1.23 my $loadSubsystem = $self->_TableLoader('Subsystem');
703 : parrello 1.25 my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly);
704 :     my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly);
705 :     my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly);
706 :     my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly);
707 :     my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly);
708 :     my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly);
709 :     my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly);
710 :     my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly);
711 :     my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly);
712 :     my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly);
713 :     my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly);
714 :     my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly);
715 :     my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly);
716 :     my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly);
717 :     my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly);
718 :     my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly);
719 : parrello 1.46 my $loadSubsystemClass = $self->_TableLoader('SubsystemClass', $self->PrimaryOnly);
720 : parrello 1.23 if ($self->{options}->{loadOnly}) {
721 :     Trace("Loading from existing files.") if T(2);
722 :     } else {
723 :     Trace("Generating subsystem data.") if T(2);
724 :     # This hash will contain the role for each EC. When we're done, this
725 :     # information will be used to generate the Catalyzes table.
726 :     my %ecToRoles = ();
727 :     # Loop through the subsystems. Our first task will be to create the
728 :     # roles. We do this by looping through the subsystems and creating a
729 :     # role hash. The hash tracks each role ID so that we don't create
730 :     # duplicates. As we move along, we'll connect the roles and subsystems
731 :     # and memorize up the reactions.
732 :     my ($genomeID, $roleID);
733 :     my %roleData = ();
734 :     for my $subsysID (@subsysIDs) {
735 :     # Get the subsystem object.
736 :     my $sub = $fig->get_subsystem($subsysID);
737 : parrello 1.32 # Only proceed if the subsystem has a spreadsheet.
738 :     if (! $sub->{empty_ss}) {
739 : parrello 1.31 Trace("Creating subsystem $subsysID.") if T(3);
740 :     $loadSubsystem->Add("subsystemIn");
741 :     # Create the subsystem record.
742 :     my $curator = $sub->get_curator();
743 :     my $notes = $sub->get_notes();
744 :     $loadSubsystem->Put($subsysID, $curator, $notes);
745 : parrello 1.64 my $classList = $fig->subsystem_classification($subsysID);
746 :     my @classes = @$classList;
747 :     if (@classes) {
748 :     for my $class (@classes) {
749 :     $loadSubsystemClass->Put($subsysID, $class);
750 :     }
751 : parrello 1.46 }
752 : parrello 1.31 # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
753 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
754 :     # Connect to this role.
755 :     $loadOccursInSubsystem->Add("roleIn");
756 :     $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
757 :     # If it's a new role, add it to the role table.
758 :     if (! exists $roleData{$roleID}) {
759 :     # Get the role's abbreviation.
760 :     my $abbr = $sub->get_role_abbr($col);
761 :     # Add the role.
762 :     $loadRole->Put($roleID, $abbr);
763 :     $roleData{$roleID} = 1;
764 :     # Check for an EC number.
765 :     if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
766 :     my $ec = $1;
767 :     $loadRoleEC->Put($roleID, $ec);
768 :     $ecToRoles{$ec} = $roleID;
769 :     }
770 : parrello 1.23 }
771 : parrello 1.18 }
772 : parrello 1.31 # Now we create the spreadsheet for the subsystem by matching roles to
773 :     # genomes. Each genome is a row and each role is a column. We may need
774 :     # to actually create the roles as we find them.
775 :     Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
776 :     for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
777 :     # Only proceed if this is one of our genomes.
778 :     if (exists $genomeHash->{$genomeID}) {
779 :     # Count the PEGs and cells found for verification purposes.
780 :     my $pegCount = 0;
781 :     my $cellCount = 0;
782 :     # Create a list for the PEGs we find. This list will be used
783 :     # to generate cluster numbers.
784 :     my @pegsFound = ();
785 :     # Create a hash that maps spreadsheet IDs to PEGs. We will
786 :     # use this to generate the ContainsFeature data after we have
787 :     # the cluster numbers.
788 :     my %cellPegs = ();
789 :     # Get the genome's variant code for this subsystem.
790 :     my $variantCode = $sub->get_variant_code($row);
791 :     # Loop through the subsystem's roles. We use an index because it is
792 :     # part of the spreadsheet cell ID.
793 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
794 :     # Get the features in the spreadsheet cell for this genome and role.
795 : parrello 1.37 my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
796 : parrello 1.31 # Only proceed if features exist.
797 :     if (@pegs > 0) {
798 :     # Create the spreadsheet cell.
799 :     $cellCount++;
800 :     my $cellID = "$subsysID:$genomeID:$col";
801 :     $loadSSCell->Put($cellID);
802 :     $loadIsGenomeOf->Put($genomeID, $cellID);
803 :     $loadIsRoleOf->Put($roleID, $cellID);
804 :     $loadHasSSCell->Put($subsysID, $cellID);
805 :     # Remember its features.
806 :     push @pegsFound, @pegs;
807 :     $cellPegs{$cellID} = \@pegs;
808 :     $pegCount += @pegs;
809 :     }
810 : parrello 1.23 }
811 : parrello 1.31 # If we found some cells for this genome, we need to compute clusters and
812 :     # denote it participates in the subsystem.
813 :     if ($pegCount > 0) {
814 :     Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
815 :     $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
816 :     # Create a hash mapping PEG IDs to cluster numbers.
817 :     # We default to -1 for all of them.
818 :     my %clusterOf = map { $_ => -1 } @pegsFound;
819 : parrello 1.41 # Partition the PEGs found into clusters.
820 :     my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
821 : parrello 1.31 for (my $i = 0; $i <= $#clusters; $i++) {
822 :     my $subList = $clusters[$i];
823 :     for my $peg (@{$subList}) {
824 :     $clusterOf{$peg} = $i;
825 :     }
826 : parrello 1.23 }
827 : parrello 1.31 # Create the ContainsFeature data.
828 :     for my $cellID (keys %cellPegs) {
829 :     my $cellList = $cellPegs{$cellID};
830 :     for my $cellPeg (@$cellList) {
831 :     $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
832 :     }
833 : parrello 1.23 }
834 : parrello 1.18 }
835 :     }
836 : parrello 1.15 }
837 : parrello 1.31 # Now we need to generate the subsets. The subset names must be concatenated to
838 :     # the subsystem name to make them unique keys. There are two types of subsets:
839 :     # genome subsets and role subsets. We do the role subsets first.
840 :     my @subsetNames = $sub->get_subset_names();
841 :     for my $subsetID (@subsetNames) {
842 :     # Create the subset record.
843 :     my $actualID = "$subsysID:$subsetID";
844 :     $loadRoleSubset->Put($actualID);
845 :     # Connect the subset to the subsystem.
846 :     $loadHasRoleSubset->Put($subsysID, $actualID);
847 :     # Connect the subset to its roles.
848 :     my @roles = $sub->get_subsetC_roles($subsetID);
849 :     for my $roleID (@roles) {
850 :     $loadConsistsOfRoles->Put($actualID, $roleID);
851 :     }
852 :     }
853 :     # Next the genome subsets.
854 :     @subsetNames = $sub->get_subset_namesR();
855 :     for my $subsetID (@subsetNames) {
856 :     # Create the subset record.
857 :     my $actualID = "$subsysID:$subsetID";
858 :     $loadGenomeSubset->Put($actualID);
859 :     # Connect the subset to the subsystem.
860 :     $loadHasGenomeSubset->Put($subsysID, $actualID);
861 :     # Connect the subset to its genomes.
862 :     my @genomes = $sub->get_subsetR($subsetID);
863 :     for my $genomeID (@genomes) {
864 :     $loadConsistsOfGenomes->Put($actualID, $genomeID);
865 :     }
866 : parrello 1.23 }
867 : parrello 1.18 }
868 : parrello 1.57 }
869 :     # Now we loop through the diagrams. We need to create the diagram records
870 :     # and link each diagram to its roles. Note that only roles which occur
871 :     # in subsystems (and therefore appear in the %ecToRoles hash) are
872 :     # included.
873 :     for my $map (@maps) {
874 :     Trace("Loading diagram $map.") if T(3);
875 :     # Get the diagram's descriptive name.
876 :     my $name = $fig->map_name($map);
877 :     $loadDiagram->Put($map, $name);
878 :     # Now we need to link all the map's roles to it.
879 :     # A hash is used to prevent duplicates.
880 :     my %roleHash = ();
881 :     for my $role ($fig->map_to_ecs($map)) {
882 :     if (exists $ecToRoles{$role} && ! $roleHash{$role}) {
883 :     $loadRoleOccursIn->Put($ecToRoles{$role}, $map);
884 :     $roleHash{$role} = 1;
885 : parrello 1.23 }
886 : parrello 1.21 }
887 : parrello 1.57 }
888 :     # Before we leave, we must create the Catalyzes table. We start with the reactions,
889 :     # then use the "ecToRoles" table to convert EC numbers to role IDs.
890 :     my @reactions = $fig->all_reactions();
891 :     for my $reactionID (@reactions) {
892 :     # Get this reaction's list of roles. The results will be EC numbers.
893 :     my @roles = $fig->catalyzed_by($reactionID);
894 :     # Loop through the roles, creating catalyzation records.
895 :     for my $thisRole (@roles) {
896 :     if (exists $ecToRoles{$thisRole}) {
897 :     $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID);
898 : parrello 1.23 }
899 : parrello 1.18 }
900 :     }
901 : parrello 1.1 }
902 :     # Finish the load.
903 :     my $retVal = $self->_FinishAll();
904 :     return $retVal;
905 :     }
906 :    
907 :     =head3 LoadPropertyData
908 :    
909 :     C<< my $stats = $spl->LoadPropertyData(); >>
910 :    
911 :     Load the attribute data from FIG into Sprout.
912 :    
913 :     Attribute data in FIG corresponds to the Sprout concept of Property. As currently
914 :     implemented, each key-value attribute combination in the SEED corresponds to a
915 :     record in the B<Property> table. The B<HasProperty> relationship links the
916 :     features to the properties.
917 :    
918 :     The SEED also allows attributes to be assigned to genomes, but this is not yet
919 :     supported by Sprout.
920 :    
921 :     The following relations are loaded by this method.
922 :    
923 :     HasProperty
924 :     Property
925 :    
926 :     =over 4
927 :    
928 :     =item RETURNS
929 :    
930 :     Returns a statistics object for the loads.
931 :    
932 :     =back
933 :    
934 :     =cut
935 :     #: Return Type $%;
936 :     sub LoadPropertyData {
937 :     # Get this object instance.
938 :     my ($self) = @_;
939 :     # Get the FIG object.
940 :     my $fig = $self->{fig};
941 :     # Get the genome hash.
942 :     my $genomeHash = $self->{genomes};
943 :     # Create load objects for each of the tables we're loading.
944 : parrello 1.23 my $loadProperty = $self->_TableLoader('Property');
945 : parrello 1.25 my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly);
946 : parrello 1.23 if ($self->{options}->{loadOnly}) {
947 :     Trace("Loading from existing files.") if T(2);
948 :     } else {
949 :     Trace("Generating property data.") if T(2);
950 :     # Create a hash for storing property IDs.
951 :     my %propertyKeys = ();
952 :     my $nextID = 1;
953 :     # Loop through the genomes.
954 :     for my $genomeID (keys %{$genomeHash}) {
955 :     $loadProperty->Add("genomeIn");
956 : parrello 1.24 Trace("Generating properties for $genomeID.") if T(3);
957 : parrello 1.23 # Get the genome's features. The feature ID is the first field in the
958 :     # tuples returned by "all_features_detailed". We use "all_features_detailed"
959 :     # rather than "all_features" because we want all features regardless of type.
960 :     my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
961 : parrello 1.24 my $featureCount = 0;
962 :     my $propertyCount = 0;
963 : parrello 1.23 # Loop through the features, creating HasProperty records.
964 :     for my $fid (@features) {
965 :     # Get all attributes for this feature. We do this one feature at a time
966 :     # to insure we do not get any genome attributes.
967 :     my @attributeList = $fig->get_attributes($fid, '', '', '');
968 : parrello 1.24 if (scalar @attributeList) {
969 :     $featureCount++;
970 :     }
971 : parrello 1.23 # Loop through the attributes.
972 :     for my $tuple (@attributeList) {
973 : parrello 1.24 $propertyCount++;
974 : parrello 1.23 # Get this attribute value's data. Note that we throw away the FID,
975 :     # since it will always be the same as the value if "$fid".
976 :     my (undef, $key, $value, $url) = @{$tuple};
977 :     # Concatenate the key and value and check the "propertyKeys" hash to
978 :     # see if we already have an ID for it. We use a tab for the separator
979 :     # character.
980 :     my $propertyKey = "$key\t$value";
981 :     # Use the concatenated value to check for an ID. If no ID exists, we
982 :     # create one.
983 :     my $propertyID = $propertyKeys{$propertyKey};
984 :     if (! $propertyID) {
985 :     # Here we need to create a new property ID for this key/value pair.
986 :     $propertyKeys{$propertyKey} = $nextID;
987 :     $propertyID = $nextID;
988 :     $nextID++;
989 :     $loadProperty->Put($propertyID, $key, $value);
990 :     }
991 :     # Create the HasProperty entry for this feature/property association.
992 :     $loadHasProperty->Put($fid, $propertyID, $url);
993 : parrello 1.1 }
994 :     }
995 : parrello 1.24 # Update the statistics.
996 :     Trace("$propertyCount attributes processed for $featureCount features.") if T(3);
997 :     $loadHasProperty->Add("featuresIn", $featureCount);
998 :     $loadHasProperty->Add("propertiesIn", $propertyCount);
999 : parrello 1.1 }
1000 :     }
1001 :     # Finish the load.
1002 :     my $retVal = $self->_FinishAll();
1003 :     return $retVal;
1004 :     }
1005 :    
1006 :     =head3 LoadAnnotationData
1007 :    
1008 :     C<< my $stats = $spl->LoadAnnotationData(); >>
1009 :    
1010 :     Load the annotation data from FIG into Sprout.
1011 :    
1012 :     Sprout annotations encompass both the assignments and the annotations in SEED.
1013 :     These describe the function performed by a PEG as well as any other useful
1014 :     information that may aid in identifying its purpose.
1015 :    
1016 :     The following relations are loaded by this method.
1017 :    
1018 :     Annotation
1019 :     IsTargetOfAnnotation
1020 :     SproutUser
1021 :     MadeAnnotation
1022 :    
1023 :     =over 4
1024 :    
1025 :     =item RETURNS
1026 :    
1027 :     Returns a statistics object for the loads.
1028 :    
1029 :     =back
1030 :    
1031 :     =cut
1032 :     #: Return Type $%;
1033 :     sub LoadAnnotationData {
1034 :     # Get this object instance.
1035 :     my ($self) = @_;
1036 :     # Get the FIG object.
1037 :     my $fig = $self->{fig};
1038 :     # Get the genome hash.
1039 :     my $genomeHash = $self->{genomes};
1040 :     # Create load objects for each of the tables we're loading.
1041 : parrello 1.23 my $loadAnnotation = $self->_TableLoader('Annotation');
1042 : parrello 1.25 my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly);
1043 :     my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly);
1044 :     my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly);
1045 :     my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly);
1046 : parrello 1.23 if ($self->{options}->{loadOnly}) {
1047 :     Trace("Loading from existing files.") if T(2);
1048 :     } else {
1049 :     Trace("Generating annotation data.") if T(2);
1050 :     # Create a hash of user names. We'll use this to prevent us from generating duplicate
1051 :     # user records.
1052 :     my %users = ( FIG => 1, master => 1 );
1053 :     # Put in FIG and "master".
1054 :     $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes");
1055 :     $loadUserAccess->Put("FIG", 1);
1056 :     $loadSproutUser->Put("master", "Master User");
1057 :     $loadUserAccess->Put("master", 1);
1058 :     # Get the current time.
1059 :     my $time = time();
1060 :     # Loop through the genomes.
1061 :     for my $genomeID (sort keys %{$genomeHash}) {
1062 :     Trace("Processing $genomeID.") if T(3);
1063 : parrello 1.38 # Create a hash of timestamps. We use this to prevent duplicate time stamps
1064 :     # from showing up for a single PEG's annotations.
1065 :     my %seenTimestamps = ();
1066 : parrello 1.36 # Get the genome's annotations.
1067 :     my @annotations = $fig->read_all_annotations($genomeID);
1068 :     Trace("Processing annotations.") if T(2);
1069 :     for my $tuple (@annotations) {
1070 :     # Get the annotation tuple.
1071 :     my ($peg, $timestamp, $user, $text) = @{$tuple};
1072 :     # Here we fix up the annotation text. "\r" is removed,
1073 : parrello 1.42 # and "\t" and "\n" are escaped. Note we use the "gs"
1074 : parrello 1.36 # modifier so that new-lines inside the text do not
1075 :     # stop the substitution search.
1076 :     $text =~ s/\r//gs;
1077 :     $text =~ s/\t/\\t/gs;
1078 :     $text =~ s/\n/\\n/gs;
1079 :     # Change assignments by the master user to FIG assignments.
1080 :     $text =~ s/Set master function/Set FIG function/s;
1081 :     # Insure the time stamp is valid.
1082 :     if ($timestamp =~ /^\d+$/) {
1083 :     # Here it's a number. We need to insure the one we use to form
1084 :     # the key is unique.
1085 :     my $keyStamp = $timestamp;
1086 :     while ($seenTimestamps{"$peg:$keyStamp"}) {
1087 :     $keyStamp++;
1088 : parrello 1.1 }
1089 : parrello 1.36 my $annotationID = "$peg:$keyStamp";
1090 :     $seenTimestamps{$annotationID} = 1;
1091 :     # Insure the user exists.
1092 :     if (! $users{$user}) {
1093 :     $loadSproutUser->Put($user, "SEED user");
1094 :     $loadUserAccess->Put($user, 1);
1095 :     $users{$user} = 1;
1096 :     }
1097 :     # Generate the annotation.
1098 :     $loadAnnotation->Put($annotationID, $timestamp, $text);
1099 :     $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1100 :     $loadMadeAnnotation->Put($user, $annotationID);
1101 :     } else {
1102 :     # Here we have an invalid time stamp.
1103 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1);
1104 : parrello 1.1 }
1105 :     }
1106 :     }
1107 :     }
1108 :     # Finish the load.
1109 :     my $retVal = $self->_FinishAll();
1110 :     return $retVal;
1111 :     }
1112 :    
1113 : parrello 1.5 =head3 LoadSourceData
1114 :    
1115 :     C<< my $stats = $spl->LoadSourceData(); >>
1116 :    
1117 :     Load the source data from FIG into Sprout.
1118 :    
1119 :     Source data links genomes to information about the organizations that
1120 :     mapped it.
1121 :    
1122 :     The following relations are loaded by this method.
1123 :    
1124 :     ComesFrom
1125 :     Source
1126 :     SourceURL
1127 :    
1128 :     There is no direct support for source attribution in FIG, so we access the SEED
1129 :     files directly.
1130 :    
1131 :     =over 4
1132 :    
1133 :     =item RETURNS
1134 :    
1135 :     Returns a statistics object for the loads.
1136 :    
1137 :     =back
1138 :    
1139 :     =cut
1140 :     #: Return Type $%;
1141 :     sub LoadSourceData {
1142 :     # Get this object instance.
1143 :     my ($self) = @_;
1144 :     # Get the FIG object.
1145 :     my $fig = $self->{fig};
1146 :     # Get the genome hash.
1147 :     my $genomeHash = $self->{genomes};
1148 :     # Create load objects for each of the tables we're loading.
1149 : parrello 1.25 my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly);
1150 : parrello 1.23 my $loadSource = $self->_TableLoader('Source');
1151 :     my $loadSourceURL = $self->_TableLoader('SourceURL');
1152 :     if ($self->{options}->{loadOnly}) {
1153 :     Trace("Loading from existing files.") if T(2);
1154 :     } else {
1155 :     Trace("Generating annotation data.") if T(2);
1156 :     # Create hashes to collect the Source information.
1157 :     my %sourceURL = ();
1158 :     my %sourceDesc = ();
1159 :     # Loop through the genomes.
1160 :     my $line;
1161 :     for my $genomeID (sort keys %{$genomeHash}) {
1162 :     Trace("Processing $genomeID.") if T(3);
1163 :     # Open the project file.
1164 :     if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1165 :     defined($line = <TMP>)) {
1166 :     chomp $line;
1167 :     my($sourceID, $desc, $url) = split(/\t/,$line);
1168 :     $loadComesFrom->Put($genomeID, $sourceID);
1169 :     if ($url && ! exists $sourceURL{$sourceID}) {
1170 :     $loadSourceURL->Put($sourceID, $url);
1171 :     $sourceURL{$sourceID} = 1;
1172 :     }
1173 :     if ($desc) {
1174 :     $sourceDesc{$sourceID} = $desc;
1175 :     } elsif (! exists $sourceDesc{$sourceID}) {
1176 :     $sourceDesc{$sourceID} = $sourceID;
1177 :     }
1178 : parrello 1.5 }
1179 : parrello 1.23 close TMP;
1180 :     }
1181 :     # Write the source descriptions.
1182 :     for my $sourceID (keys %sourceDesc) {
1183 :     $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1184 : parrello 1.5 }
1185 : parrello 1.16 }
1186 : parrello 1.5 # Finish the load.
1187 :     my $retVal = $self->_FinishAll();
1188 :     return $retVal;
1189 :     }
1190 :    
1191 : parrello 1.6 =head3 LoadExternalData
1192 :    
1193 :     C<< my $stats = $spl->LoadExternalData(); >>
1194 :    
1195 :     Load the external data from FIG into Sprout.
1196 :    
1197 :     External data contains information about external feature IDs.
1198 :    
1199 :     The following relations are loaded by this method.
1200 :    
1201 :     ExternalAliasFunc
1202 :     ExternalAliasOrg
1203 :    
1204 :     The support for external IDs in FIG is hidden beneath layers of other data, so
1205 :     we access the SEED files directly to create these tables. This is also one of
1206 :     the few load methods that does not proceed genome by genome.
1207 :    
1208 :     =over 4
1209 :    
1210 :     =item RETURNS
1211 :    
1212 :     Returns a statistics object for the loads.
1213 :    
1214 :     =back
1215 :    
1216 :     =cut
1217 :     #: Return Type $%;
1218 :     sub LoadExternalData {
1219 :     # Get this object instance.
1220 :     my ($self) = @_;
1221 :     # Get the FIG object.
1222 :     my $fig = $self->{fig};
1223 :     # Get the genome hash.
1224 :     my $genomeHash = $self->{genomes};
1225 :     # Convert the genome hash. We'll get the genus and species for each genome and make
1226 :     # it the key.
1227 :     my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1228 :     # Create load objects for each of the tables we're loading.
1229 : parrello 1.23 my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1230 :     my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1231 :     if ($self->{options}->{loadOnly}) {
1232 :     Trace("Loading from existing files.") if T(2);
1233 :     } else {
1234 :     Trace("Generating external data.") if T(2);
1235 :     # We loop through the files one at a time. First, the organism file.
1236 : parrello 1.58 Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1237 : parrello 1.23 my $orgLine;
1238 :     while (defined($orgLine = <ORGS>)) {
1239 :     # Clean the input line.
1240 :     chomp $orgLine;
1241 :     # Parse the organism name.
1242 :     my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1243 :     $loadExternalAliasOrg->Put($protID, $name);
1244 :     }
1245 :     close ORGS;
1246 :     # Now the function file.
1247 :     my $funcLine;
1248 : parrello 1.58 Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1249 : parrello 1.23 while (defined($funcLine = <FUNCS>)) {
1250 :     # Clean the line ending.
1251 :     chomp $funcLine;
1252 :     # Only proceed if the line is non-blank.
1253 :     if ($funcLine) {
1254 :     # Split it into fields.
1255 :     my @funcFields = split /\s*\t\s*/, $funcLine;
1256 :     # If there's an EC number, append it to the description.
1257 :     if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1258 :     $funcFields[1] .= " $1";
1259 :     }
1260 :     # Output the function line.
1261 :     $loadExternalAliasFunc->Put(@funcFields[0,1]);
1262 : parrello 1.6 }
1263 :     }
1264 :     }
1265 :     # Finish the load.
1266 :     my $retVal = $self->_FinishAll();
1267 :     return $retVal;
1268 :     }
1269 : parrello 1.5
1270 : parrello 1.18
1271 :     =head3 LoadReactionData
1272 :    
1273 :     C<< my $stats = $spl->LoadReactionData(); >>
1274 :    
1275 :     Load the reaction data from FIG into Sprout.
1276 :    
1277 :     Reaction data connects reactions to the compounds that participate in them.
1278 :    
1279 :     The following relations are loaded by this method.
1280 :    
1281 : parrello 1.20 Reaction
1282 : parrello 1.18 ReactionURL
1283 :     Compound
1284 :     CompoundName
1285 :     CompoundCAS
1286 :     IsAComponentOf
1287 :    
1288 :     This method proceeds reaction by reaction rather than genome by genome.
1289 :    
1290 :     =over 4
1291 :    
1292 :     =item RETURNS
1293 :    
1294 :     Returns a statistics object for the loads.
1295 :    
1296 :     =back
1297 :    
1298 :     =cut
1299 :     #: Return Type $%;
1300 :     sub LoadReactionData {
1301 :     # Get this object instance.
1302 :     my ($self) = @_;
1303 :     # Get the FIG object.
1304 :     my $fig = $self->{fig};
1305 :     # Create load objects for each of the tables we're loading.
1306 : parrello 1.23 my $loadReaction = $self->_TableLoader('Reaction');
1307 : parrello 1.25 my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly);
1308 :     my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly);
1309 :     my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly);
1310 :     my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly);
1311 :     my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly);
1312 : parrello 1.23 if ($self->{options}->{loadOnly}) {
1313 :     Trace("Loading from existing files.") if T(2);
1314 :     } else {
1315 :     Trace("Generating annotation data.") if T(2);
1316 :     # First we create the compounds.
1317 :     my @compounds = $fig->all_compounds();
1318 :     for my $cid (@compounds) {
1319 :     # Check for names.
1320 :     my @names = $fig->names_of_compound($cid);
1321 :     # Each name will be given a priority number, starting with 1.
1322 :     my $prio = 1;
1323 :     for my $name (@names) {
1324 :     $loadCompoundName->Put($cid, $name, $prio++);
1325 :     }
1326 :     # Create the main compound record. Note that the first name
1327 :     # becomes the label.
1328 :     my $label = (@names > 0 ? $names[0] : $cid);
1329 :     $loadCompound->Put($cid, $label);
1330 :     # Check for a CAS ID.
1331 :     my $cas = $fig->cas($cid);
1332 :     if ($cas) {
1333 :     $loadCompoundCAS->Put($cid, $cas);
1334 :     }
1335 : parrello 1.20 }
1336 : parrello 1.23 # All the compounds are set up, so we need to loop through the reactions next. First,
1337 :     # we initialize the discriminator index. This is a single integer used to insure
1338 :     # duplicate elements in a reaction are not accidentally collapsed.
1339 :     my $discrim = 0;
1340 :     my @reactions = $fig->all_reactions();
1341 :     for my $reactionID (@reactions) {
1342 :     # Create the reaction record.
1343 :     $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1344 :     # Compute the reaction's URL.
1345 :     my $url = HTML::reaction_link($reactionID);
1346 :     # Put it in the ReactionURL table.
1347 :     $loadReactionURL->Put($reactionID, $url);
1348 :     # Now we need all of the reaction's compounds. We get these in two phases,
1349 :     # substrates first and then products.
1350 :     for my $product (0, 1) {
1351 :     # Get the compounds of the current type for the current reaction. FIG will
1352 :     # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1353 :     # have location data in SEED, so it defaults to the empty string.
1354 :     my @compounds = $fig->reaction2comp($reactionID, $product);
1355 :     for my $compData (@compounds) {
1356 :     # Extract the compound data from the current tuple.
1357 :     my ($cid, $stoich, $main) = @{$compData};
1358 :     # Link the compound to the reaction.
1359 :     $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1360 :     $product, $stoich);
1361 :     }
1362 : parrello 1.18 }
1363 :     }
1364 :     }
1365 :     # Finish the load.
1366 :     my $retVal = $self->_FinishAll();
1367 :     return $retVal;
1368 :     }
1369 :    
1370 : parrello 1.5 =head3 LoadGroupData
1371 :    
1372 :     C<< my $stats = $spl->LoadGroupData(); >>
1373 :    
1374 :     Load the genome Groups into Sprout.
1375 :    
1376 :     The following relations are loaded by this method.
1377 :    
1378 :     GenomeGroups
1379 :    
1380 :     There is no direct support for genome groups in FIG, so we access the SEED
1381 :     files directly.
1382 :    
1383 :     =over 4
1384 :    
1385 :     =item RETURNS
1386 :    
1387 :     Returns a statistics object for the loads.
1388 :    
1389 :     =back
1390 :    
1391 :     =cut
1392 :     #: Return Type $%;
1393 :     sub LoadGroupData {
1394 :     # Get this object instance.
1395 :     my ($self) = @_;
1396 :     # Get the FIG object.
1397 :     my $fig = $self->{fig};
1398 :     # Get the genome hash.
1399 :     my $genomeHash = $self->{genomes};
1400 :     # Create a load object for the table we're loading.
1401 : parrello 1.23 my $loadGenomeGroups = $self->_TableLoader('GenomeGroups');
1402 :     if ($self->{options}->{loadOnly}) {
1403 :     Trace("Loading from existing files.") if T(2);
1404 :     } else {
1405 :     Trace("Generating group data.") if T(2);
1406 :     # Loop through the genomes.
1407 :     my $line;
1408 :     for my $genomeID (keys %{$genomeHash}) {
1409 :     Trace("Processing $genomeID.") if T(3);
1410 :     # Open the NMPDR group file for this genome.
1411 :     if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1412 :     defined($line = <TMP>)) {
1413 :     # Clean the line ending.
1414 :     chomp $line;
1415 :     # Add the group to the table. Note that there can only be one group
1416 :     # per genome.
1417 :     $loadGenomeGroups->Put($genomeID, $line);
1418 :     }
1419 :     close TMP;
1420 : parrello 1.5 }
1421 :     }
1422 :     # Finish the load.
1423 :     my $retVal = $self->_FinishAll();
1424 :     return $retVal;
1425 :     }
1426 :    
1427 : parrello 1.43 =head3 LoadSynonymData
1428 :    
1429 :     C<< my $stats = $spl->LoadSynonymData(); >>
1430 :    
1431 :     Load the synonym groups into Sprout.
1432 :    
1433 :     The following relations are loaded by this method.
1434 :    
1435 :     SynonymGroup
1436 :     IsSynonymGroupFor
1437 :    
1438 :     The source information for these relations is taken from the C<maps_to_id> method
1439 : parrello 1.56 of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1440 :     SQL against the FIG database.
1441 : parrello 1.43
1442 :     =over 4
1443 :    
1444 :     =item RETURNS
1445 :    
1446 :     Returns a statistics object for the loads.
1447 :    
1448 :     =back
1449 :    
1450 :     =cut
1451 :     #: Return Type $%;
1452 :     sub LoadSynonymData {
1453 :     # Get this object instance.
1454 :     my ($self) = @_;
1455 :     # Get the FIG object.
1456 :     my $fig = $self->{fig};
1457 :     # Get the genome hash.
1458 :     my $genomeHash = $self->{genomes};
1459 :     # Create a load object for the table we're loading.
1460 :     my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1461 :     my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1462 :     if ($self->{options}->{loadOnly}) {
1463 :     Trace("Loading from existing files.") if T(2);
1464 :     } else {
1465 :     Trace("Generating synonym group data.") if T(2);
1466 : parrello 1.56 # Get the database handle.
1467 :     my $dbh = $fig->db_handle();
1468 :     # Ask for the synonyms.
1469 : parrello 1.59 my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1470 : parrello 1.56 my $result = $sth->execute();
1471 :     if (! defined($result)) {
1472 :     Confess("Database error in Synonym load: " . $sth->errstr());
1473 :     } else {
1474 :     # Remember the current synonym.
1475 :     my $current_syn = "";
1476 :     # Count the features.
1477 :     my $featureCount = 0;
1478 :     # Loop through the synonym/peg pairs.
1479 :     while (my @row = $sth->fetchrow()) {
1480 :     # Get the synonym ID and feature ID.
1481 :     my ($syn_id, $peg) = @row;
1482 :     # Insure it's for one of our genomes.
1483 :     my $genomeID = FIG::genome_of($peg);
1484 :     if (exists $genomeHash->{$genomeID}) {
1485 :     # Verify the synonym.
1486 :     if ($syn_id ne $current_syn) {
1487 :     # It's new, so put it in the group table.
1488 :     $loadSynonymGroup->Put($syn_id);
1489 :     $current_syn = $syn_id;
1490 :     }
1491 :     # Connect the synonym to the peg.
1492 :     $loadIsSynonymGroupFor->Put($syn_id, $peg);
1493 :     # Count this feature.
1494 :     $featureCount++;
1495 :     if ($featureCount % 1000 == 0) {
1496 :     Trace("$featureCount features processed.") if T(3);
1497 :     }
1498 : parrello 1.43 }
1499 :     }
1500 :     }
1501 :     }
1502 :     # Finish the load.
1503 :     my $retVal = $self->_FinishAll();
1504 :     return $retVal;
1505 :     }
1506 :    
1507 : parrello 1.60 =head3 LoadFamilyData
1508 :    
1509 :     C<< my $stats = $spl->LoadFamilyData(); >>
1510 :    
1511 :     Load the protein families into Sprout.
1512 :    
1513 :     The following relations are loaded by this method.
1514 :    
1515 :     Family
1516 : parrello 1.63 IsFamilyForFeature
1517 : parrello 1.60
1518 :     The source information for these relations is taken from the C<families_for_protein>,
1519 :     C<family_function>, and C<sz_family> methods of the B<FIG> object.
1520 :    
1521 :     =over 4
1522 :    
1523 :     =item RETURNS
1524 :    
1525 :     Returns a statistics object for the loads.
1526 :    
1527 :     =back
1528 :    
1529 :     =cut
1530 :     #: Return Type $%;
1531 :     sub LoadFamilyData {
1532 :     # Get this object instance.
1533 :     my ($self) = @_;
1534 :     # Get the FIG object.
1535 :     my $fig = $self->{fig};
1536 :     # Get the genome hash.
1537 :     my $genomeHash = $self->{genomes};
1538 :     # Create load objects for the tables we're loading.
1539 :     my $loadFamily = $self->_TableLoader('Family');
1540 : parrello 1.63 my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1541 : parrello 1.60 if ($self->{options}->{loadOnly}) {
1542 :     Trace("Loading from existing files.") if T(2);
1543 :     } else {
1544 :     Trace("Generating family data.") if T(2);
1545 :     # Create a hash for the family IDs.
1546 :     my %familyHash = ();
1547 :     # Loop through the genomes.
1548 :     for my $genomeID (sort keys %{$genomeHash}) {
1549 :     Trace("Processing features for $genomeID.") if T(2);
1550 :     # Loop through this genome's PEGs.
1551 :     for my $fid ($fig->all_features($genomeID, "peg")) {
1552 : parrello 1.63 $loadIsFamilyForFeature->Add("features", 1);
1553 : parrello 1.60 # Get this feature's families.
1554 :     my @families = $fig->families_for_protein($fid);
1555 :     # Loop through the families, connecting them to the feature.
1556 :     for my $family (@families) {
1557 : parrello 1.63 $loadIsFamilyForFeature->Put($family, $fid);
1558 : parrello 1.60 # If this is a new family, create a record for it.
1559 :     if (! exists $familyHash{$family}) {
1560 : parrello 1.62 $familyHash{$family} = 1;
1561 : parrello 1.60 $loadFamily->Add("families", 1);
1562 :     my $size = $fig->sz_family($family);
1563 :     my $func = $fig->family_function($family);
1564 : parrello 1.61 $loadFamily->Put($family, $size, $func);
1565 : parrello 1.60 }
1566 :     }
1567 :     }
1568 :     }
1569 :     }
1570 :     # Finish the load.
1571 :     my $retVal = $self->_FinishAll();
1572 :     return $retVal;
1573 :     }
1574 : parrello 1.43
1575 : parrello 1.1 =head2 Internal Utility Methods
1576 :    
1577 :     =head3 TableLoader
1578 :    
1579 :     Create an ERDBLoad object for the specified table. The object is also added to
1580 :     the internal list in the C<loaders> property of this object. That enables the
1581 :     L</FinishAll> method to terminate all the active loads.
1582 :    
1583 :     This is an instance method.
1584 :    
1585 :     =over 4
1586 :    
1587 :     =item tableName
1588 :    
1589 :     Name of the table (relation) being loaded.
1590 :    
1591 : parrello 1.25 =item ignore
1592 :    
1593 :     TRUE if the table should be ignored entirely, else FALSE.
1594 :    
1595 : parrello 1.1 =item RETURN
1596 :    
1597 :     Returns an ERDBLoad object for loading the specified table.
1598 :    
1599 :     =back
1600 :    
1601 :     =cut
1602 :    
1603 :     sub _TableLoader {
1604 :     # Get the parameters.
1605 : parrello 1.25 my ($self, $tableName, $ignore) = @_;
1606 : parrello 1.1 # Create the load object.
1607 : parrello 1.25 my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly,
1608 :     $ignore);
1609 : parrello 1.1 # Cache it in the loader list.
1610 :     push @{$self->{loaders}}, $retVal;
1611 :     # Return it to the caller.
1612 :     return $retVal;
1613 :     }
1614 :    
1615 :     =head3 FinishAll
1616 :    
1617 :     Finish all the active loads on this object.
1618 :    
1619 :     When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in
1620 :     the list pointed to be the C<loaders> property of this object. This method pops the loaders
1621 :     off the list and finishes them to flush out any accumulated residue.
1622 :    
1623 :     This is an instance method.
1624 :    
1625 :     =over 4
1626 :    
1627 :     =item RETURN
1628 :    
1629 :     Returns a statistics object containing the accumulated statistics for the load.
1630 :    
1631 :     =back
1632 :    
1633 :     =cut
1634 :    
1635 :     sub _FinishAll {
1636 :     # Get this object instance.
1637 :     my ($self) = @_;
1638 :     # Create the statistics object.
1639 :     my $retVal = Stats->new();
1640 :     # Get the loader list.
1641 :     my $loadList = $self->{loaders};
1642 : parrello 1.48 # Create a hash to hold the statistics objects, keyed on relation name.
1643 :     my %loaderHash = ();
1644 : parrello 1.1 # Loop through the list, finishing the loads. Note that if the finish fails, we die
1645 : parrello 1.48 # ignominiously. At some future point, we want to make the loads more restartable.
1646 : parrello 1.1 while (my $loader = pop @{$loadList}) {
1647 : parrello 1.26 # Get the relation name.
1648 : parrello 1.19 my $relName = $loader->RelName;
1649 : parrello 1.26 # Check the ignore flag.
1650 :     if ($loader->Ignore) {
1651 :     Trace("Relation $relName not loaded.") if T(2);
1652 :     } else {
1653 :     # Here we really need to finish.
1654 :     Trace("Finishing $relName.") if T(2);
1655 :     my $stats = $loader->Finish();
1656 : parrello 1.48 $loaderHash{$relName} = $stats;
1657 :     }
1658 :     }
1659 :     # Now we loop through again, actually loading the tables. We want to finish before
1660 :     # loading so that if something goes wrong at this point, all the load files are usable
1661 :     # and we don't have to redo all that work.
1662 :     for my $relName (sort keys %loaderHash) {
1663 :     # Get the statistics for this relation.
1664 :     my $stats = $loaderHash{$relName};
1665 :     # Check for a database load.
1666 :     if ($self->{options}->{dbLoad}) {
1667 :     # Here we want to use the load file just created to load the database.
1668 :     Trace("Loading relation $relName.") if T(2);
1669 :     my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1670 :     # Accumulate the statistics from the DB load.
1671 :     $stats->Accumulate($newStats);
1672 :     }
1673 :     $retVal->Accumulate($stats);
1674 :     Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1675 : parrello 1.1 }
1676 :     # Return the load statistics.
1677 :     return $retVal;
1678 :     }
1679 :    
1680 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3