Parent Directory
|
Revision Log
Revision 1.27 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | package SproutLoad; | ||
4 : | |||
5 : | use strict; | ||
6 : | use Tracer; | ||
7 : | use PageBuilder; | ||
8 : | use ERDBLoad; | ||
9 : | use FIG; | ||
10 : | use Sprout; | ||
11 : | use Stats; | ||
12 : | use BasicLocation; | ||
13 : | parrello | 1.18 | use HTML; |
14 : | parrello | 1.1 | |
15 : | =head1 Sprout Load Methods | ||
16 : | |||
17 : | =head2 Introduction | ||
18 : | |||
19 : | This object contains the methods needed to copy data from the FIG data store to the | ||
20 : | Sprout database. It makes heavy use of the ERDBLoad object to manage the load into | ||
21 : | individual tables. The client can create an instance of this object and then | ||
22 : | call methods for each group of tables to load. For example, the following code will | ||
23 : | load the Genome- and Feature-related tables. (It is presumed the first command line | ||
24 : | parameter contains the name of a file specifying the genomes.) | ||
25 : | |||
26 : | my $fig = FIG->new(); | ||
27 : | my $sprout = SFXlate->new_sprout_only(); | ||
28 : | my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]); | ||
29 : | my $stats = $spl->LoadGenomeData(); | ||
30 : | $stats->Accumulate($spl->LoadFeatureData()); | ||
31 : | print $stats->Show(); | ||
32 : | |||
33 : | This module makes use of the internal Sprout property C<_erdb>. | ||
34 : | |||
35 : | It is worth noting that the FIG object does not need to be a real one. Any object | ||
36 : | that implements the FIG methods for data retrieval could be used. So, for example, | ||
37 : | this object could be used to copy data from one Sprout database to another, or | ||
38 : | from any FIG-compliant data story implemented in the future. | ||
39 : | |||
40 : | To insure that this is possible, each time the FIG object is used, it will be via | ||
41 : | a variable called C<$fig>. This makes it fairly straightforward to determine which | ||
42 : | FIG methods are required to load the Sprout database. | ||
43 : | |||
44 : | parrello | 1.5 | This object creates the load files; however, the tables are not created until it |
45 : | is time to actually do the load from the files into the target database. | ||
46 : | |||
47 : | parrello | 1.1 | =cut |
48 : | |||
49 : | #: Constructor SproutLoad->new(); | ||
50 : | |||
51 : | =head2 Public Methods | ||
52 : | |||
53 : | =head3 new | ||
54 : | |||
55 : | parrello | 1.8 | C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
56 : | parrello | 1.1 | |
57 : | Construct a new Sprout Loader object, specifying the two participating databases and | ||
58 : | the name of the files containing the list of genomes and subsystems to use. | ||
59 : | |||
60 : | =over 4 | ||
61 : | |||
62 : | =item sprout | ||
63 : | |||
64 : | Sprout object representing the target database. This also specifies the directory to | ||
65 : | be used for creating the load files. | ||
66 : | |||
67 : | =item fig | ||
68 : | |||
69 : | FIG object representing the source data store from which the data is to be taken. | ||
70 : | |||
71 : | =item genomeFile | ||
72 : | |||
73 : | Either the name of the file containing the list of genomes to load or a reference to | ||
74 : | a hash of genome IDs to access codes. If nothing is specified, all complete genomes | ||
75 : | will be loaded and the access code will default to 1. The genome list is presumed | ||
76 : | to be all-inclusive. In other words, all existing data in the target database will | ||
77 : | be deleted and replaced with the data on the specified genes. If a file is specified, | ||
78 : | it should contain one genome ID and access code per line, tab-separated. | ||
79 : | |||
80 : | =item subsysFile | ||
81 : | |||
82 : | Either the name of the file containing the list of trusted subsystems or a reference | ||
83 : | to a list of subsystem names. If nothing is specified, all known subsystems will be | ||
84 : | considered trusted. Only subsystem data related to the trusted subsystems is loaded. | ||
85 : | |||
86 : | parrello | 1.8 | =item options |
87 : | |||
88 : | Reference to a hash of command-line options. | ||
89 : | |||
90 : | parrello | 1.1 | =back |
91 : | |||
92 : | =cut | ||
93 : | |||
94 : | sub new { | ||
95 : | # Get the parameters. | ||
96 : | parrello | 1.8 | my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
97 : | parrello | 1.1 | # Load the list of genomes into a hash. |
98 : | my %genomes; | ||
99 : | if (! defined($genomeFile) || $genomeFile eq '') { | ||
100 : | # Here we want all the complete genomes and an access code of 1. | ||
101 : | my @genomeList = $fig->genomes(1); | ||
102 : | %genomes = map { $_ => 1 } @genomeList; | ||
103 : | parrello | 1.3 | } else { |
104 : | my $type = ref $genomeFile; | ||
105 : | Trace("Genome file parameter type is \"$type\".") if T(3); | ||
106 : | if ($type eq 'HASH') { | ||
107 : | # Here the user specified a hash of genome IDs to access codes, which is | ||
108 : | # exactly what we want. | ||
109 : | %genomes = %{$genomeFile}; | ||
110 : | } elsif (! $type || $type eq 'SCALAR' ) { | ||
111 : | # The caller specified a file, so read the genomes from the file. (Note | ||
112 : | # that some PERLs return an empty string rather than SCALAR.) | ||
113 : | my @genomeList = Tracer::GetFile($genomeFile); | ||
114 : | if (! @genomeList) { | ||
115 : | # It's an error if the genome file is empty or not found. | ||
116 : | Confess("No genomes found in file \"$genomeFile\"."); | ||
117 : | } else { | ||
118 : | # We build the genome Hash using a loop rather than "map" so that | ||
119 : | # an omitted access code can be defaulted to 1. | ||
120 : | for my $genomeLine (@genomeList) { | ||
121 : | my ($genomeID, $accessCode) = split("\t", $genomeLine); | ||
122 : | if (undef $accessCode) { | ||
123 : | $accessCode = 1; | ||
124 : | } | ||
125 : | $genomes{$genomeID} = $accessCode; | ||
126 : | parrello | 1.1 | } |
127 : | } | ||
128 : | parrello | 1.3 | } else { |
129 : | Confess("Invalid genome parameter ($type) in SproutLoad constructor."); | ||
130 : | parrello | 1.1 | } |
131 : | } | ||
132 : | # Load the list of trusted subsystems. | ||
133 : | my %subsystems = (); | ||
134 : | if (! defined $subsysFile || $subsysFile eq '') { | ||
135 : | # Here we want all the subsystems. | ||
136 : | %subsystems = map { $_ => 1 } $fig->all_subsystems(); | ||
137 : | parrello | 1.4 | } else { |
138 : | my $type = ref $subsysFile; | ||
139 : | if ($type eq 'ARRAY') { | ||
140 : | # Here the user passed in a list of subsystems. | ||
141 : | %subsystems = map { $_ => 1 } @{$subsysFile}; | ||
142 : | } elsif (! $type || $type eq 'SCALAR') { | ||
143 : | # Here the list of subsystems is in a file. | ||
144 : | if (! -e $subsysFile) { | ||
145 : | # It's an error if the file does not exist. | ||
146 : | Confess("Trusted subsystem file not found."); | ||
147 : | } else { | ||
148 : | # GetFile automatically chomps end-of-line characters, so this | ||
149 : | # is an easy task. | ||
150 : | %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile); | ||
151 : | } | ||
152 : | parrello | 1.1 | } else { |
153 : | parrello | 1.4 | Confess("Invalid subsystem parameter in SproutLoad constructor."); |
154 : | parrello | 1.1 | } |
155 : | } | ||
156 : | # Get the data directory from the Sprout object. | ||
157 : | my ($directory) = $sprout->LoadInfo(); | ||
158 : | # Create the Sprout load object. | ||
159 : | my $retVal = { | ||
160 : | fig => $fig, | ||
161 : | genomes => \%genomes, | ||
162 : | subsystems => \%subsystems, | ||
163 : | sprout => $sprout, | ||
164 : | loadDirectory => $directory, | ||
165 : | erdb => $sprout->{_erdb}, | ||
166 : | parrello | 1.8 | loaders => [], |
167 : | options => $options | ||
168 : | parrello | 1.1 | }; |
169 : | # Bless and return it. | ||
170 : | bless $retVal, $class; | ||
171 : | return $retVal; | ||
172 : | } | ||
173 : | |||
174 : | parrello | 1.23 | =head3 LoadOnly |
175 : | |||
176 : | C<< my $flag = $spl->LoadOnly; >> | ||
177 : | |||
178 : | Return TRUE if we are in load-only mode, else FALSE. | ||
179 : | |||
180 : | =cut | ||
181 : | |||
182 : | sub LoadOnly { | ||
183 : | my ($self) = @_; | ||
184 : | return $self->{options}->{loadOnly}; | ||
185 : | } | ||
186 : | |||
187 : | parrello | 1.25 | =head3 PrimaryOnly |
188 : | |||
189 : | C<< my $flag = $spl->PrimaryOnly; >> | ||
190 : | |||
191 : | Return TRUE if only the main entity is to be loaded, else FALSE. | ||
192 : | |||
193 : | =cut | ||
194 : | |||
195 : | sub PrimaryOnly { | ||
196 : | my ($self) = @_; | ||
197 : | return $self->{options}->{primaryOnly}; | ||
198 : | } | ||
199 : | |||
200 : | parrello | 1.1 | =head3 LoadGenomeData |
201 : | |||
202 : | C<< my $stats = $spl->LoadGenomeData(); >> | ||
203 : | |||
204 : | Load the Genome, Contig, and Sequence data from FIG into Sprout. | ||
205 : | |||
206 : | The Sequence table is the largest single relation in the Sprout database, so this | ||
207 : | method is expected to be slow and clumsy. At some point we will need to make it | ||
208 : | restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be | ||
209 : | very annoying otherwise. | ||
210 : | |||
211 : | The following relations are loaded by this method. | ||
212 : | |||
213 : | Genome | ||
214 : | HasContig | ||
215 : | Contig | ||
216 : | IsMadeUpOf | ||
217 : | Sequence | ||
218 : | |||
219 : | =over 4 | ||
220 : | |||
221 : | =item RETURNS | ||
222 : | |||
223 : | Returns a statistics object for the loads. | ||
224 : | |||
225 : | =back | ||
226 : | |||
227 : | =cut | ||
228 : | #: Return Type $%; | ||
229 : | sub LoadGenomeData { | ||
230 : | # Get this object instance. | ||
231 : | my ($self) = @_; | ||
232 : | # Get the FIG object. | ||
233 : | my $fig = $self->{fig}; | ||
234 : | # Get the genome count. | ||
235 : | my $genomeHash = $self->{genomes}; | ||
236 : | my $genomeCount = (keys %{$genomeHash}); | ||
237 : | # Create load objects for each of the tables we're loading. | ||
238 : | parrello | 1.23 | my $loadGenome = $self->_TableLoader('Genome'); |
239 : | parrello | 1.25 | my $loadHasContig = $self->_TableLoader('HasContig', $self->PrimaryOnly); |
240 : | my $loadContig = $self->_TableLoader('Contig', $self->PrimaryOnly); | ||
241 : | my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $self->PrimaryOnly); | ||
242 : | my $loadSequence = $self->_TableLoader('Sequence', $self->PrimaryOnly); | ||
243 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
244 : | Trace("Loading from existing files.") if T(2); | ||
245 : | } else { | ||
246 : | Trace("Generating genome data.") if T(2); | ||
247 : | # Now we loop through the genomes, generating the data for each one. | ||
248 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
249 : | Trace("Generating data for genome $genomeID.") if T(3); | ||
250 : | $loadGenome->Add("genomeIn"); | ||
251 : | # The access code comes in via the genome hash. | ||
252 : | my $accessCode = $genomeHash->{$genomeID}; | ||
253 : | # Get the genus, species, and strain from the scientific name. Note that we append | ||
254 : | # the genome ID to the strain. In some cases this is the totality of the strain name. | ||
255 : | my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); | ||
256 : | my $extra = join " ", @extraData, "[$genomeID]"; | ||
257 : | # Get the full taxonomy. | ||
258 : | my $taxonomy = $fig->taxonomy_of($genomeID); | ||
259 : | # Output the genome record. | ||
260 : | $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, | ||
261 : | $species, $extra, $taxonomy); | ||
262 : | # Now we loop through each of the genome's contigs. | ||
263 : | my @contigs = $fig->all_contigs($genomeID); | ||
264 : | for my $contigID (@contigs) { | ||
265 : | Trace("Processing contig $contigID for $genomeID.") if T(4); | ||
266 : | $loadContig->Add("contigIn"); | ||
267 : | $loadSequence->Add("contigIn"); | ||
268 : | # Create the contig ID. | ||
269 : | my $sproutContigID = "$genomeID:$contigID"; | ||
270 : | # Create the contig record and relate it to the genome. | ||
271 : | $loadContig->Put($sproutContigID); | ||
272 : | $loadHasContig->Put($genomeID, $sproutContigID); | ||
273 : | # Now we need to split the contig into sequences. The maximum sequence size is | ||
274 : | # a property of the Sprout object. | ||
275 : | my $chunkSize = $self->{sprout}->MaxSequence(); | ||
276 : | # Now we get the sequence a chunk at a time. | ||
277 : | my $contigLen = $fig->contig_ln($genomeID, $contigID); | ||
278 : | for (my $i = 1; $i <= $contigLen; $i += $chunkSize) { | ||
279 : | $loadSequence->Add("chunkIn"); | ||
280 : | # Compute the endpoint of this chunk. | ||
281 : | my $end = FIG::min($i + $chunkSize - 1, $contigLen); | ||
282 : | # Get the actual DNA. | ||
283 : | my $dna = $fig->get_dna($genomeID, $contigID, $i, $end); | ||
284 : | # Compute the sequenceID. | ||
285 : | my $seqID = "$sproutContigID.$i"; | ||
286 : | # Write out the data. For now, the quality vector is always "unknown". | ||
287 : | $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i); | ||
288 : | $loadSequence->Put($seqID, "unknown", $dna); | ||
289 : | } | ||
290 : | parrello | 1.1 | } |
291 : | } | ||
292 : | } | ||
293 : | # Finish the loads. | ||
294 : | my $retVal = $self->_FinishAll(); | ||
295 : | # Return the result. | ||
296 : | return $retVal; | ||
297 : | } | ||
298 : | |||
299 : | =head3 LoadCouplingData | ||
300 : | |||
301 : | C<< my $stats = $spl->LoadCouplingData(); >> | ||
302 : | |||
303 : | Load the coupling and evidence data from FIG into Sprout. | ||
304 : | |||
305 : | The coupling data specifies which genome features are functionally coupled. The | ||
306 : | evidence data explains why the coupling is functional. | ||
307 : | |||
308 : | The following relations are loaded by this method. | ||
309 : | |||
310 : | Coupling | ||
311 : | IsEvidencedBy | ||
312 : | PCH | ||
313 : | ParticipatesInCoupling | ||
314 : | UsesAsEvidence | ||
315 : | |||
316 : | =over 4 | ||
317 : | |||
318 : | =item RETURNS | ||
319 : | |||
320 : | Returns a statistics object for the loads. | ||
321 : | |||
322 : | =back | ||
323 : | |||
324 : | =cut | ||
325 : | #: Return Type $%; | ||
326 : | sub LoadCouplingData { | ||
327 : | # Get this object instance. | ||
328 : | my ($self) = @_; | ||
329 : | # Get the FIG object. | ||
330 : | my $fig = $self->{fig}; | ||
331 : | # Get the genome hash. | ||
332 : | my $genomeFilter = $self->{genomes}; | ||
333 : | my $genomeCount = (keys %{$genomeFilter}); | ||
334 : | my $featureCount = $genomeCount * 4000; | ||
335 : | # Start the loads. | ||
336 : | parrello | 1.23 | my $loadCoupling = $self->_TableLoader('Coupling'); |
337 : | parrello | 1.25 | my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $self->PrimaryOnly); |
338 : | my $loadPCH = $self->_TableLoader('PCH', $self->PrimaryOnly); | ||
339 : | my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $self->PrimaryOnly); | ||
340 : | my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $self->PrimaryOnly); | ||
341 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
342 : | Trace("Loading from existing files.") if T(2); | ||
343 : | } else { | ||
344 : | Trace("Generating coupling data.") if T(2); | ||
345 : | # Loop through the genomes found. | ||
346 : | for my $genome (sort keys %{$genomeFilter}) { | ||
347 : | Trace("Generating coupling data for $genome.") if T(3); | ||
348 : | $loadCoupling->Add("genomeIn"); | ||
349 : | # Create a hash table for holding coupled pairs. We use this to prevent | ||
350 : | # duplicates. For example, if A is coupled to B, we don't want to also | ||
351 : | # assert that B is coupled to A, because we already know it. Fortunately, | ||
352 : | # all couplings occur within a genome, so we can keep the hash table | ||
353 : | # size reasonably small. | ||
354 : | my %dupHash = (); | ||
355 : | # Get all of the genome's PEGs. | ||
356 : | my @pegs = $fig->pegs_of($genome); | ||
357 : | # Loop through the PEGs. | ||
358 : | for my $peg1 (@pegs) { | ||
359 : | $loadCoupling->Add("pegIn"); | ||
360 : | Trace("Processing PEG $peg1 for $genome.") if T(4); | ||
361 : | # Get a list of the coupled PEGs. | ||
362 : | my @couplings = $fig->coupled_to($peg1); | ||
363 : | # For each coupled PEG, we need to verify that a coupling already | ||
364 : | # exists. If not, we have to create one. | ||
365 : | for my $coupleData (@couplings) { | ||
366 : | my ($peg2, $score) = @{$coupleData}; | ||
367 : | # Compute the coupling ID. | ||
368 : | my $coupleID = Sprout::CouplingID($peg1, $peg2); | ||
369 : | if (! exists $dupHash{$coupleID}) { | ||
370 : | $loadCoupling->Add("couplingIn"); | ||
371 : | # Here we have a new coupling to store in the load files. | ||
372 : | Trace("Storing coupling ($coupleID) with score $score.") if T(4); | ||
373 : | # Ensure we don't do this again. | ||
374 : | $dupHash{$coupleID} = $score; | ||
375 : | # Write the coupling record. | ||
376 : | $loadCoupling->Put($coupleID, $score); | ||
377 : | # Connect it to the coupled PEGs. | ||
378 : | $loadParticipatesInCoupling->Put($peg1, $coupleID, 1); | ||
379 : | $loadParticipatesInCoupling->Put($peg2, $coupleID, 2); | ||
380 : | # Get the evidence for this coupling. | ||
381 : | my @evidence = $fig->coupling_evidence($peg1, $peg2); | ||
382 : | # Organize the evidence into a hash table. | ||
383 : | my %evidenceMap = (); | ||
384 : | # Process each evidence item. | ||
385 : | for my $evidenceData (@evidence) { | ||
386 : | $loadPCH->Add("evidenceIn"); | ||
387 : | my ($peg3, $peg4, $usage) = @{$evidenceData}; | ||
388 : | # Only proceed if the evidence is from a Sprout | ||
389 : | # genome. | ||
390 : | if ($genomeFilter->{$fig->genome_of($peg3)}) { | ||
391 : | $loadUsesAsEvidence->Add("evidenceChosen"); | ||
392 : | my $evidenceKey = "$coupleID $peg3 $peg4"; | ||
393 : | # We store this evidence in the hash if the usage | ||
394 : | # is nonzero or no prior evidence has been found. This | ||
395 : | # insures that if there is duplicate evidence, we | ||
396 : | # at least keep the meaningful ones. Only evidence in | ||
397 : | # the hash makes it to the output. | ||
398 : | if ($usage || ! exists $evidenceMap{$evidenceKey}) { | ||
399 : | $evidenceMap{$evidenceKey} = $evidenceData; | ||
400 : | } | ||
401 : | parrello | 1.1 | } |
402 : | } | ||
403 : | parrello | 1.23 | for my $evidenceID (keys %evidenceMap) { |
404 : | # Create the evidence record. | ||
405 : | my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; | ||
406 : | $loadPCH->Put($evidenceID, $usage); | ||
407 : | # Connect it to the coupling. | ||
408 : | $loadIsEvidencedBy->Put($coupleID, $evidenceID); | ||
409 : | # Connect it to the features. | ||
410 : | $loadUsesAsEvidence->Put($evidenceID, $peg3, 1); | ||
411 : | $loadUsesAsEvidence->Put($evidenceID, $peg4, 2); | ||
412 : | } | ||
413 : | parrello | 1.1 | } |
414 : | } | ||
415 : | } | ||
416 : | } | ||
417 : | } | ||
418 : | # All done. Finish the load. | ||
419 : | my $retVal = $self->_FinishAll(); | ||
420 : | return $retVal; | ||
421 : | } | ||
422 : | |||
423 : | =head3 LoadFeatureData | ||
424 : | |||
425 : | C<< my $stats = $spl->LoadFeatureData(); >> | ||
426 : | |||
427 : | Load the feature data from FIG into Sprout. | ||
428 : | |||
429 : | Features represent annotated genes, and are therefore the heart of the data store. | ||
430 : | |||
431 : | The following relations are loaded by this method. | ||
432 : | |||
433 : | Feature | ||
434 : | FeatureAlias | ||
435 : | FeatureLink | ||
436 : | FeatureTranslation | ||
437 : | FeatureUpstream | ||
438 : | IsLocatedIn | ||
439 : | |||
440 : | =over 4 | ||
441 : | |||
442 : | =item RETURNS | ||
443 : | |||
444 : | Returns a statistics object for the loads. | ||
445 : | |||
446 : | =back | ||
447 : | |||
448 : | =cut | ||
449 : | #: Return Type $%; | ||
450 : | sub LoadFeatureData { | ||
451 : | # Get this object instance. | ||
452 : | my ($self) = @_; | ||
453 : | # Get the FIG object. | ||
454 : | my $fig = $self->{fig}; | ||
455 : | # Get the table of genome IDs. | ||
456 : | my $genomeHash = $self->{genomes}; | ||
457 : | # Create load objects for each of the tables we're loading. | ||
458 : | parrello | 1.23 | my $loadFeature = $self->_TableLoader('Feature'); |
459 : | parrello | 1.25 | my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $self->PrimaryOnly); |
460 : | parrello | 1.23 | my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
461 : | my $loadFeatureLink = $self->_TableLoader('FeatureLink'); | ||
462 : | my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); | ||
463 : | my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); | ||
464 : | parrello | 1.1 | # Get the maximum sequence size. We need this later for splitting up the |
465 : | # locations. | ||
466 : | my $chunkSize = $self->{sprout}->MaxSegment(); | ||
467 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
468 : | Trace("Loading from existing files.") if T(2); | ||
469 : | } else { | ||
470 : | Trace("Generating feature data.") if T(2); | ||
471 : | # Now we loop through the genomes, generating the data for each one. | ||
472 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
473 : | Trace("Loading features for genome $genomeID.") if T(3); | ||
474 : | $loadFeature->Add("genomeIn"); | ||
475 : | # Get the feature list for this genome. | ||
476 : | my $features = $fig->all_features_detailed($genomeID); | ||
477 : | # Loop through the features. | ||
478 : | for my $featureData (@{$features}) { | ||
479 : | $loadFeature->Add("featureIn"); | ||
480 : | # Split the tuple. | ||
481 : | my ($featureID, $locations, undef, $type) = @{$featureData}; | ||
482 : | # Create the feature record. | ||
483 : | $loadFeature->Put($featureID, 1, $type); | ||
484 : | # Create the aliases. | ||
485 : | for my $alias ($fig->feature_aliases($featureID)) { | ||
486 : | $loadFeatureAlias->Put($featureID, $alias); | ||
487 : | } | ||
488 : | parrello | 1.8 | # Get the links. |
489 : | my @links = $fig->fid_links($featureID); | ||
490 : | for my $link (@links) { | ||
491 : | $loadFeatureLink->Put($featureID, $link); | ||
492 : | parrello | 1.1 | } |
493 : | parrello | 1.8 | # If this is a peg, generate the translation and the upstream. |
494 : | if ($type eq 'peg') { | ||
495 : | $loadFeatureTranslation->Add("pegIn"); | ||
496 : | my $translation = $fig->get_translation($featureID); | ||
497 : | if ($translation) { | ||
498 : | $loadFeatureTranslation->Put($featureID, $translation); | ||
499 : | } | ||
500 : | # We use the default upstream values of u=200 and c=100. | ||
501 : | my $upstream = $fig->upstream_of($featureID, 200, 100); | ||
502 : | if ($upstream) { | ||
503 : | $loadFeatureUpstream->Put($featureID, $upstream); | ||
504 : | } | ||
505 : | parrello | 1.1 | } |
506 : | parrello | 1.23 | # This part is the roughest. We need to relate the features to contig |
507 : | # locations, and the locations must be split so that none of them exceed | ||
508 : | # the maximum segment size. This simplifies the genes_in_region processing | ||
509 : | # for Sprout. | ||
510 : | my @locationList = split /\s*,\s*/, $locations; | ||
511 : | # Create the location position indicator. | ||
512 : | my $i = 1; | ||
513 : | # Loop through the locations. | ||
514 : | for my $location (@locationList) { | ||
515 : | # Parse the location. | ||
516 : | my $locObject = BasicLocation->new("$genomeID:$location"); | ||
517 : | # Split it into a list of chunks. | ||
518 : | my @locOList = (); | ||
519 : | while (my $peeling = $locObject->Peel($chunkSize)) { | ||
520 : | $loadIsLocatedIn->Add("peeling"); | ||
521 : | push @locOList, $peeling; | ||
522 : | } | ||
523 : | push @locOList, $locObject; | ||
524 : | # Loop through the chunks, creating IsLocatedIn records. The variable | ||
525 : | # "$i" will be used to keep the location index. | ||
526 : | for my $locChunk (@locOList) { | ||
527 : | $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, | ||
528 : | $locChunk->Dir, $locChunk->Length, $i); | ||
529 : | $i++; | ||
530 : | } | ||
531 : | parrello | 1.1 | } |
532 : | } | ||
533 : | } | ||
534 : | } | ||
535 : | # Finish the loads. | ||
536 : | my $retVal = $self->_FinishAll(); | ||
537 : | return $retVal; | ||
538 : | } | ||
539 : | |||
540 : | =head3 LoadBBHData | ||
541 : | |||
542 : | C<< my $stats = $spl->LoadBBHData(); >> | ||
543 : | |||
544 : | Load the bidirectional best hit data from FIG into Sprout. | ||
545 : | |||
546 : | Sprout does not store information on similarities. Instead, it has only the | ||
547 : | bi-directional best hits. Even so, the BBH table is one of the largest in | ||
548 : | the database. | ||
549 : | |||
550 : | The following relations are loaded by this method. | ||
551 : | |||
552 : | IsBidirectionalBestHitOf | ||
553 : | |||
554 : | =over 4 | ||
555 : | |||
556 : | =item RETURNS | ||
557 : | |||
558 : | Returns a statistics object for the loads. | ||
559 : | |||
560 : | =back | ||
561 : | |||
562 : | =cut | ||
563 : | #: Return Type $%; | ||
564 : | parrello | 1.2 | sub LoadBBHData { |
565 : | parrello | 1.1 | # Get this object instance. |
566 : | my ($self) = @_; | ||
567 : | # Get the FIG object. | ||
568 : | my $fig = $self->{fig}; | ||
569 : | # Get the table of genome IDs. | ||
570 : | my $genomeHash = $self->{genomes}; | ||
571 : | # Create load objects for each of the tables we're loading. | ||
572 : | parrello | 1.23 | my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf'); |
573 : | if ($self->{options}->{loadOnly}) { | ||
574 : | Trace("Loading from existing files.") if T(2); | ||
575 : | } else { | ||
576 : | Trace("Generating BBH data.") if T(2); | ||
577 : | # Now we loop through the genomes, generating the data for each one. | ||
578 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
579 : | $loadIsBidirectionalBestHitOf->Add("genomeIn"); | ||
580 : | Trace("Processing features for genome $genomeID.") if T(3); | ||
581 : | # Get the feature list for this genome. | ||
582 : | my $features = $fig->all_features_detailed($genomeID); | ||
583 : | # Loop through the features. | ||
584 : | for my $featureData (@{$features}) { | ||
585 : | # Split the tuple. | ||
586 : | my ($featureID, $locations, $aliases, $type) = @{$featureData}; | ||
587 : | # Get the bi-directional best hits. | ||
588 : | my @bbhList = $fig->bbhs($featureID); | ||
589 : | for my $bbhEntry (@bbhList) { | ||
590 : | # Get the target feature ID and the score. | ||
591 : | my ($targetID, $score) = @{$bbhEntry}; | ||
592 : | # Check the target feature's genome. | ||
593 : | my $targetGenomeID = $fig->genome_of($targetID); | ||
594 : | # Only proceed if it's one of our genomes. | ||
595 : | if ($genomeHash->{$targetGenomeID}) { | ||
596 : | $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, | ||
597 : | $score); | ||
598 : | } | ||
599 : | parrello | 1.1 | } |
600 : | } | ||
601 : | } | ||
602 : | } | ||
603 : | # Finish the loads. | ||
604 : | my $retVal = $self->_FinishAll(); | ||
605 : | return $retVal; | ||
606 : | } | ||
607 : | |||
608 : | =head3 LoadSubsystemData | ||
609 : | |||
610 : | C<< my $stats = $spl->LoadSubsystemData(); >> | ||
611 : | |||
612 : | Load the subsystem data from FIG into Sprout. | ||
613 : | |||
614 : | Subsystems are groupings of genetic roles that work together to effect a specific | ||
615 : | chemical reaction. Similar organisms require similar subsystems. To curate a subsystem, | ||
616 : | a spreadsheet is created with genomes on one axis and subsystem roles on the other | ||
617 : | axis. Similar features are then mapped into the cells, allowing the annotation of one | ||
618 : | genome's roles to be used to assist in the annotation of others. | ||
619 : | |||
620 : | The following relations are loaded by this method. | ||
621 : | |||
622 : | Subsystem | ||
623 : | Role | ||
624 : | parrello | 1.19 | RoleEC |
625 : | parrello | 1.1 | SSCell |
626 : | ContainsFeature | ||
627 : | IsGenomeOf | ||
628 : | IsRoleOf | ||
629 : | OccursInSubsystem | ||
630 : | ParticipatesIn | ||
631 : | HasSSCell | ||
632 : | parrello | 1.18 | ConsistsOfRoles |
633 : | RoleSubset | ||
634 : | HasRoleSubset | ||
635 : | ConsistsOfGenomes | ||
636 : | GenomeSubset | ||
637 : | HasGenomeSubset | ||
638 : | parrello | 1.20 | Catalyzes |
639 : | parrello | 1.21 | Diagram |
640 : | RoleOccursIn | ||
641 : | parrello | 1.1 | |
642 : | =over 4 | ||
643 : | |||
644 : | =item RETURNS | ||
645 : | |||
646 : | Returns a statistics object for the loads. | ||
647 : | |||
648 : | =back | ||
649 : | |||
650 : | =cut | ||
651 : | #: Return Type $%; | ||
652 : | sub LoadSubsystemData { | ||
653 : | # Get this object instance. | ||
654 : | my ($self) = @_; | ||
655 : | # Get the FIG object. | ||
656 : | my $fig = $self->{fig}; | ||
657 : | # Get the genome hash. We'll use it to filter the genomes in each | ||
658 : | # spreadsheet. | ||
659 : | my $genomeHash = $self->{genomes}; | ||
660 : | # Get the subsystem hash. This lists the subsystems we'll process. | ||
661 : | my $subsysHash = $self->{subsystems}; | ||
662 : | my @subsysIDs = sort keys %{$subsysHash}; | ||
663 : | parrello | 1.21 | # Get the map list. |
664 : | my @maps = $fig->all_maps; | ||
665 : | parrello | 1.1 | # Create load objects for each of the tables we're loading. |
666 : | parrello | 1.25 | my $loadDiagram = $self->_TableLoader('Diagram', $self->PrimaryOnly); |
667 : | my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $self->PrimaryOnly); | ||
668 : | parrello | 1.23 | my $loadSubsystem = $self->_TableLoader('Subsystem'); |
669 : | parrello | 1.25 | my $loadRole = $self->_TableLoader('Role', $self->PrimaryOnly); |
670 : | my $loadRoleEC = $self->_TableLoader('RoleEC', $self->PrimaryOnly); | ||
671 : | my $loadCatalyzes = $self->_TableLoader('Catalyzes', $self->PrimaryOnly); | ||
672 : | my $loadSSCell = $self->_TableLoader('SSCell', $self->PrimaryOnly); | ||
673 : | my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $self->PrimaryOnly); | ||
674 : | my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $self->PrimaryOnly); | ||
675 : | my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $self->PrimaryOnly); | ||
676 : | my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $self->PrimaryOnly); | ||
677 : | my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $self->PrimaryOnly); | ||
678 : | my $loadHasSSCell = $self->_TableLoader('HasSSCell', $self->PrimaryOnly); | ||
679 : | my $loadRoleSubset = $self->_TableLoader('RoleSubset', $self->PrimaryOnly); | ||
680 : | my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $self->PrimaryOnly); | ||
681 : | my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $self->PrimaryOnly); | ||
682 : | my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $self->PrimaryOnly); | ||
683 : | my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $self->PrimaryOnly); | ||
684 : | my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $self->PrimaryOnly); | ||
685 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
686 : | Trace("Loading from existing files.") if T(2); | ||
687 : | } else { | ||
688 : | Trace("Generating subsystem data.") if T(2); | ||
689 : | # This hash will contain the role for each EC. When we're done, this | ||
690 : | # information will be used to generate the Catalyzes table. | ||
691 : | my %ecToRoles = (); | ||
692 : | # Loop through the subsystems. Our first task will be to create the | ||
693 : | # roles. We do this by looping through the subsystems and creating a | ||
694 : | # role hash. The hash tracks each role ID so that we don't create | ||
695 : | # duplicates. As we move along, we'll connect the roles and subsystems | ||
696 : | # and memorize up the reactions. | ||
697 : | my ($genomeID, $roleID); | ||
698 : | my %roleData = (); | ||
699 : | for my $subsysID (@subsysIDs) { | ||
700 : | Trace("Creating subsystem $subsysID.") if T(3); | ||
701 : | $loadSubsystem->Add("subsystemIn"); | ||
702 : | # Get the subsystem object. | ||
703 : | my $sub = $fig->get_subsystem($subsysID); | ||
704 : | # Create the subsystem record. | ||
705 : | my $curator = $sub->get_curator(); | ||
706 : | my $notes = $sub->get_notes(); | ||
707 : | $loadSubsystem->Put($subsysID, $curator, $notes); | ||
708 : | # Connect it to its roles. Each role is a column in the subsystem spreadsheet. | ||
709 : | for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { | ||
710 : | # Connect to this role. | ||
711 : | $loadOccursInSubsystem->Add("roleIn"); | ||
712 : | $loadOccursInSubsystem->Put($roleID, $subsysID, $col); | ||
713 : | # If it's a new role, add it to the role table. | ||
714 : | if (! exists $roleData{$roleID}) { | ||
715 : | # Get the role's abbreviation. | ||
716 : | my $abbr = $sub->get_role_abbr($col); | ||
717 : | # Add the role. | ||
718 : | $loadRole->Put($roleID, $abbr); | ||
719 : | $roleData{$roleID} = 1; | ||
720 : | # Check for an EC number. | ||
721 : | if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) { | ||
722 : | my $ec = $1; | ||
723 : | $loadRoleEC->Put($roleID, $ec); | ||
724 : | $ecToRoles{$ec} = $roleID; | ||
725 : | } | ||
726 : | parrello | 1.18 | } |
727 : | parrello | 1.1 | } |
728 : | parrello | 1.23 | # Now we create the spreadsheet for the subsystem by matching roles to |
729 : | # genomes. Each genome is a row and each role is a column. We may need | ||
730 : | # to actually create the roles as we find them. | ||
731 : | Trace("Creating subsystem $subsysID spreadsheet.") if T(3); | ||
732 : | for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) { | ||
733 : | # Only proceed if this is one of our genomes. | ||
734 : | if (exists $genomeHash->{$genomeID}) { | ||
735 : | # Count the PEGs and cells found for verification purposes. | ||
736 : | my $pegCount = 0; | ||
737 : | my $cellCount = 0; | ||
738 : | # Create a list for the PEGs we find. This list will be used | ||
739 : | # to generate cluster numbers. | ||
740 : | my @pegsFound = (); | ||
741 : | # Create a hash that maps spreadsheet IDs to PEGs. We will | ||
742 : | # use this to generate the ContainsFeature data after we have | ||
743 : | # the cluster numbers. | ||
744 : | my %cellPegs = (); | ||
745 : | # Get the genome's variant code for this subsystem. | ||
746 : | my $variantCode = $sub->get_variant_code($row); | ||
747 : | # Loop through the subsystem's roles. We use an index because it is | ||
748 : | # part of the spreadsheet cell ID. | ||
749 : | for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { | ||
750 : | # Get the features in the spreadsheet cell for this genome and role. | ||
751 : | my @pegs = $sub->get_pegs_from_cell($row, $col); | ||
752 : | # Only proceed if features exist. | ||
753 : | if (@pegs > 0) { | ||
754 : | # Create the spreadsheet cell. | ||
755 : | $cellCount++; | ||
756 : | my $cellID = "$subsysID:$genomeID:$col"; | ||
757 : | $loadSSCell->Put($cellID); | ||
758 : | $loadIsGenomeOf->Put($genomeID, $cellID); | ||
759 : | $loadIsRoleOf->Put($roleID, $cellID); | ||
760 : | $loadHasSSCell->Put($subsysID, $cellID); | ||
761 : | # Remember its features. | ||
762 : | push @pegsFound, @pegs; | ||
763 : | $cellPegs{$cellID} = \@pegs; | ||
764 : | $pegCount += @pegs; | ||
765 : | } | ||
766 : | parrello | 1.1 | } |
767 : | parrello | 1.23 | # If we found some cells for this genome, we need to compute clusters and |
768 : | # denote it participates in the subsystem. | ||
769 : | if ($pegCount > 0) { | ||
770 : | Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); | ||
771 : | $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); | ||
772 : | # Partition the PEGs found into clusters. | ||
773 : | my @clusters = $fig->compute_clusters(\@pegsFound, $sub); | ||
774 : | # Create a hash mapping PEG IDs to cluster numbers. | ||
775 : | # We default to -1 for all of them. | ||
776 : | my %clusterOf = map { $_ => -1 } @pegsFound; | ||
777 : | for (my $i = 0; $i <= $#clusters; $i++) { | ||
778 : | my $subList = $clusters[$i]; | ||
779 : | for my $peg (@{$subList}) { | ||
780 : | $clusterOf{$peg} = $i; | ||
781 : | } | ||
782 : | parrello | 1.18 | } |
783 : | parrello | 1.23 | # Create the ContainsFeature data. |
784 : | for my $cellID (keys %cellPegs) { | ||
785 : | my $cellList = $cellPegs{$cellID}; | ||
786 : | for my $cellPeg (@$cellList) { | ||
787 : | $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg}); | ||
788 : | } | ||
789 : | parrello | 1.18 | } |
790 : | } | ||
791 : | parrello | 1.15 | } |
792 : | parrello | 1.1 | } |
793 : | parrello | 1.23 | # Now we need to generate the subsets. The subset names must be concatenated to |
794 : | # the subsystem name to make them unique keys. There are two types of subsets: | ||
795 : | # genome subsets and role subsets. We do the role subsets first. | ||
796 : | my @subsetNames = $sub->get_subset_names(); | ||
797 : | for my $subsetID (@subsetNames) { | ||
798 : | # Create the subset record. | ||
799 : | my $actualID = "$subsysID:$subsetID"; | ||
800 : | $loadRoleSubset->Put($actualID); | ||
801 : | # Connect the subset to the subsystem. | ||
802 : | $loadHasRoleSubset->Put($subsysID, $actualID); | ||
803 : | # Connect the subset to its roles. | ||
804 : | my @roles = $sub->get_subset($subsetID); | ||
805 : | for my $roleID (@roles) { | ||
806 : | $loadConsistsOfRoles->Put($actualID, $roleID); | ||
807 : | } | ||
808 : | parrello | 1.18 | } |
809 : | parrello | 1.23 | # Next the genome subsets. |
810 : | @subsetNames = $sub->get_subset_namesR(); | ||
811 : | for my $subsetID (@subsetNames) { | ||
812 : | # Create the subset record. | ||
813 : | my $actualID = "$subsysID:$subsetID"; | ||
814 : | $loadGenomeSubset->Put($actualID); | ||
815 : | # Connect the subset to the subsystem. | ||
816 : | $loadHasGenomeSubset->Put($subsysID, $actualID); | ||
817 : | # Connect the subset to its genomes. | ||
818 : | my @genomes = $sub->get_subsetR($subsetID); | ||
819 : | for my $genomeID (@genomes) { | ||
820 : | $loadConsistsOfGenomes->Put($actualID, $genomeID); | ||
821 : | } | ||
822 : | parrello | 1.18 | } |
823 : | } | ||
824 : | parrello | 1.23 | # Now we loop through the diagrams. We need to create the diagram records |
825 : | # and link each diagram to its roles. Note that only roles which occur | ||
826 : | # in subsystems (and therefore appear in the %ecToRoles hash) are | ||
827 : | # included. | ||
828 : | for my $map (@maps) { | ||
829 : | Trace("Loading diagram $map.") if T(3); | ||
830 : | # Get the diagram's descriptive name. | ||
831 : | my $name = $fig->map_name($map); | ||
832 : | $loadDiagram->Put($map, $name); | ||
833 : | # Now we need to link all the map's roles to it. | ||
834 : | # A hash is used to prevent duplicates. | ||
835 : | my %roleHash = (); | ||
836 : | for my $role ($fig->map_to_ecs($map)) { | ||
837 : | if (exists $ecToRoles{$role} && ! $roleHash{$role}) { | ||
838 : | $loadRoleOccursIn->Put($ecToRoles{$role}, $map); | ||
839 : | $roleHash{$role} = 1; | ||
840 : | } | ||
841 : | parrello | 1.21 | } |
842 : | } | ||
843 : | parrello | 1.23 | # Before we leave, we must create the Catalyzes table. We start with the reactions, |
844 : | # then use the "ecToRoles" table to convert EC numbers to role IDs. | ||
845 : | my @reactions = $fig->all_reactions(); | ||
846 : | for my $reactionID (@reactions) { | ||
847 : | # Get this reaction's list of roles. The results will be EC numbers. | ||
848 : | my @roles = $fig->catalyzed_by($reactionID); | ||
849 : | # Loop through the roles, creating catalyzation records. | ||
850 : | for my $thisRole (@roles) { | ||
851 : | if (exists $ecToRoles{$thisRole}) { | ||
852 : | $loadCatalyzes->Put($ecToRoles{$thisRole}, $reactionID); | ||
853 : | } | ||
854 : | parrello | 1.18 | } |
855 : | } | ||
856 : | parrello | 1.1 | } |
857 : | # Finish the load. | ||
858 : | my $retVal = $self->_FinishAll(); | ||
859 : | return $retVal; | ||
860 : | } | ||
861 : | |||
862 : | =head3 LoadPropertyData | ||
863 : | |||
864 : | C<< my $stats = $spl->LoadPropertyData(); >> | ||
865 : | |||
866 : | Load the attribute data from FIG into Sprout. | ||
867 : | |||
868 : | Attribute data in FIG corresponds to the Sprout concept of Property. As currently | ||
869 : | implemented, each key-value attribute combination in the SEED corresponds to a | ||
870 : | record in the B<Property> table. The B<HasProperty> relationship links the | ||
871 : | features to the properties. | ||
872 : | |||
873 : | The SEED also allows attributes to be assigned to genomes, but this is not yet | ||
874 : | supported by Sprout. | ||
875 : | |||
876 : | The following relations are loaded by this method. | ||
877 : | |||
878 : | HasProperty | ||
879 : | Property | ||
880 : | |||
881 : | =over 4 | ||
882 : | |||
883 : | =item RETURNS | ||
884 : | |||
885 : | Returns a statistics object for the loads. | ||
886 : | |||
887 : | =back | ||
888 : | |||
889 : | =cut | ||
890 : | #: Return Type $%; | ||
891 : | sub LoadPropertyData { | ||
892 : | # Get this object instance. | ||
893 : | my ($self) = @_; | ||
894 : | # Get the FIG object. | ||
895 : | my $fig = $self->{fig}; | ||
896 : | # Get the genome hash. | ||
897 : | my $genomeHash = $self->{genomes}; | ||
898 : | # Create load objects for each of the tables we're loading. | ||
899 : | parrello | 1.23 | my $loadProperty = $self->_TableLoader('Property'); |
900 : | parrello | 1.25 | my $loadHasProperty = $self->_TableLoader('HasProperty', $self->PrimaryOnly); |
901 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
902 : | Trace("Loading from existing files.") if T(2); | ||
903 : | } else { | ||
904 : | Trace("Generating property data.") if T(2); | ||
905 : | # Create a hash for storing property IDs. | ||
906 : | my %propertyKeys = (); | ||
907 : | my $nextID = 1; | ||
908 : | # Loop through the genomes. | ||
909 : | for my $genomeID (keys %{$genomeHash}) { | ||
910 : | $loadProperty->Add("genomeIn"); | ||
911 : | parrello | 1.24 | Trace("Generating properties for $genomeID.") if T(3); |
912 : | parrello | 1.23 | # Get the genome's features. The feature ID is the first field in the |
913 : | # tuples returned by "all_features_detailed". We use "all_features_detailed" | ||
914 : | # rather than "all_features" because we want all features regardless of type. | ||
915 : | my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; | ||
916 : | parrello | 1.24 | my $featureCount = 0; |
917 : | my $propertyCount = 0; | ||
918 : | parrello | 1.23 | # Loop through the features, creating HasProperty records. |
919 : | for my $fid (@features) { | ||
920 : | # Get all attributes for this feature. We do this one feature at a time | ||
921 : | # to insure we do not get any genome attributes. | ||
922 : | my @attributeList = $fig->get_attributes($fid, '', '', ''); | ||
923 : | parrello | 1.24 | if (scalar @attributeList) { |
924 : | $featureCount++; | ||
925 : | } | ||
926 : | parrello | 1.23 | # Loop through the attributes. |
927 : | for my $tuple (@attributeList) { | ||
928 : | parrello | 1.24 | $propertyCount++; |
929 : | parrello | 1.23 | # Get this attribute value's data. Note that we throw away the FID, |
930 : | # since it will always be the same as the value if "$fid". | ||
931 : | my (undef, $key, $value, $url) = @{$tuple}; | ||
932 : | # Concatenate the key and value and check the "propertyKeys" hash to | ||
933 : | # see if we already have an ID for it. We use a tab for the separator | ||
934 : | # character. | ||
935 : | my $propertyKey = "$key\t$value"; | ||
936 : | # Use the concatenated value to check for an ID. If no ID exists, we | ||
937 : | # create one. | ||
938 : | my $propertyID = $propertyKeys{$propertyKey}; | ||
939 : | if (! $propertyID) { | ||
940 : | # Here we need to create a new property ID for this key/value pair. | ||
941 : | $propertyKeys{$propertyKey} = $nextID; | ||
942 : | $propertyID = $nextID; | ||
943 : | $nextID++; | ||
944 : | $loadProperty->Put($propertyID, $key, $value); | ||
945 : | } | ||
946 : | # Create the HasProperty entry for this feature/property association. | ||
947 : | $loadHasProperty->Put($fid, $propertyID, $url); | ||
948 : | parrello | 1.1 | } |
949 : | } | ||
950 : | parrello | 1.24 | # Update the statistics. |
951 : | Trace("$propertyCount attributes processed for $featureCount features.") if T(3); | ||
952 : | $loadHasProperty->Add("featuresIn", $featureCount); | ||
953 : | $loadHasProperty->Add("propertiesIn", $propertyCount); | ||
954 : | parrello | 1.1 | } |
955 : | } | ||
956 : | # Finish the load. | ||
957 : | my $retVal = $self->_FinishAll(); | ||
958 : | return $retVal; | ||
959 : | } | ||
960 : | |||
961 : | =head3 LoadAnnotationData | ||
962 : | |||
963 : | C<< my $stats = $spl->LoadAnnotationData(); >> | ||
964 : | |||
965 : | Load the annotation data from FIG into Sprout. | ||
966 : | |||
967 : | Sprout annotations encompass both the assignments and the annotations in SEED. | ||
968 : | These describe the function performed by a PEG as well as any other useful | ||
969 : | information that may aid in identifying its purpose. | ||
970 : | |||
971 : | The following relations are loaded by this method. | ||
972 : | |||
973 : | Annotation | ||
974 : | IsTargetOfAnnotation | ||
975 : | SproutUser | ||
976 : | MadeAnnotation | ||
977 : | |||
978 : | =over 4 | ||
979 : | |||
980 : | =item RETURNS | ||
981 : | |||
982 : | Returns a statistics object for the loads. | ||
983 : | |||
984 : | =back | ||
985 : | |||
986 : | =cut | ||
987 : | #: Return Type $%; | ||
988 : | sub LoadAnnotationData { | ||
989 : | # Get this object instance. | ||
990 : | my ($self) = @_; | ||
991 : | # Get the FIG object. | ||
992 : | my $fig = $self->{fig}; | ||
993 : | # Get the genome hash. | ||
994 : | my $genomeHash = $self->{genomes}; | ||
995 : | # Create load objects for each of the tables we're loading. | ||
996 : | parrello | 1.23 | my $loadAnnotation = $self->_TableLoader('Annotation'); |
997 : | parrello | 1.25 | my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $self->PrimaryOnly); |
998 : | my $loadSproutUser = $self->_TableLoader('SproutUser', $self->PrimaryOnly); | ||
999 : | my $loadUserAccess = $self->_TableLoader('UserAccess', $self->PrimaryOnly); | ||
1000 : | my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $self->PrimaryOnly); | ||
1001 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
1002 : | Trace("Loading from existing files.") if T(2); | ||
1003 : | } else { | ||
1004 : | Trace("Generating annotation data.") if T(2); | ||
1005 : | # Create a hash of user names. We'll use this to prevent us from generating duplicate | ||
1006 : | # user records. | ||
1007 : | my %users = ( FIG => 1, master => 1 ); | ||
1008 : | # Put in FIG and "master". | ||
1009 : | $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes"); | ||
1010 : | $loadUserAccess->Put("FIG", 1); | ||
1011 : | $loadSproutUser->Put("master", "Master User"); | ||
1012 : | $loadUserAccess->Put("master", 1); | ||
1013 : | # Get the current time. | ||
1014 : | my $time = time(); | ||
1015 : | # Loop through the genomes. | ||
1016 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
1017 : | Trace("Processing $genomeID.") if T(3); | ||
1018 : | # Get the genome's PEGs. | ||
1019 : | my @pegs = $fig->pegs_of($genomeID); | ||
1020 : | for my $peg (@pegs) { | ||
1021 : | Trace("Processing $peg.") if T(4); | ||
1022 : | # Create a hash of timestamps. We use this to prevent duplicate time stamps | ||
1023 : | # from showing up for a single PEG's annotations. | ||
1024 : | my %seenTimestamps = (); | ||
1025 : | # Loop through the annotations. | ||
1026 : | for my $tuple ($fig->feature_annotations($peg, "raw")) { | ||
1027 : | my ($fid, $timestamp, $user, $text) = @{$tuple}; | ||
1028 : | # Here we fix up the annotation text. "\r" is removed, | ||
1029 : | # and "\t" and "\n" are escaped. Note we use the "s" | ||
1030 : | # modifier so that new-lines inside the text do not | ||
1031 : | # stop the substitution search. | ||
1032 : | $text =~ s/\r//gs; | ||
1033 : | $text =~ s/\t/\\t/gs; | ||
1034 : | $text =~ s/\n/\\n/gs; | ||
1035 : | # Change assignments by the master user to FIG assignments. | ||
1036 : | $text =~ s/Set master function/Set FIG function/s; | ||
1037 : | # Insure the time stamp is valid. | ||
1038 : | if ($timestamp =~ /^\d+$/) { | ||
1039 : | # Here it's a number. We need to insure the one we use to form | ||
1040 : | # the key is unique. | ||
1041 : | my $keyStamp = $timestamp; | ||
1042 : | while ($seenTimestamps{$keyStamp}) { | ||
1043 : | $keyStamp++; | ||
1044 : | } | ||
1045 : | $seenTimestamps{$keyStamp} = 1; | ||
1046 : | my $annotationID = "$peg:$keyStamp"; | ||
1047 : | # Insure the user exists. | ||
1048 : | if (! $users{$user}) { | ||
1049 : | $loadSproutUser->Put($user, "SEED user"); | ||
1050 : | $loadUserAccess->Put($user, 1); | ||
1051 : | $users{$user} = 1; | ||
1052 : | } | ||
1053 : | # Generate the annotation. | ||
1054 : | $loadAnnotation->Put($annotationID, $timestamp, $text); | ||
1055 : | $loadIsTargetOfAnnotation->Put($peg, $annotationID); | ||
1056 : | $loadMadeAnnotation->Put($user, $annotationID); | ||
1057 : | } else { | ||
1058 : | # Here we have an invalid time stamp. | ||
1059 : | Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1); | ||
1060 : | parrello | 1.1 | } |
1061 : | } | ||
1062 : | } | ||
1063 : | } | ||
1064 : | } | ||
1065 : | # Finish the load. | ||
1066 : | my $retVal = $self->_FinishAll(); | ||
1067 : | return $retVal; | ||
1068 : | } | ||
1069 : | |||
1070 : | parrello | 1.5 | =head3 LoadSourceData |
1071 : | |||
1072 : | C<< my $stats = $spl->LoadSourceData(); >> | ||
1073 : | |||
1074 : | Load the source data from FIG into Sprout. | ||
1075 : | |||
1076 : | Source data links genomes to information about the organizations that | ||
1077 : | mapped it. | ||
1078 : | |||
1079 : | The following relations are loaded by this method. | ||
1080 : | |||
1081 : | ComesFrom | ||
1082 : | Source | ||
1083 : | SourceURL | ||
1084 : | |||
1085 : | There is no direct support for source attribution in FIG, so we access the SEED | ||
1086 : | files directly. | ||
1087 : | |||
1088 : | =over 4 | ||
1089 : | |||
1090 : | =item RETURNS | ||
1091 : | |||
1092 : | Returns a statistics object for the loads. | ||
1093 : | |||
1094 : | =back | ||
1095 : | |||
1096 : | =cut | ||
1097 : | #: Return Type $%; | ||
1098 : | sub LoadSourceData { | ||
1099 : | # Get this object instance. | ||
1100 : | my ($self) = @_; | ||
1101 : | # Get the FIG object. | ||
1102 : | my $fig = $self->{fig}; | ||
1103 : | # Get the genome hash. | ||
1104 : | my $genomeHash = $self->{genomes}; | ||
1105 : | # Create load objects for each of the tables we're loading. | ||
1106 : | parrello | 1.25 | my $loadComesFrom = $self->_TableLoader('ComesFrom', $self->PrimaryOnly); |
1107 : | parrello | 1.23 | my $loadSource = $self->_TableLoader('Source'); |
1108 : | my $loadSourceURL = $self->_TableLoader('SourceURL'); | ||
1109 : | if ($self->{options}->{loadOnly}) { | ||
1110 : | Trace("Loading from existing files.") if T(2); | ||
1111 : | } else { | ||
1112 : | Trace("Generating annotation data.") if T(2); | ||
1113 : | # Create hashes to collect the Source information. | ||
1114 : | my %sourceURL = (); | ||
1115 : | my %sourceDesc = (); | ||
1116 : | # Loop through the genomes. | ||
1117 : | my $line; | ||
1118 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
1119 : | Trace("Processing $genomeID.") if T(3); | ||
1120 : | # Open the project file. | ||
1121 : | if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) && | ||
1122 : | defined($line = <TMP>)) { | ||
1123 : | chomp $line; | ||
1124 : | my($sourceID, $desc, $url) = split(/\t/,$line); | ||
1125 : | $loadComesFrom->Put($genomeID, $sourceID); | ||
1126 : | if ($url && ! exists $sourceURL{$sourceID}) { | ||
1127 : | $loadSourceURL->Put($sourceID, $url); | ||
1128 : | $sourceURL{$sourceID} = 1; | ||
1129 : | } | ||
1130 : | if ($desc) { | ||
1131 : | $sourceDesc{$sourceID} = $desc; | ||
1132 : | } elsif (! exists $sourceDesc{$sourceID}) { | ||
1133 : | $sourceDesc{$sourceID} = $sourceID; | ||
1134 : | } | ||
1135 : | parrello | 1.5 | } |
1136 : | parrello | 1.23 | close TMP; |
1137 : | } | ||
1138 : | # Write the source descriptions. | ||
1139 : | for my $sourceID (keys %sourceDesc) { | ||
1140 : | $loadSource->Put($sourceID, $sourceDesc{$sourceID}); | ||
1141 : | parrello | 1.5 | } |
1142 : | parrello | 1.16 | } |
1143 : | parrello | 1.5 | # Finish the load. |
1144 : | my $retVal = $self->_FinishAll(); | ||
1145 : | return $retVal; | ||
1146 : | } | ||
1147 : | |||
1148 : | parrello | 1.6 | =head3 LoadExternalData |
1149 : | |||
1150 : | C<< my $stats = $spl->LoadExternalData(); >> | ||
1151 : | |||
1152 : | Load the external data from FIG into Sprout. | ||
1153 : | |||
1154 : | External data contains information about external feature IDs. | ||
1155 : | |||
1156 : | The following relations are loaded by this method. | ||
1157 : | |||
1158 : | ExternalAliasFunc | ||
1159 : | ExternalAliasOrg | ||
1160 : | |||
1161 : | The support for external IDs in FIG is hidden beneath layers of other data, so | ||
1162 : | we access the SEED files directly to create these tables. This is also one of | ||
1163 : | the few load methods that does not proceed genome by genome. | ||
1164 : | |||
1165 : | =over 4 | ||
1166 : | |||
1167 : | =item RETURNS | ||
1168 : | |||
1169 : | Returns a statistics object for the loads. | ||
1170 : | |||
1171 : | =back | ||
1172 : | |||
1173 : | =cut | ||
1174 : | #: Return Type $%; | ||
1175 : | sub LoadExternalData { | ||
1176 : | # Get this object instance. | ||
1177 : | my ($self) = @_; | ||
1178 : | # Get the FIG object. | ||
1179 : | my $fig = $self->{fig}; | ||
1180 : | # Get the genome hash. | ||
1181 : | my $genomeHash = $self->{genomes}; | ||
1182 : | # Convert the genome hash. We'll get the genus and species for each genome and make | ||
1183 : | # it the key. | ||
1184 : | my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); | ||
1185 : | # Create load objects for each of the tables we're loading. | ||
1186 : | parrello | 1.23 | my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc'); |
1187 : | my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg'); | ||
1188 : | if ($self->{options}->{loadOnly}) { | ||
1189 : | Trace("Loading from existing files.") if T(2); | ||
1190 : | } else { | ||
1191 : | Trace("Generating external data.") if T(2); | ||
1192 : | # We loop through the files one at a time. First, the organism file. | ||
1193 : | Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); | ||
1194 : | my $orgLine; | ||
1195 : | while (defined($orgLine = <ORGS>)) { | ||
1196 : | # Clean the input line. | ||
1197 : | chomp $orgLine; | ||
1198 : | # Parse the organism name. | ||
1199 : | my ($protID, $name) = split /\s*\t\s*/, $orgLine; | ||
1200 : | $loadExternalAliasOrg->Put($protID, $name); | ||
1201 : | } | ||
1202 : | close ORGS; | ||
1203 : | # Now the function file. | ||
1204 : | my $funcLine; | ||
1205 : | Open(\*FUNCS, "<$FIG_Config::global/ext_func.table"); | ||
1206 : | while (defined($funcLine = <FUNCS>)) { | ||
1207 : | # Clean the line ending. | ||
1208 : | chomp $funcLine; | ||
1209 : | # Only proceed if the line is non-blank. | ||
1210 : | if ($funcLine) { | ||
1211 : | # Split it into fields. | ||
1212 : | my @funcFields = split /\s*\t\s*/, $funcLine; | ||
1213 : | # If there's an EC number, append it to the description. | ||
1214 : | if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) { | ||
1215 : | $funcFields[1] .= " $1"; | ||
1216 : | } | ||
1217 : | # Output the function line. | ||
1218 : | $loadExternalAliasFunc->Put(@funcFields[0,1]); | ||
1219 : | parrello | 1.6 | } |
1220 : | } | ||
1221 : | } | ||
1222 : | # Finish the load. | ||
1223 : | my $retVal = $self->_FinishAll(); | ||
1224 : | return $retVal; | ||
1225 : | } | ||
1226 : | parrello | 1.5 | |
1227 : | parrello | 1.18 | |
1228 : | =head3 LoadReactionData | ||
1229 : | |||
1230 : | C<< my $stats = $spl->LoadReactionData(); >> | ||
1231 : | |||
1232 : | Load the reaction data from FIG into Sprout. | ||
1233 : | |||
1234 : | Reaction data connects reactions to the compounds that participate in them. | ||
1235 : | |||
1236 : | The following relations are loaded by this method. | ||
1237 : | |||
1238 : | parrello | 1.20 | Reaction |
1239 : | parrello | 1.18 | ReactionURL |
1240 : | Compound | ||
1241 : | CompoundName | ||
1242 : | CompoundCAS | ||
1243 : | IsAComponentOf | ||
1244 : | |||
1245 : | This method proceeds reaction by reaction rather than genome by genome. | ||
1246 : | |||
1247 : | =over 4 | ||
1248 : | |||
1249 : | =item RETURNS | ||
1250 : | |||
1251 : | Returns a statistics object for the loads. | ||
1252 : | |||
1253 : | =back | ||
1254 : | |||
1255 : | =cut | ||
1256 : | #: Return Type $%; | ||
1257 : | sub LoadReactionData { | ||
1258 : | # Get this object instance. | ||
1259 : | my ($self) = @_; | ||
1260 : | # Get the FIG object. | ||
1261 : | my $fig = $self->{fig}; | ||
1262 : | # Create load objects for each of the tables we're loading. | ||
1263 : | parrello | 1.23 | my $loadReaction = $self->_TableLoader('Reaction'); |
1264 : | parrello | 1.25 | my $loadReactionURL = $self->_TableLoader('ReactionURL', $self->PrimaryOnly); |
1265 : | my $loadCompound = $self->_TableLoader('Compound', $self->PrimaryOnly); | ||
1266 : | my $loadCompoundName = $self->_TableLoader('CompoundName', $self->PrimaryOnly); | ||
1267 : | my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $self->PrimaryOnly); | ||
1268 : | my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $self->PrimaryOnly); | ||
1269 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
1270 : | Trace("Loading from existing files.") if T(2); | ||
1271 : | } else { | ||
1272 : | Trace("Generating annotation data.") if T(2); | ||
1273 : | # First we create the compounds. | ||
1274 : | my @compounds = $fig->all_compounds(); | ||
1275 : | for my $cid (@compounds) { | ||
1276 : | # Check for names. | ||
1277 : | my @names = $fig->names_of_compound($cid); | ||
1278 : | # Each name will be given a priority number, starting with 1. | ||
1279 : | my $prio = 1; | ||
1280 : | for my $name (@names) { | ||
1281 : | $loadCompoundName->Put($cid, $name, $prio++); | ||
1282 : | } | ||
1283 : | # Create the main compound record. Note that the first name | ||
1284 : | # becomes the label. | ||
1285 : | my $label = (@names > 0 ? $names[0] : $cid); | ||
1286 : | $loadCompound->Put($cid, $label); | ||
1287 : | # Check for a CAS ID. | ||
1288 : | my $cas = $fig->cas($cid); | ||
1289 : | if ($cas) { | ||
1290 : | $loadCompoundCAS->Put($cid, $cas); | ||
1291 : | } | ||
1292 : | parrello | 1.20 | } |
1293 : | parrello | 1.23 | # All the compounds are set up, so we need to loop through the reactions next. First, |
1294 : | # we initialize the discriminator index. This is a single integer used to insure | ||
1295 : | # duplicate elements in a reaction are not accidentally collapsed. | ||
1296 : | my $discrim = 0; | ||
1297 : | my @reactions = $fig->all_reactions(); | ||
1298 : | for my $reactionID (@reactions) { | ||
1299 : | # Create the reaction record. | ||
1300 : | $loadReaction->Put($reactionID, $fig->reversible($reactionID)); | ||
1301 : | # Compute the reaction's URL. | ||
1302 : | my $url = HTML::reaction_link($reactionID); | ||
1303 : | # Put it in the ReactionURL table. | ||
1304 : | $loadReactionURL->Put($reactionID, $url); | ||
1305 : | # Now we need all of the reaction's compounds. We get these in two phases, | ||
1306 : | # substrates first and then products. | ||
1307 : | for my $product (0, 1) { | ||
1308 : | # Get the compounds of the current type for the current reaction. FIG will | ||
1309 : | # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not | ||
1310 : | # have location data in SEED, so it defaults to the empty string. | ||
1311 : | my @compounds = $fig->reaction2comp($reactionID, $product); | ||
1312 : | for my $compData (@compounds) { | ||
1313 : | # Extract the compound data from the current tuple. | ||
1314 : | my ($cid, $stoich, $main) = @{$compData}; | ||
1315 : | # Link the compound to the reaction. | ||
1316 : | $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main, | ||
1317 : | $product, $stoich); | ||
1318 : | } | ||
1319 : | parrello | 1.18 | } |
1320 : | } | ||
1321 : | } | ||
1322 : | # Finish the load. | ||
1323 : | my $retVal = $self->_FinishAll(); | ||
1324 : | return $retVal; | ||
1325 : | } | ||
1326 : | |||
1327 : | parrello | 1.5 | =head3 LoadGroupData |
1328 : | |||
1329 : | C<< my $stats = $spl->LoadGroupData(); >> | ||
1330 : | |||
1331 : | Load the genome Groups into Sprout. | ||
1332 : | |||
1333 : | The following relations are loaded by this method. | ||
1334 : | |||
1335 : | GenomeGroups | ||
1336 : | |||
1337 : | There is no direct support for genome groups in FIG, so we access the SEED | ||
1338 : | files directly. | ||
1339 : | |||
1340 : | =over 4 | ||
1341 : | |||
1342 : | =item RETURNS | ||
1343 : | |||
1344 : | Returns a statistics object for the loads. | ||
1345 : | |||
1346 : | =back | ||
1347 : | |||
1348 : | =cut | ||
1349 : | #: Return Type $%; | ||
1350 : | sub LoadGroupData { | ||
1351 : | # Get this object instance. | ||
1352 : | my ($self) = @_; | ||
1353 : | # Get the FIG object. | ||
1354 : | my $fig = $self->{fig}; | ||
1355 : | # Get the genome hash. | ||
1356 : | my $genomeHash = $self->{genomes}; | ||
1357 : | # Create a load object for the table we're loading. | ||
1358 : | parrello | 1.23 | my $loadGenomeGroups = $self->_TableLoader('GenomeGroups'); |
1359 : | if ($self->{options}->{loadOnly}) { | ||
1360 : | Trace("Loading from existing files.") if T(2); | ||
1361 : | } else { | ||
1362 : | Trace("Generating group data.") if T(2); | ||
1363 : | # Loop through the genomes. | ||
1364 : | my $line; | ||
1365 : | for my $genomeID (keys %{$genomeHash}) { | ||
1366 : | Trace("Processing $genomeID.") if T(3); | ||
1367 : | # Open the NMPDR group file for this genome. | ||
1368 : | if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && | ||
1369 : | defined($line = <TMP>)) { | ||
1370 : | # Clean the line ending. | ||
1371 : | chomp $line; | ||
1372 : | # Add the group to the table. Note that there can only be one group | ||
1373 : | # per genome. | ||
1374 : | $loadGenomeGroups->Put($genomeID, $line); | ||
1375 : | } | ||
1376 : | close TMP; | ||
1377 : | parrello | 1.5 | } |
1378 : | } | ||
1379 : | # Finish the load. | ||
1380 : | my $retVal = $self->_FinishAll(); | ||
1381 : | return $retVal; | ||
1382 : | } | ||
1383 : | |||
1384 : | parrello | 1.1 | =head2 Internal Utility Methods |
1385 : | |||
1386 : | =head3 TableLoader | ||
1387 : | |||
1388 : | Create an ERDBLoad object for the specified table. The object is also added to | ||
1389 : | the internal list in the C<loaders> property of this object. That enables the | ||
1390 : | L</FinishAll> method to terminate all the active loads. | ||
1391 : | |||
1392 : | This is an instance method. | ||
1393 : | |||
1394 : | =over 4 | ||
1395 : | |||
1396 : | =item tableName | ||
1397 : | |||
1398 : | Name of the table (relation) being loaded. | ||
1399 : | |||
1400 : | parrello | 1.25 | =item ignore |
1401 : | |||
1402 : | TRUE if the table should be ignored entirely, else FALSE. | ||
1403 : | |||
1404 : | parrello | 1.1 | =item RETURN |
1405 : | |||
1406 : | Returns an ERDBLoad object for loading the specified table. | ||
1407 : | |||
1408 : | =back | ||
1409 : | |||
1410 : | =cut | ||
1411 : | |||
1412 : | sub _TableLoader { | ||
1413 : | # Get the parameters. | ||
1414 : | parrello | 1.25 | my ($self, $tableName, $ignore) = @_; |
1415 : | parrello | 1.1 | # Create the load object. |
1416 : | parrello | 1.25 | my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly, |
1417 : | $ignore); | ||
1418 : | parrello | 1.1 | # Cache it in the loader list. |
1419 : | push @{$self->{loaders}}, $retVal; | ||
1420 : | # Return it to the caller. | ||
1421 : | return $retVal; | ||
1422 : | } | ||
1423 : | |||
1424 : | =head3 FinishAll | ||
1425 : | |||
1426 : | Finish all the active loads on this object. | ||
1427 : | |||
1428 : | When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in | ||
1429 : | the list pointed to be the C<loaders> property of this object. This method pops the loaders | ||
1430 : | off the list and finishes them to flush out any accumulated residue. | ||
1431 : | |||
1432 : | This is an instance method. | ||
1433 : | |||
1434 : | =over 4 | ||
1435 : | |||
1436 : | =item RETURN | ||
1437 : | |||
1438 : | Returns a statistics object containing the accumulated statistics for the load. | ||
1439 : | |||
1440 : | =back | ||
1441 : | |||
1442 : | =cut | ||
1443 : | |||
1444 : | sub _FinishAll { | ||
1445 : | # Get this object instance. | ||
1446 : | my ($self) = @_; | ||
1447 : | # Create the statistics object. | ||
1448 : | my $retVal = Stats->new(); | ||
1449 : | # Get the loader list. | ||
1450 : | my $loadList = $self->{loaders}; | ||
1451 : | # Loop through the list, finishing the loads. Note that if the finish fails, we die | ||
1452 : | # ignominiously. At some future point, we want to make the loads restartable. | ||
1453 : | while (my $loader = pop @{$loadList}) { | ||
1454 : | parrello | 1.26 | # Get the relation name. |
1455 : | parrello | 1.19 | my $relName = $loader->RelName; |
1456 : | parrello | 1.26 | # Check the ignore flag. |
1457 : | if ($loader->Ignore) { | ||
1458 : | Trace("Relation $relName not loaded.") if T(2); | ||
1459 : | } else { | ||
1460 : | # Here we really need to finish. | ||
1461 : | Trace("Finishing $relName.") if T(2); | ||
1462 : | my $stats = $loader->Finish(); | ||
1463 : | parrello | 1.27 | if ($self->{options}->{dbLoad}) { |
1464 : | parrello | 1.26 | # Here we want to use the load file just created to load the database. |
1465 : | Trace("Loading relation $relName.") if T(2); | ||
1466 : | my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]); | ||
1467 : | # Accumulate the statistics from the DB load. | ||
1468 : | $stats->Accumulate($newStats); | ||
1469 : | } | ||
1470 : | $retVal->Accumulate($stats); | ||
1471 : | Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); | ||
1472 : | parrello | 1.19 | } |
1473 : | parrello | 1.1 | } |
1474 : | # Return the load statistics. | ||
1475 : | return $retVal; | ||
1476 : | } | ||
1477 : | |||
1478 : | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |