Parent Directory
|
Revision Log
Revision 1.9 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | package SproutLoad; | ||
4 : | |||
5 : | use strict; | ||
6 : | use Tracer; | ||
7 : | use PageBuilder; | ||
8 : | use ERDBLoad; | ||
9 : | use FIG; | ||
10 : | use Sprout; | ||
11 : | use Stats; | ||
12 : | use BasicLocation; | ||
13 : | |||
14 : | =head1 Sprout Load Methods | ||
15 : | |||
16 : | =head2 Introduction | ||
17 : | |||
18 : | This object contains the methods needed to copy data from the FIG data store to the | ||
19 : | Sprout database. It makes heavy use of the ERDBLoad object to manage the load into | ||
20 : | individual tables. The client can create an instance of this object and then | ||
21 : | call methods for each group of tables to load. For example, the following code will | ||
22 : | load the Genome- and Feature-related tables. (It is presumed the first command line | ||
23 : | parameter contains the name of a file specifying the genomes.) | ||
24 : | |||
25 : | my $fig = FIG->new(); | ||
26 : | my $sprout = SFXlate->new_sprout_only(); | ||
27 : | my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]); | ||
28 : | my $stats = $spl->LoadGenomeData(); | ||
29 : | $stats->Accumulate($spl->LoadFeatureData()); | ||
30 : | print $stats->Show(); | ||
31 : | |||
32 : | This module makes use of the internal Sprout property C<_erdb>. | ||
33 : | |||
34 : | It is worth noting that the FIG object does not need to be a real one. Any object | ||
35 : | that implements the FIG methods for data retrieval could be used. So, for example, | ||
36 : | this object could be used to copy data from one Sprout database to another, or | ||
37 : | from any FIG-compliant data story implemented in the future. | ||
38 : | |||
39 : | To insure that this is possible, each time the FIG object is used, it will be via | ||
40 : | a variable called C<$fig>. This makes it fairly straightforward to determine which | ||
41 : | FIG methods are required to load the Sprout database. | ||
42 : | |||
43 : | parrello | 1.5 | This object creates the load files; however, the tables are not created until it |
44 : | is time to actually do the load from the files into the target database. | ||
45 : | |||
46 : | parrello | 1.1 | =cut |
47 : | |||
48 : | #: Constructor SproutLoad->new(); | ||
49 : | |||
50 : | =head2 Public Methods | ||
51 : | |||
52 : | =head3 new | ||
53 : | |||
54 : | parrello | 1.8 | C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
55 : | parrello | 1.1 | |
56 : | Construct a new Sprout Loader object, specifying the two participating databases and | ||
57 : | the name of the files containing the list of genomes and subsystems to use. | ||
58 : | |||
59 : | =over 4 | ||
60 : | |||
61 : | =item sprout | ||
62 : | |||
63 : | Sprout object representing the target database. This also specifies the directory to | ||
64 : | be used for creating the load files. | ||
65 : | |||
66 : | =item fig | ||
67 : | |||
68 : | FIG object representing the source data store from which the data is to be taken. | ||
69 : | |||
70 : | =item genomeFile | ||
71 : | |||
72 : | Either the name of the file containing the list of genomes to load or a reference to | ||
73 : | a hash of genome IDs to access codes. If nothing is specified, all complete genomes | ||
74 : | will be loaded and the access code will default to 1. The genome list is presumed | ||
75 : | to be all-inclusive. In other words, all existing data in the target database will | ||
76 : | be deleted and replaced with the data on the specified genes. If a file is specified, | ||
77 : | it should contain one genome ID and access code per line, tab-separated. | ||
78 : | |||
79 : | =item subsysFile | ||
80 : | |||
81 : | Either the name of the file containing the list of trusted subsystems or a reference | ||
82 : | to a list of subsystem names. If nothing is specified, all known subsystems will be | ||
83 : | considered trusted. Only subsystem data related to the trusted subsystems is loaded. | ||
84 : | |||
85 : | parrello | 1.8 | =item options |
86 : | |||
87 : | Reference to a hash of command-line options. | ||
88 : | |||
89 : | parrello | 1.1 | =back |
90 : | |||
91 : | =cut | ||
92 : | |||
93 : | sub new { | ||
94 : | # Get the parameters. | ||
95 : | parrello | 1.8 | my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
96 : | parrello | 1.1 | # Load the list of genomes into a hash. |
97 : | my %genomes; | ||
98 : | if (! defined($genomeFile) || $genomeFile eq '') { | ||
99 : | # Here we want all the complete genomes and an access code of 1. | ||
100 : | my @genomeList = $fig->genomes(1); | ||
101 : | %genomes = map { $_ => 1 } @genomeList; | ||
102 : | parrello | 1.3 | } else { |
103 : | my $type = ref $genomeFile; | ||
104 : | Trace("Genome file parameter type is \"$type\".") if T(3); | ||
105 : | if ($type eq 'HASH') { | ||
106 : | # Here the user specified a hash of genome IDs to access codes, which is | ||
107 : | # exactly what we want. | ||
108 : | %genomes = %{$genomeFile}; | ||
109 : | } elsif (! $type || $type eq 'SCALAR' ) { | ||
110 : | # The caller specified a file, so read the genomes from the file. (Note | ||
111 : | # that some PERLs return an empty string rather than SCALAR.) | ||
112 : | my @genomeList = Tracer::GetFile($genomeFile); | ||
113 : | if (! @genomeList) { | ||
114 : | # It's an error if the genome file is empty or not found. | ||
115 : | Confess("No genomes found in file \"$genomeFile\"."); | ||
116 : | } else { | ||
117 : | # We build the genome Hash using a loop rather than "map" so that | ||
118 : | # an omitted access code can be defaulted to 1. | ||
119 : | for my $genomeLine (@genomeList) { | ||
120 : | my ($genomeID, $accessCode) = split("\t", $genomeLine); | ||
121 : | if (undef $accessCode) { | ||
122 : | $accessCode = 1; | ||
123 : | } | ||
124 : | $genomes{$genomeID} = $accessCode; | ||
125 : | parrello | 1.1 | } |
126 : | } | ||
127 : | parrello | 1.3 | } else { |
128 : | Confess("Invalid genome parameter ($type) in SproutLoad constructor."); | ||
129 : | parrello | 1.1 | } |
130 : | } | ||
131 : | # Load the list of trusted subsystems. | ||
132 : | my %subsystems = (); | ||
133 : | if (! defined $subsysFile || $subsysFile eq '') { | ||
134 : | # Here we want all the subsystems. | ||
135 : | %subsystems = map { $_ => 1 } $fig->all_subsystems(); | ||
136 : | parrello | 1.4 | } else { |
137 : | my $type = ref $subsysFile; | ||
138 : | if ($type eq 'ARRAY') { | ||
139 : | # Here the user passed in a list of subsystems. | ||
140 : | %subsystems = map { $_ => 1 } @{$subsysFile}; | ||
141 : | } elsif (! $type || $type eq 'SCALAR') { | ||
142 : | # Here the list of subsystems is in a file. | ||
143 : | if (! -e $subsysFile) { | ||
144 : | # It's an error if the file does not exist. | ||
145 : | Confess("Trusted subsystem file not found."); | ||
146 : | } else { | ||
147 : | # GetFile automatically chomps end-of-line characters, so this | ||
148 : | # is an easy task. | ||
149 : | %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile); | ||
150 : | } | ||
151 : | parrello | 1.1 | } else { |
152 : | parrello | 1.4 | Confess("Invalid subsystem parameter in SproutLoad constructor."); |
153 : | parrello | 1.1 | } |
154 : | } | ||
155 : | # Get the data directory from the Sprout object. | ||
156 : | my ($directory) = $sprout->LoadInfo(); | ||
157 : | # Create the Sprout load object. | ||
158 : | my $retVal = { | ||
159 : | fig => $fig, | ||
160 : | genomes => \%genomes, | ||
161 : | subsystems => \%subsystems, | ||
162 : | sprout => $sprout, | ||
163 : | loadDirectory => $directory, | ||
164 : | erdb => $sprout->{_erdb}, | ||
165 : | parrello | 1.8 | loaders => [], |
166 : | options => $options | ||
167 : | parrello | 1.1 | }; |
168 : | # Bless and return it. | ||
169 : | bless $retVal, $class; | ||
170 : | return $retVal; | ||
171 : | } | ||
172 : | |||
173 : | =head3 LoadGenomeData | ||
174 : | |||
175 : | C<< my $stats = $spl->LoadGenomeData(); >> | ||
176 : | |||
177 : | Load the Genome, Contig, and Sequence data from FIG into Sprout. | ||
178 : | |||
179 : | The Sequence table is the largest single relation in the Sprout database, so this | ||
180 : | method is expected to be slow and clumsy. At some point we will need to make it | ||
181 : | restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be | ||
182 : | very annoying otherwise. | ||
183 : | |||
184 : | The following relations are loaded by this method. | ||
185 : | |||
186 : | Genome | ||
187 : | HasContig | ||
188 : | Contig | ||
189 : | IsMadeUpOf | ||
190 : | Sequence | ||
191 : | |||
192 : | =over 4 | ||
193 : | |||
194 : | =item RETURNS | ||
195 : | |||
196 : | Returns a statistics object for the loads. | ||
197 : | |||
198 : | =back | ||
199 : | |||
200 : | B<TO DO> | ||
201 : | |||
202 : | Real quality vectors instead of C<unknown> for everything. | ||
203 : | |||
204 : | GenomeGroup relation. (The original script took group information from the C<NMPDR> file | ||
205 : | in each genome's main directory, but no such file exists anywhere in my version of the | ||
206 : | data store.) | ||
207 : | |||
208 : | =cut | ||
209 : | #: Return Type $%; | ||
210 : | sub LoadGenomeData { | ||
211 : | # Get this object instance. | ||
212 : | my ($self) = @_; | ||
213 : | # Get the FIG object. | ||
214 : | my $fig = $self->{fig}; | ||
215 : | # Get the genome count. | ||
216 : | my $genomeHash = $self->{genomes}; | ||
217 : | my $genomeCount = (keys %{$genomeHash}); | ||
218 : | Trace("Beginning genome data load.") if T(2); | ||
219 : | # Create load objects for each of the tables we're loading. | ||
220 : | my $loadGenome = $self->_TableLoader('Genome', $genomeCount); | ||
221 : | my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300); | ||
222 : | my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300); | ||
223 : | my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000); | ||
224 : | my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000); | ||
225 : | # Now we loop through the genomes, generating the data for each one. | ||
226 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
227 : | Trace("Loading data for genome $genomeID.") if T(3); | ||
228 : | parrello | 1.6 | $loadGenome->Add("genomeIn"); |
229 : | parrello | 1.1 | # The access code comes in via the genome hash. |
230 : | my $accessCode = $genomeHash->{$genomeID}; | ||
231 : | # Get the genus, species, and strain from the scientific name. Note that we append | ||
232 : | # the genome ID to the strain. In some cases this is the totality of the strain name. | ||
233 : | my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); | ||
234 : | parrello | 1.4 | my $extra = join " ", @extraData, "[$genomeID]"; |
235 : | parrello | 1.1 | # Get the full taxonomy. |
236 : | my $taxonomy = $fig->taxonomy_of($genomeID); | ||
237 : | # Output the genome record. | ||
238 : | $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus, | ||
239 : | $species, $extra, $taxonomy); | ||
240 : | # Now we loop through each of the genome's contigs. | ||
241 : | my @contigs = $fig->all_contigs($genomeID); | ||
242 : | for my $contigID (@contigs) { | ||
243 : | Trace("Processing contig $contigID for $genomeID.") if T(4); | ||
244 : | parrello | 1.6 | $loadContig->Add("contigIn"); |
245 : | $loadSequence->Add("contigIn"); | ||
246 : | parrello | 1.1 | # Create the contig ID. |
247 : | my $sproutContigID = "$genomeID:$contigID"; | ||
248 : | # Create the contig record and relate it to the genome. | ||
249 : | $loadContig->Put($sproutContigID); | ||
250 : | $loadHasContig->Put($genomeID, $sproutContigID); | ||
251 : | # Now we need to split the contig into sequences. The maximum sequence size is | ||
252 : | # a property of the Sprout object. | ||
253 : | my $chunkSize = $self->{sprout}->MaxSequence(); | ||
254 : | # Now we get the sequence a chunk at a time. | ||
255 : | my $contigLen = $fig->contig_ln($genomeID, $contigID); | ||
256 : | for (my $i = 1; $i <= $contigLen; $i += $chunkSize) { | ||
257 : | parrello | 1.6 | $loadSequence->Add("chunkIn"); |
258 : | parrello | 1.1 | # Compute the endpoint of this chunk. |
259 : | my $end = FIG::min($i + $chunkSize - 1, $contigLen); | ||
260 : | # Get the actual DNA. | ||
261 : | my $dna = $fig->get_dna($genomeID, $contigID, $i, $end); | ||
262 : | # Compute the sequenceID. | ||
263 : | my $seqID = "$sproutContigID.$i"; | ||
264 : | # Write out the data. For now, the quality vector is always "unknown". | ||
265 : | $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i); | ||
266 : | $loadSequence->Put($seqID, "unknown", $dna); | ||
267 : | } | ||
268 : | } | ||
269 : | } | ||
270 : | # Finish the loads. | ||
271 : | my $retVal = $self->_FinishAll(); | ||
272 : | # Return the result. | ||
273 : | return $retVal; | ||
274 : | } | ||
275 : | |||
276 : | =head3 LoadCouplingData | ||
277 : | |||
278 : | C<< my $stats = $spl->LoadCouplingData(); >> | ||
279 : | |||
280 : | Load the coupling and evidence data from FIG into Sprout. | ||
281 : | |||
282 : | The coupling data specifies which genome features are functionally coupled. The | ||
283 : | evidence data explains why the coupling is functional. | ||
284 : | |||
285 : | The following relations are loaded by this method. | ||
286 : | |||
287 : | Coupling | ||
288 : | IsEvidencedBy | ||
289 : | PCH | ||
290 : | ParticipatesInCoupling | ||
291 : | UsesAsEvidence | ||
292 : | |||
293 : | =over 4 | ||
294 : | |||
295 : | =item RETURNS | ||
296 : | |||
297 : | Returns a statistics object for the loads. | ||
298 : | |||
299 : | =back | ||
300 : | |||
301 : | =cut | ||
302 : | #: Return Type $%; | ||
303 : | sub LoadCouplingData { | ||
304 : | # Get this object instance. | ||
305 : | my ($self) = @_; | ||
306 : | # Get the FIG object. | ||
307 : | my $fig = $self->{fig}; | ||
308 : | # Get the genome hash. | ||
309 : | my $genomeFilter = $self->{genomes}; | ||
310 : | my $genomeCount = (keys %{$genomeFilter}); | ||
311 : | my $featureCount = $genomeCount * 4000; | ||
312 : | # Start the loads. | ||
313 : | my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount); | ||
314 : | my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000); | ||
315 : | my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000); | ||
316 : | my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000); | ||
317 : | my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000); | ||
318 : | Trace("Beginning coupling data load.") if T(2); | ||
319 : | # Loop through the genomes found. | ||
320 : | for my $genome (sort keys %{$genomeFilter}) { | ||
321 : | Trace("Generating coupling data for $genome.") if T(3); | ||
322 : | parrello | 1.6 | $loadCoupling->Add("genomeIn"); |
323 : | parrello | 1.1 | # Create a hash table for holding coupled pairs. We use this to prevent |
324 : | # duplicates. For example, if A is coupled to B, we don't want to also | ||
325 : | # assert that B is coupled to A, because we already know it. Fortunately, | ||
326 : | # all couplings occur within a genome, so we can keep the hash table | ||
327 : | # size reasonably small. | ||
328 : | my %dupHash = (); | ||
329 : | # Get all of the genome's PEGs. | ||
330 : | my @pegs = $fig->pegs_of($genome); | ||
331 : | # Loop through the PEGs. | ||
332 : | for my $peg1 (@pegs) { | ||
333 : | parrello | 1.6 | $loadCoupling->Add("pegIn"); |
334 : | parrello | 1.1 | Trace("Processing PEG $peg1 for $genome.") if T(4); |
335 : | # Get a list of the coupled PEGs. | ||
336 : | my @couplings = $fig->coupled_to($peg1); | ||
337 : | # For each coupled PEG, we need to verify that a coupling already | ||
338 : | # exists. If not, we have to create one. | ||
339 : | for my $coupleData (@couplings) { | ||
340 : | my ($peg2, $score) = @{$coupleData}; | ||
341 : | # Compute the coupling ID. | ||
342 : | my $coupleID = Sprout::CouplingID($peg1, $peg2); | ||
343 : | if (! exists $dupHash{$coupleID}) { | ||
344 : | parrello | 1.6 | $loadCoupling->Add("couplingIn"); |
345 : | parrello | 1.1 | # Here we have a new coupling to store in the load files. |
346 : | Trace("Storing coupling ($coupleID) with score $score.") if T(4); | ||
347 : | # Ensure we don't do this again. | ||
348 : | $dupHash{$coupleID} = $score; | ||
349 : | # Write the coupling record. | ||
350 : | $loadCoupling->Put($coupleID, $score); | ||
351 : | # Connect it to the coupled PEGs. | ||
352 : | $loadParticipatesInCoupling->Put($peg1, $coupleID, 1); | ||
353 : | $loadParticipatesInCoupling->Put($peg2, $coupleID, 2); | ||
354 : | # Get the evidence for this coupling. | ||
355 : | my @evidence = $fig->coupling_evidence($peg1, $peg2); | ||
356 : | # Organize the evidence into a hash table. | ||
357 : | my %evidenceMap = (); | ||
358 : | # Process each evidence item. | ||
359 : | for my $evidenceData (@evidence) { | ||
360 : | parrello | 1.6 | $loadPCH->Add("evidenceIn"); |
361 : | parrello | 1.1 | my ($peg3, $peg4, $usage) = @{$evidenceData}; |
362 : | # Only proceed if the evidence is from a Sprout | ||
363 : | # genome. | ||
364 : | if ($genomeFilter->{$fig->genome_of($peg3)}) { | ||
365 : | parrello | 1.6 | $loadUsesAsEvidence->Add("evidenceChosen"); |
366 : | parrello | 1.1 | my $evidenceKey = "$coupleID $peg3 $peg4"; |
367 : | # We store this evidence in the hash if the usage | ||
368 : | # is nonzero or no prior evidence has been found. This | ||
369 : | # insures that if there is duplicate evidence, we | ||
370 : | # at least keep the meaningful ones. Only evidence is | ||
371 : | # the hash makes it to the output. | ||
372 : | if ($usage || ! exists $evidenceMap{$evidenceKey}) { | ||
373 : | $evidenceMap{$evidenceKey} = $evidenceData; | ||
374 : | } | ||
375 : | } | ||
376 : | } | ||
377 : | for my $evidenceID (keys %evidenceMap) { | ||
378 : | # Create the evidence record. | ||
379 : | my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}}; | ||
380 : | $loadPCH->Put($evidenceID, $usage); | ||
381 : | # Connect it to the coupling. | ||
382 : | $loadIsEvidencedBy->Put($coupleID, $evidenceID); | ||
383 : | # Connect it to the features. | ||
384 : | $loadUsesAsEvidence->Put($evidenceID, $peg3, 1); | ||
385 : | $loadUsesAsEvidence->Put($evidenceID, $peg4, 1); | ||
386 : | } | ||
387 : | } | ||
388 : | } | ||
389 : | } | ||
390 : | } | ||
391 : | # All done. Finish the load. | ||
392 : | my $retVal = $self->_FinishAll(); | ||
393 : | return $retVal; | ||
394 : | } | ||
395 : | |||
396 : | =head3 LoadFeatureData | ||
397 : | |||
398 : | C<< my $stats = $spl->LoadFeatureData(); >> | ||
399 : | |||
400 : | Load the feature data from FIG into Sprout. | ||
401 : | |||
402 : | Features represent annotated genes, and are therefore the heart of the data store. | ||
403 : | |||
404 : | The following relations are loaded by this method. | ||
405 : | |||
406 : | Feature | ||
407 : | FeatureAlias | ||
408 : | FeatureLink | ||
409 : | FeatureTranslation | ||
410 : | FeatureUpstream | ||
411 : | IsLocatedIn | ||
412 : | |||
413 : | =over 4 | ||
414 : | |||
415 : | =item RETURNS | ||
416 : | |||
417 : | Returns a statistics object for the loads. | ||
418 : | |||
419 : | =back | ||
420 : | |||
421 : | =cut | ||
422 : | #: Return Type $%; | ||
423 : | sub LoadFeatureData { | ||
424 : | # Get this object instance. | ||
425 : | my ($self) = @_; | ||
426 : | # Get the FIG object. | ||
427 : | my $fig = $self->{fig}; | ||
428 : | parrello | 1.8 | # Find out if this is a limited run. |
429 : | my $limited = $self->{options}->{limitedFeatures}; | ||
430 : | parrello | 1.1 | # Get the table of genome IDs. |
431 : | my $genomeHash = $self->{genomes}; | ||
432 : | my $genomeCount = (keys %{$genomeHash}); | ||
433 : | my $featureCount = $genomeCount * 4000; | ||
434 : | # Create load objects for each of the tables we're loading. | ||
435 : | my $loadFeature = $self->_TableLoader('Feature', $featureCount); | ||
436 : | my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount); | ||
437 : | parrello | 1.8 | my ($loadFeatureAlias, $loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream); |
438 : | if (! $limited) { | ||
439 : | $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6); | ||
440 : | $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10); | ||
441 : | $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount); | ||
442 : | $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount); | ||
443 : | } | ||
444 : | parrello | 1.1 | # Get the maximum sequence size. We need this later for splitting up the |
445 : | # locations. | ||
446 : | my $chunkSize = $self->{sprout}->MaxSegment(); | ||
447 : | Trace("Beginning feature data load.") if T(2); | ||
448 : | # Now we loop through the genomes, generating the data for each one. | ||
449 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
450 : | Trace("Loading features for genome $genomeID.") if T(3); | ||
451 : | parrello | 1.6 | $loadFeature->Add("genomeIn"); |
452 : | parrello | 1.1 | # Get the feature list for this genome. |
453 : | my $features = $fig->all_features_detailed($genomeID); | ||
454 : | # Loop through the features. | ||
455 : | for my $featureData (@{$features}) { | ||
456 : | parrello | 1.6 | $loadFeature->Add("featureIn"); |
457 : | parrello | 1.1 | # Split the tuple. |
458 : | my ($featureID, $locations, $aliases, $type) = @{$featureData}; | ||
459 : | # Create the feature record. | ||
460 : | parrello | 1.7 | $loadFeature->Put($featureID, 1, $type); |
461 : | parrello | 1.8 | # The next stuff is for a full load only. |
462 : | if (! $limited) { | ||
463 : | # Create the aliases. | ||
464 : | for my $alias (split /\s*,\s*/, $aliases) { | ||
465 : | $loadFeatureAlias->Put($featureID, $alias); | ||
466 : | } | ||
467 : | # Get the links. | ||
468 : | my @links = $fig->fid_links($featureID); | ||
469 : | for my $link (@links) { | ||
470 : | $loadFeatureLink->Put($featureID, $link); | ||
471 : | parrello | 1.1 | } |
472 : | parrello | 1.8 | # If this is a peg, generate the translation and the upstream. |
473 : | if ($type eq 'peg') { | ||
474 : | $loadFeatureTranslation->Add("pegIn"); | ||
475 : | my $translation = $fig->get_translation($featureID); | ||
476 : | if ($translation) { | ||
477 : | $loadFeatureTranslation->Put($featureID, $translation); | ||
478 : | } | ||
479 : | # We use the default upstream values of u=200 and c=100. | ||
480 : | my $upstream = $fig->upstream_of($featureID, 200, 100); | ||
481 : | if ($upstream) { | ||
482 : | $loadFeatureUpstream->Put($featureID, $upstream); | ||
483 : | } | ||
484 : | parrello | 1.1 | } |
485 : | } | ||
486 : | # This part is the roughest. We need to relate the features to contig | ||
487 : | # locations, and the locations must be split so that none of them exceed | ||
488 : | # the maximum segment size. This simplifies the genes_in_region processing | ||
489 : | # for Sprout. | ||
490 : | parrello | 1.9 | my @locationList = map { "$genomeID:$_" } split /\s*,\s*/, $locations; |
491 : | parrello | 1.8 | # Create the location position indicator. |
492 : | my $i = 1; | ||
493 : | parrello | 1.1 | # Loop through the locations. |
494 : | for my $location (@locationList) { | ||
495 : | # Parse the location. | ||
496 : | my $locObject = BasicLocation->new($location); | ||
497 : | # Split it into a list of chunks. | ||
498 : | my @locOList = (); | ||
499 : | while (my $peeling = $locObject->Peel($chunkSize)) { | ||
500 : | parrello | 1.6 | $loadIsLocatedIn->Add("peeling"); |
501 : | parrello | 1.1 | push @locOList, $peeling; |
502 : | } | ||
503 : | push @locOList, $locObject; | ||
504 : | # Loop through the chunks, creating IsLocatedIn records. The variable | ||
505 : | # "$i" will be used to keep the location index. | ||
506 : | parrello | 1.8 | for my $locChunk (@locOList) { |
507 : | parrello | 1.1 | $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, |
508 : | $locChunk->Dir, $locChunk->Length, $i); | ||
509 : | $i++; | ||
510 : | } | ||
511 : | } | ||
512 : | } | ||
513 : | } | ||
514 : | # Finish the loads. | ||
515 : | my $retVal = $self->_FinishAll(); | ||
516 : | return $retVal; | ||
517 : | } | ||
518 : | |||
519 : | =head3 LoadBBHData | ||
520 : | |||
521 : | C<< my $stats = $spl->LoadBBHData(); >> | ||
522 : | |||
523 : | Load the bidirectional best hit data from FIG into Sprout. | ||
524 : | |||
525 : | Sprout does not store information on similarities. Instead, it has only the | ||
526 : | bi-directional best hits. Even so, the BBH table is one of the largest in | ||
527 : | the database. | ||
528 : | |||
529 : | The following relations are loaded by this method. | ||
530 : | |||
531 : | IsBidirectionalBestHitOf | ||
532 : | |||
533 : | =over 4 | ||
534 : | |||
535 : | =item RETURNS | ||
536 : | |||
537 : | Returns a statistics object for the loads. | ||
538 : | |||
539 : | =back | ||
540 : | |||
541 : | =cut | ||
542 : | #: Return Type $%; | ||
543 : | parrello | 1.2 | sub LoadBBHData { |
544 : | parrello | 1.1 | # Get this object instance. |
545 : | my ($self) = @_; | ||
546 : | # Get the FIG object. | ||
547 : | my $fig = $self->{fig}; | ||
548 : | # Get the table of genome IDs. | ||
549 : | my $genomeHash = $self->{genomes}; | ||
550 : | my $genomeCount = (keys %{$genomeHash}); | ||
551 : | my $featureCount = $genomeCount * 4000; | ||
552 : | # Create load objects for each of the tables we're loading. | ||
553 : | my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf', | ||
554 : | $featureCount * $genomeCount); | ||
555 : | Trace("Beginning BBH load.") if T(2); | ||
556 : | # Now we loop through the genomes, generating the data for each one. | ||
557 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
558 : | parrello | 1.6 | $loadIsBidirectionalBestHitOf->Add("genomeIn"); |
559 : | parrello | 1.1 | Trace("Processing features for genome $genomeID.") if T(3); |
560 : | # Get the feature list for this genome. | ||
561 : | my $features = $fig->all_features_detailed($genomeID); | ||
562 : | # Loop through the features. | ||
563 : | for my $featureData (@{$features}) { | ||
564 : | # Split the tuple. | ||
565 : | my ($featureID, $locations, $aliases, $type) = @{$featureData}; | ||
566 : | # Get the bi-directional best hits. | ||
567 : | my @bbhList = $fig->bbhs($featureID); | ||
568 : | for my $bbhEntry (@bbhList) { | ||
569 : | # Get the target feature ID and the score. | ||
570 : | my ($targetID, $score) = @{$bbhEntry}; | ||
571 : | # Check the target feature's genome. | ||
572 : | my $targetGenomeID = $fig->genome_of($targetID); | ||
573 : | # Only proceed if it's one of our genomes. | ||
574 : | if ($genomeHash->{$targetGenomeID}) { | ||
575 : | $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID, | ||
576 : | $score); | ||
577 : | } | ||
578 : | } | ||
579 : | } | ||
580 : | } | ||
581 : | # Finish the loads. | ||
582 : | my $retVal = $self->_FinishAll(); | ||
583 : | return $retVal; | ||
584 : | } | ||
585 : | |||
586 : | =head3 LoadSubsystemData | ||
587 : | |||
588 : | C<< my $stats = $spl->LoadSubsystemData(); >> | ||
589 : | |||
590 : | Load the subsystem data from FIG into Sprout. | ||
591 : | |||
592 : | Subsystems are groupings of genetic roles that work together to effect a specific | ||
593 : | chemical reaction. Similar organisms require similar subsystems. To curate a subsystem, | ||
594 : | a spreadsheet is created with genomes on one axis and subsystem roles on the other | ||
595 : | axis. Similar features are then mapped into the cells, allowing the annotation of one | ||
596 : | genome's roles to be used to assist in the annotation of others. | ||
597 : | |||
598 : | The following relations are loaded by this method. | ||
599 : | |||
600 : | Subsystem | ||
601 : | Role | ||
602 : | SSCell | ||
603 : | ContainsFeature | ||
604 : | IsGenomeOf | ||
605 : | IsRoleOf | ||
606 : | OccursInSubsystem | ||
607 : | ParticipatesIn | ||
608 : | HasSSCell | ||
609 : | |||
610 : | =over 4 | ||
611 : | |||
612 : | =item RETURNS | ||
613 : | |||
614 : | Returns a statistics object for the loads. | ||
615 : | |||
616 : | =back | ||
617 : | |||
618 : | B<TO DO> | ||
619 : | |||
620 : | Generate RoleName table? | ||
621 : | |||
622 : | =cut | ||
623 : | #: Return Type $%; | ||
624 : | sub LoadSubsystemData { | ||
625 : | # Get this object instance. | ||
626 : | my ($self) = @_; | ||
627 : | # Get the FIG object. | ||
628 : | my $fig = $self->{fig}; | ||
629 : | # Get the genome hash. We'll use it to filter the genomes in each | ||
630 : | # spreadsheet. | ||
631 : | my $genomeHash = $self->{genomes}; | ||
632 : | # Get the subsystem hash. This lists the subsystems we'll process. | ||
633 : | my $subsysHash = $self->{subsystems}; | ||
634 : | my @subsysIDs = sort keys %{$subsysHash}; | ||
635 : | my $subsysCount = @subsysIDs; | ||
636 : | my $genomeCount = (keys %{$genomeHash}); | ||
637 : | my $featureCount = $genomeCount * 4000; | ||
638 : | # Create load objects for each of the tables we're loading. | ||
639 : | my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount); | ||
640 : | my $loadRole = $self->_TableLoader('Role', $featureCount * 6); | ||
641 : | my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount); | ||
642 : | my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount); | ||
643 : | my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount); | ||
644 : | my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount); | ||
645 : | my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6); | ||
646 : | my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount); | ||
647 : | my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount); | ||
648 : | Trace("Beginning subsystem data load.") if T(2); | ||
649 : | # Loop through the subsystems. Our first task will be to create the | ||
650 : | # roles. We do this by looping through the subsystems and creating a | ||
651 : | # role hash. The hash tracks each role ID so that we don't create | ||
652 : | # duplicates. As we move along, we'll connect the roles and subsystems. | ||
653 : | my %roleData = (); | ||
654 : | for my $subsysID (@subsysIDs) { | ||
655 : | Trace("Creating subsystem $subsysID.") if T(3); | ||
656 : | parrello | 1.6 | $loadSubsystem->Add("subsystemIn"); |
657 : | parrello | 1.1 | # Create the subsystem record. |
658 : | $loadSubsystem->Put($subsysID); | ||
659 : | # Get the subsystem's roles. | ||
660 : | parrello | 1.6 | my @roles = $fig->subsystem_to_roles($subsysID); |
661 : | parrello | 1.1 | # Connect the roles to the subsystem. If a role is new, we create |
662 : | # a role record for it. | ||
663 : | for my $roleID (@roles) { | ||
664 : | parrello | 1.6 | $loadOccursInSubsystem->Add("roleIn"); |
665 : | parrello | 1.1 | $loadOccursInSubsystem->Put($roleID, $subsysID); |
666 : | if (! exists $roleData{$roleID}) { | ||
667 : | $loadRole->Put($roleID); | ||
668 : | $roleData{$roleID} = 1; | ||
669 : | } | ||
670 : | } | ||
671 : | # Now all roles for this subsystem have been filled in. We create the | ||
672 : | # spreadsheet by matches roles to genomes. To do this, we need to | ||
673 : | # get the genomes on the sheet. | ||
674 : | Trace("Creating subsystem $subsysID spreadsheet.") if T(3); | ||
675 : | my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)}; | ||
676 : | for my $genomeID (@genomes) { | ||
677 : | # Only process this genome if it's one of ours. | ||
678 : | if (exists $genomeHash->{$genomeID}) { | ||
679 : | # Connect the genome to the subsystem. | ||
680 : | $loadParticipatesIn->Put($genomeID, $subsysID); | ||
681 : | # Loop through the subsystem's roles. We use an index because it is | ||
682 : | # part of the spreadsheet cell ID. | ||
683 : | for (my $i = 0; $i <= $#roles; $i++) { | ||
684 : | my $role = $roles[$i]; | ||
685 : | # Get the features in the spreadsheet cell for this genome and role. | ||
686 : | parrello | 1.6 | my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i); |
687 : | parrello | 1.1 | # Only proceed if features exist. |
688 : | if (@pegs > 0) { | ||
689 : | # Create the spreadsheet cell. | ||
690 : | my $cellID = "$subsysID:$genomeID:$i"; | ||
691 : | $loadSSCell->Put($cellID); | ||
692 : | $loadIsGenomeOf->Put($genomeID, $cellID); | ||
693 : | $loadIsRoleOf->Put($role, $cellID); | ||
694 : | $loadHasSSCell->Put($subsysID, $cellID); | ||
695 : | # Attach the features to it. | ||
696 : | for my $pegID (@pegs) { | ||
697 : | $loadContainsFeature->Put($cellID, $pegID); | ||
698 : | } | ||
699 : | } | ||
700 : | } | ||
701 : | } | ||
702 : | } | ||
703 : | } | ||
704 : | # Finish the load. | ||
705 : | my $retVal = $self->_FinishAll(); | ||
706 : | return $retVal; | ||
707 : | } | ||
708 : | |||
709 : | =head3 LoadDiagramData | ||
710 : | |||
711 : | C<< my $stats = $spl->LoadDiagramData(); >> | ||
712 : | |||
713 : | Load the diagram data from FIG into Sprout. | ||
714 : | |||
715 : | Diagrams are used to organize functional roles. The diagram shows the | ||
716 : | connections between chemicals that interact with a subsystem. | ||
717 : | |||
718 : | The following relations are loaded by this method. | ||
719 : | |||
720 : | Diagram | ||
721 : | RoleOccursIn | ||
722 : | |||
723 : | =over 4 | ||
724 : | |||
725 : | =item RETURNS | ||
726 : | |||
727 : | Returns a statistics object for the loads. | ||
728 : | |||
729 : | =back | ||
730 : | |||
731 : | =cut | ||
732 : | #: Return Type $%; | ||
733 : | sub LoadDiagramData { | ||
734 : | # Get this object instance. | ||
735 : | my ($self) = @_; | ||
736 : | # Get the FIG object. | ||
737 : | my $fig = $self->{fig}; | ||
738 : | # Get the map list. | ||
739 : | my @maps = $fig->all_maps; | ||
740 : | my $mapCount = @maps; | ||
741 : | my $genomeCount = (keys %{$self->{genomes}}); | ||
742 : | my $featureCount = $genomeCount * 4000; | ||
743 : | # Create load objects for each of the tables we're loading. | ||
744 : | my $loadDiagram = $self->_TableLoader('Diagram', $mapCount); | ||
745 : | my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6); | ||
746 : | Trace("Beginning diagram data load.") if T(2); | ||
747 : | # Loop through the diagrams. | ||
748 : | for my $map ($fig->all_maps) { | ||
749 : | Trace("Loading diagram $map.") if T(3); | ||
750 : | # Get the diagram's descriptive name. | ||
751 : | my $name = $fig->map_name($map); | ||
752 : | $loadDiagram->Put($map, $name); | ||
753 : | # Now we need to link all the map's roles to it. | ||
754 : | # A hash is used to prevent duplicates. | ||
755 : | my %roleHash = (); | ||
756 : | for my $role ($fig->map_to_ecs($map)) { | ||
757 : | if (! $roleHash{$role}) { | ||
758 : | $loadRoleOccursIn->Put($role, $map); | ||
759 : | $roleHash{$role} = 1; | ||
760 : | } | ||
761 : | } | ||
762 : | } | ||
763 : | # Finish the load. | ||
764 : | my $retVal = $self->_FinishAll(); | ||
765 : | return $retVal; | ||
766 : | } | ||
767 : | |||
768 : | =head3 LoadPropertyData | ||
769 : | |||
770 : | C<< my $stats = $spl->LoadPropertyData(); >> | ||
771 : | |||
772 : | Load the attribute data from FIG into Sprout. | ||
773 : | |||
774 : | Attribute data in FIG corresponds to the Sprout concept of Property. As currently | ||
775 : | implemented, each key-value attribute combination in the SEED corresponds to a | ||
776 : | record in the B<Property> table. The B<HasProperty> relationship links the | ||
777 : | features to the properties. | ||
778 : | |||
779 : | The SEED also allows attributes to be assigned to genomes, but this is not yet | ||
780 : | supported by Sprout. | ||
781 : | |||
782 : | The following relations are loaded by this method. | ||
783 : | |||
784 : | HasProperty | ||
785 : | Property | ||
786 : | |||
787 : | =over 4 | ||
788 : | |||
789 : | =item RETURNS | ||
790 : | |||
791 : | Returns a statistics object for the loads. | ||
792 : | |||
793 : | =back | ||
794 : | |||
795 : | =cut | ||
796 : | #: Return Type $%; | ||
797 : | sub LoadPropertyData { | ||
798 : | # Get this object instance. | ||
799 : | my ($self) = @_; | ||
800 : | # Get the FIG object. | ||
801 : | my $fig = $self->{fig}; | ||
802 : | # Get the genome hash. | ||
803 : | my $genomeHash = $self->{genomes}; | ||
804 : | my $genomeCount = (keys %{$genomeHash}); | ||
805 : | # Create load objects for each of the tables we're loading. | ||
806 : | my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500); | ||
807 : | my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500); | ||
808 : | Trace("Beginning property data load.") if T(2); | ||
809 : | # Create a hash for storing property IDs. | ||
810 : | my %propertyKeys = (); | ||
811 : | my $nextID = 1; | ||
812 : | # Loop through the genomes. | ||
813 : | for my $genomeID (keys %{$genomeHash}) { | ||
814 : | parrello | 1.6 | $loadProperty->Add("genomeIn"); |
815 : | parrello | 1.1 | # Get the genome's features. The feature ID is the first field in the |
816 : | # tuples returned by "all_features_detailed". We use "all_features_detailed" | ||
817 : | # rather than "all_features" because we want all features regardless of type. | ||
818 : | my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)}; | ||
819 : | # Loop through the features, creating HasProperty records. | ||
820 : | for my $fid (@features) { | ||
821 : | parrello | 1.6 | $loadProperty->Add("featureIn"); |
822 : | parrello | 1.1 | # Get all attributes for this feature. We do this one feature at a time |
823 : | # to insure we do not get any genome attributes. | ||
824 : | my @attributeList = $fig->get_attributes($fid, '', '', ''); | ||
825 : | # Loop through the attributes. | ||
826 : | for my $tuple (@attributeList) { | ||
827 : | # Get this attribute value's data. Note that we throw away the FID, | ||
828 : | # since it will always be the same as the value if "$fid". | ||
829 : | my (undef, $key, $value, $url) = @{$tuple}; | ||
830 : | # Concatenate the key and value and check the "propertyKeys" hash to | ||
831 : | # see if we already have an ID for it. We use a tab for the separator | ||
832 : | # character. | ||
833 : | my $propertyKey = "$key\t$value"; | ||
834 : | # Use the concatenated value to check for an ID. If no ID exists, we | ||
835 : | # create one. | ||
836 : | my $propertyID = $propertyKeys{$propertyKey}; | ||
837 : | if (! $propertyID) { | ||
838 : | # Here we need to create a new property ID for this key/value pair. | ||
839 : | $propertyKeys{$propertyKey} = $nextID; | ||
840 : | $propertyID = $nextID; | ||
841 : | $nextID++; | ||
842 : | $loadProperty->Put($propertyID, $key, $value); | ||
843 : | } | ||
844 : | # Create the HasProperty entry for this feature/property association. | ||
845 : | $loadHasProperty->Put($fid, $propertyID, $url); | ||
846 : | } | ||
847 : | } | ||
848 : | } | ||
849 : | # Finish the load. | ||
850 : | my $retVal = $self->_FinishAll(); | ||
851 : | return $retVal; | ||
852 : | } | ||
853 : | |||
854 : | =head3 LoadAnnotationData | ||
855 : | |||
856 : | C<< my $stats = $spl->LoadAnnotationData(); >> | ||
857 : | |||
858 : | Load the annotation data from FIG into Sprout. | ||
859 : | |||
860 : | Sprout annotations encompass both the assignments and the annotations in SEED. | ||
861 : | These describe the function performed by a PEG as well as any other useful | ||
862 : | information that may aid in identifying its purpose. | ||
863 : | |||
864 : | The following relations are loaded by this method. | ||
865 : | |||
866 : | Annotation | ||
867 : | IsTargetOfAnnotation | ||
868 : | SproutUser | ||
869 : | MadeAnnotation | ||
870 : | |||
871 : | =over 4 | ||
872 : | |||
873 : | =item RETURNS | ||
874 : | |||
875 : | Returns a statistics object for the loads. | ||
876 : | |||
877 : | =back | ||
878 : | |||
879 : | =cut | ||
880 : | #: Return Type $%; | ||
881 : | sub LoadAnnotationData { | ||
882 : | # Get this object instance. | ||
883 : | my ($self) = @_; | ||
884 : | # Get the FIG object. | ||
885 : | my $fig = $self->{fig}; | ||
886 : | # Get the genome hash. | ||
887 : | my $genomeHash = $self->{genomes}; | ||
888 : | my $genomeCount = (keys %{$genomeHash}); | ||
889 : | # Create load objects for each of the tables we're loading. | ||
890 : | my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000); | ||
891 : | my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000); | ||
892 : | my $loadSproutUser = $self->_TableLoader('SproutUser', 100); | ||
893 : | my $loadUserAccess = $self->_TableLoader('UserAccess', 1000); | ||
894 : | my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000); | ||
895 : | Trace("Beginning annotation data load.") if T(2); | ||
896 : | # Create a hash of user names. We'll use this to prevent us from generating duplicate | ||
897 : | # user records. | ||
898 : | my %users = ( FIG => 1, master => 1 ); | ||
899 : | # Put in FIG and "master". | ||
900 : | $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes"); | ||
901 : | $loadUserAccess->Put("FIG", 1); | ||
902 : | $loadSproutUser->Put("master", "Master User"); | ||
903 : | $loadUserAccess->Put("master", 1); | ||
904 : | # Get the current time. | ||
905 : | my $time = time(); | ||
906 : | # Loop through the genomes. | ||
907 : | parrello | 1.6 | for my $genomeID (sort keys %{$genomeHash}) { |
908 : | parrello | 1.1 | Trace("Processing $genomeID.") if T(3); |
909 : | # Get the genome's PEGs. | ||
910 : | my @pegs = $fig->pegs_of($genomeID); | ||
911 : | for my $peg (@pegs) { | ||
912 : | Trace("Processing $peg.") if T(4); | ||
913 : | # Create a hash of timestamps. We use this to prevent duplicate time stamps | ||
914 : | # from showing up for a single PEG's annotations. | ||
915 : | my %seenTimestamps = (); | ||
916 : | # Check for a functional assignment. | ||
917 : | my $func = $fig->function_of($peg); | ||
918 : | if ($func) { | ||
919 : | # If this is NOT a hypothetical assignment, we create an | ||
920 : | # assignment annotation for it. | ||
921 : | if (! FIG::hypo($peg)) { | ||
922 : | # Note that we double the slashes so that what goes into the database is | ||
923 : | # a new-line escape sequence rather than an actual new-line. | ||
924 : | $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func"); | ||
925 : | $loadIsTargetOfAnnotation->Put($peg, "$peg:$time"); | ||
926 : | $loadMadeAnnotation->Put("FIG", "$peg:$time"); | ||
927 : | # Denote we've seen this timestamp. | ||
928 : | $seenTimestamps{$time} = 1; | ||
929 : | } | ||
930 : | # Now loop through the real annotations. | ||
931 : | for my $tuple ($fig->feature_annotations($peg, "raw")) { | ||
932 : | parrello | 1.6 | my ($fid, $timestamp, $user, $text) = @{$tuple}; |
933 : | parrello | 1.1 | # Here we fix up the annotation text. "\r" is removed, |
934 : | # and "\t" and "\n" are escaped. Note we use the "s" | ||
935 : | # modifier so that new-lines inside the text do not | ||
936 : | # stop the substitution search. | ||
937 : | $text =~ s/\r//gs; | ||
938 : | $text =~ s/\t/\\t/gs; | ||
939 : | $text =~ s/\n/\\n/gs; | ||
940 : | # Change assignments by the master user to FIG assignments. | ||
941 : | $text =~ s/Set master function/Set FIG function/s; | ||
942 : | # Insure the time stamp is valid. | ||
943 : | if ($timestamp =~ /^\d+$/) { | ||
944 : | # Here it's a number. We need to insure it's unique. | ||
945 : | while ($seenTimestamps{$timestamp}) { | ||
946 : | $timestamp++; | ||
947 : | } | ||
948 : | $seenTimestamps{$timestamp} = 1; | ||
949 : | my $annotationID = "$peg:$timestamp"; | ||
950 : | # Insure the user exists. | ||
951 : | if (! $users{$user}) { | ||
952 : | $loadSproutUser->Put($user, "SEED user"); | ||
953 : | $loadUserAccess->Put($user, 1); | ||
954 : | $users{$user} = 1; | ||
955 : | } | ||
956 : | # Generate the annotation. | ||
957 : | $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text"); | ||
958 : | $loadIsTargetOfAnnotation->Put($peg, $annotationID); | ||
959 : | $loadMadeAnnotation->Put($user, $annotationID); | ||
960 : | } else { | ||
961 : | # Here we have an invalid time stamp. | ||
962 : | Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1); | ||
963 : | } | ||
964 : | } | ||
965 : | } | ||
966 : | } | ||
967 : | } | ||
968 : | # Finish the load. | ||
969 : | my $retVal = $self->_FinishAll(); | ||
970 : | return $retVal; | ||
971 : | } | ||
972 : | |||
973 : | parrello | 1.5 | =head3 LoadSourceData |
974 : | |||
975 : | C<< my $stats = $spl->LoadSourceData(); >> | ||
976 : | |||
977 : | Load the source data from FIG into Sprout. | ||
978 : | |||
979 : | Source data links genomes to information about the organizations that | ||
980 : | mapped it. | ||
981 : | |||
982 : | The following relations are loaded by this method. | ||
983 : | |||
984 : | ComesFrom | ||
985 : | Source | ||
986 : | SourceURL | ||
987 : | |||
988 : | There is no direct support for source attribution in FIG, so we access the SEED | ||
989 : | files directly. | ||
990 : | |||
991 : | =over 4 | ||
992 : | |||
993 : | =item RETURNS | ||
994 : | |||
995 : | Returns a statistics object for the loads. | ||
996 : | |||
997 : | =back | ||
998 : | |||
999 : | =cut | ||
1000 : | #: Return Type $%; | ||
1001 : | sub LoadSourceData { | ||
1002 : | # Get this object instance. | ||
1003 : | my ($self) = @_; | ||
1004 : | # Get the FIG object. | ||
1005 : | my $fig = $self->{fig}; | ||
1006 : | # Get the genome hash. | ||
1007 : | my $genomeHash = $self->{genomes}; | ||
1008 : | my $genomeCount = (keys %{$genomeHash}); | ||
1009 : | # Create load objects for each of the tables we're loading. | ||
1010 : | my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4); | ||
1011 : | my $loadSource = $self->_TableLoader('Source', $genomeCount * 4); | ||
1012 : | my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8); | ||
1013 : | Trace("Beginning source data load.") if T(2); | ||
1014 : | # Create hashes to collect the Source information. | ||
1015 : | my %sourceURL = (); | ||
1016 : | my %sourceDesc = (); | ||
1017 : | # Loop through the genomes. | ||
1018 : | my $line; | ||
1019 : | parrello | 1.6 | for my $genomeID (sort keys %{$genomeHash}) { |
1020 : | parrello | 1.5 | Trace("Processing $genomeID.") if T(3); |
1021 : | # Open the project file. | ||
1022 : | if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) && | ||
1023 : | defined($line = <TMP>)) { | ||
1024 : | chomp $line; | ||
1025 : | parrello | 1.6 | my($sourceID, $desc, $url) = split(/\t/,$line); |
1026 : | parrello | 1.5 | $loadComesFrom->Put($genomeID, $sourceID); |
1027 : | if ($url && ! exists $sourceURL{$genomeID}) { | ||
1028 : | $loadSourceURL->Put($sourceID, $url); | ||
1029 : | $sourceURL{$sourceID} = 1; | ||
1030 : | } | ||
1031 : | if ($desc && ! exists $sourceDesc{$sourceID}) { | ||
1032 : | $loadSource->Put($sourceID, $desc); | ||
1033 : | $sourceDesc{$sourceID} = 1; | ||
1034 : | } | ||
1035 : | } | ||
1036 : | close TMP; | ||
1037 : | } | ||
1038 : | # Finish the load. | ||
1039 : | my $retVal = $self->_FinishAll(); | ||
1040 : | return $retVal; | ||
1041 : | } | ||
1042 : | |||
1043 : | parrello | 1.6 | =head3 LoadExternalData |
1044 : | |||
1045 : | C<< my $stats = $spl->LoadExternalData(); >> | ||
1046 : | |||
1047 : | Load the external data from FIG into Sprout. | ||
1048 : | |||
1049 : | External data contains information about external feature IDs. | ||
1050 : | |||
1051 : | The following relations are loaded by this method. | ||
1052 : | |||
1053 : | ExternalAliasFunc | ||
1054 : | ExternalAliasOrg | ||
1055 : | |||
1056 : | The support for external IDs in FIG is hidden beneath layers of other data, so | ||
1057 : | we access the SEED files directly to create these tables. This is also one of | ||
1058 : | the few load methods that does not proceed genome by genome. | ||
1059 : | |||
1060 : | =over 4 | ||
1061 : | |||
1062 : | =item RETURNS | ||
1063 : | |||
1064 : | Returns a statistics object for the loads. | ||
1065 : | |||
1066 : | =back | ||
1067 : | |||
1068 : | =cut | ||
1069 : | #: Return Type $%; | ||
1070 : | sub LoadExternalData { | ||
1071 : | # Get this object instance. | ||
1072 : | my ($self) = @_; | ||
1073 : | # Get the FIG object. | ||
1074 : | my $fig = $self->{fig}; | ||
1075 : | # Get the genome hash. | ||
1076 : | my $genomeHash = $self->{genomes}; | ||
1077 : | my $genomeCount = (keys %{$genomeHash}); | ||
1078 : | # Convert the genome hash. We'll get the genus and species for each genome and make | ||
1079 : | # it the key. | ||
1080 : | my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); | ||
1081 : | # Create load objects for each of the tables we're loading. | ||
1082 : | my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000); | ||
1083 : | my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000); | ||
1084 : | Trace("Beginning external data load.") if T(2); | ||
1085 : | # We loop through the files one at a time. First, the organism file. | ||
1086 : | Open(\*ORGS, "<$FIG_Config::global/ext_org.table"); | ||
1087 : | my $orgLine; | ||
1088 : | while (defined($orgLine = <ORGS>)) { | ||
1089 : | # Clean the input line. | ||
1090 : | chomp $orgLine; | ||
1091 : | # Parse the organism name. | ||
1092 : | my ($protID, $name) = split /\s*\t\s*/, $orgLine; | ||
1093 : | $loadExternalAliasOrg->Put($protID, $name); | ||
1094 : | } | ||
1095 : | close ORGS; | ||
1096 : | # Now the function file. | ||
1097 : | my $funcLine; | ||
1098 : | Open(\*FUNCS, "<$FIG_Config::global/ext_func.table"); | ||
1099 : | while (defined($funcLine = <FUNCS>)) { | ||
1100 : | # Clean the line ending. | ||
1101 : | chomp $funcLine; | ||
1102 : | # Only proceed if the line is non-blank. | ||
1103 : | if ($funcLine) { | ||
1104 : | # Split it into fields. | ||
1105 : | my @funcFields = split /\s*\t\s*/, $funcLine; | ||
1106 : | # If there's an EC number, append it to the description. | ||
1107 : | if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) { | ||
1108 : | $funcFields[1] .= " $1"; | ||
1109 : | } | ||
1110 : | # Output the function line. | ||
1111 : | $loadExternalAliasFunc->Put(@funcFields[0,1]); | ||
1112 : | } | ||
1113 : | } | ||
1114 : | # Finish the load. | ||
1115 : | my $retVal = $self->_FinishAll(); | ||
1116 : | return $retVal; | ||
1117 : | } | ||
1118 : | parrello | 1.5 | |
1119 : | =head3 LoadGroupData | ||
1120 : | |||
1121 : | C<< my $stats = $spl->LoadGroupData(); >> | ||
1122 : | |||
1123 : | Load the genome Groups into Sprout. | ||
1124 : | |||
1125 : | The following relations are loaded by this method. | ||
1126 : | |||
1127 : | GenomeGroups | ||
1128 : | |||
1129 : | There is no direct support for genome groups in FIG, so we access the SEED | ||
1130 : | files directly. | ||
1131 : | |||
1132 : | =over 4 | ||
1133 : | |||
1134 : | =item RETURNS | ||
1135 : | |||
1136 : | Returns a statistics object for the loads. | ||
1137 : | |||
1138 : | =back | ||
1139 : | |||
1140 : | =cut | ||
1141 : | #: Return Type $%; | ||
1142 : | sub LoadGroupData { | ||
1143 : | # Get this object instance. | ||
1144 : | my ($self) = @_; | ||
1145 : | # Get the FIG object. | ||
1146 : | my $fig = $self->{fig}; | ||
1147 : | # Get the genome hash. | ||
1148 : | my $genomeHash = $self->{genomes}; | ||
1149 : | my $genomeCount = (keys %{$genomeHash}); | ||
1150 : | # Create a load object for the table we're loading. | ||
1151 : | my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4); | ||
1152 : | Trace("Beginning group data load.") if T(2); | ||
1153 : | # Loop through the genomes. | ||
1154 : | my $line; | ||
1155 : | parrello | 1.6 | for my $genomeID (keys %{$genomeHash}) { |
1156 : | parrello | 1.5 | Trace("Processing $genomeID.") if T(3); |
1157 : | # Open the NMPDR group file for this genome. | ||
1158 : | if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && | ||
1159 : | defined($line = <TMP>)) { | ||
1160 : | # Clean the line ending. | ||
1161 : | parrello | 1.6 | chomp $line; |
1162 : | parrello | 1.5 | # Add the group to the table. Note that there can only be one group |
1163 : | # per genome. | ||
1164 : | $loadGenomeGroups->Put($genomeID, $line); | ||
1165 : | } | ||
1166 : | close TMP; | ||
1167 : | } | ||
1168 : | # Finish the load. | ||
1169 : | my $retVal = $self->_FinishAll(); | ||
1170 : | return $retVal; | ||
1171 : | } | ||
1172 : | |||
1173 : | parrello | 1.1 | =head2 Internal Utility Methods |
1174 : | |||
1175 : | =head3 TableLoader | ||
1176 : | |||
1177 : | Create an ERDBLoad object for the specified table. The object is also added to | ||
1178 : | the internal list in the C<loaders> property of this object. That enables the | ||
1179 : | L</FinishAll> method to terminate all the active loads. | ||
1180 : | |||
1181 : | This is an instance method. | ||
1182 : | |||
1183 : | =over 4 | ||
1184 : | |||
1185 : | =item tableName | ||
1186 : | |||
1187 : | Name of the table (relation) being loaded. | ||
1188 : | |||
1189 : | =item rowCount (optional) | ||
1190 : | |||
1191 : | Estimated maximum number of rows in the table. | ||
1192 : | |||
1193 : | =item RETURN | ||
1194 : | |||
1195 : | Returns an ERDBLoad object for loading the specified table. | ||
1196 : | |||
1197 : | =back | ||
1198 : | |||
1199 : | =cut | ||
1200 : | |||
1201 : | sub _TableLoader { | ||
1202 : | # Get the parameters. | ||
1203 : | my ($self, $tableName, $rowCount) = @_; | ||
1204 : | # Create the load object. | ||
1205 : | my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount); | ||
1206 : | # Cache it in the loader list. | ||
1207 : | push @{$self->{loaders}}, $retVal; | ||
1208 : | # Return it to the caller. | ||
1209 : | return $retVal; | ||
1210 : | } | ||
1211 : | |||
1212 : | =head3 FinishAll | ||
1213 : | |||
1214 : | Finish all the active loads on this object. | ||
1215 : | |||
1216 : | When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in | ||
1217 : | the list pointed to be the C<loaders> property of this object. This method pops the loaders | ||
1218 : | off the list and finishes them to flush out any accumulated residue. | ||
1219 : | |||
1220 : | This is an instance method. | ||
1221 : | |||
1222 : | =over 4 | ||
1223 : | |||
1224 : | =item RETURN | ||
1225 : | |||
1226 : | Returns a statistics object containing the accumulated statistics for the load. | ||
1227 : | |||
1228 : | =back | ||
1229 : | |||
1230 : | =cut | ||
1231 : | |||
1232 : | sub _FinishAll { | ||
1233 : | # Get this object instance. | ||
1234 : | my ($self) = @_; | ||
1235 : | # Create the statistics object. | ||
1236 : | my $retVal = Stats->new(); | ||
1237 : | # Get the loader list. | ||
1238 : | my $loadList = $self->{loaders}; | ||
1239 : | # Loop through the list, finishing the loads. Note that if the finish fails, we die | ||
1240 : | # ignominiously. At some future point, we want to make the loads restartable. | ||
1241 : | while (my $loader = pop @{$loadList}) { | ||
1242 : | my $stats = $loader->Finish(); | ||
1243 : | $retVal->Accumulate($stats); | ||
1244 : | my $relName = $loader->RelName; | ||
1245 : | Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); | ||
1246 : | } | ||
1247 : | # Return the load statistics. | ||
1248 : | return $retVal; | ||
1249 : | } | ||
1250 : | |||
1251 : | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |