[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Annotation of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SproutLoad;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDBLoad;
9 :     use FIG;
10 :     use Sprout;
11 :     use Stats;
12 :     use BasicLocation;
13 :    
14 :     =head1 Sprout Load Methods
15 :    
16 :     =head2 Introduction
17 :    
18 :     This object contains the methods needed to copy data from the FIG data store to the
19 :     Sprout database. It makes heavy use of the ERDBLoad object to manage the load into
20 :     individual tables. The client can create an instance of this object and then
21 :     call methods for each group of tables to load. For example, the following code will
22 :     load the Genome- and Feature-related tables. (It is presumed the first command line
23 :     parameter contains the name of a file specifying the genomes.)
24 :    
25 :     my $fig = FIG->new();
26 :     my $sprout = SFXlate->new_sprout_only();
27 :     my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]);
28 :     my $stats = $spl->LoadGenomeData();
29 :     $stats->Accumulate($spl->LoadFeatureData());
30 :     print $stats->Show();
31 :    
32 :     This module makes use of the internal Sprout property C<_erdb>.
33 :    
34 :     It is worth noting that the FIG object does not need to be a real one. Any object
35 :     that implements the FIG methods for data retrieval could be used. So, for example,
36 :     this object could be used to copy data from one Sprout database to another, or
37 :     from any FIG-compliant data story implemented in the future.
38 :    
39 :     To insure that this is possible, each time the FIG object is used, it will be via
40 :     a variable called C<$fig>. This makes it fairly straightforward to determine which
41 :     FIG methods are required to load the Sprout database.
42 :    
43 : parrello 1.5 This object creates the load files; however, the tables are not created until it
44 :     is time to actually do the load from the files into the target database.
45 :    
46 : parrello 1.1 =cut
47 :    
48 :     #: Constructor SproutLoad->new();
49 :    
50 :     =head2 Public Methods
51 :    
52 :     =head3 new
53 :    
54 : parrello 1.8 C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
55 : parrello 1.1
56 :     Construct a new Sprout Loader object, specifying the two participating databases and
57 :     the name of the files containing the list of genomes and subsystems to use.
58 :    
59 :     =over 4
60 :    
61 :     =item sprout
62 :    
63 :     Sprout object representing the target database. This also specifies the directory to
64 :     be used for creating the load files.
65 :    
66 :     =item fig
67 :    
68 :     FIG object representing the source data store from which the data is to be taken.
69 :    
70 :     =item genomeFile
71 :    
72 :     Either the name of the file containing the list of genomes to load or a reference to
73 :     a hash of genome IDs to access codes. If nothing is specified, all complete genomes
74 :     will be loaded and the access code will default to 1. The genome list is presumed
75 :     to be all-inclusive. In other words, all existing data in the target database will
76 :     be deleted and replaced with the data on the specified genes. If a file is specified,
77 :     it should contain one genome ID and access code per line, tab-separated.
78 :    
79 :     =item subsysFile
80 :    
81 :     Either the name of the file containing the list of trusted subsystems or a reference
82 :     to a list of subsystem names. If nothing is specified, all known subsystems will be
83 :     considered trusted. Only subsystem data related to the trusted subsystems is loaded.
84 :    
85 : parrello 1.8 =item options
86 :    
87 :     Reference to a hash of command-line options.
88 :    
89 : parrello 1.1 =back
90 :    
91 :     =cut
92 :    
93 :     sub new {
94 :     # Get the parameters.
95 : parrello 1.8 my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
96 : parrello 1.1 # Load the list of genomes into a hash.
97 :     my %genomes;
98 :     if (! defined($genomeFile) || $genomeFile eq '') {
99 :     # Here we want all the complete genomes and an access code of 1.
100 :     my @genomeList = $fig->genomes(1);
101 :     %genomes = map { $_ => 1 } @genomeList;
102 : parrello 1.3 } else {
103 :     my $type = ref $genomeFile;
104 :     Trace("Genome file parameter type is \"$type\".") if T(3);
105 :     if ($type eq 'HASH') {
106 :     # Here the user specified a hash of genome IDs to access codes, which is
107 :     # exactly what we want.
108 :     %genomes = %{$genomeFile};
109 :     } elsif (! $type || $type eq 'SCALAR' ) {
110 :     # The caller specified a file, so read the genomes from the file. (Note
111 :     # that some PERLs return an empty string rather than SCALAR.)
112 :     my @genomeList = Tracer::GetFile($genomeFile);
113 :     if (! @genomeList) {
114 :     # It's an error if the genome file is empty or not found.
115 :     Confess("No genomes found in file \"$genomeFile\".");
116 :     } else {
117 :     # We build the genome Hash using a loop rather than "map" so that
118 :     # an omitted access code can be defaulted to 1.
119 :     for my $genomeLine (@genomeList) {
120 :     my ($genomeID, $accessCode) = split("\t", $genomeLine);
121 :     if (undef $accessCode) {
122 :     $accessCode = 1;
123 :     }
124 :     $genomes{$genomeID} = $accessCode;
125 : parrello 1.1 }
126 :     }
127 : parrello 1.3 } else {
128 :     Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
129 : parrello 1.1 }
130 :     }
131 :     # Load the list of trusted subsystems.
132 :     my %subsystems = ();
133 :     if (! defined $subsysFile || $subsysFile eq '') {
134 :     # Here we want all the subsystems.
135 :     %subsystems = map { $_ => 1 } $fig->all_subsystems();
136 : parrello 1.4 } else {
137 :     my $type = ref $subsysFile;
138 :     if ($type eq 'ARRAY') {
139 :     # Here the user passed in a list of subsystems.
140 :     %subsystems = map { $_ => 1 } @{$subsysFile};
141 :     } elsif (! $type || $type eq 'SCALAR') {
142 :     # Here the list of subsystems is in a file.
143 :     if (! -e $subsysFile) {
144 :     # It's an error if the file does not exist.
145 :     Confess("Trusted subsystem file not found.");
146 :     } else {
147 :     # GetFile automatically chomps end-of-line characters, so this
148 :     # is an easy task.
149 :     %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile);
150 :     }
151 : parrello 1.1 } else {
152 : parrello 1.4 Confess("Invalid subsystem parameter in SproutLoad constructor.");
153 : parrello 1.1 }
154 :     }
155 :     # Get the data directory from the Sprout object.
156 :     my ($directory) = $sprout->LoadInfo();
157 :     # Create the Sprout load object.
158 :     my $retVal = {
159 :     fig => $fig,
160 :     genomes => \%genomes,
161 :     subsystems => \%subsystems,
162 :     sprout => $sprout,
163 :     loadDirectory => $directory,
164 :     erdb => $sprout->{_erdb},
165 : parrello 1.8 loaders => [],
166 :     options => $options
167 : parrello 1.1 };
168 :     # Bless and return it.
169 :     bless $retVal, $class;
170 :     return $retVal;
171 :     }
172 :    
173 :     =head3 LoadGenomeData
174 :    
175 :     C<< my $stats = $spl->LoadGenomeData(); >>
176 :    
177 :     Load the Genome, Contig, and Sequence data from FIG into Sprout.
178 :    
179 :     The Sequence table is the largest single relation in the Sprout database, so this
180 :     method is expected to be slow and clumsy. At some point we will need to make it
181 :     restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be
182 :     very annoying otherwise.
183 :    
184 :     The following relations are loaded by this method.
185 :    
186 :     Genome
187 :     HasContig
188 :     Contig
189 :     IsMadeUpOf
190 :     Sequence
191 :    
192 :     =over 4
193 :    
194 :     =item RETURNS
195 :    
196 :     Returns a statistics object for the loads.
197 :    
198 :     =back
199 :    
200 :     B<TO DO>
201 :    
202 :     Real quality vectors instead of C<unknown> for everything.
203 :    
204 :     GenomeGroup relation. (The original script took group information from the C<NMPDR> file
205 :     in each genome's main directory, but no such file exists anywhere in my version of the
206 :     data store.)
207 :    
208 :     =cut
209 :     #: Return Type $%;
210 :     sub LoadGenomeData {
211 :     # Get this object instance.
212 :     my ($self) = @_;
213 :     # Get the FIG object.
214 :     my $fig = $self->{fig};
215 :     # Get the genome count.
216 :     my $genomeHash = $self->{genomes};
217 :     my $genomeCount = (keys %{$genomeHash});
218 :     Trace("Beginning genome data load.") if T(2);
219 :     # Create load objects for each of the tables we're loading.
220 :     my $loadGenome = $self->_TableLoader('Genome', $genomeCount);
221 :     my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);
222 :     my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);
223 :     my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);
224 :     my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);
225 :     # Now we loop through the genomes, generating the data for each one.
226 :     for my $genomeID (sort keys %{$genomeHash}) {
227 :     Trace("Loading data for genome $genomeID.") if T(3);
228 : parrello 1.6 $loadGenome->Add("genomeIn");
229 : parrello 1.1 # The access code comes in via the genome hash.
230 :     my $accessCode = $genomeHash->{$genomeID};
231 :     # Get the genus, species, and strain from the scientific name. Note that we append
232 :     # the genome ID to the strain. In some cases this is the totality of the strain name.
233 :     my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
234 : parrello 1.4 my $extra = join " ", @extraData, "[$genomeID]";
235 : parrello 1.1 # Get the full taxonomy.
236 :     my $taxonomy = $fig->taxonomy_of($genomeID);
237 :     # Output the genome record.
238 :     $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,
239 :     $species, $extra, $taxonomy);
240 :     # Now we loop through each of the genome's contigs.
241 :     my @contigs = $fig->all_contigs($genomeID);
242 :     for my $contigID (@contigs) {
243 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
244 : parrello 1.6 $loadContig->Add("contigIn");
245 :     $loadSequence->Add("contigIn");
246 : parrello 1.1 # Create the contig ID.
247 :     my $sproutContigID = "$genomeID:$contigID";
248 :     # Create the contig record and relate it to the genome.
249 :     $loadContig->Put($sproutContigID);
250 :     $loadHasContig->Put($genomeID, $sproutContigID);
251 :     # Now we need to split the contig into sequences. The maximum sequence size is
252 :     # a property of the Sprout object.
253 :     my $chunkSize = $self->{sprout}->MaxSequence();
254 :     # Now we get the sequence a chunk at a time.
255 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
256 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
257 : parrello 1.6 $loadSequence->Add("chunkIn");
258 : parrello 1.1 # Compute the endpoint of this chunk.
259 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
260 :     # Get the actual DNA.
261 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
262 :     # Compute the sequenceID.
263 :     my $seqID = "$sproutContigID.$i";
264 :     # Write out the data. For now, the quality vector is always "unknown".
265 :     $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i);
266 :     $loadSequence->Put($seqID, "unknown", $dna);
267 :     }
268 :     }
269 :     }
270 :     # Finish the loads.
271 :     my $retVal = $self->_FinishAll();
272 :     # Return the result.
273 :     return $retVal;
274 :     }
275 :    
276 :     =head3 LoadCouplingData
277 :    
278 :     C<< my $stats = $spl->LoadCouplingData(); >>
279 :    
280 :     Load the coupling and evidence data from FIG into Sprout.
281 :    
282 :     The coupling data specifies which genome features are functionally coupled. The
283 :     evidence data explains why the coupling is functional.
284 :    
285 :     The following relations are loaded by this method.
286 :    
287 :     Coupling
288 :     IsEvidencedBy
289 :     PCH
290 :     ParticipatesInCoupling
291 :     UsesAsEvidence
292 :    
293 :     =over 4
294 :    
295 :     =item RETURNS
296 :    
297 :     Returns a statistics object for the loads.
298 :    
299 :     =back
300 :    
301 :     =cut
302 :     #: Return Type $%;
303 :     sub LoadCouplingData {
304 :     # Get this object instance.
305 :     my ($self) = @_;
306 :     # Get the FIG object.
307 :     my $fig = $self->{fig};
308 :     # Get the genome hash.
309 :     my $genomeFilter = $self->{genomes};
310 :     my $genomeCount = (keys %{$genomeFilter});
311 :     my $featureCount = $genomeCount * 4000;
312 :     # Start the loads.
313 :     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);
314 :     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);
315 :     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);
316 :     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);
317 :     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);
318 :     Trace("Beginning coupling data load.") if T(2);
319 :     # Loop through the genomes found.
320 :     for my $genome (sort keys %{$genomeFilter}) {
321 :     Trace("Generating coupling data for $genome.") if T(3);
322 : parrello 1.6 $loadCoupling->Add("genomeIn");
323 : parrello 1.1 # Create a hash table for holding coupled pairs. We use this to prevent
324 :     # duplicates. For example, if A is coupled to B, we don't want to also
325 :     # assert that B is coupled to A, because we already know it. Fortunately,
326 :     # all couplings occur within a genome, so we can keep the hash table
327 :     # size reasonably small.
328 :     my %dupHash = ();
329 :     # Get all of the genome's PEGs.
330 :     my @pegs = $fig->pegs_of($genome);
331 :     # Loop through the PEGs.
332 :     for my $peg1 (@pegs) {
333 : parrello 1.6 $loadCoupling->Add("pegIn");
334 : parrello 1.1 Trace("Processing PEG $peg1 for $genome.") if T(4);
335 :     # Get a list of the coupled PEGs.
336 :     my @couplings = $fig->coupled_to($peg1);
337 :     # For each coupled PEG, we need to verify that a coupling already
338 :     # exists. If not, we have to create one.
339 :     for my $coupleData (@couplings) {
340 :     my ($peg2, $score) = @{$coupleData};
341 :     # Compute the coupling ID.
342 :     my $coupleID = Sprout::CouplingID($peg1, $peg2);
343 :     if (! exists $dupHash{$coupleID}) {
344 : parrello 1.6 $loadCoupling->Add("couplingIn");
345 : parrello 1.1 # Here we have a new coupling to store in the load files.
346 :     Trace("Storing coupling ($coupleID) with score $score.") if T(4);
347 :     # Ensure we don't do this again.
348 :     $dupHash{$coupleID} = $score;
349 :     # Write the coupling record.
350 :     $loadCoupling->Put($coupleID, $score);
351 :     # Connect it to the coupled PEGs.
352 :     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);
353 :     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);
354 :     # Get the evidence for this coupling.
355 :     my @evidence = $fig->coupling_evidence($peg1, $peg2);
356 :     # Organize the evidence into a hash table.
357 :     my %evidenceMap = ();
358 :     # Process each evidence item.
359 :     for my $evidenceData (@evidence) {
360 : parrello 1.6 $loadPCH->Add("evidenceIn");
361 : parrello 1.1 my ($peg3, $peg4, $usage) = @{$evidenceData};
362 :     # Only proceed if the evidence is from a Sprout
363 :     # genome.
364 :     if ($genomeFilter->{$fig->genome_of($peg3)}) {
365 : parrello 1.6 $loadUsesAsEvidence->Add("evidenceChosen");
366 : parrello 1.1 my $evidenceKey = "$coupleID $peg3 $peg4";
367 :     # We store this evidence in the hash if the usage
368 :     # is nonzero or no prior evidence has been found. This
369 :     # insures that if there is duplicate evidence, we
370 :     # at least keep the meaningful ones. Only evidence is
371 :     # the hash makes it to the output.
372 :     if ($usage || ! exists $evidenceMap{$evidenceKey}) {
373 :     $evidenceMap{$evidenceKey} = $evidenceData;
374 :     }
375 :     }
376 :     }
377 :     for my $evidenceID (keys %evidenceMap) {
378 :     # Create the evidence record.
379 :     my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
380 :     $loadPCH->Put($evidenceID, $usage);
381 :     # Connect it to the coupling.
382 :     $loadIsEvidencedBy->Put($coupleID, $evidenceID);
383 :     # Connect it to the features.
384 :     $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
385 :     $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);
386 :     }
387 :     }
388 :     }
389 :     }
390 :     }
391 :     # All done. Finish the load.
392 :     my $retVal = $self->_FinishAll();
393 :     return $retVal;
394 :     }
395 :    
396 :     =head3 LoadFeatureData
397 :    
398 :     C<< my $stats = $spl->LoadFeatureData(); >>
399 :    
400 :     Load the feature data from FIG into Sprout.
401 :    
402 :     Features represent annotated genes, and are therefore the heart of the data store.
403 :    
404 :     The following relations are loaded by this method.
405 :    
406 :     Feature
407 :     FeatureAlias
408 :     FeatureLink
409 :     FeatureTranslation
410 :     FeatureUpstream
411 :     IsLocatedIn
412 :    
413 :     =over 4
414 :    
415 :     =item RETURNS
416 :    
417 :     Returns a statistics object for the loads.
418 :    
419 :     =back
420 :    
421 :     =cut
422 :     #: Return Type $%;
423 :     sub LoadFeatureData {
424 :     # Get this object instance.
425 :     my ($self) = @_;
426 :     # Get the FIG object.
427 :     my $fig = $self->{fig};
428 : parrello 1.8 # Find out if this is a limited run.
429 :     my $limited = $self->{options}->{limitedFeatures};
430 : parrello 1.1 # Get the table of genome IDs.
431 :     my $genomeHash = $self->{genomes};
432 :     my $genomeCount = (keys %{$genomeHash});
433 :     my $featureCount = $genomeCount * 4000;
434 :     # Create load objects for each of the tables we're loading.
435 :     my $loadFeature = $self->_TableLoader('Feature', $featureCount);
436 :     my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);
437 : parrello 1.11 my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);
438 :     my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);
439 : parrello 1.8 if (! $limited) {
440 :     $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);
441 :     $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);
442 :     $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);
443 :     }
444 : parrello 1.1 # Get the maximum sequence size. We need this later for splitting up the
445 :     # locations.
446 :     my $chunkSize = $self->{sprout}->MaxSegment();
447 :     Trace("Beginning feature data load.") if T(2);
448 :     # Now we loop through the genomes, generating the data for each one.
449 :     for my $genomeID (sort keys %{$genomeHash}) {
450 :     Trace("Loading features for genome $genomeID.") if T(3);
451 : parrello 1.6 $loadFeature->Add("genomeIn");
452 : parrello 1.1 # Get the feature list for this genome.
453 :     my $features = $fig->all_features_detailed($genomeID);
454 :     # Loop through the features.
455 :     for my $featureData (@{$features}) {
456 : parrello 1.6 $loadFeature->Add("featureIn");
457 : parrello 1.1 # Split the tuple.
458 : parrello 1.11 my ($featureID, $locations, undef, $type) = @{$featureData};
459 : parrello 1.1 # Create the feature record.
460 : parrello 1.7 $loadFeature->Put($featureID, 1, $type);
461 : parrello 1.11 # Create the aliases.
462 :     for my $alias ($fig->feature_aliases($featureID)) {
463 :     $loadFeatureAlias->Put($featureID, $alias);
464 :     }
465 : parrello 1.8 # The next stuff is for a full load only.
466 :     if (! $limited) {
467 :     # Get the links.
468 :     my @links = $fig->fid_links($featureID);
469 :     for my $link (@links) {
470 :     $loadFeatureLink->Put($featureID, $link);
471 : parrello 1.1 }
472 : parrello 1.8 # If this is a peg, generate the translation and the upstream.
473 :     if ($type eq 'peg') {
474 :     $loadFeatureTranslation->Add("pegIn");
475 :     my $translation = $fig->get_translation($featureID);
476 :     if ($translation) {
477 :     $loadFeatureTranslation->Put($featureID, $translation);
478 :     }
479 :     # We use the default upstream values of u=200 and c=100.
480 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
481 :     if ($upstream) {
482 :     $loadFeatureUpstream->Put($featureID, $upstream);
483 :     }
484 : parrello 1.1 }
485 :     }
486 :     # This part is the roughest. We need to relate the features to contig
487 :     # locations, and the locations must be split so that none of them exceed
488 :     # the maximum segment size. This simplifies the genes_in_region processing
489 :     # for Sprout.
490 : parrello 1.10 my @locationList = split /\s*,\s*/, $locations;
491 : parrello 1.8 # Create the location position indicator.
492 :     my $i = 1;
493 : parrello 1.1 # Loop through the locations.
494 :     for my $location (@locationList) {
495 :     # Parse the location.
496 : parrello 1.10 my $locObject = BasicLocation->new("$genomeID:$location");
497 : parrello 1.1 # Split it into a list of chunks.
498 :     my @locOList = ();
499 :     while (my $peeling = $locObject->Peel($chunkSize)) {
500 : parrello 1.6 $loadIsLocatedIn->Add("peeling");
501 : parrello 1.1 push @locOList, $peeling;
502 :     }
503 :     push @locOList, $locObject;
504 :     # Loop through the chunks, creating IsLocatedIn records. The variable
505 :     # "$i" will be used to keep the location index.
506 : parrello 1.8 for my $locChunk (@locOList) {
507 : parrello 1.1 $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
508 :     $locChunk->Dir, $locChunk->Length, $i);
509 :     $i++;
510 :     }
511 :     }
512 :     }
513 :     }
514 :     # Finish the loads.
515 :     my $retVal = $self->_FinishAll();
516 :     return $retVal;
517 :     }
518 :    
519 :     =head3 LoadBBHData
520 :    
521 :     C<< my $stats = $spl->LoadBBHData(); >>
522 :    
523 :     Load the bidirectional best hit data from FIG into Sprout.
524 :    
525 :     Sprout does not store information on similarities. Instead, it has only the
526 :     bi-directional best hits. Even so, the BBH table is one of the largest in
527 :     the database.
528 :    
529 :     The following relations are loaded by this method.
530 :    
531 :     IsBidirectionalBestHitOf
532 :    
533 :     =over 4
534 :    
535 :     =item RETURNS
536 :    
537 :     Returns a statistics object for the loads.
538 :    
539 :     =back
540 :    
541 :     =cut
542 :     #: Return Type $%;
543 : parrello 1.2 sub LoadBBHData {
544 : parrello 1.1 # Get this object instance.
545 :     my ($self) = @_;
546 :     # Get the FIG object.
547 :     my $fig = $self->{fig};
548 :     # Get the table of genome IDs.
549 :     my $genomeHash = $self->{genomes};
550 :     my $genomeCount = (keys %{$genomeHash});
551 :     my $featureCount = $genomeCount * 4000;
552 :     # Create load objects for each of the tables we're loading.
553 :     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',
554 :     $featureCount * $genomeCount);
555 :     Trace("Beginning BBH load.") if T(2);
556 :     # Now we loop through the genomes, generating the data for each one.
557 :     for my $genomeID (sort keys %{$genomeHash}) {
558 : parrello 1.6 $loadIsBidirectionalBestHitOf->Add("genomeIn");
559 : parrello 1.1 Trace("Processing features for genome $genomeID.") if T(3);
560 :     # Get the feature list for this genome.
561 :     my $features = $fig->all_features_detailed($genomeID);
562 :     # Loop through the features.
563 :     for my $featureData (@{$features}) {
564 :     # Split the tuple.
565 :     my ($featureID, $locations, $aliases, $type) = @{$featureData};
566 :     # Get the bi-directional best hits.
567 :     my @bbhList = $fig->bbhs($featureID);
568 :     for my $bbhEntry (@bbhList) {
569 :     # Get the target feature ID and the score.
570 :     my ($targetID, $score) = @{$bbhEntry};
571 :     # Check the target feature's genome.
572 :     my $targetGenomeID = $fig->genome_of($targetID);
573 :     # Only proceed if it's one of our genomes.
574 :     if ($genomeHash->{$targetGenomeID}) {
575 :     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,
576 :     $score);
577 :     }
578 :     }
579 :     }
580 :     }
581 :     # Finish the loads.
582 :     my $retVal = $self->_FinishAll();
583 :     return $retVal;
584 :     }
585 :    
586 :     =head3 LoadSubsystemData
587 :    
588 :     C<< my $stats = $spl->LoadSubsystemData(); >>
589 :    
590 :     Load the subsystem data from FIG into Sprout.
591 :    
592 :     Subsystems are groupings of genetic roles that work together to effect a specific
593 :     chemical reaction. Similar organisms require similar subsystems. To curate a subsystem,
594 :     a spreadsheet is created with genomes on one axis and subsystem roles on the other
595 :     axis. Similar features are then mapped into the cells, allowing the annotation of one
596 :     genome's roles to be used to assist in the annotation of others.
597 :    
598 :     The following relations are loaded by this method.
599 :    
600 :     Subsystem
601 :     Role
602 :     SSCell
603 :     ContainsFeature
604 :     IsGenomeOf
605 :     IsRoleOf
606 :     OccursInSubsystem
607 :     ParticipatesIn
608 :     HasSSCell
609 :    
610 :     =over 4
611 :    
612 :     =item RETURNS
613 :    
614 :     Returns a statistics object for the loads.
615 :    
616 :     =back
617 :    
618 :     B<TO DO>
619 :    
620 :     Generate RoleName table?
621 :    
622 :     =cut
623 :     #: Return Type $%;
624 :     sub LoadSubsystemData {
625 :     # Get this object instance.
626 :     my ($self) = @_;
627 :     # Get the FIG object.
628 :     my $fig = $self->{fig};
629 :     # Get the genome hash. We'll use it to filter the genomes in each
630 :     # spreadsheet.
631 :     my $genomeHash = $self->{genomes};
632 :     # Get the subsystem hash. This lists the subsystems we'll process.
633 :     my $subsysHash = $self->{subsystems};
634 :     my @subsysIDs = sort keys %{$subsysHash};
635 :     my $subsysCount = @subsysIDs;
636 :     my $genomeCount = (keys %{$genomeHash});
637 :     my $featureCount = $genomeCount * 4000;
638 :     # Create load objects for each of the tables we're loading.
639 :     my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);
640 :     my $loadRole = $self->_TableLoader('Role', $featureCount * 6);
641 :     my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);
642 :     my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);
643 :     my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);
644 :     my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);
645 :     my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);
646 :     my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);
647 :     my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);
648 :     Trace("Beginning subsystem data load.") if T(2);
649 :     # Loop through the subsystems. Our first task will be to create the
650 :     # roles. We do this by looping through the subsystems and creating a
651 :     # role hash. The hash tracks each role ID so that we don't create
652 :     # duplicates. As we move along, we'll connect the roles and subsystems.
653 :     my %roleData = ();
654 :     for my $subsysID (@subsysIDs) {
655 :     Trace("Creating subsystem $subsysID.") if T(3);
656 : parrello 1.6 $loadSubsystem->Add("subsystemIn");
657 : parrello 1.1 # Create the subsystem record.
658 :     $loadSubsystem->Put($subsysID);
659 :     # Get the subsystem's roles.
660 : parrello 1.6 my @roles = $fig->subsystem_to_roles($subsysID);
661 : parrello 1.1 # Connect the roles to the subsystem. If a role is new, we create
662 :     # a role record for it.
663 :     for my $roleID (@roles) {
664 : parrello 1.6 $loadOccursInSubsystem->Add("roleIn");
665 : parrello 1.1 $loadOccursInSubsystem->Put($roleID, $subsysID);
666 :     if (! exists $roleData{$roleID}) {
667 :     $loadRole->Put($roleID);
668 :     $roleData{$roleID} = 1;
669 :     }
670 :     }
671 :     # Now all roles for this subsystem have been filled in. We create the
672 :     # spreadsheet by matches roles to genomes. To do this, we need to
673 :     # get the genomes on the sheet.
674 :     Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
675 :     my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};
676 :     for my $genomeID (@genomes) {
677 :     # Only process this genome if it's one of ours.
678 :     if (exists $genomeHash->{$genomeID}) {
679 :     # Connect the genome to the subsystem.
680 :     $loadParticipatesIn->Put($genomeID, $subsysID);
681 :     # Loop through the subsystem's roles. We use an index because it is
682 :     # part of the spreadsheet cell ID.
683 :     for (my $i = 0; $i <= $#roles; $i++) {
684 :     my $role = $roles[$i];
685 :     # Get the features in the spreadsheet cell for this genome and role.
686 : parrello 1.6 my @pegs = $fig->pegs_in_subsystem_cell($subsysID, $genomeID, $i);
687 : parrello 1.1 # Only proceed if features exist.
688 :     if (@pegs > 0) {
689 :     # Create the spreadsheet cell.
690 :     my $cellID = "$subsysID:$genomeID:$i";
691 :     $loadSSCell->Put($cellID);
692 :     $loadIsGenomeOf->Put($genomeID, $cellID);
693 :     $loadIsRoleOf->Put($role, $cellID);
694 :     $loadHasSSCell->Put($subsysID, $cellID);
695 :     # Attach the features to it.
696 :     for my $pegID (@pegs) {
697 :     $loadContainsFeature->Put($cellID, $pegID);
698 :     }
699 :     }
700 :     }
701 :     }
702 :     }
703 :     }
704 :     # Finish the load.
705 :     my $retVal = $self->_FinishAll();
706 :     return $retVal;
707 :     }
708 :    
709 :     =head3 LoadDiagramData
710 :    
711 :     C<< my $stats = $spl->LoadDiagramData(); >>
712 :    
713 :     Load the diagram data from FIG into Sprout.
714 :    
715 :     Diagrams are used to organize functional roles. The diagram shows the
716 :     connections between chemicals that interact with a subsystem.
717 :    
718 :     The following relations are loaded by this method.
719 :    
720 :     Diagram
721 :     RoleOccursIn
722 :    
723 :     =over 4
724 :    
725 :     =item RETURNS
726 :    
727 :     Returns a statistics object for the loads.
728 :    
729 :     =back
730 :    
731 :     =cut
732 :     #: Return Type $%;
733 :     sub LoadDiagramData {
734 :     # Get this object instance.
735 :     my ($self) = @_;
736 :     # Get the FIG object.
737 :     my $fig = $self->{fig};
738 :     # Get the map list.
739 :     my @maps = $fig->all_maps;
740 :     my $mapCount = @maps;
741 :     my $genomeCount = (keys %{$self->{genomes}});
742 :     my $featureCount = $genomeCount * 4000;
743 :     # Create load objects for each of the tables we're loading.
744 :     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);
745 :     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);
746 :     Trace("Beginning diagram data load.") if T(2);
747 :     # Loop through the diagrams.
748 :     for my $map ($fig->all_maps) {
749 :     Trace("Loading diagram $map.") if T(3);
750 :     # Get the diagram's descriptive name.
751 :     my $name = $fig->map_name($map);
752 :     $loadDiagram->Put($map, $name);
753 :     # Now we need to link all the map's roles to it.
754 :     # A hash is used to prevent duplicates.
755 :     my %roleHash = ();
756 :     for my $role ($fig->map_to_ecs($map)) {
757 :     if (! $roleHash{$role}) {
758 :     $loadRoleOccursIn->Put($role, $map);
759 :     $roleHash{$role} = 1;
760 :     }
761 :     }
762 :     }
763 :     # Finish the load.
764 :     my $retVal = $self->_FinishAll();
765 :     return $retVal;
766 :     }
767 :    
768 :     =head3 LoadPropertyData
769 :    
770 :     C<< my $stats = $spl->LoadPropertyData(); >>
771 :    
772 :     Load the attribute data from FIG into Sprout.
773 :    
774 :     Attribute data in FIG corresponds to the Sprout concept of Property. As currently
775 :     implemented, each key-value attribute combination in the SEED corresponds to a
776 :     record in the B<Property> table. The B<HasProperty> relationship links the
777 :     features to the properties.
778 :    
779 :     The SEED also allows attributes to be assigned to genomes, but this is not yet
780 :     supported by Sprout.
781 :    
782 :     The following relations are loaded by this method.
783 :    
784 :     HasProperty
785 :     Property
786 :    
787 :     =over 4
788 :    
789 :     =item RETURNS
790 :    
791 :     Returns a statistics object for the loads.
792 :    
793 :     =back
794 :    
795 :     =cut
796 :     #: Return Type $%;
797 :     sub LoadPropertyData {
798 :     # Get this object instance.
799 :     my ($self) = @_;
800 :     # Get the FIG object.
801 :     my $fig = $self->{fig};
802 :     # Get the genome hash.
803 :     my $genomeHash = $self->{genomes};
804 :     my $genomeCount = (keys %{$genomeHash});
805 :     # Create load objects for each of the tables we're loading.
806 :     my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);
807 :     my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);
808 :     Trace("Beginning property data load.") if T(2);
809 :     # Create a hash for storing property IDs.
810 :     my %propertyKeys = ();
811 :     my $nextID = 1;
812 :     # Loop through the genomes.
813 :     for my $genomeID (keys %{$genomeHash}) {
814 : parrello 1.6 $loadProperty->Add("genomeIn");
815 : parrello 1.1 # Get the genome's features. The feature ID is the first field in the
816 :     # tuples returned by "all_features_detailed". We use "all_features_detailed"
817 :     # rather than "all_features" because we want all features regardless of type.
818 :     my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
819 :     # Loop through the features, creating HasProperty records.
820 :     for my $fid (@features) {
821 : parrello 1.6 $loadProperty->Add("featureIn");
822 : parrello 1.1 # Get all attributes for this feature. We do this one feature at a time
823 :     # to insure we do not get any genome attributes.
824 :     my @attributeList = $fig->get_attributes($fid, '', '', '');
825 :     # Loop through the attributes.
826 :     for my $tuple (@attributeList) {
827 :     # Get this attribute value's data. Note that we throw away the FID,
828 :     # since it will always be the same as the value if "$fid".
829 :     my (undef, $key, $value, $url) = @{$tuple};
830 :     # Concatenate the key and value and check the "propertyKeys" hash to
831 :     # see if we already have an ID for it. We use a tab for the separator
832 :     # character.
833 :     my $propertyKey = "$key\t$value";
834 :     # Use the concatenated value to check for an ID. If no ID exists, we
835 :     # create one.
836 :     my $propertyID = $propertyKeys{$propertyKey};
837 :     if (! $propertyID) {
838 :     # Here we need to create a new property ID for this key/value pair.
839 :     $propertyKeys{$propertyKey} = $nextID;
840 :     $propertyID = $nextID;
841 :     $nextID++;
842 :     $loadProperty->Put($propertyID, $key, $value);
843 :     }
844 :     # Create the HasProperty entry for this feature/property association.
845 :     $loadHasProperty->Put($fid, $propertyID, $url);
846 :     }
847 :     }
848 :     }
849 :     # Finish the load.
850 :     my $retVal = $self->_FinishAll();
851 :     return $retVal;
852 :     }
853 :    
854 :     =head3 LoadAnnotationData
855 :    
856 :     C<< my $stats = $spl->LoadAnnotationData(); >>
857 :    
858 :     Load the annotation data from FIG into Sprout.
859 :    
860 :     Sprout annotations encompass both the assignments and the annotations in SEED.
861 :     These describe the function performed by a PEG as well as any other useful
862 :     information that may aid in identifying its purpose.
863 :    
864 :     The following relations are loaded by this method.
865 :    
866 :     Annotation
867 :     IsTargetOfAnnotation
868 :     SproutUser
869 :     MadeAnnotation
870 :    
871 :     =over 4
872 :    
873 :     =item RETURNS
874 :    
875 :     Returns a statistics object for the loads.
876 :    
877 :     =back
878 :    
879 :     =cut
880 :     #: Return Type $%;
881 :     sub LoadAnnotationData {
882 :     # Get this object instance.
883 :     my ($self) = @_;
884 :     # Get the FIG object.
885 :     my $fig = $self->{fig};
886 :     # Get the genome hash.
887 :     my $genomeHash = $self->{genomes};
888 :     my $genomeCount = (keys %{$genomeHash});
889 :     # Create load objects for each of the tables we're loading.
890 :     my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);
891 :     my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);
892 :     my $loadSproutUser = $self->_TableLoader('SproutUser', 100);
893 :     my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);
894 :     my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);
895 :     Trace("Beginning annotation data load.") if T(2);
896 :     # Create a hash of user names. We'll use this to prevent us from generating duplicate
897 :     # user records.
898 :     my %users = ( FIG => 1, master => 1 );
899 :     # Put in FIG and "master".
900 :     $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes");
901 :     $loadUserAccess->Put("FIG", 1);
902 :     $loadSproutUser->Put("master", "Master User");
903 :     $loadUserAccess->Put("master", 1);
904 :     # Get the current time.
905 :     my $time = time();
906 :     # Loop through the genomes.
907 : parrello 1.6 for my $genomeID (sort keys %{$genomeHash}) {
908 : parrello 1.1 Trace("Processing $genomeID.") if T(3);
909 :     # Get the genome's PEGs.
910 :     my @pegs = $fig->pegs_of($genomeID);
911 :     for my $peg (@pegs) {
912 :     Trace("Processing $peg.") if T(4);
913 :     # Create a hash of timestamps. We use this to prevent duplicate time stamps
914 :     # from showing up for a single PEG's annotations.
915 :     my %seenTimestamps = ();
916 :     # Check for a functional assignment.
917 :     my $func = $fig->function_of($peg);
918 :     if ($func) {
919 :     # If this is NOT a hypothetical assignment, we create an
920 :     # assignment annotation for it.
921 :     if (! FIG::hypo($peg)) {
922 :     # Note that we double the slashes so that what goes into the database is
923 :     # a new-line escape sequence rather than an actual new-line.
924 :     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");
925 :     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");
926 :     $loadMadeAnnotation->Put("FIG", "$peg:$time");
927 :     # Denote we've seen this timestamp.
928 :     $seenTimestamps{$time} = 1;
929 :     }
930 :     # Now loop through the real annotations.
931 :     for my $tuple ($fig->feature_annotations($peg, "raw")) {
932 : parrello 1.6 my ($fid, $timestamp, $user, $text) = @{$tuple};
933 : parrello 1.1 # Here we fix up the annotation text. "\r" is removed,
934 :     # and "\t" and "\n" are escaped. Note we use the "s"
935 :     # modifier so that new-lines inside the text do not
936 :     # stop the substitution search.
937 :     $text =~ s/\r//gs;
938 :     $text =~ s/\t/\\t/gs;
939 :     $text =~ s/\n/\\n/gs;
940 :     # Change assignments by the master user to FIG assignments.
941 :     $text =~ s/Set master function/Set FIG function/s;
942 :     # Insure the time stamp is valid.
943 :     if ($timestamp =~ /^\d+$/) {
944 :     # Here it's a number. We need to insure it's unique.
945 :     while ($seenTimestamps{$timestamp}) {
946 :     $timestamp++;
947 :     }
948 :     $seenTimestamps{$timestamp} = 1;
949 :     my $annotationID = "$peg:$timestamp";
950 :     # Insure the user exists.
951 :     if (! $users{$user}) {
952 :     $loadSproutUser->Put($user, "SEED user");
953 :     $loadUserAccess->Put($user, 1);
954 :     $users{$user} = 1;
955 :     }
956 :     # Generate the annotation.
957 :     $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");
958 :     $loadIsTargetOfAnnotation->Put($peg, $annotationID);
959 :     $loadMadeAnnotation->Put($user, $annotationID);
960 :     } else {
961 :     # Here we have an invalid time stamp.
962 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1);
963 :     }
964 :     }
965 :     }
966 :     }
967 :     }
968 :     # Finish the load.
969 :     my $retVal = $self->_FinishAll();
970 :     return $retVal;
971 :     }
972 :    
973 : parrello 1.5 =head3 LoadSourceData
974 :    
975 :     C<< my $stats = $spl->LoadSourceData(); >>
976 :    
977 :     Load the source data from FIG into Sprout.
978 :    
979 :     Source data links genomes to information about the organizations that
980 :     mapped it.
981 :    
982 :     The following relations are loaded by this method.
983 :    
984 :     ComesFrom
985 :     Source
986 :     SourceURL
987 :    
988 :     There is no direct support for source attribution in FIG, so we access the SEED
989 :     files directly.
990 :    
991 :     =over 4
992 :    
993 :     =item RETURNS
994 :    
995 :     Returns a statistics object for the loads.
996 :    
997 :     =back
998 :    
999 :     =cut
1000 :     #: Return Type $%;
1001 :     sub LoadSourceData {
1002 :     # Get this object instance.
1003 :     my ($self) = @_;
1004 :     # Get the FIG object.
1005 :     my $fig = $self->{fig};
1006 :     # Get the genome hash.
1007 :     my $genomeHash = $self->{genomes};
1008 :     my $genomeCount = (keys %{$genomeHash});
1009 :     # Create load objects for each of the tables we're loading.
1010 :     my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);
1011 :     my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);
1012 :     my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);
1013 :     Trace("Beginning source data load.") if T(2);
1014 :     # Create hashes to collect the Source information.
1015 :     my %sourceURL = ();
1016 :     my %sourceDesc = ();
1017 :     # Loop through the genomes.
1018 :     my $line;
1019 : parrello 1.6 for my $genomeID (sort keys %{$genomeHash}) {
1020 : parrello 1.5 Trace("Processing $genomeID.") if T(3);
1021 :     # Open the project file.
1022 :     if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1023 :     defined($line = <TMP>)) {
1024 :     chomp $line;
1025 : parrello 1.6 my($sourceID, $desc, $url) = split(/\t/,$line);
1026 : parrello 1.5 $loadComesFrom->Put($genomeID, $sourceID);
1027 :     if ($url && ! exists $sourceURL{$genomeID}) {
1028 :     $loadSourceURL->Put($sourceID, $url);
1029 :     $sourceURL{$sourceID} = 1;
1030 :     }
1031 :     if ($desc && ! exists $sourceDesc{$sourceID}) {
1032 :     $loadSource->Put($sourceID, $desc);
1033 :     $sourceDesc{$sourceID} = 1;
1034 :     }
1035 :     }
1036 :     close TMP;
1037 :     }
1038 :     # Finish the load.
1039 :     my $retVal = $self->_FinishAll();
1040 :     return $retVal;
1041 :     }
1042 :    
1043 : parrello 1.6 =head3 LoadExternalData
1044 :    
1045 :     C<< my $stats = $spl->LoadExternalData(); >>
1046 :    
1047 :     Load the external data from FIG into Sprout.
1048 :    
1049 :     External data contains information about external feature IDs.
1050 :    
1051 :     The following relations are loaded by this method.
1052 :    
1053 :     ExternalAliasFunc
1054 :     ExternalAliasOrg
1055 :    
1056 :     The support for external IDs in FIG is hidden beneath layers of other data, so
1057 :     we access the SEED files directly to create these tables. This is also one of
1058 :     the few load methods that does not proceed genome by genome.
1059 :    
1060 :     =over 4
1061 :    
1062 :     =item RETURNS
1063 :    
1064 :     Returns a statistics object for the loads.
1065 :    
1066 :     =back
1067 :    
1068 :     =cut
1069 :     #: Return Type $%;
1070 :     sub LoadExternalData {
1071 :     # Get this object instance.
1072 :     my ($self) = @_;
1073 :     # Get the FIG object.
1074 :     my $fig = $self->{fig};
1075 :     # Get the genome hash.
1076 :     my $genomeHash = $self->{genomes};
1077 :     my $genomeCount = (keys %{$genomeHash});
1078 :     # Convert the genome hash. We'll get the genus and species for each genome and make
1079 :     # it the key.
1080 :     my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1081 :     # Create load objects for each of the tables we're loading.
1082 :     my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);
1083 :     my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);
1084 :     Trace("Beginning external data load.") if T(2);
1085 :     # We loop through the files one at a time. First, the organism file.
1086 :     Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1087 :     my $orgLine;
1088 :     while (defined($orgLine = <ORGS>)) {
1089 :     # Clean the input line.
1090 :     chomp $orgLine;
1091 :     # Parse the organism name.
1092 :     my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1093 :     $loadExternalAliasOrg->Put($protID, $name);
1094 :     }
1095 :     close ORGS;
1096 :     # Now the function file.
1097 :     my $funcLine;
1098 :     Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");
1099 :     while (defined($funcLine = <FUNCS>)) {
1100 :     # Clean the line ending.
1101 :     chomp $funcLine;
1102 :     # Only proceed if the line is non-blank.
1103 :     if ($funcLine) {
1104 :     # Split it into fields.
1105 :     my @funcFields = split /\s*\t\s*/, $funcLine;
1106 :     # If there's an EC number, append it to the description.
1107 :     if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1108 :     $funcFields[1] .= " $1";
1109 :     }
1110 :     # Output the function line.
1111 :     $loadExternalAliasFunc->Put(@funcFields[0,1]);
1112 :     }
1113 :     }
1114 :     # Finish the load.
1115 :     my $retVal = $self->_FinishAll();
1116 :     return $retVal;
1117 :     }
1118 : parrello 1.5
1119 :     =head3 LoadGroupData
1120 :    
1121 :     C<< my $stats = $spl->LoadGroupData(); >>
1122 :    
1123 :     Load the genome Groups into Sprout.
1124 :    
1125 :     The following relations are loaded by this method.
1126 :    
1127 :     GenomeGroups
1128 :    
1129 :     There is no direct support for genome groups in FIG, so we access the SEED
1130 :     files directly.
1131 :    
1132 :     =over 4
1133 :    
1134 :     =item RETURNS
1135 :    
1136 :     Returns a statistics object for the loads.
1137 :    
1138 :     =back
1139 :    
1140 :     =cut
1141 :     #: Return Type $%;
1142 :     sub LoadGroupData {
1143 :     # Get this object instance.
1144 :     my ($self) = @_;
1145 :     # Get the FIG object.
1146 :     my $fig = $self->{fig};
1147 :     # Get the genome hash.
1148 :     my $genomeHash = $self->{genomes};
1149 :     my $genomeCount = (keys %{$genomeHash});
1150 :     # Create a load object for the table we're loading.
1151 :     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);
1152 :     Trace("Beginning group data load.") if T(2);
1153 :     # Loop through the genomes.
1154 :     my $line;
1155 : parrello 1.6 for my $genomeID (keys %{$genomeHash}) {
1156 : parrello 1.5 Trace("Processing $genomeID.") if T(3);
1157 :     # Open the NMPDR group file for this genome.
1158 :     if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1159 :     defined($line = <TMP>)) {
1160 :     # Clean the line ending.
1161 : parrello 1.6 chomp $line;
1162 : parrello 1.5 # Add the group to the table. Note that there can only be one group
1163 :     # per genome.
1164 :     $loadGenomeGroups->Put($genomeID, $line);
1165 :     }
1166 :     close TMP;
1167 :     }
1168 :     # Finish the load.
1169 :     my $retVal = $self->_FinishAll();
1170 :     return $retVal;
1171 :     }
1172 :    
1173 : parrello 1.1 =head2 Internal Utility Methods
1174 :    
1175 :     =head3 TableLoader
1176 :    
1177 :     Create an ERDBLoad object for the specified table. The object is also added to
1178 :     the internal list in the C<loaders> property of this object. That enables the
1179 :     L</FinishAll> method to terminate all the active loads.
1180 :    
1181 :     This is an instance method.
1182 :    
1183 :     =over 4
1184 :    
1185 :     =item tableName
1186 :    
1187 :     Name of the table (relation) being loaded.
1188 :    
1189 :     =item rowCount (optional)
1190 :    
1191 :     Estimated maximum number of rows in the table.
1192 :    
1193 :     =item RETURN
1194 :    
1195 :     Returns an ERDBLoad object for loading the specified table.
1196 :    
1197 :     =back
1198 :    
1199 :     =cut
1200 :    
1201 :     sub _TableLoader {
1202 :     # Get the parameters.
1203 :     my ($self, $tableName, $rowCount) = @_;
1204 :     # Create the load object.
1205 :     my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);
1206 :     # Cache it in the loader list.
1207 :     push @{$self->{loaders}}, $retVal;
1208 :     # Return it to the caller.
1209 :     return $retVal;
1210 :     }
1211 :    
1212 :     =head3 FinishAll
1213 :    
1214 :     Finish all the active loads on this object.
1215 :    
1216 :     When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in
1217 :     the list pointed to be the C<loaders> property of this object. This method pops the loaders
1218 :     off the list and finishes them to flush out any accumulated residue.
1219 :    
1220 :     This is an instance method.
1221 :    
1222 :     =over 4
1223 :    
1224 :     =item RETURN
1225 :    
1226 :     Returns a statistics object containing the accumulated statistics for the load.
1227 :    
1228 :     =back
1229 :    
1230 :     =cut
1231 :    
1232 :     sub _FinishAll {
1233 :     # Get this object instance.
1234 :     my ($self) = @_;
1235 :     # Create the statistics object.
1236 :     my $retVal = Stats->new();
1237 :     # Get the loader list.
1238 :     my $loadList = $self->{loaders};
1239 :     # Loop through the list, finishing the loads. Note that if the finish fails, we die
1240 :     # ignominiously. At some future point, we want to make the loads restartable.
1241 :     while (my $loader = pop @{$loadList}) {
1242 :     my $stats = $loader->Finish();
1243 :     $retVal->Accumulate($stats);
1244 :     my $relName = $loader->RelName;
1245 :     Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1246 :     }
1247 :     # Return the load statistics.
1248 :     return $retVal;
1249 :     }
1250 :    
1251 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3