[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Annotation of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SproutLoad;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDBLoad;
9 :     use FIG;
10 :     use Sprout;
11 :     use Stats;
12 :     use BasicLocation;
13 :    
14 :     =head1 Sprout Load Methods
15 :    
16 :     =head2 Introduction
17 :    
18 :     This object contains the methods needed to copy data from the FIG data store to the
19 :     Sprout database. It makes heavy use of the ERDBLoad object to manage the load into
20 :     individual tables. The client can create an instance of this object and then
21 :     call methods for each group of tables to load. For example, the following code will
22 :     load the Genome- and Feature-related tables. (It is presumed the first command line
23 :     parameter contains the name of a file specifying the genomes.)
24 :    
25 :     my $fig = FIG->new();
26 :     my $sprout = SFXlate->new_sprout_only();
27 :     my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]);
28 :     my $stats = $spl->LoadGenomeData();
29 :     $stats->Accumulate($spl->LoadFeatureData());
30 :     print $stats->Show();
31 :    
32 :     This module makes use of the internal Sprout property C<_erdb>.
33 :    
34 :     It is worth noting that the FIG object does not need to be a real one. Any object
35 :     that implements the FIG methods for data retrieval could be used. So, for example,
36 :     this object could be used to copy data from one Sprout database to another, or
37 :     from any FIG-compliant data story implemented in the future.
38 :    
39 :     To insure that this is possible, each time the FIG object is used, it will be via
40 :     a variable called C<$fig>. This makes it fairly straightforward to determine which
41 :     FIG methods are required to load the Sprout database.
42 :    
43 :     =cut
44 :    
45 :     #: Constructor SproutLoad->new();
46 :    
47 :     =head2 Public Methods
48 :    
49 :     =head3 new
50 :    
51 :     C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile); >>
52 :    
53 :     Construct a new Sprout Loader object, specifying the two participating databases and
54 :     the name of the files containing the list of genomes and subsystems to use.
55 :    
56 :     =over 4
57 :    
58 :     =item sprout
59 :    
60 :     Sprout object representing the target database. This also specifies the directory to
61 :     be used for creating the load files.
62 :    
63 :     =item fig
64 :    
65 :     FIG object representing the source data store from which the data is to be taken.
66 :    
67 :     =item genomeFile
68 :    
69 :     Either the name of the file containing the list of genomes to load or a reference to
70 :     a hash of genome IDs to access codes. If nothing is specified, all complete genomes
71 :     will be loaded and the access code will default to 1. The genome list is presumed
72 :     to be all-inclusive. In other words, all existing data in the target database will
73 :     be deleted and replaced with the data on the specified genes. If a file is specified,
74 :     it should contain one genome ID and access code per line, tab-separated.
75 :    
76 :     =item subsysFile
77 :    
78 :     Either the name of the file containing the list of trusted subsystems or a reference
79 :     to a list of subsystem names. If nothing is specified, all known subsystems will be
80 :     considered trusted. Only subsystem data related to the trusted subsystems is loaded.
81 :    
82 :     =back
83 :    
84 :     =cut
85 :    
86 :     sub new {
87 :     # Get the parameters.
88 :     my ($class, $sprout, $fig, $genomeFile, $subsysFile) = @_;
89 :     # Load the list of genomes into a hash.
90 :     my %genomes;
91 :     if (! defined($genomeFile) || $genomeFile eq '') {
92 :     # Here we want all the complete genomes and an access code of 1.
93 :     my @genomeList = $fig->genomes(1);
94 :     %genomes = map { $_ => 1 } @genomeList;
95 :     } elsif (ref $genomeFile eq 'HASH') {
96 :     # Here the user specified a hash of genome IDs to access codes, which is
97 :     # exactly what we want.
98 :     %genomes = %{$genomeFile};
99 :     } elsif (ref $genomeFile eq 'SCALAR') {
100 :     # The caller specified a file, so read the genomes from the file.
101 :     my @genomeList = Tracer::GetFile($genomeFile);
102 :     if (! @genomeList) {
103 :     # It's an error if the genome file is empty or not found.
104 :     Confess("No genomes found in file \"$genomeFile\".");
105 :     } else {
106 :     # We build the genome Hash using a loop rather than "map" so that
107 :     # an omitted access code can be defaulted to 1.
108 :     for my $genomeLine (@genomeList) {
109 :     my ($genomeID, $accessCode) = split("\t", $genomeLine);
110 :     if (undef $accessCode) {
111 :     $accessCode = 1;
112 :     }
113 :     $genomes{$genomeID} = $accessCode;
114 :     }
115 :     }
116 :     } else {
117 : parrello 1.2 my $type = ref $genomeFile;
118 :     Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
119 : parrello 1.1 }
120 :     # Load the list of trusted subsystems.
121 :     my %subsystems = ();
122 :     if (! defined $subsysFile || $subsysFile eq '') {
123 :     # Here we want all the subsystems.
124 :     %subsystems = map { $_ => 1 } $fig->all_subsystems();
125 :     } elsif (ref $subsysFile eq 'ARRAY') {
126 :     # Here the user passed in a list of subsystems.
127 :     %subsystems = map { $_ => 1 } @{$subsysFile};
128 :     } elsif (ref $subsysFile eq 'SCALAR') {
129 :     # Here the list of subsystems is in a file.
130 :     if (! -e $subsysFile) {
131 :     # It's an error if the file does not exist.
132 :     Confess("Trusted subsystem file not found.");
133 :     } else {
134 :     # GetFile automatically chomps end-of-line characters, so this
135 :     # is an easy task.
136 :     %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile);
137 :     }
138 :     } else {
139 :     Confess("Invalid subsystem parameter in SproutLoad constructor.");
140 :     }
141 :     # Get the data directory from the Sprout object.
142 :     my ($directory) = $sprout->LoadInfo();
143 :     # Create the Sprout load object.
144 :     my $retVal = {
145 :     fig => $fig,
146 :     genomes => \%genomes,
147 :     subsystems => \%subsystems,
148 :     sprout => $sprout,
149 :     loadDirectory => $directory,
150 :     erdb => $sprout->{_erdb},
151 :     loaders => []
152 :     };
153 :     # Bless and return it.
154 :     bless $retVal, $class;
155 :     return $retVal;
156 :     }
157 :    
158 :     =head3 LoadGenomeData
159 :    
160 :     C<< my $stats = $spl->LoadGenomeData(); >>
161 :    
162 :     Load the Genome, Contig, and Sequence data from FIG into Sprout.
163 :    
164 :     The Sequence table is the largest single relation in the Sprout database, so this
165 :     method is expected to be slow and clumsy. At some point we will need to make it
166 :     restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be
167 :     very annoying otherwise.
168 :    
169 :     The following relations are loaded by this method.
170 :    
171 :     Genome
172 :     HasContig
173 :     Contig
174 :     IsMadeUpOf
175 :     Sequence
176 :    
177 :     =over 4
178 :    
179 :     =item RETURNS
180 :    
181 :     Returns a statistics object for the loads.
182 :    
183 :     =back
184 :    
185 :     B<TO DO>
186 :    
187 :     Real quality vectors instead of C<unknown> for everything.
188 :    
189 :     GenomeGroup relation. (The original script took group information from the C<NMPDR> file
190 :     in each genome's main directory, but no such file exists anywhere in my version of the
191 :     data store.)
192 :    
193 :     =cut
194 :     #: Return Type $%;
195 :     sub LoadGenomeData {
196 :     # Get this object instance.
197 :     my ($self) = @_;
198 :     # Get the FIG object.
199 :     my $fig = $self->{fig};
200 :     # Get the genome count.
201 :     my $genomeHash = $self->{genomes};
202 :     my $genomeCount = (keys %{$genomeHash});
203 :     Trace("Beginning genome data load.") if T(2);
204 :     # Create load objects for each of the tables we're loading.
205 :     my $loadGenome = $self->_TableLoader('Genome', $genomeCount);
206 :     my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);
207 :     my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);
208 :     my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);
209 :     my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);
210 :     # Now we loop through the genomes, generating the data for each one.
211 :     for my $genomeID (sort keys %{$genomeHash}) {
212 :     Trace("Loading data for genome $genomeID.") if T(3);
213 :     # The access code comes in via the genome hash.
214 :     my $accessCode = $genomeHash->{$genomeID};
215 :     # Get the genus, species, and strain from the scientific name. Note that we append
216 :     # the genome ID to the strain. In some cases this is the totality of the strain name.
217 :     my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
218 :     my $extra = join " ", @extraData, "[genomeID]";
219 :     # Get the full taxonomy.
220 :     my $taxonomy = $fig->taxonomy_of($genomeID);
221 :     # Output the genome record.
222 :     $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,
223 :     $species, $extra, $taxonomy);
224 :     # Now we loop through each of the genome's contigs.
225 :     my @contigs = $fig->all_contigs($genomeID);
226 :     for my $contigID (@contigs) {
227 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
228 :     # Create the contig ID.
229 :     my $sproutContigID = "$genomeID:$contigID";
230 :     # Create the contig record and relate it to the genome.
231 :     $loadContig->Put($sproutContigID);
232 :     $loadHasContig->Put($genomeID, $sproutContigID);
233 :     # Now we need to split the contig into sequences. The maximum sequence size is
234 :     # a property of the Sprout object.
235 :     my $chunkSize = $self->{sprout}->MaxSequence();
236 :     # Now we get the sequence a chunk at a time.
237 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
238 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
239 :     # Compute the endpoint of this chunk.
240 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
241 :     # Get the actual DNA.
242 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
243 :     # Compute the sequenceID.
244 :     my $seqID = "$sproutContigID.$i";
245 :     # Write out the data. For now, the quality vector is always "unknown".
246 :     $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i);
247 :     $loadSequence->Put($seqID, "unknown", $dna);
248 :     }
249 :     }
250 :     }
251 :     # Finish the loads.
252 :     my $retVal = $self->_FinishAll();
253 :     # Return the result.
254 :     return $retVal;
255 :     }
256 :    
257 :     =head3 LoadCouplingData
258 :    
259 :     C<< my $stats = $spl->LoadCouplingData(); >>
260 :    
261 :     Load the coupling and evidence data from FIG into Sprout.
262 :    
263 :     The coupling data specifies which genome features are functionally coupled. The
264 :     evidence data explains why the coupling is functional.
265 :    
266 :     The following relations are loaded by this method.
267 :    
268 :     Coupling
269 :     IsEvidencedBy
270 :     PCH
271 :     ParticipatesInCoupling
272 :     UsesAsEvidence
273 :    
274 :     =over 4
275 :    
276 :     =item RETURNS
277 :    
278 :     Returns a statistics object for the loads.
279 :    
280 :     =back
281 :    
282 :     =cut
283 :     #: Return Type $%;
284 :     sub LoadCouplingData {
285 :     # Get this object instance.
286 :     my ($self) = @_;
287 :     # Get the FIG object.
288 :     my $fig = $self->{fig};
289 :     # Get the genome hash.
290 :     my $genomeFilter = $self->{genomes};
291 :     my $genomeCount = (keys %{$genomeFilter});
292 :     my $featureCount = $genomeCount * 4000;
293 :     # Start the loads.
294 :     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);
295 :     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);
296 :     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);
297 :     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);
298 :     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);
299 :     Trace("Beginning coupling data load.") if T(2);
300 :     # Loop through the genomes found.
301 :     for my $genome (sort keys %{$genomeFilter}) {
302 :     Trace("Generating coupling data for $genome.") if T(3);
303 :     # Create a hash table for holding coupled pairs. We use this to prevent
304 :     # duplicates. For example, if A is coupled to B, we don't want to also
305 :     # assert that B is coupled to A, because we already know it. Fortunately,
306 :     # all couplings occur within a genome, so we can keep the hash table
307 :     # size reasonably small.
308 :     my %dupHash = ();
309 :     # Get all of the genome's PEGs.
310 :     my @pegs = $fig->pegs_of($genome);
311 :     # Loop through the PEGs.
312 :     for my $peg1 (@pegs) {
313 :     Trace("Processing PEG $peg1 for $genome.") if T(4);
314 :     # Get a list of the coupled PEGs.
315 :     my @couplings = $fig->coupled_to($peg1);
316 :     # For each coupled PEG, we need to verify that a coupling already
317 :     # exists. If not, we have to create one.
318 :     for my $coupleData (@couplings) {
319 :     my ($peg2, $score) = @{$coupleData};
320 :     # Compute the coupling ID.
321 :     my $coupleID = Sprout::CouplingID($peg1, $peg2);
322 :     if (! exists $dupHash{$coupleID}) {
323 :     # Here we have a new coupling to store in the load files.
324 :     Trace("Storing coupling ($coupleID) with score $score.") if T(4);
325 :     # Ensure we don't do this again.
326 :     $dupHash{$coupleID} = $score;
327 :     # Write the coupling record.
328 :     $loadCoupling->Put($coupleID, $score);
329 :     # Connect it to the coupled PEGs.
330 :     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);
331 :     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);
332 :     # Get the evidence for this coupling.
333 :     my @evidence = $fig->coupling_evidence($peg1, $peg2);
334 :     # Organize the evidence into a hash table.
335 :     my %evidenceMap = ();
336 :     # Process each evidence item.
337 :     for my $evidenceData (@evidence) {
338 :     my ($peg3, $peg4, $usage) = @{$evidenceData};
339 :     # Only proceed if the evidence is from a Sprout
340 :     # genome.
341 :     if ($genomeFilter->{$fig->genome_of($peg3)}) {
342 :     my $evidenceKey = "$coupleID $peg3 $peg4";
343 :     # We store this evidence in the hash if the usage
344 :     # is nonzero or no prior evidence has been found. This
345 :     # insures that if there is duplicate evidence, we
346 :     # at least keep the meaningful ones. Only evidence is
347 :     # the hash makes it to the output.
348 :     if ($usage || ! exists $evidenceMap{$evidenceKey}) {
349 :     $evidenceMap{$evidenceKey} = $evidenceData;
350 :     }
351 :     }
352 :     }
353 :     for my $evidenceID (keys %evidenceMap) {
354 :     # Create the evidence record.
355 :     my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
356 :     $loadPCH->Put($evidenceID, $usage);
357 :     # Connect it to the coupling.
358 :     $loadIsEvidencedBy->Put($coupleID, $evidenceID);
359 :     # Connect it to the features.
360 :     $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
361 :     $loadUsesAsEvidence->Put($evidenceID, $peg4, 1);
362 :     }
363 :     }
364 :     }
365 :     }
366 :     }
367 :     # All done. Finish the load.
368 :     my $retVal = $self->_FinishAll();
369 :     return $retVal;
370 :     }
371 :    
372 :     =head3 LoadFeatureData
373 :    
374 :     C<< my $stats = $spl->LoadFeatureData(); >>
375 :    
376 :     Load the feature data from FIG into Sprout.
377 :    
378 :     Features represent annotated genes, and are therefore the heart of the data store.
379 :    
380 :     The following relations are loaded by this method.
381 :    
382 :     Feature
383 :     FeatureAlias
384 :     FeatureLink
385 :     FeatureTranslation
386 :     FeatureUpstream
387 :     IsLocatedIn
388 :    
389 :     =over 4
390 :    
391 :     =item RETURNS
392 :    
393 :     Returns a statistics object for the loads.
394 :    
395 :     =back
396 :    
397 :     =cut
398 :     #: Return Type $%;
399 :     sub LoadFeatureData {
400 :     # Get this object instance.
401 :     my ($self) = @_;
402 :     # Get the FIG object.
403 :     my $fig = $self->{fig};
404 :     # Get the table of genome IDs.
405 :     my $genomeHash = $self->{genomes};
406 :     my $genomeCount = (keys %{$genomeHash});
407 :     my $featureCount = $genomeCount * 4000;
408 :     # Create load objects for each of the tables we're loading.
409 :     my $loadFeature = $self->_TableLoader('Feature', $featureCount);
410 :     my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);
411 :     my $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);
412 :     my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);
413 :     my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);
414 :     my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);
415 :     # Get the maximum sequence size. We need this later for splitting up the
416 :     # locations.
417 :     my $chunkSize = $self->{sprout}->MaxSegment();
418 :     Trace("Beginning feature data load.") if T(2);
419 :     # Now we loop through the genomes, generating the data for each one.
420 :     for my $genomeID (sort keys %{$genomeHash}) {
421 :     Trace("Loading features for genome $genomeID.") if T(3);
422 :     # Get the feature list for this genome.
423 :     my $features = $fig->all_features_detailed($genomeID);
424 :     # Loop through the features.
425 :     for my $featureData (@{$features}) {
426 :     # Split the tuple.
427 :     my ($featureID, $locations, $aliases, $type) = @{$featureData};
428 :     # Create the feature record.
429 :     $loadFeature->Put($featureID, 1, $type);
430 :     # Create the aliases.
431 :     for my $alias (split /\s*,\s*/, $aliases) {
432 :     $loadFeatureAlias->Put($featureID, $alias);
433 :     }
434 :     # Get the links.
435 :     my @links = $fig->fid_links($featureID);
436 :     for my $link (@links) {
437 :     $loadFeatureLink->Put($featureID, $link);
438 :     }
439 :     # If this is a peg, generate the translation and the upstream.
440 :     if ($type eq 'peg') {
441 :     my $translation = $fig->get_translation($featureID);
442 :     if ($translation) {
443 :     $loadFeatureTranslation->Put($featureID, $translation);
444 :     }
445 :     # We use the default upstream values of u=200 and c=100.
446 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
447 :     if ($upstream) {
448 :     $loadFeatureUpstream->Put($featureID, $upstream);
449 :     }
450 :     }
451 :     # This part is the roughest. We need to relate the features to contig
452 :     # locations, and the locations must be split so that none of them exceed
453 :     # the maximum segment size. This simplifies the genes_in_region processing
454 :     # for Sprout.
455 :     my @locationList = split /\s*,\s*/, $locations;
456 :     # Loop through the locations.
457 :     for my $location (@locationList) {
458 :     # Parse the location.
459 :     my $locObject = BasicLocation->new($location);
460 :     # Split it into a list of chunks.
461 :     my @locOList = ();
462 :     while (my $peeling = $locObject->Peel($chunkSize)) {
463 :     push @locOList, $peeling;
464 :     }
465 :     push @locOList, $locObject;
466 :     # Loop through the chunks, creating IsLocatedIn records. The variable
467 :     # "$i" will be used to keep the location index.
468 :     my $i = 1;
469 :     for my $locChunk (@locOList) {
470 :     $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
471 :     $locChunk->Dir, $locChunk->Length, $i);
472 :     $i++;
473 :     }
474 :     }
475 :     }
476 :     }
477 :     # Finish the loads.
478 :     my $retVal = $self->_FinishAll();
479 :     return $retVal;
480 :     }
481 :    
482 :     =head3 LoadBBHData
483 :    
484 :     C<< my $stats = $spl->LoadBBHData(); >>
485 :    
486 :     Load the bidirectional best hit data from FIG into Sprout.
487 :    
488 :     Sprout does not store information on similarities. Instead, it has only the
489 :     bi-directional best hits. Even so, the BBH table is one of the largest in
490 :     the database.
491 :    
492 :     The following relations are loaded by this method.
493 :    
494 :     IsBidirectionalBestHitOf
495 :    
496 :     =over 4
497 :    
498 :     =item RETURNS
499 :    
500 :     Returns a statistics object for the loads.
501 :    
502 :     =back
503 :    
504 :     =cut
505 :     #: Return Type $%;
506 : parrello 1.2 sub LoadBBHData {
507 : parrello 1.1 # Get this object instance.
508 :     my ($self) = @_;
509 :     # Get the FIG object.
510 :     my $fig = $self->{fig};
511 :     # Get the table of genome IDs.
512 :     my $genomeHash = $self->{genomes};
513 :     my $genomeCount = (keys %{$genomeHash});
514 :     my $featureCount = $genomeCount * 4000;
515 :     # Create load objects for each of the tables we're loading.
516 :     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',
517 :     $featureCount * $genomeCount);
518 :     Trace("Beginning BBH load.") if T(2);
519 :     # Now we loop through the genomes, generating the data for each one.
520 :     for my $genomeID (sort keys %{$genomeHash}) {
521 :     Trace("Processing features for genome $genomeID.") if T(3);
522 :     # Get the feature list for this genome.
523 :     my $features = $fig->all_features_detailed($genomeID);
524 :     # Loop through the features.
525 :     for my $featureData (@{$features}) {
526 :     # Split the tuple.
527 :     my ($featureID, $locations, $aliases, $type) = @{$featureData};
528 :     # Get the bi-directional best hits.
529 :     my @bbhList = $fig->bbhs($featureID);
530 :     for my $bbhEntry (@bbhList) {
531 :     # Get the target feature ID and the score.
532 :     my ($targetID, $score) = @{$bbhEntry};
533 :     # Check the target feature's genome.
534 :     my $targetGenomeID = $fig->genome_of($targetID);
535 :     # Only proceed if it's one of our genomes.
536 :     if ($genomeHash->{$targetGenomeID}) {
537 :     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,
538 :     $score);
539 :     }
540 :     }
541 :     }
542 :     }
543 :     # Finish the loads.
544 :     my $retVal = $self->_FinishAll();
545 :     return $retVal;
546 :     }
547 :    
548 :     =head3 LoadSubsystemData
549 :    
550 :     C<< my $stats = $spl->LoadSubsystemData(); >>
551 :    
552 :     Load the subsystem data from FIG into Sprout.
553 :    
554 :     Subsystems are groupings of genetic roles that work together to effect a specific
555 :     chemical reaction. Similar organisms require similar subsystems. To curate a subsystem,
556 :     a spreadsheet is created with genomes on one axis and subsystem roles on the other
557 :     axis. Similar features are then mapped into the cells, allowing the annotation of one
558 :     genome's roles to be used to assist in the annotation of others.
559 :    
560 :     The following relations are loaded by this method.
561 :    
562 :     Subsystem
563 :     Role
564 :     SSCell
565 :     ContainsFeature
566 :     IsGenomeOf
567 :     IsRoleOf
568 :     OccursInSubsystem
569 :     ParticipatesIn
570 :     HasSSCell
571 :    
572 :     =over 4
573 :    
574 :     =item RETURNS
575 :    
576 :     Returns a statistics object for the loads.
577 :    
578 :     =back
579 :    
580 :     B<TO DO>
581 :    
582 :     Generate RoleName table?
583 :    
584 :     =cut
585 :     #: Return Type $%;
586 :     sub LoadSubsystemData {
587 :     # Get this object instance.
588 :     my ($self) = @_;
589 :     # Get the FIG object.
590 :     my $fig = $self->{fig};
591 :     # Get the genome hash. We'll use it to filter the genomes in each
592 :     # spreadsheet.
593 :     my $genomeHash = $self->{genomes};
594 :     # Get the subsystem hash. This lists the subsystems we'll process.
595 :     my $subsysHash = $self->{subsystems};
596 :     my @subsysIDs = sort keys %{$subsysHash};
597 :     my $subsysCount = @subsysIDs;
598 :     my $genomeCount = (keys %{$genomeHash});
599 :     my $featureCount = $genomeCount * 4000;
600 :     # Create load objects for each of the tables we're loading.
601 :     my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);
602 :     my $loadRole = $self->_TableLoader('Role', $featureCount * 6);
603 :     my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);
604 :     my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);
605 :     my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);
606 :     my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);
607 :     my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);
608 :     my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);
609 :     my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);
610 :     Trace("Beginning subsystem data load.") if T(2);
611 :     # Loop through the subsystems. Our first task will be to create the
612 :     # roles. We do this by looping through the subsystems and creating a
613 :     # role hash. The hash tracks each role ID so that we don't create
614 :     # duplicates. As we move along, we'll connect the roles and subsystems.
615 :     my %roleData = ();
616 :     for my $subsysID (@subsysIDs) {
617 :     Trace("Creating subsystem $subsysID.") if T(3);
618 :     # Create the subsystem record.
619 :     $loadSubsystem->Put($subsysID);
620 :     # Get the subsystem's roles.
621 :     my @roles = $fig->subsys_to_roles($subsysID);
622 :     # Connect the roles to the subsystem. If a role is new, we create
623 :     # a role record for it.
624 :     for my $roleID (@roles) {
625 :     $loadOccursInSubsystem->Put($roleID, $subsysID);
626 :     if (! exists $roleData{$roleID}) {
627 :     $loadRole->Put($roleID);
628 :     $roleData{$roleID} = 1;
629 :     }
630 :     }
631 :     # Now all roles for this subsystem have been filled in. We create the
632 :     # spreadsheet by matches roles to genomes. To do this, we need to
633 :     # get the genomes on the sheet.
634 :     Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
635 :     my @genomes = map { $_->[0] } @{$fig->subsystem_genomes($subsysID)};
636 :     for my $genomeID (@genomes) {
637 :     # Only process this genome if it's one of ours.
638 :     if (exists $genomeHash->{$genomeID}) {
639 :     # Connect the genome to the subsystem.
640 :     $loadParticipatesIn->Put($genomeID, $subsysID);
641 :     # Loop through the subsystem's roles. We use an index because it is
642 :     # part of the spreadsheet cell ID.
643 :     for (my $i = 0; $i <= $#roles; $i++) {
644 :     my $role = $roles[$i];
645 :     # Get the features in the spreadsheet cell for this genome and role.
646 :     my @pegs = $fig->pegs_in_subsystem_coll($subsysID, $genomeID, $i);
647 :     # Only proceed if features exist.
648 :     if (@pegs > 0) {
649 :     # Create the spreadsheet cell.
650 :     my $cellID = "$subsysID:$genomeID:$i";
651 :     $loadSSCell->Put($cellID);
652 :     $loadIsGenomeOf->Put($genomeID, $cellID);
653 :     $loadIsRoleOf->Put($role, $cellID);
654 :     $loadHasSSCell->Put($subsysID, $cellID);
655 :     # Attach the features to it.
656 :     for my $pegID (@pegs) {
657 :     $loadContainsFeature->Put($cellID, $pegID);
658 :     }
659 :     }
660 :     }
661 :     }
662 :     }
663 :     }
664 :     # Finish the load.
665 :     my $retVal = $self->_FinishAll();
666 :     return $retVal;
667 :     }
668 :    
669 :     =head3 LoadDiagramData
670 :    
671 :     C<< my $stats = $spl->LoadDiagramData(); >>
672 :    
673 :     Load the diagram data from FIG into Sprout.
674 :    
675 :     Diagrams are used to organize functional roles. The diagram shows the
676 :     connections between chemicals that interact with a subsystem.
677 :    
678 :     The following relations are loaded by this method.
679 :    
680 :     Diagram
681 :     RoleOccursIn
682 :    
683 :     =over 4
684 :    
685 :     =item RETURNS
686 :    
687 :     Returns a statistics object for the loads.
688 :    
689 :     =back
690 :    
691 :     =cut
692 :     #: Return Type $%;
693 :     sub LoadDiagramData {
694 :     # Get this object instance.
695 :     my ($self) = @_;
696 :     # Get the FIG object.
697 :     my $fig = $self->{fig};
698 :     # Get the map list.
699 :     my @maps = $fig->all_maps;
700 :     my $mapCount = @maps;
701 :     my $genomeCount = (keys %{$self->{genomes}});
702 :     my $featureCount = $genomeCount * 4000;
703 :     # Create load objects for each of the tables we're loading.
704 :     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);
705 :     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);
706 :     Trace("Beginning diagram data load.") if T(2);
707 :     # Loop through the diagrams.
708 :     for my $map ($fig->all_maps) {
709 :     Trace("Loading diagram $map.") if T(3);
710 :     # Get the diagram's descriptive name.
711 :     my $name = $fig->map_name($map);
712 :     $loadDiagram->Put($map, $name);
713 :     # Now we need to link all the map's roles to it.
714 :     # A hash is used to prevent duplicates.
715 :     my %roleHash = ();
716 :     for my $role ($fig->map_to_ecs($map)) {
717 :     if (! $roleHash{$role}) {
718 :     $loadRoleOccursIn->Put($role, $map);
719 :     $roleHash{$role} = 1;
720 :     }
721 :     }
722 :     }
723 :     # Finish the load.
724 :     my $retVal = $self->_FinishAll();
725 :     return $retVal;
726 :     }
727 :    
728 :     =head3 LoadPropertyData
729 :    
730 :     C<< my $stats = $spl->LoadPropertyData(); >>
731 :    
732 :     Load the attribute data from FIG into Sprout.
733 :    
734 :     Attribute data in FIG corresponds to the Sprout concept of Property. As currently
735 :     implemented, each key-value attribute combination in the SEED corresponds to a
736 :     record in the B<Property> table. The B<HasProperty> relationship links the
737 :     features to the properties.
738 :    
739 :     The SEED also allows attributes to be assigned to genomes, but this is not yet
740 :     supported by Sprout.
741 :    
742 :     The following relations are loaded by this method.
743 :    
744 :     HasProperty
745 :     Property
746 :    
747 :     =over 4
748 :    
749 :     =item RETURNS
750 :    
751 :     Returns a statistics object for the loads.
752 :    
753 :     =back
754 :    
755 :     =cut
756 :     #: Return Type $%;
757 :     sub LoadPropertyData {
758 :     # Get this object instance.
759 :     my ($self) = @_;
760 :     # Get the FIG object.
761 :     my $fig = $self->{fig};
762 :     # Get the genome hash.
763 :     my $genomeHash = $self->{genomes};
764 :     my $genomeCount = (keys %{$genomeHash});
765 :     # Create load objects for each of the tables we're loading.
766 :     my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);
767 :     my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);
768 :     Trace("Beginning property data load.") if T(2);
769 :     # Create a hash for storing property IDs.
770 :     my %propertyKeys = ();
771 :     my $nextID = 1;
772 :     # Loop through the genomes.
773 :     for my $genomeID (keys %{$genomeHash}) {
774 :     # Get the genome's features. The feature ID is the first field in the
775 :     # tuples returned by "all_features_detailed". We use "all_features_detailed"
776 :     # rather than "all_features" because we want all features regardless of type.
777 :     my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
778 :     # Loop through the features, creating HasProperty records.
779 :     for my $fid (@features) {
780 :     # Get all attributes for this feature. We do this one feature at a time
781 :     # to insure we do not get any genome attributes.
782 :     my @attributeList = $fig->get_attributes($fid, '', '', '');
783 :     # Loop through the attributes.
784 :     for my $tuple (@attributeList) {
785 :     # Get this attribute value's data. Note that we throw away the FID,
786 :     # since it will always be the same as the value if "$fid".
787 :     my (undef, $key, $value, $url) = @{$tuple};
788 :     # Concatenate the key and value and check the "propertyKeys" hash to
789 :     # see if we already have an ID for it. We use a tab for the separator
790 :     # character.
791 :     my $propertyKey = "$key\t$value";
792 :     # Use the concatenated value to check for an ID. If no ID exists, we
793 :     # create one.
794 :     my $propertyID = $propertyKeys{$propertyKey};
795 :     if (! $propertyID) {
796 :     # Here we need to create a new property ID for this key/value pair.
797 :     $propertyKeys{$propertyKey} = $nextID;
798 :     $propertyID = $nextID;
799 :     $nextID++;
800 :     $loadProperty->Put($propertyID, $key, $value);
801 :     }
802 :     # Create the HasProperty entry for this feature/property association.
803 :     $loadHasProperty->Put($fid, $propertyID, $url);
804 :     }
805 :     }
806 :     }
807 :     # Finish the load.
808 :     my $retVal = $self->_FinishAll();
809 :     return $retVal;
810 :     }
811 :    
812 :     =head3 LoadAnnotationData
813 :    
814 :     C<< my $stats = $spl->LoadAnnotationData(); >>
815 :    
816 :     Load the annotation data from FIG into Sprout.
817 :    
818 :     Sprout annotations encompass both the assignments and the annotations in SEED.
819 :     These describe the function performed by a PEG as well as any other useful
820 :     information that may aid in identifying its purpose.
821 :    
822 :     The following relations are loaded by this method.
823 :    
824 :     Annotation
825 :     IsTargetOfAnnotation
826 :     SproutUser
827 :     MadeAnnotation
828 :    
829 :     =over 4
830 :    
831 :     =item RETURNS
832 :    
833 :     Returns a statistics object for the loads.
834 :    
835 :     =back
836 :    
837 :     =cut
838 :     #: Return Type $%;
839 :     sub LoadAnnotationData {
840 :     # Get this object instance.
841 :     my ($self) = @_;
842 :     # Get the FIG object.
843 :     my $fig = $self->{fig};
844 :     # Get the genome hash.
845 :     my $genomeHash = $self->{genomes};
846 :     my $genomeCount = (keys %{$genomeHash});
847 :     # Create load objects for each of the tables we're loading.
848 :     my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);
849 :     my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);
850 :     my $loadSproutUser = $self->_TableLoader('SproutUser', 100);
851 :     my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);
852 :     my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);
853 :     Trace("Beginning annotation data load.") if T(2);
854 :     # Create a hash of user names. We'll use this to prevent us from generating duplicate
855 :     # user records.
856 :     my %users = ( FIG => 1, master => 1 );
857 :     # Put in FIG and "master".
858 :     $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes");
859 :     $loadUserAccess->Put("FIG", 1);
860 :     $loadSproutUser->Put("master", "Master User");
861 :     $loadUserAccess->Put("master", 1);
862 :     # Get the current time.
863 :     my $time = time();
864 :     # Loop through the genomes.
865 :     for my $genomeID (%{$genomeHash}) {
866 :     Trace("Processing $genomeID.") if T(3);
867 :     # Get the genome's PEGs.
868 :     my @pegs = $fig->pegs_of($genomeID);
869 :     for my $peg (@pegs) {
870 :     Trace("Processing $peg.") if T(4);
871 :     # Create a hash of timestamps. We use this to prevent duplicate time stamps
872 :     # from showing up for a single PEG's annotations.
873 :     my %seenTimestamps = ();
874 :     # Check for a functional assignment.
875 :     my $func = $fig->function_of($peg);
876 :     if ($func) {
877 :     # If this is NOT a hypothetical assignment, we create an
878 :     # assignment annotation for it.
879 :     if (! FIG::hypo($peg)) {
880 :     # Note that we double the slashes so that what goes into the database is
881 :     # a new-line escape sequence rather than an actual new-line.
882 :     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");
883 :     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");
884 :     $loadMadeAnnotation->Put("FIG", "$peg:$time");
885 :     # Denote we've seen this timestamp.
886 :     $seenTimestamps{$time} = 1;
887 :     }
888 :     # Now loop through the real annotations.
889 :     for my $tuple ($fig->feature_annotations($peg, "raw")) {
890 :     my ($fid, $timestamp, $user, $text) = $tuple;
891 :     # Here we fix up the annotation text. "\r" is removed,
892 :     # and "\t" and "\n" are escaped. Note we use the "s"
893 :     # modifier so that new-lines inside the text do not
894 :     # stop the substitution search.
895 :     $text =~ s/\r//gs;
896 :     $text =~ s/\t/\\t/gs;
897 :     $text =~ s/\n/\\n/gs;
898 :     # Change assignments by the master user to FIG assignments.
899 :     $text =~ s/Set master function/Set FIG function/s;
900 :     # Insure the time stamp is valid.
901 :     if ($timestamp =~ /^\d+$/) {
902 :     # Here it's a number. We need to insure it's unique.
903 :     while ($seenTimestamps{$timestamp}) {
904 :     $timestamp++;
905 :     }
906 :     $seenTimestamps{$timestamp} = 1;
907 :     my $annotationID = "$peg:$timestamp";
908 :     # Insure the user exists.
909 :     if (! $users{$user}) {
910 :     $loadSproutUser->Put($user, "SEED user");
911 :     $loadUserAccess->Put($user, 1);
912 :     $users{$user} = 1;
913 :     }
914 :     # Generate the annotation.
915 :     $loadAnnotation->Put($annotationID, $timestamp, "$user\\n$text");
916 :     $loadIsTargetOfAnnotation->Put($peg, $annotationID);
917 :     $loadMadeAnnotation->Put($user, $annotationID);
918 :     } else {
919 :     # Here we have an invalid time stamp.
920 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1);
921 :     }
922 :     }
923 :     }
924 :     }
925 :     }
926 :     # Finish the load.
927 :     my $retVal = $self->_FinishAll();
928 :     return $retVal;
929 :     }
930 :    
931 :     =head2 Internal Utility Methods
932 :    
933 :     =head3 TableLoader
934 :    
935 :     Create an ERDBLoad object for the specified table. The object is also added to
936 :     the internal list in the C<loaders> property of this object. That enables the
937 :     L</FinishAll> method to terminate all the active loads.
938 :    
939 :     This is an instance method.
940 :    
941 :     =over 4
942 :    
943 :     =item tableName
944 :    
945 :     Name of the table (relation) being loaded.
946 :    
947 :     =item rowCount (optional)
948 :    
949 :     Estimated maximum number of rows in the table.
950 :    
951 :     =item RETURN
952 :    
953 :     Returns an ERDBLoad object for loading the specified table.
954 :    
955 :     =back
956 :    
957 :     =cut
958 :    
959 :     sub _TableLoader {
960 :     # Get the parameters.
961 :     my ($self, $tableName, $rowCount) = @_;
962 :     # Create the load object.
963 :     my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);
964 :     # Cache it in the loader list.
965 :     push @{$self->{loaders}}, $retVal;
966 :     # Return it to the caller.
967 :     return $retVal;
968 :     }
969 :    
970 :     =head3 FinishAll
971 :    
972 :     Finish all the active loads on this object.
973 :    
974 :     When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in
975 :     the list pointed to be the C<loaders> property of this object. This method pops the loaders
976 :     off the list and finishes them to flush out any accumulated residue.
977 :    
978 :     This is an instance method.
979 :    
980 :     =over 4
981 :    
982 :     =item RETURN
983 :    
984 :     Returns a statistics object containing the accumulated statistics for the load.
985 :    
986 :     =back
987 :    
988 :     =cut
989 :    
990 :     sub _FinishAll {
991 :     # Get this object instance.
992 :     my ($self) = @_;
993 :     # Create the statistics object.
994 :     my $retVal = Stats->new();
995 :     # Get the loader list.
996 :     my $loadList = $self->{loaders};
997 :     # Loop through the list, finishing the loads. Note that if the finish fails, we die
998 :     # ignominiously. At some future point, we want to make the loads restartable.
999 :     while (my $loader = pop @{$loadList}) {
1000 :     my $stats = $loader->Finish();
1001 :     $retVal->Accumulate($stats);
1002 :     my $relName = $loader->RelName;
1003 :     Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1004 :     }
1005 :     # Return the load statistics.
1006 :     return $retVal;
1007 :     }
1008 :    
1009 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3