[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Annotation of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SproutLoad;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDBLoad;
9 :     use FIG;
10 :     use Sprout;
11 :     use Stats;
12 :     use BasicLocation;
13 : parrello 1.18 use HTML;
14 : parrello 1.1
15 :     =head1 Sprout Load Methods
16 :    
17 :     =head2 Introduction
18 :    
19 :     This object contains the methods needed to copy data from the FIG data store to the
20 :     Sprout database. It makes heavy use of the ERDBLoad object to manage the load into
21 :     individual tables. The client can create an instance of this object and then
22 :     call methods for each group of tables to load. For example, the following code will
23 :     load the Genome- and Feature-related tables. (It is presumed the first command line
24 :     parameter contains the name of a file specifying the genomes.)
25 :    
26 :     my $fig = FIG->new();
27 :     my $sprout = SFXlate->new_sprout_only();
28 :     my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]);
29 :     my $stats = $spl->LoadGenomeData();
30 :     $stats->Accumulate($spl->LoadFeatureData());
31 :     print $stats->Show();
32 :    
33 :     This module makes use of the internal Sprout property C<_erdb>.
34 :    
35 :     It is worth noting that the FIG object does not need to be a real one. Any object
36 :     that implements the FIG methods for data retrieval could be used. So, for example,
37 :     this object could be used to copy data from one Sprout database to another, or
38 :     from any FIG-compliant data story implemented in the future.
39 :    
40 :     To insure that this is possible, each time the FIG object is used, it will be via
41 :     a variable called C<$fig>. This makes it fairly straightforward to determine which
42 :     FIG methods are required to load the Sprout database.
43 :    
44 : parrello 1.5 This object creates the load files; however, the tables are not created until it
45 :     is time to actually do the load from the files into the target database.
46 :    
47 : parrello 1.1 =cut
48 :    
49 :     #: Constructor SproutLoad->new();
50 :    
51 :     =head2 Public Methods
52 :    
53 :     =head3 new
54 :    
55 : parrello 1.8 C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >>
56 : parrello 1.1
57 :     Construct a new Sprout Loader object, specifying the two participating databases and
58 :     the name of the files containing the list of genomes and subsystems to use.
59 :    
60 :     =over 4
61 :    
62 :     =item sprout
63 :    
64 :     Sprout object representing the target database. This also specifies the directory to
65 :     be used for creating the load files.
66 :    
67 :     =item fig
68 :    
69 :     FIG object representing the source data store from which the data is to be taken.
70 :    
71 :     =item genomeFile
72 :    
73 :     Either the name of the file containing the list of genomes to load or a reference to
74 :     a hash of genome IDs to access codes. If nothing is specified, all complete genomes
75 :     will be loaded and the access code will default to 1. The genome list is presumed
76 :     to be all-inclusive. In other words, all existing data in the target database will
77 :     be deleted and replaced with the data on the specified genes. If a file is specified,
78 :     it should contain one genome ID and access code per line, tab-separated.
79 :    
80 :     =item subsysFile
81 :    
82 :     Either the name of the file containing the list of trusted subsystems or a reference
83 :     to a list of subsystem names. If nothing is specified, all known subsystems will be
84 :     considered trusted. Only subsystem data related to the trusted subsystems is loaded.
85 :    
86 : parrello 1.8 =item options
87 :    
88 :     Reference to a hash of command-line options.
89 :    
90 : parrello 1.1 =back
91 :    
92 :     =cut
93 :    
94 :     sub new {
95 :     # Get the parameters.
96 : parrello 1.8 my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
97 : parrello 1.1 # Load the list of genomes into a hash.
98 :     my %genomes;
99 :     if (! defined($genomeFile) || $genomeFile eq '') {
100 :     # Here we want all the complete genomes and an access code of 1.
101 :     my @genomeList = $fig->genomes(1);
102 :     %genomes = map { $_ => 1 } @genomeList;
103 : parrello 1.3 } else {
104 :     my $type = ref $genomeFile;
105 :     Trace("Genome file parameter type is \"$type\".") if T(3);
106 :     if ($type eq 'HASH') {
107 :     # Here the user specified a hash of genome IDs to access codes, which is
108 :     # exactly what we want.
109 :     %genomes = %{$genomeFile};
110 :     } elsif (! $type || $type eq 'SCALAR' ) {
111 :     # The caller specified a file, so read the genomes from the file. (Note
112 :     # that some PERLs return an empty string rather than SCALAR.)
113 :     my @genomeList = Tracer::GetFile($genomeFile);
114 :     if (! @genomeList) {
115 :     # It's an error if the genome file is empty or not found.
116 :     Confess("No genomes found in file \"$genomeFile\".");
117 :     } else {
118 :     # We build the genome Hash using a loop rather than "map" so that
119 :     # an omitted access code can be defaulted to 1.
120 :     for my $genomeLine (@genomeList) {
121 :     my ($genomeID, $accessCode) = split("\t", $genomeLine);
122 :     if (undef $accessCode) {
123 :     $accessCode = 1;
124 :     }
125 :     $genomes{$genomeID} = $accessCode;
126 : parrello 1.1 }
127 :     }
128 : parrello 1.3 } else {
129 :     Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
130 : parrello 1.1 }
131 :     }
132 :     # Load the list of trusted subsystems.
133 :     my %subsystems = ();
134 :     if (! defined $subsysFile || $subsysFile eq '') {
135 :     # Here we want all the subsystems.
136 :     %subsystems = map { $_ => 1 } $fig->all_subsystems();
137 : parrello 1.4 } else {
138 :     my $type = ref $subsysFile;
139 :     if ($type eq 'ARRAY') {
140 :     # Here the user passed in a list of subsystems.
141 :     %subsystems = map { $_ => 1 } @{$subsysFile};
142 :     } elsif (! $type || $type eq 'SCALAR') {
143 :     # Here the list of subsystems is in a file.
144 :     if (! -e $subsysFile) {
145 :     # It's an error if the file does not exist.
146 :     Confess("Trusted subsystem file not found.");
147 :     } else {
148 :     # GetFile automatically chomps end-of-line characters, so this
149 :     # is an easy task.
150 :     %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile);
151 :     }
152 : parrello 1.1 } else {
153 : parrello 1.4 Confess("Invalid subsystem parameter in SproutLoad constructor.");
154 : parrello 1.1 }
155 :     }
156 :     # Get the data directory from the Sprout object.
157 :     my ($directory) = $sprout->LoadInfo();
158 :     # Create the Sprout load object.
159 :     my $retVal = {
160 :     fig => $fig,
161 :     genomes => \%genomes,
162 :     subsystems => \%subsystems,
163 :     sprout => $sprout,
164 :     loadDirectory => $directory,
165 :     erdb => $sprout->{_erdb},
166 : parrello 1.8 loaders => [],
167 :     options => $options
168 : parrello 1.1 };
169 :     # Bless and return it.
170 :     bless $retVal, $class;
171 :     return $retVal;
172 :     }
173 :    
174 :     =head3 LoadGenomeData
175 :    
176 :     C<< my $stats = $spl->LoadGenomeData(); >>
177 :    
178 :     Load the Genome, Contig, and Sequence data from FIG into Sprout.
179 :    
180 :     The Sequence table is the largest single relation in the Sprout database, so this
181 :     method is expected to be slow and clumsy. At some point we will need to make it
182 :     restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be
183 :     very annoying otherwise.
184 :    
185 :     The following relations are loaded by this method.
186 :    
187 :     Genome
188 :     HasContig
189 :     Contig
190 :     IsMadeUpOf
191 :     Sequence
192 :    
193 :     =over 4
194 :    
195 :     =item RETURNS
196 :    
197 :     Returns a statistics object for the loads.
198 :    
199 :     =back
200 :    
201 :     B<TO DO>
202 :    
203 :     Real quality vectors instead of C<unknown> for everything.
204 :    
205 :     GenomeGroup relation. (The original script took group information from the C<NMPDR> file
206 :     in each genome's main directory, but no such file exists anywhere in my version of the
207 :     data store.)
208 :    
209 :     =cut
210 :     #: Return Type $%;
211 :     sub LoadGenomeData {
212 :     # Get this object instance.
213 :     my ($self) = @_;
214 :     # Get the FIG object.
215 :     my $fig = $self->{fig};
216 :     # Get the genome count.
217 :     my $genomeHash = $self->{genomes};
218 :     my $genomeCount = (keys %{$genomeHash});
219 :     Trace("Beginning genome data load.") if T(2);
220 :     # Create load objects for each of the tables we're loading.
221 :     my $loadGenome = $self->_TableLoader('Genome', $genomeCount);
222 :     my $loadHasContig = $self->_TableLoader('HasContig', $genomeCount * 300);
223 :     my $loadContig = $self->_TableLoader('Contig', $genomeCount * 300);
224 :     my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf', $genomeCount * 60000);
225 :     my $loadSequence = $self->_TableLoader('Sequence', $genomeCount * 60000);
226 :     # Now we loop through the genomes, generating the data for each one.
227 :     for my $genomeID (sort keys %{$genomeHash}) {
228 :     Trace("Loading data for genome $genomeID.") if T(3);
229 : parrello 1.6 $loadGenome->Add("genomeIn");
230 : parrello 1.1 # The access code comes in via the genome hash.
231 :     my $accessCode = $genomeHash->{$genomeID};
232 :     # Get the genus, species, and strain from the scientific name. Note that we append
233 :     # the genome ID to the strain. In some cases this is the totality of the strain name.
234 :     my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
235 : parrello 1.4 my $extra = join " ", @extraData, "[$genomeID]";
236 : parrello 1.1 # Get the full taxonomy.
237 :     my $taxonomy = $fig->taxonomy_of($genomeID);
238 :     # Output the genome record.
239 :     $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), $genus,
240 :     $species, $extra, $taxonomy);
241 :     # Now we loop through each of the genome's contigs.
242 :     my @contigs = $fig->all_contigs($genomeID);
243 :     for my $contigID (@contigs) {
244 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
245 : parrello 1.6 $loadContig->Add("contigIn");
246 :     $loadSequence->Add("contigIn");
247 : parrello 1.1 # Create the contig ID.
248 :     my $sproutContigID = "$genomeID:$contigID";
249 :     # Create the contig record and relate it to the genome.
250 :     $loadContig->Put($sproutContigID);
251 :     $loadHasContig->Put($genomeID, $sproutContigID);
252 :     # Now we need to split the contig into sequences. The maximum sequence size is
253 :     # a property of the Sprout object.
254 :     my $chunkSize = $self->{sprout}->MaxSequence();
255 :     # Now we get the sequence a chunk at a time.
256 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
257 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
258 : parrello 1.6 $loadSequence->Add("chunkIn");
259 : parrello 1.1 # Compute the endpoint of this chunk.
260 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
261 :     # Get the actual DNA.
262 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
263 :     # Compute the sequenceID.
264 :     my $seqID = "$sproutContigID.$i";
265 :     # Write out the data. For now, the quality vector is always "unknown".
266 :     $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i);
267 :     $loadSequence->Put($seqID, "unknown", $dna);
268 :     }
269 :     }
270 :     }
271 :     # Finish the loads.
272 :     my $retVal = $self->_FinishAll();
273 :     # Return the result.
274 :     return $retVal;
275 :     }
276 :    
277 :     =head3 LoadCouplingData
278 :    
279 :     C<< my $stats = $spl->LoadCouplingData(); >>
280 :    
281 :     Load the coupling and evidence data from FIG into Sprout.
282 :    
283 :     The coupling data specifies which genome features are functionally coupled. The
284 :     evidence data explains why the coupling is functional.
285 :    
286 :     The following relations are loaded by this method.
287 :    
288 :     Coupling
289 :     IsEvidencedBy
290 :     PCH
291 :     ParticipatesInCoupling
292 :     UsesAsEvidence
293 :    
294 :     =over 4
295 :    
296 :     =item RETURNS
297 :    
298 :     Returns a statistics object for the loads.
299 :    
300 :     =back
301 :    
302 :     =cut
303 :     #: Return Type $%;
304 :     sub LoadCouplingData {
305 :     # Get this object instance.
306 :     my ($self) = @_;
307 :     # Get the FIG object.
308 :     my $fig = $self->{fig};
309 :     # Get the genome hash.
310 :     my $genomeFilter = $self->{genomes};
311 :     my $genomeCount = (keys %{$genomeFilter});
312 :     my $featureCount = $genomeCount * 4000;
313 :     # Start the loads.
314 :     my $loadCoupling = $self->_TableLoader('Coupling', $featureCount * $genomeCount);
315 :     my $loadIsEvidencedBy = $self->_TableLoader('IsEvidencedBy', $featureCount * 8000);
316 :     my $loadPCH = $self->_TableLoader('PCH', $featureCount * 2000);
317 :     my $loadParticipatesInCoupling = $self->_TableLoader('ParticipatesInCoupling', $featureCount * 2000);
318 :     my $loadUsesAsEvidence = $self->_TableLoader('UsesAsEvidence', $featureCount * 8000);
319 :     Trace("Beginning coupling data load.") if T(2);
320 :     # Loop through the genomes found.
321 :     for my $genome (sort keys %{$genomeFilter}) {
322 :     Trace("Generating coupling data for $genome.") if T(3);
323 : parrello 1.6 $loadCoupling->Add("genomeIn");
324 : parrello 1.1 # Create a hash table for holding coupled pairs. We use this to prevent
325 :     # duplicates. For example, if A is coupled to B, we don't want to also
326 :     # assert that B is coupled to A, because we already know it. Fortunately,
327 :     # all couplings occur within a genome, so we can keep the hash table
328 :     # size reasonably small.
329 :     my %dupHash = ();
330 :     # Get all of the genome's PEGs.
331 :     my @pegs = $fig->pegs_of($genome);
332 :     # Loop through the PEGs.
333 :     for my $peg1 (@pegs) {
334 : parrello 1.6 $loadCoupling->Add("pegIn");
335 : parrello 1.1 Trace("Processing PEG $peg1 for $genome.") if T(4);
336 :     # Get a list of the coupled PEGs.
337 :     my @couplings = $fig->coupled_to($peg1);
338 :     # For each coupled PEG, we need to verify that a coupling already
339 :     # exists. If not, we have to create one.
340 :     for my $coupleData (@couplings) {
341 :     my ($peg2, $score) = @{$coupleData};
342 :     # Compute the coupling ID.
343 :     my $coupleID = Sprout::CouplingID($peg1, $peg2);
344 :     if (! exists $dupHash{$coupleID}) {
345 : parrello 1.6 $loadCoupling->Add("couplingIn");
346 : parrello 1.1 # Here we have a new coupling to store in the load files.
347 :     Trace("Storing coupling ($coupleID) with score $score.") if T(4);
348 :     # Ensure we don't do this again.
349 :     $dupHash{$coupleID} = $score;
350 :     # Write the coupling record.
351 :     $loadCoupling->Put($coupleID, $score);
352 :     # Connect it to the coupled PEGs.
353 :     $loadParticipatesInCoupling->Put($peg1, $coupleID, 1);
354 :     $loadParticipatesInCoupling->Put($peg2, $coupleID, 2);
355 :     # Get the evidence for this coupling.
356 :     my @evidence = $fig->coupling_evidence($peg1, $peg2);
357 :     # Organize the evidence into a hash table.
358 :     my %evidenceMap = ();
359 :     # Process each evidence item.
360 :     for my $evidenceData (@evidence) {
361 : parrello 1.6 $loadPCH->Add("evidenceIn");
362 : parrello 1.1 my ($peg3, $peg4, $usage) = @{$evidenceData};
363 :     # Only proceed if the evidence is from a Sprout
364 :     # genome.
365 :     if ($genomeFilter->{$fig->genome_of($peg3)}) {
366 : parrello 1.6 $loadUsesAsEvidence->Add("evidenceChosen");
367 : parrello 1.1 my $evidenceKey = "$coupleID $peg3 $peg4";
368 :     # We store this evidence in the hash if the usage
369 :     # is nonzero or no prior evidence has been found. This
370 :     # insures that if there is duplicate evidence, we
371 : parrello 1.18 # at least keep the meaningful ones. Only evidence in
372 : parrello 1.1 # the hash makes it to the output.
373 :     if ($usage || ! exists $evidenceMap{$evidenceKey}) {
374 :     $evidenceMap{$evidenceKey} = $evidenceData;
375 :     }
376 :     }
377 :     }
378 :     for my $evidenceID (keys %evidenceMap) {
379 :     # Create the evidence record.
380 :     my ($peg3, $peg4, $usage) = @{$evidenceMap{$evidenceID}};
381 :     $loadPCH->Put($evidenceID, $usage);
382 :     # Connect it to the coupling.
383 :     $loadIsEvidencedBy->Put($coupleID, $evidenceID);
384 :     # Connect it to the features.
385 :     $loadUsesAsEvidence->Put($evidenceID, $peg3, 1);
386 : parrello 1.17 $loadUsesAsEvidence->Put($evidenceID, $peg4, 2);
387 : parrello 1.1 }
388 :     }
389 :     }
390 :     }
391 :     }
392 :     # All done. Finish the load.
393 :     my $retVal = $self->_FinishAll();
394 :     return $retVal;
395 :     }
396 :    
397 :     =head3 LoadFeatureData
398 :    
399 :     C<< my $stats = $spl->LoadFeatureData(); >>
400 :    
401 :     Load the feature data from FIG into Sprout.
402 :    
403 :     Features represent annotated genes, and are therefore the heart of the data store.
404 :    
405 :     The following relations are loaded by this method.
406 :    
407 :     Feature
408 :     FeatureAlias
409 :     FeatureLink
410 :     FeatureTranslation
411 :     FeatureUpstream
412 :     IsLocatedIn
413 :    
414 :     =over 4
415 :    
416 :     =item RETURNS
417 :    
418 :     Returns a statistics object for the loads.
419 :    
420 :     =back
421 :    
422 :     =cut
423 :     #: Return Type $%;
424 :     sub LoadFeatureData {
425 :     # Get this object instance.
426 :     my ($self) = @_;
427 :     # Get the FIG object.
428 :     my $fig = $self->{fig};
429 : parrello 1.8 # Find out if this is a limited run.
430 :     my $limited = $self->{options}->{limitedFeatures};
431 : parrello 1.1 # Get the table of genome IDs.
432 :     my $genomeHash = $self->{genomes};
433 :     my $genomeCount = (keys %{$genomeHash});
434 :     my $featureCount = $genomeCount * 4000;
435 :     # Create load objects for each of the tables we're loading.
436 :     my $loadFeature = $self->_TableLoader('Feature', $featureCount);
437 :     my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn', $featureCount);
438 : parrello 1.11 my $loadFeatureAlias = $self->_TableLoader('FeatureAlias', $featureCount * 6);
439 :     my ($loadFeatureLink, $loadFeatureTranslation, $loadFeatureUpstream);
440 : parrello 1.8 if (! $limited) {
441 :     $loadFeatureLink = $self->_TableLoader('FeatureLink', $featureCount * 10);
442 :     $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation', $featureCount);
443 :     $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream', $featureCount);
444 :     }
445 : parrello 1.1 # Get the maximum sequence size. We need this later for splitting up the
446 :     # locations.
447 :     my $chunkSize = $self->{sprout}->MaxSegment();
448 :     Trace("Beginning feature data load.") if T(2);
449 :     # Now we loop through the genomes, generating the data for each one.
450 :     for my $genomeID (sort keys %{$genomeHash}) {
451 :     Trace("Loading features for genome $genomeID.") if T(3);
452 : parrello 1.6 $loadFeature->Add("genomeIn");
453 : parrello 1.1 # Get the feature list for this genome.
454 :     my $features = $fig->all_features_detailed($genomeID);
455 :     # Loop through the features.
456 :     for my $featureData (@{$features}) {
457 : parrello 1.6 $loadFeature->Add("featureIn");
458 : parrello 1.1 # Split the tuple.
459 : parrello 1.11 my ($featureID, $locations, undef, $type) = @{$featureData};
460 : parrello 1.1 # Create the feature record.
461 : parrello 1.7 $loadFeature->Put($featureID, 1, $type);
462 : parrello 1.11 # Create the aliases.
463 :     for my $alias ($fig->feature_aliases($featureID)) {
464 :     $loadFeatureAlias->Put($featureID, $alias);
465 :     }
466 : parrello 1.8 # The next stuff is for a full load only.
467 :     if (! $limited) {
468 :     # Get the links.
469 :     my @links = $fig->fid_links($featureID);
470 :     for my $link (@links) {
471 :     $loadFeatureLink->Put($featureID, $link);
472 : parrello 1.1 }
473 : parrello 1.8 # If this is a peg, generate the translation and the upstream.
474 :     if ($type eq 'peg') {
475 :     $loadFeatureTranslation->Add("pegIn");
476 :     my $translation = $fig->get_translation($featureID);
477 :     if ($translation) {
478 :     $loadFeatureTranslation->Put($featureID, $translation);
479 :     }
480 :     # We use the default upstream values of u=200 and c=100.
481 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
482 :     if ($upstream) {
483 :     $loadFeatureUpstream->Put($featureID, $upstream);
484 :     }
485 : parrello 1.1 }
486 :     }
487 :     # This part is the roughest. We need to relate the features to contig
488 :     # locations, and the locations must be split so that none of them exceed
489 :     # the maximum segment size. This simplifies the genes_in_region processing
490 :     # for Sprout.
491 : parrello 1.10 my @locationList = split /\s*,\s*/, $locations;
492 : parrello 1.8 # Create the location position indicator.
493 :     my $i = 1;
494 : parrello 1.1 # Loop through the locations.
495 :     for my $location (@locationList) {
496 :     # Parse the location.
497 : parrello 1.10 my $locObject = BasicLocation->new("$genomeID:$location");
498 : parrello 1.1 # Split it into a list of chunks.
499 :     my @locOList = ();
500 :     while (my $peeling = $locObject->Peel($chunkSize)) {
501 : parrello 1.6 $loadIsLocatedIn->Add("peeling");
502 : parrello 1.1 push @locOList, $peeling;
503 :     }
504 :     push @locOList, $locObject;
505 :     # Loop through the chunks, creating IsLocatedIn records. The variable
506 :     # "$i" will be used to keep the location index.
507 : parrello 1.8 for my $locChunk (@locOList) {
508 : parrello 1.1 $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
509 :     $locChunk->Dir, $locChunk->Length, $i);
510 :     $i++;
511 :     }
512 :     }
513 :     }
514 :     }
515 :     # Finish the loads.
516 :     my $retVal = $self->_FinishAll();
517 :     return $retVal;
518 :     }
519 :    
520 :     =head3 LoadBBHData
521 :    
522 :     C<< my $stats = $spl->LoadBBHData(); >>
523 :    
524 :     Load the bidirectional best hit data from FIG into Sprout.
525 :    
526 :     Sprout does not store information on similarities. Instead, it has only the
527 :     bi-directional best hits. Even so, the BBH table is one of the largest in
528 :     the database.
529 :    
530 :     The following relations are loaded by this method.
531 :    
532 :     IsBidirectionalBestHitOf
533 :    
534 :     =over 4
535 :    
536 :     =item RETURNS
537 :    
538 :     Returns a statistics object for the loads.
539 :    
540 :     =back
541 :    
542 :     =cut
543 :     #: Return Type $%;
544 : parrello 1.2 sub LoadBBHData {
545 : parrello 1.1 # Get this object instance.
546 :     my ($self) = @_;
547 :     # Get the FIG object.
548 :     my $fig = $self->{fig};
549 :     # Get the table of genome IDs.
550 :     my $genomeHash = $self->{genomes};
551 :     my $genomeCount = (keys %{$genomeHash});
552 :     my $featureCount = $genomeCount * 4000;
553 :     # Create load objects for each of the tables we're loading.
554 :     my $loadIsBidirectionalBestHitOf = $self->_TableLoader('IsBidirectionalBestHitOf',
555 :     $featureCount * $genomeCount);
556 :     Trace("Beginning BBH load.") if T(2);
557 :     # Now we loop through the genomes, generating the data for each one.
558 :     for my $genomeID (sort keys %{$genomeHash}) {
559 : parrello 1.6 $loadIsBidirectionalBestHitOf->Add("genomeIn");
560 : parrello 1.1 Trace("Processing features for genome $genomeID.") if T(3);
561 :     # Get the feature list for this genome.
562 :     my $features = $fig->all_features_detailed($genomeID);
563 :     # Loop through the features.
564 :     for my $featureData (@{$features}) {
565 :     # Split the tuple.
566 :     my ($featureID, $locations, $aliases, $type) = @{$featureData};
567 :     # Get the bi-directional best hits.
568 :     my @bbhList = $fig->bbhs($featureID);
569 :     for my $bbhEntry (@bbhList) {
570 :     # Get the target feature ID and the score.
571 :     my ($targetID, $score) = @{$bbhEntry};
572 :     # Check the target feature's genome.
573 :     my $targetGenomeID = $fig->genome_of($targetID);
574 :     # Only proceed if it's one of our genomes.
575 :     if ($genomeHash->{$targetGenomeID}) {
576 :     $loadIsBidirectionalBestHitOf->Put($featureID, $targetID, $targetGenomeID,
577 :     $score);
578 :     }
579 :     }
580 :     }
581 :     }
582 :     # Finish the loads.
583 :     my $retVal = $self->_FinishAll();
584 :     return $retVal;
585 :     }
586 :    
587 :     =head3 LoadSubsystemData
588 :    
589 :     C<< my $stats = $spl->LoadSubsystemData(); >>
590 :    
591 :     Load the subsystem data from FIG into Sprout.
592 :    
593 :     Subsystems are groupings of genetic roles that work together to effect a specific
594 :     chemical reaction. Similar organisms require similar subsystems. To curate a subsystem,
595 :     a spreadsheet is created with genomes on one axis and subsystem roles on the other
596 :     axis. Similar features are then mapped into the cells, allowing the annotation of one
597 :     genome's roles to be used to assist in the annotation of others.
598 :    
599 :     The following relations are loaded by this method.
600 :    
601 :     Subsystem
602 :     Role
603 : parrello 1.19 RoleEC
604 : parrello 1.1 SSCell
605 :     ContainsFeature
606 :     IsGenomeOf
607 :     IsRoleOf
608 :     OccursInSubsystem
609 :     ParticipatesIn
610 :     HasSSCell
611 : parrello 1.18 Catalyzes
612 :     Reaction
613 :     ConsistsOfRoles
614 :     RoleSubset
615 :     HasRoleSubset
616 :     ConsistsOfGenomes
617 :     GenomeSubset
618 :     HasGenomeSubset
619 : parrello 1.1
620 :     =over 4
621 :    
622 :     =item RETURNS
623 :    
624 :     Returns a statistics object for the loads.
625 :    
626 :     =back
627 :    
628 :     =cut
629 :     #: Return Type $%;
630 :     sub LoadSubsystemData {
631 :     # Get this object instance.
632 :     my ($self) = @_;
633 :     # Get the FIG object.
634 :     my $fig = $self->{fig};
635 :     # Get the genome hash. We'll use it to filter the genomes in each
636 :     # spreadsheet.
637 :     my $genomeHash = $self->{genomes};
638 :     # Get the subsystem hash. This lists the subsystems we'll process.
639 :     my $subsysHash = $self->{subsystems};
640 :     my @subsysIDs = sort keys %{$subsysHash};
641 :     my $subsysCount = @subsysIDs;
642 :     my $genomeCount = (keys %{$genomeHash});
643 :     my $featureCount = $genomeCount * 4000;
644 :     # Create load objects for each of the tables we're loading.
645 :     my $loadSubsystem = $self->_TableLoader('Subsystem', $subsysCount);
646 :     my $loadRole = $self->_TableLoader('Role', $featureCount * 6);
647 : parrello 1.19 my $loadRoleEC = $self->_TableLoader('RoleEC', $featureCount * 6);
648 : parrello 1.1 my $loadSSCell = $self->_TableLoader('SSCell', $featureCount * $genomeCount);
649 :     my $loadContainsFeature = $self->_TableLoader('ContainsFeature', $featureCount * $subsysCount);
650 :     my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf', $featureCount * $genomeCount);
651 :     my $loadIsRoleOf = $self->_TableLoader('IsRoleOf', $featureCount * $genomeCount);
652 :     my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem', $featureCount * 6);
653 :     my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn', $subsysCount * $genomeCount);
654 :     my $loadHasSSCell = $self->_TableLoader('HasSSCell', $featureCount * $genomeCount);
655 : parrello 1.18 my $loadReaction = $self->_TableLoader('Reaction', $featureCount * $genomeCount);
656 :     my $loadCatalyzes = $self->_TableLoader('Catalyzes', $featureCount * $genomeCount);
657 :     my $loadRoleSubset = $self->_TableLoader('RoleSubset', $subsysCount * 50);
658 :     my $loadGenomeSubset = $self->_TableLoader('GenomeSubset', $subsysCount * 50);
659 :     my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles', $featureCount * $genomeCount);
660 :     my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes', $featureCount * $genomeCount);
661 :     my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset', $subsysCount * 50);
662 :     my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset', $subsysCount * 50);
663 : parrello 1.1 Trace("Beginning subsystem data load.") if T(2);
664 : parrello 1.18 # The reaction hash will contain a list of reactions for each role. When we're done,
665 :     # a complicated sort and merge will be used to generate the Reaction and Catalyzes
666 :     # tables.
667 :     my %reactionsToRoles = ();
668 : parrello 1.1 # Loop through the subsystems. Our first task will be to create the
669 :     # roles. We do this by looping through the subsystems and creating a
670 :     # role hash. The hash tracks each role ID so that we don't create
671 : parrello 1.18 # duplicates. As we move along, we'll connect the roles and subsystems
672 :     # and memorize up the reactions.
673 : parrello 1.15 my ($genomeID, $roleID);
674 : parrello 1.1 my %roleData = ();
675 :     for my $subsysID (@subsysIDs) {
676 :     Trace("Creating subsystem $subsysID.") if T(3);
677 : parrello 1.6 $loadSubsystem->Add("subsystemIn");
678 : parrello 1.15 # Get the subsystem object.
679 :     my $sub = $fig->get_subsystem($subsysID);
680 : parrello 1.18 # Get its reaction hash.
681 :     my $reactionHash = $sub->get_reactions();
682 :     # Create the subsystem record.
683 :     my $curator = $sub->get_curator();
684 :     my $notes = $sub->get_notes();
685 :     $loadSubsystem->Put($subsysID, $curator, $notes);
686 :     # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
687 : parrello 1.15 for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
688 : parrello 1.18 # Connect to this role.
689 : parrello 1.6 $loadOccursInSubsystem->Add("roleIn");
690 : parrello 1.18 $loadOccursInSubsystem->Put($roleID, $subsysID, $col);
691 :     # If it's a new role, add it to the role table.
692 : parrello 1.1 if (! exists $roleData{$roleID}) {
693 : parrello 1.18 # Get the role's abbreviation.
694 :     my $abbr = $sub->get_role_abbr($col);
695 :     # Add the role.
696 :     $loadRole->Put($roleID, $abbr);
697 : parrello 1.1 $roleData{$roleID} = 1;
698 : parrello 1.19 # Check for an EC number.
699 :     if ($roleID =~ /\(EC ([^.]+\.[^.]+\.[^.]+\.[^)]+)\)\s*$/) {
700 :     $loadRoleEC->Put($roleID, $1);
701 :     }
702 : parrello 1.18 # Add the role's reactions.
703 :     my $reactions = $reactionHash->{$roleID};
704 :     for my $reactionID (@{$reactions}) {
705 :     if (! exists $reactionsToRoles{$reactionID}) {
706 :     # Here the reaction is brand-new, so we create its reaction
707 :     # record.
708 :     $loadReaction->Put($reactionID, $fig->reversible($reactionID));
709 :     # We also create a blank list for it in the reaction hash.
710 :     $reactionsToRoles{$reactionID} = [];
711 :     }
712 :     # Add the role to the reaction's role list.
713 :     push @{$reactionsToRoles{$reactionID}}, $roleID;
714 :     }
715 : parrello 1.1 }
716 :     }
717 : parrello 1.15 # Now we create the spreadsheet for the subsystem by matching roles to
718 :     # genomes. Each genome is a row and each role is a column. We may need
719 :     # to actually create the roles as we find them.
720 : parrello 1.1 Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
721 : parrello 1.15 for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
722 :     # Only proceed if this is one of our genomes.
723 :     if (exists $genomeHash->{$genomeID}) {
724 :     # Count the PEGs and cells found for verification purposes.
725 :     my $pegCount = 0;
726 :     my $cellCount = 0;
727 : parrello 1.18 # Create a list for the PEGs we find. This list will be used
728 :     # to generate cluster numbers.
729 :     my @pegsFound = ();
730 :     # Create a hash that maps spreadsheet IDs to PEGs. We will
731 :     # use this to generate the ContainsFeature data after we have
732 :     # the cluster numbers.
733 :     my %cellPegs = ();
734 :     # Get the genome's variant code for this subsystem.
735 :     my $variantCode = $sub->get_variant_code($row);
736 : parrello 1.15 # Loop through the subsystem's roles. We use an index because it is
737 :     # part of the spreadsheet cell ID.
738 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
739 :     # Get the features in the spreadsheet cell for this genome and role.
740 :     my @pegs = $sub->get_pegs_from_cell($row, $col);
741 :     # Only proceed if features exist.
742 :     if (@pegs > 0) {
743 :     # Create the spreadsheet cell.
744 :     $cellCount++;
745 :     my $cellID = "$subsysID:$genomeID:$col";
746 :     $loadSSCell->Put($cellID);
747 :     $loadIsGenomeOf->Put($genomeID, $cellID);
748 :     $loadIsRoleOf->Put($roleID, $cellID);
749 :     $loadHasSSCell->Put($subsysID, $cellID);
750 : parrello 1.18 # Remember its features.
751 :     push @pegsFound, @pegs;
752 :     $cellPegs{$cellID} = \@pegs;
753 :     $pegCount += @pegs;
754 : parrello 1.1 }
755 :     }
756 : parrello 1.18 # If we found some cells for this genome, we need to compute clusters and
757 :     # denote it participates in the subsystem.
758 : parrello 1.15 if ($pegCount > 0) {
759 :     Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
760 : parrello 1.18 $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
761 :     # Partition the PEGs found into clusters.
762 :     my @clusters = $fig->compute_clusters(\@pegsFound, $sub);
763 :     # Create a hash mapping PEG IDs to cluster numbers.
764 :     # We default to -1 for all of them.
765 :     my %clusterOf = map { $_ => -1 } @pegsFound;
766 :     for (my $i = 0; $i <= $#clusters; $i++) {
767 :     my $subList = $clusters[$i];
768 :     for my $peg (@{$subList}) {
769 :     $clusterOf{$peg} = $i;
770 :     }
771 :     }
772 :     # Create the ContainsFeature data.
773 :     for my $cellID (keys %cellPegs) {
774 :     my $cellList = $cellPegs{$cellID};
775 :     for my $cellPeg (@$cellList) {
776 :     $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
777 :     }
778 :     }
779 : parrello 1.15 }
780 : parrello 1.1 }
781 :     }
782 : parrello 1.18 # Now we need to generate the subsets. The subset names must be concatenated to
783 :     # the subsystem name to make them unique keys. There are two types of subsets:
784 :     # genome subsets and role subsets. We do the role subsets first.
785 :     my @subsetNames = $sub->get_subset_names();
786 :     for my $subsetID (@subsetNames) {
787 :     # Create the subset record.
788 :     my $actualID = "$subsysID:$subsetID";
789 :     $loadRoleSubset->Put($actualID);
790 :     # Connect the subset to the subsystem.
791 :     $loadHasRoleSubset->Put($subsysID, $actualID);
792 :     # Connect the subset to its roles.
793 :     my @roles = $sub->get_subset($subsetID);
794 :     for my $roleID (@roles) {
795 :     $loadConsistsOfRoles->Put($actualID, $roleID);
796 :     }
797 :     }
798 :     # Next the genome subsets.
799 :     @subsetNames = $sub->get_subset_namesR();
800 :     for my $subsetID (@subsetNames) {
801 :     # Create the subset record.
802 :     my $actualID = "$subsysID:$subsetID";
803 :     $loadGenomeSubset->Put($actualID);
804 :     # Connect the subset to the subsystem.
805 :     $loadHasGenomeSubset->Put($subsysID, $actualID);
806 :     # Connect the subset to its genomes.
807 :     my @genomes = $sub->get_subsetR($subsetID);
808 :     for my $genomeID (@genomes) {
809 :     $loadConsistsOfGenomes->Put($actualID, $genomeID);
810 :     }
811 :     }
812 :     }
813 :     # Before we leave, we must create the Catalyzes table. The data is all stored in
814 :     # "reactionToRoles" hash.
815 :     for my $reactionID (keys %reactionsToRoles) {
816 :     # Get this reaction's list of roles. We sort it so we can merge out duplicates.
817 :     my @roles = sort @{$reactionsToRoles{$reactionID}};
818 :     my $lastRole = "";
819 :     # Loop through the roles, creating catalyzation records.
820 :     for my $thisRole (@roles) {
821 :     if ($thisRole ne $lastRole) {
822 :     $loadCatalyzes->Put($thisRole, $reactionID);
823 :     }
824 :     }
825 : parrello 1.1 }
826 :     # Finish the load.
827 :     my $retVal = $self->_FinishAll();
828 :     return $retVal;
829 :     }
830 :    
831 :     =head3 LoadDiagramData
832 :    
833 :     C<< my $stats = $spl->LoadDiagramData(); >>
834 :    
835 :     Load the diagram data from FIG into Sprout.
836 :    
837 :     Diagrams are used to organize functional roles. The diagram shows the
838 :     connections between chemicals that interact with a subsystem.
839 :    
840 :     The following relations are loaded by this method.
841 :    
842 :     Diagram
843 :     RoleOccursIn
844 :    
845 :     =over 4
846 :    
847 :     =item RETURNS
848 :    
849 :     Returns a statistics object for the loads.
850 :    
851 :     =back
852 :    
853 :     =cut
854 :     #: Return Type $%;
855 :     sub LoadDiagramData {
856 :     # Get this object instance.
857 :     my ($self) = @_;
858 :     # Get the FIG object.
859 :     my $fig = $self->{fig};
860 :     # Get the map list.
861 :     my @maps = $fig->all_maps;
862 :     my $mapCount = @maps;
863 :     my $genomeCount = (keys %{$self->{genomes}});
864 :     my $featureCount = $genomeCount * 4000;
865 :     # Create load objects for each of the tables we're loading.
866 :     my $loadDiagram = $self->_TableLoader('Diagram', $mapCount);
867 :     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn', $featureCount * 6);
868 :     Trace("Beginning diagram data load.") if T(2);
869 :     # Loop through the diagrams.
870 :     for my $map ($fig->all_maps) {
871 :     Trace("Loading diagram $map.") if T(3);
872 :     # Get the diagram's descriptive name.
873 :     my $name = $fig->map_name($map);
874 :     $loadDiagram->Put($map, $name);
875 :     # Now we need to link all the map's roles to it.
876 :     # A hash is used to prevent duplicates.
877 :     my %roleHash = ();
878 :     for my $role ($fig->map_to_ecs($map)) {
879 :     if (! $roleHash{$role}) {
880 :     $loadRoleOccursIn->Put($role, $map);
881 :     $roleHash{$role} = 1;
882 :     }
883 :     }
884 :     }
885 :     # Finish the load.
886 :     my $retVal = $self->_FinishAll();
887 :     return $retVal;
888 :     }
889 :    
890 :     =head3 LoadPropertyData
891 :    
892 :     C<< my $stats = $spl->LoadPropertyData(); >>
893 :    
894 :     Load the attribute data from FIG into Sprout.
895 :    
896 :     Attribute data in FIG corresponds to the Sprout concept of Property. As currently
897 :     implemented, each key-value attribute combination in the SEED corresponds to a
898 :     record in the B<Property> table. The B<HasProperty> relationship links the
899 :     features to the properties.
900 :    
901 :     The SEED also allows attributes to be assigned to genomes, but this is not yet
902 :     supported by Sprout.
903 :    
904 :     The following relations are loaded by this method.
905 :    
906 :     HasProperty
907 :     Property
908 :    
909 :     =over 4
910 :    
911 :     =item RETURNS
912 :    
913 :     Returns a statistics object for the loads.
914 :    
915 :     =back
916 :    
917 :     =cut
918 :     #: Return Type $%;
919 :     sub LoadPropertyData {
920 :     # Get this object instance.
921 :     my ($self) = @_;
922 :     # Get the FIG object.
923 :     my $fig = $self->{fig};
924 :     # Get the genome hash.
925 :     my $genomeHash = $self->{genomes};
926 :     my $genomeCount = (keys %{$genomeHash});
927 :     # Create load objects for each of the tables we're loading.
928 :     my $loadProperty = $self->_TableLoader('Property', $genomeCount * 1500);
929 :     my $loadHasProperty = $self->_TableLoader('HasProperty', $genomeCount * 1500);
930 :     Trace("Beginning property data load.") if T(2);
931 :     # Create a hash for storing property IDs.
932 :     my %propertyKeys = ();
933 :     my $nextID = 1;
934 :     # Loop through the genomes.
935 :     for my $genomeID (keys %{$genomeHash}) {
936 : parrello 1.6 $loadProperty->Add("genomeIn");
937 : parrello 1.1 # Get the genome's features. The feature ID is the first field in the
938 :     # tuples returned by "all_features_detailed". We use "all_features_detailed"
939 :     # rather than "all_features" because we want all features regardless of type.
940 :     my @features = map { $_->[0] } @{$fig->all_features_detailed($genomeID)};
941 :     # Loop through the features, creating HasProperty records.
942 :     for my $fid (@features) {
943 : parrello 1.6 $loadProperty->Add("featureIn");
944 : parrello 1.1 # Get all attributes for this feature. We do this one feature at a time
945 :     # to insure we do not get any genome attributes.
946 :     my @attributeList = $fig->get_attributes($fid, '', '', '');
947 :     # Loop through the attributes.
948 :     for my $tuple (@attributeList) {
949 :     # Get this attribute value's data. Note that we throw away the FID,
950 :     # since it will always be the same as the value if "$fid".
951 :     my (undef, $key, $value, $url) = @{$tuple};
952 :     # Concatenate the key and value and check the "propertyKeys" hash to
953 :     # see if we already have an ID for it. We use a tab for the separator
954 :     # character.
955 :     my $propertyKey = "$key\t$value";
956 :     # Use the concatenated value to check for an ID. If no ID exists, we
957 :     # create one.
958 :     my $propertyID = $propertyKeys{$propertyKey};
959 :     if (! $propertyID) {
960 :     # Here we need to create a new property ID for this key/value pair.
961 :     $propertyKeys{$propertyKey} = $nextID;
962 :     $propertyID = $nextID;
963 :     $nextID++;
964 :     $loadProperty->Put($propertyID, $key, $value);
965 :     }
966 :     # Create the HasProperty entry for this feature/property association.
967 :     $loadHasProperty->Put($fid, $propertyID, $url);
968 :     }
969 :     }
970 :     }
971 :     # Finish the load.
972 :     my $retVal = $self->_FinishAll();
973 :     return $retVal;
974 :     }
975 :    
976 :     =head3 LoadAnnotationData
977 :    
978 :     C<< my $stats = $spl->LoadAnnotationData(); >>
979 :    
980 :     Load the annotation data from FIG into Sprout.
981 :    
982 :     Sprout annotations encompass both the assignments and the annotations in SEED.
983 :     These describe the function performed by a PEG as well as any other useful
984 :     information that may aid in identifying its purpose.
985 :    
986 :     The following relations are loaded by this method.
987 :    
988 :     Annotation
989 :     IsTargetOfAnnotation
990 :     SproutUser
991 :     MadeAnnotation
992 :    
993 :     =over 4
994 :    
995 :     =item RETURNS
996 :    
997 :     Returns a statistics object for the loads.
998 :    
999 :     =back
1000 :    
1001 :     =cut
1002 :     #: Return Type $%;
1003 :     sub LoadAnnotationData {
1004 :     # Get this object instance.
1005 :     my ($self) = @_;
1006 :     # Get the FIG object.
1007 :     my $fig = $self->{fig};
1008 :     # Get the genome hash.
1009 :     my $genomeHash = $self->{genomes};
1010 :     my $genomeCount = (keys %{$genomeHash});
1011 :     # Create load objects for each of the tables we're loading.
1012 :     my $loadAnnotation = $self->_TableLoader('Annotation', $genomeCount * 4000);
1013 :     my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation', $genomeCount * 4000);
1014 :     my $loadSproutUser = $self->_TableLoader('SproutUser', 100);
1015 :     my $loadUserAccess = $self->_TableLoader('UserAccess', 1000);
1016 :     my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation', $genomeCount * 4000);
1017 :     Trace("Beginning annotation data load.") if T(2);
1018 :     # Create a hash of user names. We'll use this to prevent us from generating duplicate
1019 :     # user records.
1020 :     my %users = ( FIG => 1, master => 1 );
1021 :     # Put in FIG and "master".
1022 :     $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes");
1023 :     $loadUserAccess->Put("FIG", 1);
1024 :     $loadSproutUser->Put("master", "Master User");
1025 :     $loadUserAccess->Put("master", 1);
1026 :     # Get the current time.
1027 :     my $time = time();
1028 :     # Loop through the genomes.
1029 : parrello 1.6 for my $genomeID (sort keys %{$genomeHash}) {
1030 : parrello 1.1 Trace("Processing $genomeID.") if T(3);
1031 :     # Get the genome's PEGs.
1032 :     my @pegs = $fig->pegs_of($genomeID);
1033 :     for my $peg (@pegs) {
1034 :     Trace("Processing $peg.") if T(4);
1035 :     # Create a hash of timestamps. We use this to prevent duplicate time stamps
1036 :     # from showing up for a single PEG's annotations.
1037 :     my %seenTimestamps = ();
1038 :     # Check for a functional assignment.
1039 :     my $func = $fig->function_of($peg);
1040 :     if ($func) {
1041 :     # If this is NOT a hypothetical assignment, we create an
1042 :     # assignment annotation for it.
1043 :     if (! FIG::hypo($peg)) {
1044 :     # Note that we double the slashes so that what goes into the database is
1045 :     # a new-line escape sequence rather than an actual new-line.
1046 :     $loadAnnotation->Put("$peg:$time", $time, "FIG\\nSet function to\\n$func");
1047 :     $loadIsTargetOfAnnotation->Put($peg, "$peg:$time");
1048 :     $loadMadeAnnotation->Put("FIG", "$peg:$time");
1049 :     # Denote we've seen this timestamp.
1050 :     $seenTimestamps{$time} = 1;
1051 :     }
1052 : parrello 1.18 }
1053 :     # Now loop through the real annotations.
1054 :     for my $tuple ($fig->feature_annotations($peg, "raw")) {
1055 :     my ($fid, $timestamp, $user, $text) = @{$tuple};
1056 :     # Here we fix up the annotation text. "\r" is removed,
1057 :     # and "\t" and "\n" are escaped. Note we use the "s"
1058 :     # modifier so that new-lines inside the text do not
1059 :     # stop the substitution search.
1060 :     $text =~ s/\r//gs;
1061 :     $text =~ s/\t/\\t/gs;
1062 :     $text =~ s/\n/\\n/gs;
1063 :     # Change assignments by the master user to FIG assignments.
1064 :     $text =~ s/Set master function/Set FIG function/s;
1065 :     # Insure the time stamp is valid.
1066 :     if ($timestamp =~ /^\d+$/) {
1067 :     # Here it's a number. We need to insure the one we use to form
1068 :     # the key is unique.
1069 :     my $keyStamp = $timestamp;
1070 :     while ($seenTimestamps{$keyStamp}) {
1071 :     $keyStamp++;
1072 :     }
1073 :     $seenTimestamps{$keyStamp} = 1;
1074 :     my $annotationID = "$peg:$keyStamp";
1075 :     # Insure the user exists.
1076 :     if (! $users{$user}) {
1077 :     $loadSproutUser->Put($user, "SEED user");
1078 :     $loadUserAccess->Put($user, 1);
1079 :     $users{$user} = 1;
1080 : parrello 1.1 }
1081 : parrello 1.18 # Generate the annotation.
1082 :     $loadAnnotation->Put($annotationID, $timestamp, $text);
1083 :     $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1084 :     $loadMadeAnnotation->Put($user, $annotationID);
1085 :     } else {
1086 :     # Here we have an invalid time stamp.
1087 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1);
1088 : parrello 1.1 }
1089 :     }
1090 :     }
1091 :     }
1092 :     # Finish the load.
1093 :     my $retVal = $self->_FinishAll();
1094 :     return $retVal;
1095 :     }
1096 :    
1097 : parrello 1.5 =head3 LoadSourceData
1098 :    
1099 :     C<< my $stats = $spl->LoadSourceData(); >>
1100 :    
1101 :     Load the source data from FIG into Sprout.
1102 :    
1103 :     Source data links genomes to information about the organizations that
1104 :     mapped it.
1105 :    
1106 :     The following relations are loaded by this method.
1107 :    
1108 :     ComesFrom
1109 :     Source
1110 :     SourceURL
1111 :    
1112 :     There is no direct support for source attribution in FIG, so we access the SEED
1113 :     files directly.
1114 :    
1115 :     =over 4
1116 :    
1117 :     =item RETURNS
1118 :    
1119 :     Returns a statistics object for the loads.
1120 :    
1121 :     =back
1122 :    
1123 :     =cut
1124 :     #: Return Type $%;
1125 :     sub LoadSourceData {
1126 :     # Get this object instance.
1127 :     my ($self) = @_;
1128 :     # Get the FIG object.
1129 :     my $fig = $self->{fig};
1130 :     # Get the genome hash.
1131 :     my $genomeHash = $self->{genomes};
1132 :     my $genomeCount = (keys %{$genomeHash});
1133 :     # Create load objects for each of the tables we're loading.
1134 :     my $loadComesFrom = $self->_TableLoader('ComesFrom', $genomeCount * 4);
1135 :     my $loadSource = $self->_TableLoader('Source', $genomeCount * 4);
1136 :     my $loadSourceURL = $self->_TableLoader('SourceURL', $genomeCount * 8);
1137 :     Trace("Beginning source data load.") if T(2);
1138 :     # Create hashes to collect the Source information.
1139 :     my %sourceURL = ();
1140 :     my %sourceDesc = ();
1141 :     # Loop through the genomes.
1142 :     my $line;
1143 : parrello 1.6 for my $genomeID (sort keys %{$genomeHash}) {
1144 : parrello 1.5 Trace("Processing $genomeID.") if T(3);
1145 :     # Open the project file.
1146 :     if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1147 :     defined($line = <TMP>)) {
1148 :     chomp $line;
1149 : parrello 1.6 my($sourceID, $desc, $url) = split(/\t/,$line);
1150 : parrello 1.5 $loadComesFrom->Put($genomeID, $sourceID);
1151 : parrello 1.16 if ($url && ! exists $sourceURL{$sourceID}) {
1152 : parrello 1.5 $loadSourceURL->Put($sourceID, $url);
1153 :     $sourceURL{$sourceID} = 1;
1154 :     }
1155 : parrello 1.16 if ($desc) {
1156 :     $sourceDesc{$sourceID} = $desc;
1157 :     } elsif (! exists $sourceDesc{$sourceID}) {
1158 :     $sourceDesc{$sourceID} = $sourceID;
1159 : parrello 1.5 }
1160 :     }
1161 :     close TMP;
1162 :     }
1163 : parrello 1.16 # Write the source descriptions.
1164 :     for my $sourceID (keys %sourceDesc) {
1165 :     $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1166 :     }
1167 : parrello 1.5 # Finish the load.
1168 :     my $retVal = $self->_FinishAll();
1169 :     return $retVal;
1170 :     }
1171 :    
1172 : parrello 1.6 =head3 LoadExternalData
1173 :    
1174 :     C<< my $stats = $spl->LoadExternalData(); >>
1175 :    
1176 :     Load the external data from FIG into Sprout.
1177 :    
1178 :     External data contains information about external feature IDs.
1179 :    
1180 :     The following relations are loaded by this method.
1181 :    
1182 :     ExternalAliasFunc
1183 :     ExternalAliasOrg
1184 :    
1185 :     The support for external IDs in FIG is hidden beneath layers of other data, so
1186 :     we access the SEED files directly to create these tables. This is also one of
1187 :     the few load methods that does not proceed genome by genome.
1188 :    
1189 :     =over 4
1190 :    
1191 :     =item RETURNS
1192 :    
1193 :     Returns a statistics object for the loads.
1194 :    
1195 :     =back
1196 :    
1197 :     =cut
1198 :     #: Return Type $%;
1199 :     sub LoadExternalData {
1200 :     # Get this object instance.
1201 :     my ($self) = @_;
1202 :     # Get the FIG object.
1203 :     my $fig = $self->{fig};
1204 :     # Get the genome hash.
1205 :     my $genomeHash = $self->{genomes};
1206 :     my $genomeCount = (keys %{$genomeHash});
1207 :     # Convert the genome hash. We'll get the genus and species for each genome and make
1208 :     # it the key.
1209 :     my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1210 :     # Create load objects for each of the tables we're loading.
1211 :     my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc', $genomeCount * 4000);
1212 :     my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg', $genomeCount * 4000);
1213 :     Trace("Beginning external data load.") if T(2);
1214 :     # We loop through the files one at a time. First, the organism file.
1215 :     Open(\*ORGS, "<$FIG_Config::global/ext_org.table");
1216 :     my $orgLine;
1217 :     while (defined($orgLine = <ORGS>)) {
1218 :     # Clean the input line.
1219 :     chomp $orgLine;
1220 :     # Parse the organism name.
1221 :     my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1222 :     $loadExternalAliasOrg->Put($protID, $name);
1223 :     }
1224 :     close ORGS;
1225 :     # Now the function file.
1226 :     my $funcLine;
1227 :     Open(\*FUNCS, "<$FIG_Config::global/ext_func.table");
1228 :     while (defined($funcLine = <FUNCS>)) {
1229 :     # Clean the line ending.
1230 :     chomp $funcLine;
1231 :     # Only proceed if the line is non-blank.
1232 :     if ($funcLine) {
1233 :     # Split it into fields.
1234 :     my @funcFields = split /\s*\t\s*/, $funcLine;
1235 :     # If there's an EC number, append it to the description.
1236 :     if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1237 :     $funcFields[1] .= " $1";
1238 :     }
1239 :     # Output the function line.
1240 :     $loadExternalAliasFunc->Put(@funcFields[0,1]);
1241 :     }
1242 :     }
1243 :     # Finish the load.
1244 :     my $retVal = $self->_FinishAll();
1245 :     return $retVal;
1246 :     }
1247 : parrello 1.5
1248 : parrello 1.18
1249 :     =head3 LoadReactionData
1250 :    
1251 :     C<< my $stats = $spl->LoadReactionData(); >>
1252 :    
1253 :     Load the reaction data from FIG into Sprout.
1254 :    
1255 :     Reaction data connects reactions to the compounds that participate in them.
1256 :    
1257 :     The following relations are loaded by this method.
1258 :    
1259 :     ReactionURL
1260 :     Compound
1261 :     CompoundName
1262 :     CompoundCAS
1263 :     IsAComponentOf
1264 :    
1265 :     This method proceeds reaction by reaction rather than genome by genome.
1266 :    
1267 :     =over 4
1268 :    
1269 :     =item RETURNS
1270 :    
1271 :     Returns a statistics object for the loads.
1272 :    
1273 :     =back
1274 :    
1275 :     =cut
1276 :     #: Return Type $%;
1277 :     sub LoadReactionData {
1278 :     # Get this object instance.
1279 :     my ($self) = @_;
1280 :     # Get the FIG object.
1281 :     my $fig = $self->{fig};
1282 :     # Get the genome hash.
1283 :     my $genomeHash = $self->{genomes};
1284 :     my $genomeCount = (keys %{$genomeHash});
1285 :     # Create load objects for each of the tables we're loading.
1286 :     my $loadReactionURL = $self->_TableLoader('ReactionURL', $genomeCount * 4000);
1287 :     my $loadCompound = $self->_TableLoader('Compound', $genomeCount * 4000);
1288 :     my $loadCompoundName = $self->_TableLoader('CompoundName', $genomeCount * 8000);
1289 :     my $loadCompoundCAS = $self->_TableLoader('CompoundCAS', $genomeCount * 4000);
1290 :     my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf', $genomeCount * 12000);
1291 :     Trace("Beginning reaction/compound data load.") if T(2);
1292 :     # Create a hash to remember the compounds we've generated in the compound table.
1293 :     my %compoundHash = ();
1294 :     # Loop through the reactions.
1295 :     my @reactions = $fig->all_reactions();
1296 :     for my $reactionID (@reactions) {
1297 :     # Compute the reaction's URL.
1298 :     my $url = HTML::reaction_link($reactionID);
1299 :     # Put it in the ReactionURL table.
1300 :     $loadReactionURL->Put($reactionID, $url);
1301 :     # Now we need all of the reaction's compounds. We get these in two phases,
1302 :     # substrates first and then products.
1303 :     for my $product (0, 1) {
1304 :     # Get the compounds of the current type for the current reaction. FIG will
1305 :     # give us 3-tuples: [ID, Stoichometry, main-flag]. At this time we do not
1306 :     # have location data in SEED, so it defaults to the empty string.
1307 :     my @compounds = $fig->reaction2comp($reactionID, $product);
1308 :     for my $compData (@compounds) {
1309 :     # Extract the compound data from the current tuple.
1310 :     my ($cid, $stoich, $main) = @{$compData};
1311 :     # Link the compound to the reaction.
1312 :     $loadIsAComponentOf->Put($cid, $reactionID, "", $main, $product, $stoich);
1313 :     # If this is a new compound, we need to create its table entries.
1314 :     if (! exists $compoundHash{$cid}) {
1315 :     $compoundHash{$cid} = 1;
1316 :     # Create the main compound record and denote we've done it.
1317 :     $loadCompound->Put($cid);
1318 :     # Check for a CAS ID.
1319 :     my $cas = $fig->cas($cid);
1320 :     if ($cas) {
1321 :     $loadCompoundCAS->Put($cid, $cas);
1322 :     }
1323 :     # Check for names.
1324 :     my @names = $fig->names_of_compound($cid);
1325 :     # Each name will be given a priority number, starting with 1.
1326 :     my $prio = 0;
1327 :     for my $name (@names) {
1328 :     $loadCompoundName->Put($cid, $name, $prio++);
1329 :     }
1330 :     }
1331 :     }
1332 :     }
1333 :     }
1334 :     # Finish the load.
1335 :     my $retVal = $self->_FinishAll();
1336 :     return $retVal;
1337 :     }
1338 :    
1339 : parrello 1.5 =head3 LoadGroupData
1340 :    
1341 :     C<< my $stats = $spl->LoadGroupData(); >>
1342 :    
1343 :     Load the genome Groups into Sprout.
1344 :    
1345 :     The following relations are loaded by this method.
1346 :    
1347 :     GenomeGroups
1348 :    
1349 :     There is no direct support for genome groups in FIG, so we access the SEED
1350 :     files directly.
1351 :    
1352 :     =over 4
1353 :    
1354 :     =item RETURNS
1355 :    
1356 :     Returns a statistics object for the loads.
1357 :    
1358 :     =back
1359 :    
1360 :     =cut
1361 :     #: Return Type $%;
1362 :     sub LoadGroupData {
1363 :     # Get this object instance.
1364 :     my ($self) = @_;
1365 :     # Get the FIG object.
1366 :     my $fig = $self->{fig};
1367 :     # Get the genome hash.
1368 :     my $genomeHash = $self->{genomes};
1369 :     my $genomeCount = (keys %{$genomeHash});
1370 :     # Create a load object for the table we're loading.
1371 :     my $loadGenomeGroups = $self->_TableLoader('GenomeGroups', $genomeCount * 4);
1372 :     Trace("Beginning group data load.") if T(2);
1373 :     # Loop through the genomes.
1374 :     my $line;
1375 : parrello 1.6 for my $genomeID (keys %{$genomeHash}) {
1376 : parrello 1.5 Trace("Processing $genomeID.") if T(3);
1377 :     # Open the NMPDR group file for this genome.
1378 :     if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
1379 :     defined($line = <TMP>)) {
1380 :     # Clean the line ending.
1381 : parrello 1.6 chomp $line;
1382 : parrello 1.5 # Add the group to the table. Note that there can only be one group
1383 :     # per genome.
1384 :     $loadGenomeGroups->Put($genomeID, $line);
1385 :     }
1386 :     close TMP;
1387 :     }
1388 :     # Finish the load.
1389 :     my $retVal = $self->_FinishAll();
1390 :     return $retVal;
1391 :     }
1392 :    
1393 : parrello 1.1 =head2 Internal Utility Methods
1394 :    
1395 :     =head3 TableLoader
1396 :    
1397 :     Create an ERDBLoad object for the specified table. The object is also added to
1398 :     the internal list in the C<loaders> property of this object. That enables the
1399 :     L</FinishAll> method to terminate all the active loads.
1400 :    
1401 :     This is an instance method.
1402 :    
1403 :     =over 4
1404 :    
1405 :     =item tableName
1406 :    
1407 :     Name of the table (relation) being loaded.
1408 :    
1409 :     =item rowCount (optional)
1410 :    
1411 :     Estimated maximum number of rows in the table.
1412 :    
1413 :     =item RETURN
1414 :    
1415 :     Returns an ERDBLoad object for loading the specified table.
1416 :    
1417 :     =back
1418 :    
1419 :     =cut
1420 :    
1421 :     sub _TableLoader {
1422 :     # Get the parameters.
1423 :     my ($self, $tableName, $rowCount) = @_;
1424 :     # Create the load object.
1425 :     my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $rowCount);
1426 :     # Cache it in the loader list.
1427 :     push @{$self->{loaders}}, $retVal;
1428 :     # Return it to the caller.
1429 :     return $retVal;
1430 :     }
1431 :    
1432 :     =head3 FinishAll
1433 :    
1434 :     Finish all the active loads on this object.
1435 :    
1436 :     When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in
1437 :     the list pointed to be the C<loaders> property of this object. This method pops the loaders
1438 :     off the list and finishes them to flush out any accumulated residue.
1439 :    
1440 :     This is an instance method.
1441 :    
1442 :     =over 4
1443 :    
1444 :     =item RETURN
1445 :    
1446 :     Returns a statistics object containing the accumulated statistics for the load.
1447 :    
1448 :     =back
1449 :    
1450 :     =cut
1451 :    
1452 :     sub _FinishAll {
1453 :     # Get this object instance.
1454 :     my ($self) = @_;
1455 :     # Create the statistics object.
1456 :     my $retVal = Stats->new();
1457 :     # Get the loader list.
1458 :     my $loadList = $self->{loaders};
1459 :     # Loop through the list, finishing the loads. Note that if the finish fails, we die
1460 :     # ignominiously. At some future point, we want to make the loads restartable.
1461 :     while (my $loader = pop @{$loadList}) {
1462 : parrello 1.19 # Trace the fact that we're cleaning up.
1463 :     my $relName = $loader->RelName;
1464 :     Trace("Finishing load for $relName.") if T(2);
1465 : parrello 1.1 my $stats = $loader->Finish();
1466 : parrello 1.19 if ($self->{options}->{dbLoad}) {
1467 :     # Here we want to use the load file just created to load the database.
1468 :     Trace("Loading relation $relName.") if T(2);
1469 :     my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
1470 :     # Accumulate the statistics from the DB load.
1471 :     $stats->Accumulate($newStats);
1472 :     }
1473 : parrello 1.1 $retVal->Accumulate($stats);
1474 :     Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
1475 :     }
1476 :     # Return the load statistics.
1477 :     return $retVal;
1478 :     }
1479 :    
1480 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3