[Bio] / Sprout / SproutLoad.pm Repository:
ViewVC logotype

Annotation of /Sprout/SproutLoad.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.96 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SproutLoad;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use PageBuilder;
8 :     use ERDBLoad;
9 :     use FIG;
10 : parrello 1.85 use FIGRules;
11 : parrello 1.1 use Sprout;
12 :     use Stats;
13 :     use BasicLocation;
14 : parrello 1.18 use HTML;
15 : parrello 1.88 use AliasAnalysis;
16 : parrello 1.93 use BioWords;
17 : parrello 1.1
18 :     =head1 Sprout Load Methods
19 :    
20 :     =head2 Introduction
21 :    
22 :     This object contains the methods needed to copy data from the FIG data store to the
23 :     Sprout database. It makes heavy use of the ERDBLoad object to manage the load into
24 :     individual tables. The client can create an instance of this object and then
25 :     call methods for each group of tables to load. For example, the following code will
26 :     load the Genome- and Feature-related tables. (It is presumed the first command line
27 :     parameter contains the name of a file specifying the genomes.)
28 :    
29 :     my $fig = FIG->new();
30 :     my $sprout = SFXlate->new_sprout_only();
31 :     my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]);
32 :     my $stats = $spl->LoadGenomeData();
33 :     $stats->Accumulate($spl->LoadFeatureData());
34 :     print $stats->Show();
35 :    
36 :     It is worth noting that the FIG object does not need to be a real one. Any object
37 :     that implements the FIG methods for data retrieval could be used. So, for example,
38 :     this object could be used to copy data from one Sprout database to another, or
39 :     from any FIG-compliant data story implemented in the future.
40 :    
41 :     To insure that this is possible, each time the FIG object is used, it will be via
42 :     a variable called C<$fig>. This makes it fairly straightforward to determine which
43 :     FIG methods are required to load the Sprout database.
44 :    
45 : parrello 1.5 This object creates the load files; however, the tables are not created until it
46 :     is time to actually do the load from the files into the target database.
47 :    
48 : parrello 1.1 =cut
49 :    
50 :     #: Constructor SproutLoad->new();
51 :    
52 :     =head2 Public Methods
53 :    
54 :     =head3 new
55 :    
56 : parrello 1.90 my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options);
57 : parrello 1.1
58 :     Construct a new Sprout Loader object, specifying the two participating databases and
59 :     the name of the files containing the list of genomes and subsystems to use.
60 :    
61 :     =over 4
62 :    
63 :     =item sprout
64 :    
65 :     Sprout object representing the target database. This also specifies the directory to
66 :     be used for creating the load files.
67 :    
68 :     =item fig
69 :    
70 :     FIG object representing the source data store from which the data is to be taken.
71 :    
72 :     =item genomeFile
73 :    
74 :     Either the name of the file containing the list of genomes to load or a reference to
75 :     a hash of genome IDs to access codes. If nothing is specified, all complete genomes
76 :     will be loaded and the access code will default to 1. The genome list is presumed
77 :     to be all-inclusive. In other words, all existing data in the target database will
78 :     be deleted and replaced with the data on the specified genes. If a file is specified,
79 :     it should contain one genome ID and access code per line, tab-separated.
80 :    
81 :     =item subsysFile
82 :    
83 :     Either the name of the file containing the list of trusted subsystems or a reference
84 : parrello 1.34 to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be
85 :     considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR>
86 : parrello 1.76 in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded.
87 : parrello 1.1
88 : parrello 1.8 =item options
89 :    
90 :     Reference to a hash of command-line options.
91 :    
92 : parrello 1.1 =back
93 :    
94 :     =cut
95 :    
96 :     sub new {
97 :     # Get the parameters.
98 : parrello 1.8 my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_;
99 : parrello 1.35 # Create the genome hash.
100 :     my %genomes = ();
101 :     # We only need it if load-only is NOT specified.
102 :     if (! $options->{loadOnly}) {
103 :     if (! defined($genomeFile) || $genomeFile eq '') {
104 :     # Here we want all the complete genomes and an access code of 1.
105 :     my @genomeList = $fig->genomes(1);
106 :     %genomes = map { $_ => 1 } @genomeList;
107 : parrello 1.88 Trace(scalar(keys %genomes) . " genomes found.") if T(3);
108 : parrello 1.35 } else {
109 :     my $type = ref $genomeFile;
110 :     Trace("Genome file parameter type is \"$type\".") if T(3);
111 :     if ($type eq 'HASH') {
112 :     # Here the user specified a hash of genome IDs to access codes, which is
113 :     # exactly what we want.
114 :     %genomes = %{$genomeFile};
115 :     } elsif (! $type || $type eq 'SCALAR' ) {
116 :     # The caller specified a file, so read the genomes from the file. (Note
117 :     # that some PERLs return an empty string rather than SCALAR.)
118 :     my @genomeList = Tracer::GetFile($genomeFile);
119 :     if (! @genomeList) {
120 :     # It's an error if the genome file is empty or not found.
121 :     Confess("No genomes found in file \"$genomeFile\".");
122 :     } else {
123 :     # We build the genome Hash using a loop rather than "map" so that
124 :     # an omitted access code can be defaulted to 1.
125 :     for my $genomeLine (@genomeList) {
126 :     my ($genomeID, $accessCode) = split("\t", $genomeLine);
127 : parrello 1.65 if (! defined($accessCode)) {
128 : parrello 1.35 $accessCode = 1;
129 :     }
130 :     $genomes{$genomeID} = $accessCode;
131 : parrello 1.3 }
132 : parrello 1.1 }
133 : parrello 1.35 } else {
134 :     Confess("Invalid genome parameter ($type) in SproutLoad constructor.");
135 : parrello 1.1 }
136 :     }
137 :     }
138 :     # Load the list of trusted subsystems.
139 :     my %subsystems = ();
140 : parrello 1.35 # We only need it if load-only is NOT specified.
141 :     if (! $options->{loadOnly}) {
142 :     if (! defined $subsysFile || $subsysFile eq '') {
143 : parrello 1.55 # Here we want all the usable subsystems. First we get the whole list.
144 : parrello 1.35 my @subs = $fig->all_subsystems();
145 : parrello 1.76 # Loop through, checking for the NMPDR file.
146 : parrello 1.35 for my $sub (@subs) {
147 : parrello 1.76 if ($fig->nmpdr_subsystem($sub)) {
148 : parrello 1.35 $subsystems{$sub} = 1;
149 :     }
150 : parrello 1.33 }
151 : parrello 1.35 } else {
152 :     my $type = ref $subsysFile;
153 :     if ($type eq 'ARRAY') {
154 :     # Here the user passed in a list of subsystems.
155 :     %subsystems = map { $_ => 1 } @{$subsysFile};
156 :     } elsif (! $type || $type eq 'SCALAR') {
157 :     # Here the list of subsystems is in a file.
158 :     if (! -e $subsysFile) {
159 :     # It's an error if the file does not exist.
160 :     Confess("Trusted subsystem file not found.");
161 :     } else {
162 :     # GetFile automatically chomps end-of-line characters, so this
163 :     # is an easy task.
164 :     %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile);
165 :     }
166 : parrello 1.4 } else {
167 : parrello 1.35 Confess("Invalid subsystem parameter in SproutLoad constructor.");
168 : parrello 1.4 }
169 : parrello 1.1 }
170 : parrello 1.72 # Go through the subsys hash again, creating the keyword list for each subsystem.
171 :     for my $subsystem (keys %subsystems) {
172 :     my $name = $subsystem;
173 :     $name =~ s/_/ /g;
174 :     $subsystems{$subsystem} = $name;
175 :     }
176 : parrello 1.1 }
177 : parrello 1.85 # Get the list of NMPDR-oriented attribute keys.
178 :     my @propKeys = $fig->get_group_keys("NMPDR");
179 : parrello 1.1 # Get the data directory from the Sprout object.
180 :     my ($directory) = $sprout->LoadInfo();
181 :     # Create the Sprout load object.
182 :     my $retVal = {
183 :     fig => $fig,
184 :     genomes => \%genomes,
185 :     subsystems => \%subsystems,
186 :     sprout => $sprout,
187 :     loadDirectory => $directory,
188 : parrello 1.39 erdb => $sprout,
189 : parrello 1.8 loaders => [],
190 : parrello 1.85 options => $options,
191 :     propKeys => \@propKeys,
192 : parrello 1.1 };
193 :     # Bless and return it.
194 :     bless $retVal, $class;
195 :     return $retVal;
196 :     }
197 :    
198 : parrello 1.23 =head3 LoadOnly
199 :    
200 : parrello 1.90 my $flag = $spl->LoadOnly;
201 : parrello 1.23
202 :     Return TRUE if we are in load-only mode, else FALSE.
203 :    
204 :     =cut
205 :    
206 :     sub LoadOnly {
207 :     my ($self) = @_;
208 :     return $self->{options}->{loadOnly};
209 :     }
210 :    
211 : parrello 1.25
212 : parrello 1.1 =head3 LoadGenomeData
213 :    
214 : parrello 1.90 my $stats = $spl->LoadGenomeData();
215 : parrello 1.1
216 :     Load the Genome, Contig, and Sequence data from FIG into Sprout.
217 :    
218 :     The Sequence table is the largest single relation in the Sprout database, so this
219 :     method is expected to be slow and clumsy. At some point we will need to make it
220 :     restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be
221 :     very annoying otherwise.
222 :    
223 :     The following relations are loaded by this method.
224 :    
225 :     Genome
226 :     HasContig
227 :     Contig
228 :     IsMadeUpOf
229 :     Sequence
230 :    
231 :     =over 4
232 :    
233 :     =item RETURNS
234 :    
235 :     Returns a statistics object for the loads.
236 :    
237 :     =back
238 :    
239 :     =cut
240 :     #: Return Type $%;
241 :     sub LoadGenomeData {
242 :     # Get this object instance.
243 :     my ($self) = @_;
244 :     # Get the FIG object.
245 :     my $fig = $self->{fig};
246 :     # Get the genome count.
247 :     my $genomeHash = $self->{genomes};
248 :     my $genomeCount = (keys %{$genomeHash});
249 :     # Create load objects for each of the tables we're loading.
250 : parrello 1.23 my $loadGenome = $self->_TableLoader('Genome');
251 : parrello 1.85 my $loadHasContig = $self->_TableLoader('HasContig');
252 :     my $loadContig = $self->_TableLoader('Contig');
253 :     my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf');
254 :     my $loadSequence = $self->_TableLoader('Sequence');
255 : parrello 1.23 if ($self->{options}->{loadOnly}) {
256 :     Trace("Loading from existing files.") if T(2);
257 :     } else {
258 :     Trace("Generating genome data.") if T(2);
259 : parrello 1.92 # Get the full info for the FIG genomes.
260 :     my %genomeInfo = map { $_->[0] => { gname => $_->[1], szdna => $_->[2], maindomain => $_->[3],
261 :     pegs => $_->[4], rnas => $_->[5], complete => $_->[6] } } @{$fig->genome_info()};
262 : parrello 1.23 # Now we loop through the genomes, generating the data for each one.
263 :     for my $genomeID (sort keys %{$genomeHash}) {
264 :     Trace("Generating data for genome $genomeID.") if T(3);
265 :     $loadGenome->Add("genomeIn");
266 :     # The access code comes in via the genome hash.
267 :     my $accessCode = $genomeHash->{$genomeID};
268 : parrello 1.28 # Get the genus, species, and strain from the scientific name.
269 : parrello 1.23 my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID);
270 : parrello 1.28 my $extra = join " ", @extraData;
271 : parrello 1.23 # Get the full taxonomy.
272 :     my $taxonomy = $fig->taxonomy_of($genomeID);
273 : parrello 1.82 # Get the version. If no version is specified, we default to the genome ID by itself.
274 :     my $version = $fig->genome_version($genomeID);
275 :     if (! defined($version)) {
276 :     $version = $genomeID;
277 :     }
278 :     # Get the DNA size.
279 :     my $dnaSize = $fig->genome_szdna($genomeID);
280 : parrello 1.68 # Open the NMPDR group file for this genome.
281 :     my $group;
282 :     if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") &&
283 :     defined($group = <TMP>)) {
284 :     # Clean the line ending.
285 :     chomp $group;
286 :     } else {
287 :     # No group, so use the default.
288 :     $group = $FIG_Config::otherGroup;
289 :     }
290 :     close TMP;
291 : parrello 1.92 # Get the contigs.
292 :     my @contigs = $fig->all_contigs($genomeID);
293 :     # Get this genome's info array.
294 :     my $info = $genomeInfo{$genomeID};
295 : parrello 1.23 # Output the genome record.
296 : parrello 1.92 $loadGenome->Put($genomeID, $accessCode, $info->{complete}, scalar(@contigs),
297 :     $dnaSize, $genus, $info->{pegs}, $group, $info->{rnas}, $species, $extra, $version, $taxonomy);
298 : parrello 1.23 # Now we loop through each of the genome's contigs.
299 :     for my $contigID (@contigs) {
300 :     Trace("Processing contig $contigID for $genomeID.") if T(4);
301 :     $loadContig->Add("contigIn");
302 :     $loadSequence->Add("contigIn");
303 :     # Create the contig ID.
304 :     my $sproutContigID = "$genomeID:$contigID";
305 :     # Create the contig record and relate it to the genome.
306 :     $loadContig->Put($sproutContigID);
307 :     $loadHasContig->Put($genomeID, $sproutContigID);
308 :     # Now we need to split the contig into sequences. The maximum sequence size is
309 :     # a property of the Sprout object.
310 :     my $chunkSize = $self->{sprout}->MaxSequence();
311 :     # Now we get the sequence a chunk at a time.
312 :     my $contigLen = $fig->contig_ln($genomeID, $contigID);
313 :     for (my $i = 1; $i <= $contigLen; $i += $chunkSize) {
314 :     $loadSequence->Add("chunkIn");
315 :     # Compute the endpoint of this chunk.
316 :     my $end = FIG::min($i + $chunkSize - 1, $contigLen);
317 :     # Get the actual DNA.
318 :     my $dna = $fig->get_dna($genomeID, $contigID, $i, $end);
319 :     # Compute the sequenceID.
320 :     my $seqID = "$sproutContigID.$i";
321 :     # Write out the data. For now, the quality vector is always "unknown".
322 :     $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i);
323 :     $loadSequence->Put($seqID, "unknown", $dna);
324 :     }
325 : parrello 1.1 }
326 :     }
327 :     }
328 :     # Finish the loads.
329 :     my $retVal = $self->_FinishAll();
330 :     # Return the result.
331 :     return $retVal;
332 :     }
333 :    
334 :     =head3 LoadFeatureData
335 :    
336 : parrello 1.90 my $stats = $spl->LoadFeatureData();
337 : parrello 1.1
338 :     Load the feature data from FIG into Sprout.
339 :    
340 :     Features represent annotated genes, and are therefore the heart of the data store.
341 :    
342 :     The following relations are loaded by this method.
343 :    
344 :     Feature
345 :     FeatureAlias
346 : parrello 1.85 IsAliasOf
347 : parrello 1.1 FeatureLink
348 :     FeatureTranslation
349 :     FeatureUpstream
350 :     IsLocatedIn
351 : parrello 1.30 HasFeature
352 : parrello 1.69 HasRoleInSubsystem
353 : parrello 1.76 FeatureEssential
354 :     FeatureVirulent
355 :     FeatureIEDB
356 : parrello 1.85 CDD
357 :     IsPresentOnProteinOf
358 : parrello 1.93 CellLocation
359 :     IsPossiblePlaceFor
360 :     ExternalDatabase
361 :     IsAlsoFoundIn
362 :     Keyword
363 : parrello 1.1
364 :     =over 4
365 :    
366 :     =item RETURNS
367 :    
368 :     Returns a statistics object for the loads.
369 :    
370 :     =back
371 :    
372 :     =cut
373 :     #: Return Type $%;
374 :     sub LoadFeatureData {
375 :     # Get this object instance.
376 :     my ($self) = @_;
377 : parrello 1.72 # Get the FIG and Sprout objects.
378 : parrello 1.1 my $fig = $self->{fig};
379 : parrello 1.72 my $sprout = $self->{sprout};
380 : parrello 1.1 # Get the table of genome IDs.
381 :     my $genomeHash = $self->{genomes};
382 :     # Create load objects for each of the tables we're loading.
383 : parrello 1.23 my $loadFeature = $self->_TableLoader('Feature');
384 : parrello 1.85 my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn');
385 : parrello 1.23 my $loadFeatureAlias = $self->_TableLoader('FeatureAlias');
386 : parrello 1.85 my $loadIsAliasOf = $self->_TableLoader('IsAliasOf');
387 : parrello 1.23 my $loadFeatureLink = $self->_TableLoader('FeatureLink');
388 :     my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation');
389 :     my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream');
390 : parrello 1.85 my $loadHasFeature = $self->_TableLoader('HasFeature');
391 :     my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem');
392 : parrello 1.76 my $loadFeatureEssential = $self->_TableLoader('FeatureEssential');
393 :     my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent');
394 :     my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB');
395 : parrello 1.85 my $loadCDD = $self->_TableLoader('CDD');
396 :     my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf');
397 : parrello 1.93 my $loadCellLocation = $self->_TableLoader('CellLocation');
398 :     my $loadIsPossiblePlaceFor = $self->_TableLoader('IsPossiblePlaceFor');
399 :     my $loadIsAlsoFoundIn = $self->_TableLoader('IsAlsoFoundIn');
400 :     my $loadExternalDatabase = $self->_TableLoader('ExternalDatabase');
401 :     my $loadKeyword = $self->_TableLoader('Keyword');
402 : parrello 1.72 # Get the subsystem hash.
403 :     my $subHash = $self->{subsystems};
404 : parrello 1.85 # Get the property keys.
405 :     my $propKeys = $self->{propKeys};
406 : parrello 1.93 # Create a hashes to hold CDD, Cell Location (PSORT), External Database, and alias values.
407 : parrello 1.85 my %CDD = ();
408 :     my %alias = ();
409 : parrello 1.93 my %cellLocation = ();
410 :     my %xdb = ();
411 :     # Create the bio-words object.
412 : parrello 1.94 my $biowords = BioWords->new(exceptions => "$FIG_Config::sproutData/Exceptions.txt",
413 :     stops => "$FIG_Config::sproutData/StopWords.txt",
414 :     cache => 0);
415 : parrello 1.93 # One of the things we have to do here is build the keyword table, and the keyword
416 :     # table needs to contain the originating text and feature count for each stem. Unfortunately,
417 :     # the number of distinct keywords is so large it causes PERL to hang if we try to
418 :     # keep them in memory. As a result, we need to track them using disk files.
419 :     # Our approach will be to use two sequential files. One will contain stems and phonexes.
420 :     # Each time a stem occurs in a feature, a record will be written to that file. The stem
421 :     # file can then be sorted and collated to determine the number of features for each
422 :     # stem. A separate file will contain keywords and stems. This last file
423 :     # will be subjected to a sort unique on stem/keyword. The file is then merged
424 :     # with the stem file to create the keyword table relation (keyword, stem, phonex, count).
425 :     my $stemFileName = "$FIG_Config::temp/stems$$.tbl";
426 :     my $keyFileName = "$FIG_Config::temp/keys$$.tbl";
427 :     my $stemh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -k1,1 >$stemFileName");
428 :     my $keyh = Open(undef, "| sort -T\"$FIG_Config::temp\" -t\"\t\" -u -k1,1 -k2,2 >$keyFileName");
429 : parrello 1.1 # Get the maximum sequence size. We need this later for splitting up the
430 :     # locations.
431 :     my $chunkSize = $self->{sprout}->MaxSegment();
432 : parrello 1.23 if ($self->{options}->{loadOnly}) {
433 :     Trace("Loading from existing files.") if T(2);
434 :     } else {
435 :     Trace("Generating feature data.") if T(2);
436 :     # Now we loop through the genomes, generating the data for each one.
437 : parrello 1.88 my @allGenomes = sort keys %{$genomeHash};
438 :     Trace(scalar(@allGenomes) . " genomes found in list.") if T(3);
439 :     for my $genomeID (@allGenomes) {
440 : parrello 1.23 Trace("Loading features for genome $genomeID.") if T(3);
441 :     $loadFeature->Add("genomeIn");
442 :     # Get the feature list for this genome.
443 : parrello 1.82 my $features = $fig->all_features_detailed_fast($genomeID);
444 : parrello 1.56 # Sort and count the list.
445 : parrello 1.57 my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features};
446 :     my $count = scalar @featureTuples;
447 : parrello 1.80 my @fids = map { $_->[0] } @featureTuples;
448 : parrello 1.54 Trace("$count features found for genome $genomeID.") if T(3);
449 : parrello 1.80 # Get the attributes for this genome and put them in a hash by feature ID.
450 : parrello 1.85 my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys);
451 : parrello 1.88 Trace("Looping through features for $genomeID.") if T(3);
452 : parrello 1.56 # Set up for our duplicate-feature check.
453 :     my $oldFeatureID = "";
454 : parrello 1.23 # Loop through the features.
455 : parrello 1.57 for my $featureTuple (@featureTuples) {
456 : parrello 1.23 # Split the tuple.
457 : parrello 1.82 my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple};
458 : parrello 1.56 # Check for duplicates.
459 :     if ($featureID eq $oldFeatureID) {
460 :     Trace("Duplicate feature $featureID found.") if T(1);
461 :     } else {
462 :     $oldFeatureID = $featureID;
463 :     # Count this feature.
464 :     $loadFeature->Add("featureIn");
465 : parrello 1.82 # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the
466 :     # Sprout database requires a single character.
467 :     if (! defined($quality) || $quality eq "") {
468 :     $quality = " ";
469 :     }
470 : parrello 1.76 # Begin building the keywords. We start with the genome ID, the
471 : parrello 1.79 # feature ID, the taxonomy, and the organism name.
472 :     my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID),
473 :     $fig->taxonomy_of($genomeID));
474 : parrello 1.81 # Create the aliases.
475 :     for my $alias ($fig->feature_aliases($featureID)) {
476 : parrello 1.85 #Connect this alias to this feature.
477 :     $loadIsAliasOf->Put($alias, $featureID);
478 : parrello 1.81 push @keywords, $alias;
479 : parrello 1.85 # If this is a locus tag, also add its natural form as a keyword.
480 :     my $naturalName = AliasAnalysis::Type(LocusTag => $alias);
481 :     if ($naturalName) {
482 :     push @keywords, $naturalName;
483 :     }
484 :     # If this is the first time for the specified alias, create its
485 :     # alias record.
486 :     if (! exists $alias{$alias}) {
487 :     $loadFeatureAlias->Put($alias);
488 :     $alias{$alias} = 1;
489 :     }
490 : parrello 1.75 }
491 : parrello 1.93 # Add the corresponding IDs. We ask for 2-tuples of the form (id, database).
492 :     my @corresponders = $fig->get_corresponding_ids($featureID, 1);
493 :     for my $tuple (@corresponders) {
494 :     my ($id, $xdb) = @{$tuple};
495 :     # Ignore SEED: that's us.
496 :     if ($xdb ne 'SEED') {
497 :     # Connect this ID to the feature.
498 :     $loadIsAlsoFoundIn->Put($featureID, $xdb, $id);
499 :     # Add it as a keyword.
500 :     push @keywords, $id;
501 :     # If this is a new database, create a record for it.
502 :     if (! exists $xdb{$xdb}) {
503 :     $xdb{$xdb} = 1;
504 :     $loadExternalDatabase->Put($xdb);
505 :     }
506 :     }
507 :     }
508 : parrello 1.75 Trace("Assignment for $featureID is: $assignment") if T(4);
509 :     # Break the assignment into words and shove it onto the
510 :     # keyword list.
511 :     push @keywords, split(/\s+/, $assignment);
512 : parrello 1.72 # Link this feature to the parent genome.
513 : parrello 1.56 $loadHasFeature->Put($genomeID, $featureID, $type);
514 :     # Get the links.
515 :     my @links = $fig->fid_links($featureID);
516 :     for my $link (@links) {
517 :     $loadFeatureLink->Put($featureID, $link);
518 : parrello 1.8 }
519 : parrello 1.56 # If this is a peg, generate the translation and the upstream.
520 :     if ($type eq 'peg') {
521 :     $loadFeatureTranslation->Add("pegIn");
522 :     my $translation = $fig->get_translation($featureID);
523 :     if ($translation) {
524 :     $loadFeatureTranslation->Put($featureID, $translation);
525 :     }
526 :     # We use the default upstream values of u=200 and c=100.
527 :     my $upstream = $fig->upstream_of($featureID, 200, 100);
528 :     if ($upstream) {
529 :     $loadFeatureUpstream->Put($featureID, $upstream);
530 :     }
531 : parrello 1.23 }
532 : parrello 1.69 # Now we need to find the subsystems this feature participates in.
533 : parrello 1.72 # We also add the subsystems to the keyword list. Before we do that,
534 : parrello 1.86 # we must convert underscores to spaces.
535 : parrello 1.69 my @subsystems = $fig->peg_to_subsystems($featureID);
536 :     for my $subsystem (@subsystems) {
537 : parrello 1.72 # Only proceed if we like this subsystem.
538 :     if (exists $subHash->{$subsystem}) {
539 :     # Store the has-role link.
540 :     $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type);
541 :     # Save the subsystem's keyword data.
542 :     my $subKeywords = $subHash->{$subsystem};
543 : parrello 1.75 push @keywords, split /\s+/, $subKeywords;
544 :     # Now we need to get this feature's role in the subsystem.
545 :     my $subObject = $fig->get_subsystem($subsystem);
546 :     my @roleColumns = $subObject->get_peg_roles($featureID);
547 :     my @allRoles = $subObject->get_roles();
548 :     for my $col (@roleColumns) {
549 :     my $role = $allRoles[$col];
550 :     push @keywords, split /\s+/, $role;
551 :     push @keywords, $subObject->get_role_abbr($col);
552 :     }
553 : parrello 1.72 }
554 :     }
555 : parrello 1.76 # There are three special attributes computed from property
556 :     # data that we build next. If the special attribute is non-empty,
557 :     # its name will be added to the keyword list. First, we get all
558 :     # the attributes for this feature. They will come back as
559 :     # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead:
560 :     # [name, value, value with URL]. (We don't need the PEG, since
561 :     # we already know it.)
562 :     my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] }
563 : parrello 1.80 @{$attributes->{$featureID}};
564 : parrello 1.76 # Now we process each of the special attributes.
565 :     if (SpecialAttribute($featureID, \@attributes,
566 : parrello 1.77 1, [0,2], '^(essential|potential_essential)$',
567 : parrello 1.76 $loadFeatureEssential)) {
568 :     push @keywords, 'essential';
569 :     $loadFeature->Add('essential');
570 : parrello 1.72 }
571 : parrello 1.76 if (SpecialAttribute($featureID, \@attributes,
572 : parrello 1.77 0, [2], '^virulen',
573 : parrello 1.76 $loadFeatureVirulent)) {
574 :     push @keywords, 'virulent';
575 :     $loadFeature->Add('virulent');
576 :     }
577 :     if (SpecialAttribute($featureID, \@attributes,
578 : parrello 1.77 0, [0,2], '^iedb_',
579 : parrello 1.76 $loadFeatureIEDB)) {
580 :     push @keywords, 'iedb';
581 :     $loadFeature->Add('iedb');
582 : parrello 1.75 }
583 : parrello 1.93 # Now we have some other attributes we need to process. To get
584 :     # through them, we convert the attribute list for this feature
585 :     # into a two-layer hash: key => subkey => value.
586 : parrello 1.85 my %attributeHash = ();
587 :     for my $attrRow (@{$attributes->{$featureID}}) {
588 :     my (undef, $key, @values) = @{$attrRow};
589 : parrello 1.93 my ($realKey, $subKey);
590 :     if ($key =~ /^([^:]+)::(.+)/) {
591 :     ($realKey, $subKey) = ($1, $2);
592 :     } else {
593 :     ($realKey, $subKey) = ($key, "");
594 :     }
595 : parrello 1.85 if (exists $attributeHash{$1}) {
596 :     $attributeHash{$1}->{$2} = \@values;
597 :     } else {
598 :     $attributeHash{$1} = {$2 => \@values};
599 :     }
600 :     }
601 : parrello 1.93 # First we handle CDD. This is a bit complicated, because
602 : parrello 1.85 # there are multiple CDDs per protein.
603 :     if (exists $attributeHash{CDD}) {
604 :     # Get the hash of CDD IDs to scores for this feature. We
605 :     # already know it exists because of the above IF.
606 :     my $cddHash = $attributeHash{CDD};
607 :     my @cddData = sort keys %{$cddHash};
608 :     for my $cdd (@cddData) {
609 :     # Extract the score for this CDD and decode it.
610 : parrello 1.93 my ($codeScore) = split(/\s*[,;]\s*/, $cddHash->{$cdd}->[0]);
611 : parrello 1.85 my $realScore = FIGRules::DecodeScore($codeScore);
612 : parrello 1.89 # We can't afford to crash because of a bad attribute
613 :     # value, hence the IF below.
614 :     if (! defined($realScore)) {
615 :     # Bad score, so count it.
616 :     $loadFeature->Add('badCDDscore');
617 : parrello 1.93 Trace("CDD score \"$codeScore\" for feature $featureID invalid.") if T(3);
618 : parrello 1.89 } else {
619 :     # Create the connection.
620 :     $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore);
621 :     # If this CDD does not yet exist, create its record.
622 :     if (! exists $CDD{$cdd}) {
623 :     $CDD{$cdd} = 1;
624 :     $loadCDD->Put($cdd);
625 :     }
626 : parrello 1.85 }
627 :     }
628 :     }
629 : parrello 1.93 # Next we do PSORT cell locations. here the confidence value
630 :     # could have the value "unknown", which we translate to -1.
631 :     if (exists $attributeHash{PSORT}) {
632 :     # This will be a hash of cell locations to confidence
633 :     # factors.
634 :     my $psortHash = $attributeHash{PSORT};
635 :     for my $psort (keys %{$psortHash}) {
636 :     # Get the confidence, and convert it to a number if necessary.
637 :     my $confidence = $psortHash->{$psort};
638 :     if ($confidence eq 'unknown') {
639 :     $confidence = -1;
640 :     }
641 :     $loadIsPossiblePlaceFor->Put($psort, $featureID, $confidence);
642 :     # If this cell location does not yet exist, create its record.
643 :     if (! exists $cellLocation{$psort}) {
644 :     $cellLocation{$psort} = 1;
645 :     $loadCellLocation->Put($psort);
646 :     }
647 :     # If this is a significant location, add it as a keyword.
648 :     if ($confidence > 2.5) {
649 :     push @keywords, $psort;
650 : parrello 1.75 }
651 :     }
652 : parrello 1.69 }
653 : parrello 1.93 # Phobius data is next. This consists of the signal peptide location and
654 :     # the transmembrane locations.
655 :     my $signalList = "";
656 :     my $transList = "";
657 :     if (exists $attributeHash{Phobius}) {
658 :     # This will be a hash of two keys (transmembrane and signal) to
659 :     # location strings. If there's no value, we stuff in an empty string.
660 : parrello 1.95 $signalList = GetCommaList($attributeHash{Phobius}->{signal});
661 :     $transList = GetCommaList($attributeHash{Phobius}->{transmembrane});
662 : parrello 1.93 }
663 :     # Here are some more numbers: isoelectric point, molecular weight, and
664 :     # the similar-to-human flag.
665 :     my $isoelectric = 0;
666 :     if (exists $attributeHash{isoelectric_point}) {
667 :     $isoelectric = $attributeHash{isoelectric_point}->{""};
668 :     }
669 :     my $similarToHuman = 0;
670 :     if (exists $attributeHash{similar_to_human} && $attributeHash{similar_to_human}->{""} eq 'yes') {
671 :     $similarToHuman = 1;
672 :     }
673 :     my $molecularWeight = 0;
674 :     if (exists $attributeHash{molecular_weight}) {
675 :     $molecularWeight = $attributeHash{molecular_weight}->{""};
676 :     }
677 :     # Create the keyword string.
678 :     my $keywordString = join(" ", @keywords);
679 :     Trace("Real keyword string for $featureID: $keywordString.") if T(4);
680 : parrello 1.79 # Get rid of annoying punctuation.
681 : parrello 1.93 $keywordString =~ s/[();@#\/]/ /g;
682 :     # Get the list of keywords in the keyword string.
683 :     my @realKeywords = grep { $biowords->IsWord($_) } $biowords->Split($keywordString);
684 :     # We need to do two things here: create the keyword string for the feature table
685 :     # and write records to the keyword and stem files. The stuff we write to
686 :     # the files will be taken from the following two hashes. The stuff used
687 :     # to create the keyword string will be taken from the list.
688 :     my (%keys, %stems, @realStems);
689 :     for my $keyword (@realKeywords) {
690 :     # Compute the stem and phonex for this keyword.
691 :     my ($stem, $phonex) = $biowords->StemLookup($keyword);
692 :     # Only proceed if a stem comes back. If no stem came back, it's a
693 :     # stop word and we throw it away.
694 :     if ($stem) {
695 :     $keys{$keyword} = $stem;
696 :     $stems{$stem} = $phonex;
697 :     push @realStems, $stem;
698 :     }
699 :     }
700 :     # Now create the keyword string.
701 :     my $cleanWords = join(" ", @realStems);
702 : parrello 1.75 Trace("Keyword string for $featureID: $cleanWords") if T(4);
703 : parrello 1.93 # Write the stem and keyword records.
704 :     for my $stem (keys %stems) {
705 :     Tracer::PutLine($stemh, [$stem, $stems{$stem}]);
706 :     }
707 :     for my $key (keys %keys) {
708 :     # The stem goes first in this file, because we want to sort
709 :     # by stem and then keyword.
710 :     Tracer::PutLine($keyh, [$keys{$key}, $key]);
711 :     }
712 : parrello 1.85 # Now we need to process the feature's locations. First, we split them up.
713 :     my @locationList = split /\s*,\s*/, $locations;
714 :     # Next, we convert them to Sprout location objects.
715 :     my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList;
716 : parrello 1.90 # Assemble them into a sprout location string for later.
717 :     my $locationString = join(", ", map { $_->String } @locObjectList);
718 : parrello 1.93 # We'll store the sequence length in here.
719 :     my $sequenceLength = 0;
720 : parrello 1.56 # This part is the roughest. We need to relate the features to contig
721 :     # locations, and the locations must be split so that none of them exceed
722 :     # the maximum segment size. This simplifies the genes_in_region processing
723 : parrello 1.85 # for Sprout. To start, we create the location position indicator.
724 : parrello 1.56 my $i = 1;
725 :     # Loop through the locations.
726 : parrello 1.85 for my $locObject (@locObjectList) {
727 : parrello 1.93 # Record the length.
728 :     $sequenceLength += $locObject->Length;
729 : parrello 1.85 # Split this location into a list of chunks.
730 : parrello 1.56 my @locOList = ();
731 :     while (my $peeling = $locObject->Peel($chunkSize)) {
732 :     $loadIsLocatedIn->Add("peeling");
733 :     push @locOList, $peeling;
734 :     }
735 :     push @locOList, $locObject;
736 :     # Loop through the chunks, creating IsLocatedIn records. The variable
737 :     # "$i" will be used to keep the location index.
738 : parrello 1.90 for my $locChunk (@locOList) {
739 : parrello 1.56 $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left,
740 :     $locChunk->Dir, $locChunk->Length, $i);
741 :     $i++;
742 :     }
743 : parrello 1.23 }
744 : parrello 1.93 # Now we get some ancillary flags.
745 :     my $locked = $fig->is_locked_fid($featureID);
746 :     my $in_genbank = $fig->peg_in_gendb($featureID);
747 : parrello 1.85 # Create the feature record.
748 : parrello 1.93 $loadFeature->Put($featureID, 1, $user, $quality, $type, $in_genbank, $isoelectric, $locked, $molecularWeight,
749 :     $sequenceLength, $signalList, $similarToHuman, $assignment, $cleanWords, $locationString,
750 :     $transList);
751 : parrello 1.1 }
752 :     }
753 : parrello 1.88 Trace("Genome $genomeID processed.") if T(3);
754 : parrello 1.1 }
755 :     }
756 : parrello 1.93 Trace("Sorting keywords.") if T(2);
757 :     # Now we need to load the keyword table from the key and stem files.
758 :     close $keyh;
759 :     close $stemh;
760 :     Trace("Loading keywords.") if T(2);
761 :     $keyh = Open(undef, "<$keyFileName");
762 :     $stemh = Open(undef, "<$stemFileName");
763 :     # We'll count the keywords in here, for tracing purposes.
764 :     my $count = 0;
765 :     # These variables track the current stem's data. When an incoming
766 :     # keyword's stem changes, these will be recomputed.
767 :     my ($currentStem, $currentPhonex, $currentCount);
768 :     # Prime the loop by reading the first stem in the stem file.
769 :     my ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
770 :     # Loop through the keyword file.
771 :     while (! eof $keyh) {
772 :     # Read this keyword.
773 :     my ($thisStem, $thisKey) = Tracer::GetLine($keyh);
774 :     # Check to see if it's the new stem yet.
775 :     if ($thisStem ne $currentStem) {
776 :     # Yes. It's a terrible error if it's not also the next stem.
777 :     if ($thisStem ne $nextStem) {
778 :     Confess("Error in stem file. Expected \"$nextStem\", but found \"$thisStem\".");
779 :     } else {
780 :     # Here we're okay.
781 :     ($currentStem, $currentPhonex) = ($nextStem, $nextPhonex);
782 :     # Count the number of features for this stem.
783 :     $currentCount = 0;
784 :     while ($nextStem eq $thisStem) {
785 :     ($nextStem, $nextPhonex) = Tracer::GetLine($stemh);
786 :     $currentCount++;
787 :     }
788 :     }
789 :     }
790 :     # Now $currentStem is the same as $thisStem, and the other $current-vars
791 :     # contain the stem's data (phonex and count).
792 :     $loadKeyword->Put($thisKey, $currentCount, $currentPhonex, $currentStem);
793 :     if (++$count % 1000 == 0 && T(3)) {
794 :     Trace("$count keywords loaded.");
795 :     }
796 :     }
797 :     Trace("$count keywords loaded into keyword table.") if T(2);
798 : parrello 1.1 # Finish the loads.
799 :     my $retVal = $self->_FinishAll();
800 :     return $retVal;
801 :     }
802 :    
803 :     =head3 LoadSubsystemData
804 :    
805 : parrello 1.90 my $stats = $spl->LoadSubsystemData();
806 : parrello 1.1
807 :     Load the subsystem data from FIG into Sprout.
808 :    
809 :     Subsystems are groupings of genetic roles that work together to effect a specific
810 :     chemical reaction. Similar organisms require similar subsystems. To curate a subsystem,
811 :     a spreadsheet is created with genomes on one axis and subsystem roles on the other
812 :     axis. Similar features are then mapped into the cells, allowing the annotation of one
813 :     genome's roles to be used to assist in the annotation of others.
814 :    
815 :     The following relations are loaded by this method.
816 :    
817 :     Subsystem
818 : parrello 1.46 SubsystemClass
819 : parrello 1.1 Role
820 : parrello 1.19 RoleEC
821 : parrello 1.85 IsIdentifiedByEC
822 : parrello 1.1 SSCell
823 :     ContainsFeature
824 :     IsGenomeOf
825 :     IsRoleOf
826 :     OccursInSubsystem
827 :     ParticipatesIn
828 :     HasSSCell
829 : parrello 1.18 ConsistsOfRoles
830 :     RoleSubset
831 :     HasRoleSubset
832 :     ConsistsOfGenomes
833 :     GenomeSubset
834 :     HasGenomeSubset
835 : parrello 1.21 Diagram
836 :     RoleOccursIn
837 : parrello 1.93 SubsystemHopeNotes
838 : parrello 1.1
839 :     =over 4
840 :    
841 :     =item RETURNS
842 :    
843 :     Returns a statistics object for the loads.
844 :    
845 :     =back
846 :    
847 :     =cut
848 :     #: Return Type $%;
849 :     sub LoadSubsystemData {
850 :     # Get this object instance.
851 :     my ($self) = @_;
852 :     # Get the FIG object.
853 :     my $fig = $self->{fig};
854 :     # Get the genome hash. We'll use it to filter the genomes in each
855 :     # spreadsheet.
856 :     my $genomeHash = $self->{genomes};
857 :     # Get the subsystem hash. This lists the subsystems we'll process.
858 :     my $subsysHash = $self->{subsystems};
859 :     my @subsysIDs = sort keys %{$subsysHash};
860 : parrello 1.21 # Get the map list.
861 :     my @maps = $fig->all_maps;
862 : parrello 1.1 # Create load objects for each of the tables we're loading.
863 : parrello 1.85 my $loadDiagram = $self->_TableLoader('Diagram');
864 :     my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn');
865 : parrello 1.23 my $loadSubsystem = $self->_TableLoader('Subsystem');
866 : parrello 1.85 my $loadRole = $self->_TableLoader('Role');
867 :     my $loadRoleEC = $self->_TableLoader('RoleEC');
868 :     my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC');
869 :     my $loadCatalyzes = $self->_TableLoader('Catalyzes');
870 :     my $loadSSCell = $self->_TableLoader('SSCell');
871 :     my $loadContainsFeature = $self->_TableLoader('ContainsFeature');
872 :     my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf');
873 :     my $loadIsRoleOf = $self->_TableLoader('IsRoleOf');
874 :     my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem');
875 :     my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn');
876 :     my $loadHasSSCell = $self->_TableLoader('HasSSCell');
877 :     my $loadRoleSubset = $self->_TableLoader('RoleSubset');
878 :     my $loadGenomeSubset = $self->_TableLoader('GenomeSubset');
879 :     my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles');
880 :     my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes');
881 :     my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset');
882 :     my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset');
883 :     my $loadSubsystemClass = $self->_TableLoader('SubsystemClass');
884 : parrello 1.93 my $loadSubsystemHopeNotes = $self->_TableLoader('SubsystemHopeNotes');
885 : parrello 1.23 if ($self->{options}->{loadOnly}) {
886 :     Trace("Loading from existing files.") if T(2);
887 :     } else {
888 :     Trace("Generating subsystem data.") if T(2);
889 : parrello 1.85 # This hash will contain the roles for each EC. When we're done, this
890 : parrello 1.23 # information will be used to generate the Catalyzes table.
891 :     my %ecToRoles = ();
892 :     # Loop through the subsystems. Our first task will be to create the
893 :     # roles. We do this by looping through the subsystems and creating a
894 :     # role hash. The hash tracks each role ID so that we don't create
895 :     # duplicates. As we move along, we'll connect the roles and subsystems
896 :     # and memorize up the reactions.
897 :     my ($genomeID, $roleID);
898 :     my %roleData = ();
899 :     for my $subsysID (@subsysIDs) {
900 :     # Get the subsystem object.
901 :     my $sub = $fig->get_subsystem($subsysID);
902 : parrello 1.32 # Only proceed if the subsystem has a spreadsheet.
903 : parrello 1.83 if (defined($sub) && ! $sub->{empty_ss}) {
904 : parrello 1.31 Trace("Creating subsystem $subsysID.") if T(3);
905 :     $loadSubsystem->Add("subsystemIn");
906 :     # Create the subsystem record.
907 :     my $curator = $sub->get_curator();
908 :     my $notes = $sub->get_notes();
909 : parrello 1.93 my $version = $sub->get_version();
910 : parrello 1.92 my $description = $sub->get_description();
911 : parrello 1.93 $loadSubsystem->Put($subsysID, $curator, $version, $description, $notes);
912 :     # Add the hope notes.
913 :     my $hopeNotes = $sub->get_hope_curation_notes();
914 :     if ($hopeNotes) {
915 :     $loadSubsystemHopeNotes->Put($sub, $hopeNotes);
916 :     }
917 : parrello 1.72 # Now for the classification string. This comes back as a list
918 :     # reference and we convert it to a space-delimited string.
919 : parrello 1.64 my $classList = $fig->subsystem_classification($subsysID);
920 : parrello 1.78 my $classString = join($FIG_Config::splitter, grep { $_ } @$classList);
921 : parrello 1.72 $loadSubsystemClass->Put($subsysID, $classString);
922 : parrello 1.31 # Connect it to its roles. Each role is a column in the subsystem spreadsheet.
923 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
924 : parrello 1.85 # Get the role's abbreviation.
925 :     my $abbr = $sub->get_role_abbr($col);
926 : parrello 1.93 # Get its essentiality.
927 :     my $aux = $fig->is_aux_role_in_subsystem($subsysID, $roleID);
928 :     # Get its reaction note.
929 :     my $hope_note = $sub->get_hope_reaction_notes($roleID) || "";
930 : parrello 1.31 # Connect to this role.
931 :     $loadOccursInSubsystem->Add("roleIn");
932 : parrello 1.93 $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $aux, $col, $hope_note);
933 : parrello 1.31 # If it's a new role, add it to the role table.
934 :     if (! exists $roleData{$roleID}) {
935 :     # Get the role's abbreviation.
936 :     # Add the role.
937 : parrello 1.85 $loadRole->Put($roleID);
938 : parrello 1.31 $roleData{$roleID} = 1;
939 :     # Check for an EC number.
940 : parrello 1.85 if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) {
941 : parrello 1.31 my $ec = $1;
942 : parrello 1.85 $loadIsIdentifiedByEC->Put($roleID, $ec);
943 :     # Check to see if this is our first encounter with this EC.
944 :     if (exists $ecToRoles{$ec}) {
945 :     # No, so just add this role to the EC list.
946 :     push @{$ecToRoles{$ec}}, $roleID;
947 :     } else {
948 :     # Output this EC.
949 :     $loadRoleEC->Put($ec);
950 :     # Create its role list.
951 :     $ecToRoles{$ec} = [$roleID];
952 :     }
953 : parrello 1.31 }
954 : parrello 1.23 }
955 : parrello 1.18 }
956 : parrello 1.31 # Now we create the spreadsheet for the subsystem by matching roles to
957 :     # genomes. Each genome is a row and each role is a column. We may need
958 :     # to actually create the roles as we find them.
959 :     Trace("Creating subsystem $subsysID spreadsheet.") if T(3);
960 :     for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) {
961 :     # Only proceed if this is one of our genomes.
962 :     if (exists $genomeHash->{$genomeID}) {
963 :     # Count the PEGs and cells found for verification purposes.
964 :     my $pegCount = 0;
965 :     my $cellCount = 0;
966 :     # Create a list for the PEGs we find. This list will be used
967 :     # to generate cluster numbers.
968 :     my @pegsFound = ();
969 :     # Create a hash that maps spreadsheet IDs to PEGs. We will
970 :     # use this to generate the ContainsFeature data after we have
971 :     # the cluster numbers.
972 :     my %cellPegs = ();
973 :     # Get the genome's variant code for this subsystem.
974 :     my $variantCode = $sub->get_variant_code($row);
975 :     # Loop through the subsystem's roles. We use an index because it is
976 :     # part of the spreadsheet cell ID.
977 :     for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) {
978 :     # Get the features in the spreadsheet cell for this genome and role.
979 : parrello 1.37 my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col);
980 : parrello 1.31 # Only proceed if features exist.
981 :     if (@pegs > 0) {
982 :     # Create the spreadsheet cell.
983 :     $cellCount++;
984 :     my $cellID = "$subsysID:$genomeID:$col";
985 :     $loadSSCell->Put($cellID);
986 :     $loadIsGenomeOf->Put($genomeID, $cellID);
987 :     $loadIsRoleOf->Put($roleID, $cellID);
988 :     $loadHasSSCell->Put($subsysID, $cellID);
989 :     # Remember its features.
990 :     push @pegsFound, @pegs;
991 :     $cellPegs{$cellID} = \@pegs;
992 :     $pegCount += @pegs;
993 :     }
994 : parrello 1.23 }
995 : parrello 1.31 # If we found some cells for this genome, we need to compute clusters and
996 :     # denote it participates in the subsystem.
997 :     if ($pegCount > 0) {
998 :     Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3);
999 :     $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode);
1000 :     # Create a hash mapping PEG IDs to cluster numbers.
1001 :     # We default to -1 for all of them.
1002 :     my %clusterOf = map { $_ => -1 } @pegsFound;
1003 : parrello 1.41 # Partition the PEGs found into clusters.
1004 :     my @clusters = $fig->compute_clusters([keys %clusterOf], $sub);
1005 : parrello 1.31 for (my $i = 0; $i <= $#clusters; $i++) {
1006 :     my $subList = $clusters[$i];
1007 :     for my $peg (@{$subList}) {
1008 :     $clusterOf{$peg} = $i;
1009 :     }
1010 : parrello 1.23 }
1011 : parrello 1.31 # Create the ContainsFeature data.
1012 :     for my $cellID (keys %cellPegs) {
1013 :     my $cellList = $cellPegs{$cellID};
1014 :     for my $cellPeg (@$cellList) {
1015 :     $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg});
1016 :     }
1017 : parrello 1.23 }
1018 : parrello 1.18 }
1019 :     }
1020 : parrello 1.15 }
1021 : parrello 1.31 # Now we need to generate the subsets. The subset names must be concatenated to
1022 :     # the subsystem name to make them unique keys. There are two types of subsets:
1023 :     # genome subsets and role subsets. We do the role subsets first.
1024 :     my @subsetNames = $sub->get_subset_names();
1025 :     for my $subsetID (@subsetNames) {
1026 :     # Create the subset record.
1027 :     my $actualID = "$subsysID:$subsetID";
1028 :     $loadRoleSubset->Put($actualID);
1029 :     # Connect the subset to the subsystem.
1030 :     $loadHasRoleSubset->Put($subsysID, $actualID);
1031 :     # Connect the subset to its roles.
1032 :     my @roles = $sub->get_subsetC_roles($subsetID);
1033 :     for my $roleID (@roles) {
1034 :     $loadConsistsOfRoles->Put($actualID, $roleID);
1035 :     }
1036 :     }
1037 :     # Next the genome subsets.
1038 :     @subsetNames = $sub->get_subset_namesR();
1039 :     for my $subsetID (@subsetNames) {
1040 :     # Create the subset record.
1041 :     my $actualID = "$subsysID:$subsetID";
1042 :     $loadGenomeSubset->Put($actualID);
1043 :     # Connect the subset to the subsystem.
1044 :     $loadHasGenomeSubset->Put($subsysID, $actualID);
1045 :     # Connect the subset to its genomes.
1046 :     my @genomes = $sub->get_subsetR($subsetID);
1047 :     for my $genomeID (@genomes) {
1048 :     $loadConsistsOfGenomes->Put($actualID, $genomeID);
1049 :     }
1050 : parrello 1.23 }
1051 : parrello 1.18 }
1052 : parrello 1.57 }
1053 :     # Now we loop through the diagrams. We need to create the diagram records
1054 :     # and link each diagram to its roles. Note that only roles which occur
1055 :     # in subsystems (and therefore appear in the %ecToRoles hash) are
1056 :     # included.
1057 :     for my $map (@maps) {
1058 :     Trace("Loading diagram $map.") if T(3);
1059 :     # Get the diagram's descriptive name.
1060 :     my $name = $fig->map_name($map);
1061 :     $loadDiagram->Put($map, $name);
1062 :     # Now we need to link all the map's roles to it.
1063 :     # A hash is used to prevent duplicates.
1064 :     my %roleHash = ();
1065 : parrello 1.87 for my $ec ($fig->map_to_ecs($map)) {
1066 :     if (exists $ecToRoles{$ec}) {
1067 :     for my $role (@{$ecToRoles{$ec}}) {
1068 :     if (! $roleHash{$role}) {
1069 :     $loadRoleOccursIn->Put($role, $map);
1070 :     $roleHash{$role} = 1;
1071 :     }
1072 :     }
1073 : parrello 1.23 }
1074 : parrello 1.21 }
1075 : parrello 1.57 }
1076 : parrello 1.1 }
1077 :     # Finish the load.
1078 :     my $retVal = $self->_FinishAll();
1079 :     return $retVal;
1080 :     }
1081 :    
1082 :     =head3 LoadPropertyData
1083 :    
1084 : parrello 1.90 my $stats = $spl->LoadPropertyData();
1085 : parrello 1.1
1086 :     Load the attribute data from FIG into Sprout.
1087 :    
1088 :     Attribute data in FIG corresponds to the Sprout concept of Property. As currently
1089 :     implemented, each key-value attribute combination in the SEED corresponds to a
1090 :     record in the B<Property> table. The B<HasProperty> relationship links the
1091 :     features to the properties.
1092 :    
1093 :     The SEED also allows attributes to be assigned to genomes, but this is not yet
1094 :     supported by Sprout.
1095 :    
1096 :     The following relations are loaded by this method.
1097 :    
1098 :     HasProperty
1099 :     Property
1100 :    
1101 :     =over 4
1102 :    
1103 :     =item RETURNS
1104 :    
1105 :     Returns a statistics object for the loads.
1106 :    
1107 :     =back
1108 :    
1109 :     =cut
1110 :     #: Return Type $%;
1111 :     sub LoadPropertyData {
1112 :     # Get this object instance.
1113 :     my ($self) = @_;
1114 :     # Get the FIG object.
1115 :     my $fig = $self->{fig};
1116 :     # Get the genome hash.
1117 :     my $genomeHash = $self->{genomes};
1118 :     # Create load objects for each of the tables we're loading.
1119 : parrello 1.23 my $loadProperty = $self->_TableLoader('Property');
1120 : parrello 1.85 my $loadHasProperty = $self->_TableLoader('HasProperty');
1121 : parrello 1.23 if ($self->{options}->{loadOnly}) {
1122 :     Trace("Loading from existing files.") if T(2);
1123 :     } else {
1124 :     Trace("Generating property data.") if T(2);
1125 :     # Create a hash for storing property IDs.
1126 :     my %propertyKeys = ();
1127 :     my $nextID = 1;
1128 : parrello 1.83 # Get the attributes we intend to store in the property table.
1129 : parrello 1.85 my $propKeys = $self->{propKeys};
1130 : parrello 1.23 # Loop through the genomes.
1131 : parrello 1.66 for my $genomeID (sort keys %{$genomeHash}) {
1132 : parrello 1.23 $loadProperty->Add("genomeIn");
1133 : parrello 1.24 Trace("Generating properties for $genomeID.") if T(3);
1134 : parrello 1.83 # Initialize a counter.
1135 : parrello 1.24 my $propertyCount = 0;
1136 : parrello 1.80 # Get the properties for this genome's features.
1137 : parrello 1.85 my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys);
1138 : parrello 1.83 Trace("Property list built for $genomeID.") if T(3);
1139 :     # Loop through the results, creating HasProperty records.
1140 :     for my $attributeData (@attributes) {
1141 :     # Pull apart the attribute tuple.
1142 :     my ($fid, $key, $value, $url) = @{$attributeData};
1143 :     # Concatenate the key and value and check the "propertyKeys" hash to
1144 :     # see if we already have an ID for it. We use a tab for the separator
1145 :     # character.
1146 :     my $propertyKey = "$key\t$value";
1147 :     # Use the concatenated value to check for an ID. If no ID exists, we
1148 :     # create one.
1149 :     my $propertyID = $propertyKeys{$propertyKey};
1150 :     if (! $propertyID) {
1151 :     # Here we need to create a new property ID for this key/value pair.
1152 :     $propertyKeys{$propertyKey} = $nextID;
1153 :     $propertyID = $nextID;
1154 :     $nextID++;
1155 :     $loadProperty->Put($propertyID, $key, $value);
1156 : parrello 1.1 }
1157 : parrello 1.83 # Create the HasProperty entry for this feature/property association.
1158 :     $loadHasProperty->Put($fid, $propertyID, $url);
1159 : parrello 1.96 $propertyCount++;
1160 : parrello 1.1 }
1161 : parrello 1.24 # Update the statistics.
1162 : parrello 1.83 Trace("$propertyCount attributes processed.") if T(3);
1163 : parrello 1.24 $loadHasProperty->Add("propertiesIn", $propertyCount);
1164 : parrello 1.1 }
1165 :     }
1166 :     # Finish the load.
1167 :     my $retVal = $self->_FinishAll();
1168 :     return $retVal;
1169 :     }
1170 :    
1171 :     =head3 LoadAnnotationData
1172 :    
1173 : parrello 1.90 my $stats = $spl->LoadAnnotationData();
1174 : parrello 1.1
1175 :     Load the annotation data from FIG into Sprout.
1176 :    
1177 :     Sprout annotations encompass both the assignments and the annotations in SEED.
1178 :     These describe the function performed by a PEG as well as any other useful
1179 :     information that may aid in identifying its purpose.
1180 :    
1181 :     The following relations are loaded by this method.
1182 :    
1183 :     Annotation
1184 :     IsTargetOfAnnotation
1185 :     SproutUser
1186 :     MadeAnnotation
1187 :    
1188 :     =over 4
1189 :    
1190 :     =item RETURNS
1191 :    
1192 :     Returns a statistics object for the loads.
1193 :    
1194 :     =back
1195 :    
1196 :     =cut
1197 :     #: Return Type $%;
1198 :     sub LoadAnnotationData {
1199 :     # Get this object instance.
1200 :     my ($self) = @_;
1201 :     # Get the FIG object.
1202 :     my $fig = $self->{fig};
1203 :     # Get the genome hash.
1204 :     my $genomeHash = $self->{genomes};
1205 :     # Create load objects for each of the tables we're loading.
1206 : parrello 1.23 my $loadAnnotation = $self->_TableLoader('Annotation');
1207 : parrello 1.85 my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation');
1208 :     my $loadSproutUser = $self->_TableLoader('SproutUser');
1209 :     my $loadUserAccess = $self->_TableLoader('UserAccess');
1210 :     my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation');
1211 : parrello 1.23 if ($self->{options}->{loadOnly}) {
1212 :     Trace("Loading from existing files.") if T(2);
1213 :     } else {
1214 :     Trace("Generating annotation data.") if T(2);
1215 :     # Create a hash of user names. We'll use this to prevent us from generating duplicate
1216 :     # user records.
1217 :     my %users = ( FIG => 1, master => 1 );
1218 :     # Put in FIG and "master".
1219 :     $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes");
1220 :     $loadUserAccess->Put("FIG", 1);
1221 :     $loadSproutUser->Put("master", "Master User");
1222 :     $loadUserAccess->Put("master", 1);
1223 :     # Get the current time.
1224 :     my $time = time();
1225 :     # Loop through the genomes.
1226 :     for my $genomeID (sort keys %{$genomeHash}) {
1227 :     Trace("Processing $genomeID.") if T(3);
1228 : parrello 1.38 # Create a hash of timestamps. We use this to prevent duplicate time stamps
1229 :     # from showing up for a single PEG's annotations.
1230 :     my %seenTimestamps = ();
1231 : parrello 1.36 # Get the genome's annotations.
1232 :     my @annotations = $fig->read_all_annotations($genomeID);
1233 :     Trace("Processing annotations.") if T(2);
1234 :     for my $tuple (@annotations) {
1235 :     # Get the annotation tuple.
1236 :     my ($peg, $timestamp, $user, $text) = @{$tuple};
1237 :     # Here we fix up the annotation text. "\r" is removed,
1238 : parrello 1.42 # and "\t" and "\n" are escaped. Note we use the "gs"
1239 : parrello 1.36 # modifier so that new-lines inside the text do not
1240 :     # stop the substitution search.
1241 :     $text =~ s/\r//gs;
1242 :     $text =~ s/\t/\\t/gs;
1243 :     $text =~ s/\n/\\n/gs;
1244 :     # Change assignments by the master user to FIG assignments.
1245 :     $text =~ s/Set master function/Set FIG function/s;
1246 :     # Insure the time stamp is valid.
1247 :     if ($timestamp =~ /^\d+$/) {
1248 :     # Here it's a number. We need to insure the one we use to form
1249 :     # the key is unique.
1250 :     my $keyStamp = $timestamp;
1251 :     while ($seenTimestamps{"$peg:$keyStamp"}) {
1252 :     $keyStamp++;
1253 : parrello 1.1 }
1254 : parrello 1.36 my $annotationID = "$peg:$keyStamp";
1255 :     $seenTimestamps{$annotationID} = 1;
1256 :     # Insure the user exists.
1257 :     if (! $users{$user}) {
1258 :     $loadSproutUser->Put($user, "SEED user");
1259 :     $loadUserAccess->Put($user, 1);
1260 :     $users{$user} = 1;
1261 :     }
1262 :     # Generate the annotation.
1263 :     $loadAnnotation->Put($annotationID, $timestamp, $text);
1264 :     $loadIsTargetOfAnnotation->Put($peg, $annotationID);
1265 :     $loadMadeAnnotation->Put($user, $annotationID);
1266 :     } else {
1267 :     # Here we have an invalid time stamp.
1268 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1);
1269 : parrello 1.1 }
1270 :     }
1271 :     }
1272 :     }
1273 :     # Finish the load.
1274 :     my $retVal = $self->_FinishAll();
1275 :     return $retVal;
1276 :     }
1277 :    
1278 : parrello 1.5 =head3 LoadSourceData
1279 :    
1280 : parrello 1.90 my $stats = $spl->LoadSourceData();
1281 : parrello 1.5
1282 :     Load the source data from FIG into Sprout.
1283 :    
1284 :     Source data links genomes to information about the organizations that
1285 :     mapped it.
1286 :    
1287 :     The following relations are loaded by this method.
1288 :    
1289 :     ComesFrom
1290 :     Source
1291 :     SourceURL
1292 :    
1293 :     There is no direct support for source attribution in FIG, so we access the SEED
1294 :     files directly.
1295 :    
1296 :     =over 4
1297 :    
1298 :     =item RETURNS
1299 :    
1300 :     Returns a statistics object for the loads.
1301 :    
1302 :     =back
1303 :    
1304 :     =cut
1305 :     #: Return Type $%;
1306 :     sub LoadSourceData {
1307 :     # Get this object instance.
1308 :     my ($self) = @_;
1309 :     # Get the FIG object.
1310 :     my $fig = $self->{fig};
1311 :     # Get the genome hash.
1312 :     my $genomeHash = $self->{genomes};
1313 :     # Create load objects for each of the tables we're loading.
1314 : parrello 1.85 my $loadComesFrom = $self->_TableLoader('ComesFrom');
1315 : parrello 1.23 my $loadSource = $self->_TableLoader('Source');
1316 :     my $loadSourceURL = $self->_TableLoader('SourceURL');
1317 :     if ($self->{options}->{loadOnly}) {
1318 :     Trace("Loading from existing files.") if T(2);
1319 :     } else {
1320 :     Trace("Generating annotation data.") if T(2);
1321 :     # Create hashes to collect the Source information.
1322 :     my %sourceURL = ();
1323 :     my %sourceDesc = ();
1324 :     # Loop through the genomes.
1325 :     my $line;
1326 :     for my $genomeID (sort keys %{$genomeHash}) {
1327 :     Trace("Processing $genomeID.") if T(3);
1328 :     # Open the project file.
1329 :     if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) &&
1330 :     defined($line = <TMP>)) {
1331 :     chomp $line;
1332 :     my($sourceID, $desc, $url) = split(/\t/,$line);
1333 :     $loadComesFrom->Put($genomeID, $sourceID);
1334 :     if ($url && ! exists $sourceURL{$sourceID}) {
1335 :     $loadSourceURL->Put($sourceID, $url);
1336 :     $sourceURL{$sourceID} = 1;
1337 :     }
1338 :     if ($desc) {
1339 :     $sourceDesc{$sourceID} = $desc;
1340 :     } elsif (! exists $sourceDesc{$sourceID}) {
1341 :     $sourceDesc{$sourceID} = $sourceID;
1342 :     }
1343 : parrello 1.5 }
1344 : parrello 1.23 close TMP;
1345 :     }
1346 :     # Write the source descriptions.
1347 :     for my $sourceID (keys %sourceDesc) {
1348 :     $loadSource->Put($sourceID, $sourceDesc{$sourceID});
1349 : parrello 1.5 }
1350 : parrello 1.16 }
1351 : parrello 1.5 # Finish the load.
1352 :     my $retVal = $self->_FinishAll();
1353 :     return $retVal;
1354 :     }
1355 :    
1356 : parrello 1.6 =head3 LoadExternalData
1357 :    
1358 : parrello 1.90 my $stats = $spl->LoadExternalData();
1359 : parrello 1.6
1360 :     Load the external data from FIG into Sprout.
1361 :    
1362 :     External data contains information about external feature IDs.
1363 :    
1364 :     The following relations are loaded by this method.
1365 :    
1366 :     ExternalAliasFunc
1367 :     ExternalAliasOrg
1368 :    
1369 :     The support for external IDs in FIG is hidden beneath layers of other data, so
1370 :     we access the SEED files directly to create these tables. This is also one of
1371 :     the few load methods that does not proceed genome by genome.
1372 :    
1373 :     =over 4
1374 :    
1375 :     =item RETURNS
1376 :    
1377 :     Returns a statistics object for the loads.
1378 :    
1379 :     =back
1380 :    
1381 :     =cut
1382 :     #: Return Type $%;
1383 :     sub LoadExternalData {
1384 :     # Get this object instance.
1385 :     my ($self) = @_;
1386 :     # Get the FIG object.
1387 :     my $fig = $self->{fig};
1388 :     # Get the genome hash.
1389 :     my $genomeHash = $self->{genomes};
1390 :     # Convert the genome hash. We'll get the genus and species for each genome and make
1391 :     # it the key.
1392 :     my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash});
1393 :     # Create load objects for each of the tables we're loading.
1394 : parrello 1.23 my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc');
1395 :     my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg');
1396 :     if ($self->{options}->{loadOnly}) {
1397 :     Trace("Loading from existing files.") if T(2);
1398 :     } else {
1399 :     Trace("Generating external data.") if T(2);
1400 :     # We loop through the files one at a time. First, the organism file.
1401 : parrello 1.58 Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |");
1402 : parrello 1.23 my $orgLine;
1403 :     while (defined($orgLine = <ORGS>)) {
1404 :     # Clean the input line.
1405 :     chomp $orgLine;
1406 :     # Parse the organism name.
1407 :     my ($protID, $name) = split /\s*\t\s*/, $orgLine;
1408 :     $loadExternalAliasOrg->Put($protID, $name);
1409 :     }
1410 :     close ORGS;
1411 :     # Now the function file.
1412 :     my $funcLine;
1413 : parrello 1.58 Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |");
1414 : parrello 1.23 while (defined($funcLine = <FUNCS>)) {
1415 :     # Clean the line ending.
1416 :     chomp $funcLine;
1417 :     # Only proceed if the line is non-blank.
1418 :     if ($funcLine) {
1419 :     # Split it into fields.
1420 :     my @funcFields = split /\s*\t\s*/, $funcLine;
1421 :     # If there's an EC number, append it to the description.
1422 :     if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) {
1423 :     $funcFields[1] .= " $1";
1424 :     }
1425 :     # Output the function line.
1426 :     $loadExternalAliasFunc->Put(@funcFields[0,1]);
1427 : parrello 1.6 }
1428 :     }
1429 :     }
1430 :     # Finish the load.
1431 :     my $retVal = $self->_FinishAll();
1432 :     return $retVal;
1433 :     }
1434 : parrello 1.5
1435 : parrello 1.18
1436 :     =head3 LoadReactionData
1437 :    
1438 : parrello 1.90 my $stats = $spl->LoadReactionData();
1439 : parrello 1.18
1440 :     Load the reaction data from FIG into Sprout.
1441 :    
1442 :     Reaction data connects reactions to the compounds that participate in them.
1443 :    
1444 :     The following relations are loaded by this method.
1445 :    
1446 : parrello 1.20 Reaction
1447 : parrello 1.18 ReactionURL
1448 :     Compound
1449 :     CompoundName
1450 :     CompoundCAS
1451 : parrello 1.85 IsIdentifiedByCAS
1452 :     HasCompoundName
1453 : parrello 1.18 IsAComponentOf
1454 : parrello 1.93 Scenario
1455 :     Catalyzes
1456 :     HasScenario
1457 :     IsInputFor
1458 :     IsOutputOf
1459 :     ExcludesReaction
1460 :     IncludesReaction
1461 :     IsOnDiagram
1462 :     IncludesReaction
1463 : parrello 1.18
1464 :     This method proceeds reaction by reaction rather than genome by genome.
1465 :    
1466 :     =over 4
1467 :    
1468 :     =item RETURNS
1469 :    
1470 :     Returns a statistics object for the loads.
1471 :    
1472 :     =back
1473 :    
1474 :     =cut
1475 :     #: Return Type $%;
1476 :     sub LoadReactionData {
1477 :     # Get this object instance.
1478 :     my ($self) = @_;
1479 :     # Get the FIG object.
1480 :     my $fig = $self->{fig};
1481 :     # Create load objects for each of the tables we're loading.
1482 : parrello 1.23 my $loadReaction = $self->_TableLoader('Reaction');
1483 : parrello 1.85 my $loadReactionURL = $self->_TableLoader('ReactionURL');
1484 :     my $loadCompound = $self->_TableLoader('Compound');
1485 :     my $loadCompoundName = $self->_TableLoader('CompoundName');
1486 :     my $loadCompoundCAS = $self->_TableLoader('CompoundCAS');
1487 :     my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf');
1488 :     my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS');
1489 :     my $loadHasCompoundName = $self->_TableLoader('HasCompoundName');
1490 : parrello 1.93 my $loadScenario = $self->_TableLoader('Scenario');
1491 :     my $loadHasScenario = $self->_TableLoader('HasScenario');
1492 :     my $loadIsInputFor = $self->_TableLoader('IsInputFor');
1493 :     my $loadIsOutputOf = $self->_TableLoader('IsOutputOf');
1494 :     my $loadIsOnDiagram = $self->_TableLoader('IsOnDiagram');
1495 :     my $loadIncludesReaction = $self->_TableLoader('IncludesReaction');
1496 :     my $loadExcludesReaction = $self->_TableLoader('ExcludesReaction');
1497 :     my $loadCatalyzes = $self->_TableLoader('Catalyzes');
1498 : parrello 1.23 if ($self->{options}->{loadOnly}) {
1499 :     Trace("Loading from existing files.") if T(2);
1500 :     } else {
1501 : parrello 1.85 Trace("Generating reaction data.") if T(2);
1502 :     # We need some hashes to prevent duplicates.
1503 :     my %compoundNames = ();
1504 :     my %compoundCASes = ();
1505 : parrello 1.23 # First we create the compounds.
1506 : parrello 1.93 my %compounds = map { $_ => 1 } $fig->all_compounds();
1507 :     for my $cid (keys %compounds) {
1508 : parrello 1.23 # Check for names.
1509 :     my @names = $fig->names_of_compound($cid);
1510 :     # Each name will be given a priority number, starting with 1.
1511 :     my $prio = 1;
1512 :     for my $name (@names) {
1513 : parrello 1.85 if (! exists $compoundNames{$name}) {
1514 :     $loadCompoundName->Put($name);
1515 :     $compoundNames{$name} = 1;
1516 :     }
1517 :     $loadHasCompoundName->Put($cid, $name, $prio++);
1518 : parrello 1.23 }
1519 :     # Create the main compound record. Note that the first name
1520 :     # becomes the label.
1521 :     my $label = (@names > 0 ? $names[0] : $cid);
1522 :     $loadCompound->Put($cid, $label);
1523 :     # Check for a CAS ID.
1524 :     my $cas = $fig->cas($cid);
1525 :     if ($cas) {
1526 : parrello 1.85 $loadIsIdentifiedByCAS->Put($cid, $cas);
1527 :     if (! exists $compoundCASes{$cas}) {
1528 :     $loadCompoundCAS->Put($cas);
1529 :     $compoundCASes{$cas} = 1;
1530 :     }
1531 : parrello 1.23 }
1532 : parrello 1.20 }
1533 : parrello 1.23 # All the compounds are set up, so we need to loop through the reactions next. First,
1534 :     # we initialize the discriminator index. This is a single integer used to insure
1535 :     # duplicate elements in a reaction are not accidentally collapsed.
1536 :     my $discrim = 0;
1537 : parrello 1.93 my %reactions = map { $_ => 1 } $fig->all_reactions();
1538 :     for my $reactionID (keys %reactions) {
1539 : parrello 1.23 # Create the reaction record.
1540 :     $loadReaction->Put($reactionID, $fig->reversible($reactionID));
1541 :     # Compute the reaction's URL.
1542 :     my $url = HTML::reaction_link($reactionID);
1543 :     # Put it in the ReactionURL table.
1544 :     $loadReactionURL->Put($reactionID, $url);
1545 :     # Now we need all of the reaction's compounds. We get these in two phases,
1546 :     # substrates first and then products.
1547 :     for my $product (0, 1) {
1548 :     # Get the compounds of the current type for the current reaction. FIG will
1549 :     # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not
1550 :     # have location data in SEED, so it defaults to the empty string.
1551 :     my @compounds = $fig->reaction2comp($reactionID, $product);
1552 :     for my $compData (@compounds) {
1553 :     # Extract the compound data from the current tuple.
1554 :     my ($cid, $stoich, $main) = @{$compData};
1555 :     # Link the compound to the reaction.
1556 :     $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main,
1557 :     $product, $stoich);
1558 :     }
1559 : parrello 1.18 }
1560 :     }
1561 : parrello 1.93 # Now we run through the subsystems and roles, generating the scenarios
1562 :     # and connecting the reactions. We'll need some hashes to prevent
1563 :     # duplicates and a counter for compound group keys.
1564 :     my %roles = ();
1565 :     my %scenarios = ();
1566 :     my @subsystems = $fig->all_subsystems();
1567 :     for my $subName (@subsystems) {
1568 :     my $sub = $fig->get_subsystem($subName);
1569 :     Trace("Processing $subName reactions.") if T(3);
1570 :     # Get the subsystem's reactions.
1571 :     my %reactions = $sub->get_hope_reactions();
1572 :     # Loop through the roles, connecting them to the reactions.
1573 :     for my $role (keys %reactions) {
1574 :     # Only process this role if it is new.
1575 :     if (! $roles{$role}) {
1576 :     $roles{$role} = 1;
1577 :     my @reactions = @{$reactions{$role}};
1578 :     for my $reaction (@reactions) {
1579 :     $loadCatalyzes->Put($role, $reaction);
1580 :     }
1581 :     }
1582 :     }
1583 :     Trace("Processing $subName scenarios.") if T(3);
1584 :     # Get the subsystem's scenarios.
1585 :     my @scenarioNames = $sub->get_hope_scenario_names();
1586 :     # Loop through the scenarios, creating scenario data.
1587 :     for my $scenarioName (@scenarioNames) {
1588 :     # Link this scenario to this subsystem.
1589 :     $loadHasScenario->Put($subName, $scenarioName);
1590 :     # If this scenario is new, we need to create it.
1591 :     if (! $scenarios{$scenarioName}) {
1592 :     Trace("Creating scenario $scenarioName.") if T(3);
1593 :     $scenarios{$scenarioName} = 1;
1594 :     # Create the scenario itself.
1595 :     $loadScenario->Put($scenarioName);
1596 :     # Attach the input compounds.
1597 :     for my $input ($sub->get_hope_input_compounds($scenarioName)) {
1598 :     $loadIsInputFor->Put($input, $scenarioName);
1599 :     }
1600 :     # Now we need to set up the output compounds. They come in two
1601 :     # groups, which we mark 0 and 1.
1602 :     my $outputGroup = 0;
1603 :     # Set up the output compounds.
1604 :     for my $outputGroup ($sub->get_hope_output_compounds($scenarioName)) {
1605 :     # Attach the compounds.
1606 :     for my $compound (@$outputGroup) {
1607 :     $loadIsOutputOf->Put($scenarioName, $compound, $outputGroup);
1608 :     }
1609 :     }
1610 :     # Create the reaction lists.
1611 :     my @addReactions = $sub->get_hope_additional_reactions($scenarioName);
1612 :     for my $reaction (@addReactions) {
1613 :     $loadIncludesReaction->Put($scenarioName, $reaction);
1614 :     }
1615 :     my @notReactions = $sub->get_hope_ignore_reactions($scenarioName);
1616 :     for my $reaction (@notReactions) {
1617 :     $loadExcludesReaction->Put($scenarioName, $reaction);
1618 :     }
1619 :     # Link the maps.
1620 :     my @maps = $sub->get_hope_map_ids($scenarioName);
1621 :     for my $map (@maps) {
1622 :     $loadIsOnDiagram->Put($scenarioName, "map$map");
1623 :     }
1624 :     }
1625 :     }
1626 :     }
1627 : parrello 1.18 }
1628 :     # Finish the load.
1629 :     my $retVal = $self->_FinishAll();
1630 :     return $retVal;
1631 :     }
1632 :    
1633 : parrello 1.43 =head3 LoadSynonymData
1634 :    
1635 : parrello 1.90 my $stats = $spl->LoadSynonymData();
1636 : parrello 1.43
1637 :     Load the synonym groups into Sprout.
1638 :    
1639 :     The following relations are loaded by this method.
1640 :    
1641 :     SynonymGroup
1642 :     IsSynonymGroupFor
1643 :    
1644 :     The source information for these relations is taken from the C<maps_to_id> method
1645 : parrello 1.56 of the B<FIG> object. Unfortunately, to make this work, we need to use direct
1646 :     SQL against the FIG database.
1647 : parrello 1.43
1648 :     =over 4
1649 :    
1650 :     =item RETURNS
1651 :    
1652 :     Returns a statistics object for the loads.
1653 :    
1654 :     =back
1655 :    
1656 :     =cut
1657 :     #: Return Type $%;
1658 :     sub LoadSynonymData {
1659 :     # Get this object instance.
1660 :     my ($self) = @_;
1661 :     # Get the FIG object.
1662 :     my $fig = $self->{fig};
1663 :     # Get the genome hash.
1664 :     my $genomeHash = $self->{genomes};
1665 :     # Create a load object for the table we're loading.
1666 :     my $loadSynonymGroup = $self->_TableLoader('SynonymGroup');
1667 :     my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor');
1668 :     if ($self->{options}->{loadOnly}) {
1669 :     Trace("Loading from existing files.") if T(2);
1670 :     } else {
1671 :     Trace("Generating synonym group data.") if T(2);
1672 : parrello 1.56 # Get the database handle.
1673 :     my $dbh = $fig->db_handle();
1674 : parrello 1.85 # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias.
1675 : parrello 1.59 my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to");
1676 : parrello 1.56 my $result = $sth->execute();
1677 :     if (! defined($result)) {
1678 :     Confess("Database error in Synonym load: " . $sth->errstr());
1679 :     } else {
1680 : parrello 1.90 Trace("Processing synonym results.") if T(2);
1681 : parrello 1.56 # Remember the current synonym.
1682 :     my $current_syn = "";
1683 :     # Count the features.
1684 :     my $featureCount = 0;
1685 : parrello 1.90 my $entryCount = 0;
1686 : parrello 1.56 # Loop through the synonym/peg pairs.
1687 :     while (my @row = $sth->fetchrow()) {
1688 : parrello 1.85 # Get the synonym group ID and feature ID.
1689 : parrello 1.56 my ($syn_id, $peg) = @row;
1690 : parrello 1.90 # Count this row.
1691 :     $entryCount++;
1692 :     if ($entryCount % 1000 == 0) {
1693 :     Trace("$entryCount rows processed.") if T(3);
1694 :     }
1695 : parrello 1.56 # Insure it's for one of our genomes.
1696 :     my $genomeID = FIG::genome_of($peg);
1697 :     if (exists $genomeHash->{$genomeID}) {
1698 :     # Verify the synonym.
1699 :     if ($syn_id ne $current_syn) {
1700 :     # It's new, so put it in the group table.
1701 :     $loadSynonymGroup->Put($syn_id);
1702 :     $current_syn = $syn_id;
1703 :     }
1704 :     # Connect the synonym to the peg.
1705 :     $loadIsSynonymGroupFor->Put($syn_id, $peg);
1706 :     # Count this feature.
1707 :     $featureCount++;
1708 :     if ($featureCount % 1000 == 0) {
1709 :     Trace("$featureCount features processed.") if T(3);
1710 :     }
1711 : parrello 1.43 }
1712 :     }
1713 : parrello 1.90 Trace("$entryCount rows produced $featureCount features.") if T(2);
1714 : parrello 1.43 }
1715 :     }
1716 :     # Finish the load.
1717 :     my $retVal = $self->_FinishAll();
1718 :     return $retVal;
1719 :     }
1720 :    
1721 : parrello 1.60 =head3 LoadFamilyData
1722 :    
1723 : parrello 1.90 my $stats = $spl->LoadFamilyData();
1724 : parrello 1.60
1725 :     Load the protein families into Sprout.
1726 :    
1727 :     The following relations are loaded by this method.
1728 :    
1729 :     Family
1730 : parrello 1.63 IsFamilyForFeature
1731 : parrello 1.60
1732 :     The source information for these relations is taken from the C<families_for_protein>,
1733 :     C<family_function>, and C<sz_family> methods of the B<FIG> object.
1734 :    
1735 :     =over 4
1736 :    
1737 :     =item RETURNS
1738 :    
1739 :     Returns a statistics object for the loads.
1740 :    
1741 :     =back
1742 :    
1743 :     =cut
1744 :     #: Return Type $%;
1745 :     sub LoadFamilyData {
1746 :     # Get this object instance.
1747 :     my ($self) = @_;
1748 :     # Get the FIG object.
1749 :     my $fig = $self->{fig};
1750 :     # Get the genome hash.
1751 :     my $genomeHash = $self->{genomes};
1752 :     # Create load objects for the tables we're loading.
1753 :     my $loadFamily = $self->_TableLoader('Family');
1754 : parrello 1.63 my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature');
1755 : parrello 1.60 if ($self->{options}->{loadOnly}) {
1756 :     Trace("Loading from existing files.") if T(2);
1757 :     } else {
1758 :     Trace("Generating family data.") if T(2);
1759 :     # Create a hash for the family IDs.
1760 :     my %familyHash = ();
1761 :     # Loop through the genomes.
1762 :     for my $genomeID (sort keys %{$genomeHash}) {
1763 :     Trace("Processing features for $genomeID.") if T(2);
1764 :     # Loop through this genome's PEGs.
1765 :     for my $fid ($fig->all_features($genomeID, "peg")) {
1766 : parrello 1.63 $loadIsFamilyForFeature->Add("features", 1);
1767 : parrello 1.60 # Get this feature's families.
1768 :     my @families = $fig->families_for_protein($fid);
1769 :     # Loop through the families, connecting them to the feature.
1770 :     for my $family (@families) {
1771 : parrello 1.63 $loadIsFamilyForFeature->Put($family, $fid);
1772 : parrello 1.60 # If this is a new family, create a record for it.
1773 :     if (! exists $familyHash{$family}) {
1774 : parrello 1.62 $familyHash{$family} = 1;
1775 : parrello 1.60 $loadFamily->Add("families", 1);
1776 :     my $size = $fig->sz_family($family);
1777 :     my $func = $fig->family_function($family);
1778 : parrello 1.61 $loadFamily->Put($family, $size, $func);
1779 : parrello 1.60 }
1780 :     }
1781 :     }
1782 :     }
1783 :     }
1784 :     # Finish the load.
1785 :     my $retVal = $self->_FinishAll();
1786 :     return $retVal;
1787 :     }
1788 : parrello 1.43
1789 : parrello 1.76 =head3 LoadDrugData
1790 :    
1791 : parrello 1.90 my $stats = $spl->LoadDrugData();
1792 : parrello 1.76
1793 :     Load the drug target data into Sprout.
1794 :    
1795 :     The following relations are loaded by this method.
1796 :    
1797 :     PDB
1798 : parrello 1.83 DocksWith
1799 :     IsProteinForFeature
1800 : parrello 1.76 Ligand
1801 :    
1802 : parrello 1.83 The source information for these relations is taken from attributes. The
1803 :     C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>.
1804 :     The C<zinc_name> attribute describes the ligands. The C<docking_results>
1805 :     attribute contains the information for the B<DocksWith> relationship. It is
1806 :     expected that additional attributes and tables will be added in the future.
1807 : parrello 1.76
1808 :     =over 4
1809 :    
1810 :     =item RETURNS
1811 :    
1812 :     Returns a statistics object for the loads.
1813 :    
1814 :     =back
1815 :    
1816 :     =cut
1817 :     #: Return Type $%;
1818 :     sub LoadDrugData {
1819 :     # Get this object instance.
1820 :     my ($self) = @_;
1821 :     # Get the FIG object.
1822 :     my $fig = $self->{fig};
1823 :     # Get the genome hash.
1824 :     my $genomeHash = $self->{genomes};
1825 :     # Create load objects for the tables we're loading.
1826 :     my $loadPDB = $self->_TableLoader('PDB');
1827 :     my $loadLigand = $self->_TableLoader('Ligand');
1828 : parrello 1.83 my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature');
1829 :     my $loadDocksWith = $self->_TableLoader('DocksWith');
1830 : parrello 1.76 if ($self->{options}->{loadOnly}) {
1831 :     Trace("Loading from existing files.") if T(2);
1832 :     } else {
1833 :     Trace("Generating drug target data.") if T(2);
1834 : parrello 1.83 # First comes the "DocksWith" relationship. This will give us a list of PDBs.
1835 :     # We can also encounter PDBs when we process "IsProteinForFeature". To manage
1836 :     # this process, PDB information is collected in a hash table and then
1837 :     # unspooled after both relationships are created.
1838 :     my %pdbHash = ();
1839 :     Trace("Generating docking data.") if T(2);
1840 :     # Get all the docking data. This may cause problems if there are too many PDBs,
1841 :     # at which point we'll need another algorithm. The indicator that this is
1842 :     # happening will be a timeout error in the next statement.
1843 :     my @dockData = $fig->query_attributes('$key = ? AND $value < ?',
1844 :     ['docking_results', $FIG_Config::dockLimit]);
1845 :     Trace(scalar(@dockData) . " rows of docking data found.") if T(3);
1846 :     for my $dockData (@dockData) {
1847 :     # Get the docking data components.
1848 :     my ($pdbID, $docking_key, @valueData) = @{$dockData};
1849 :     # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen.
1850 :     $pdbID = lc $pdbID;
1851 : parrello 1.84 # Strip off the object type.
1852 :     $pdbID =~ s/pdb://;
1853 : parrello 1.83 # Extract the ZINC ID from the docking key. Note that there are two possible
1854 :     # formats.
1855 :     my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/;
1856 :     if (! $zinc_id) {
1857 :     Trace("Invalid docking result key $docking_key for $pdbID.") if T(0);
1858 :     $loadDocksWith->Add("errors");
1859 : parrello 1.76 } else {
1860 : parrello 1.83 # Get the pieces of the value and parse the energy.
1861 :     # Note that we don't care about the rank, since
1862 :     # we can sort on the energy level itself in our database.
1863 :     my ($energy, $tool, $type) = @valueData;
1864 :     my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy;
1865 : parrello 1.84 # Ignore predicted results.
1866 :     if ($type ne "Predicted") {
1867 :     # Count this docking result.
1868 :     if (! exists $pdbHash{$pdbID}) {
1869 :     $pdbHash{$pdbID} = 1;
1870 :     } else {
1871 :     $pdbHash{$pdbID}++;
1872 :     }
1873 :     # Write the result to the output.
1874 :     $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool,
1875 :     $total, $vanderwaals);
1876 :     }
1877 : parrello 1.83 }
1878 :     }
1879 :     Trace("Connecting features.") if T(2);
1880 :     # Loop through the genomes.
1881 :     for my $genome (sort keys %{$genomeHash}) {
1882 :     Trace("Generating PDBs for $genome.") if T(3);
1883 :     # Get all of the PDBs that BLAST against this genome's features.
1884 :     my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%');
1885 :     for my $pdbData (@attributeData) {
1886 :     # The PDB ID is coded as a subkey.
1887 : parrello 1.84 if ($pdbData->[1] !~ /PDB::(.+)/i) {
1888 : parrello 1.83 Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0);
1889 :     $loadPDB->Add("errors");
1890 :     } else {
1891 :     my $pdbID = $1;
1892 :     # Insure the PDB is in the hash.
1893 :     if (! exists $pdbHash{$pdbID}) {
1894 :     $pdbHash{$pdbID} = 0;
1895 : parrello 1.76 }
1896 : parrello 1.83 # The score and locations are coded in the attribute value.
1897 :     if ($pdbData->[2] !~ /^([^;]+)(.*)$/) {
1898 :     Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0);
1899 :     $loadIsProteinForFeature->Add("errors");
1900 :     } else {
1901 :     my ($score, $locData) = ($1,$2);
1902 :     # The location data may not be present, so we have to start with some
1903 :     # defaults and then check.
1904 :     my ($start, $end) = (1, 0);
1905 :     if ($locData) {
1906 :     $locData =~ /(\d+)-(\d+)/;
1907 :     $start = $1;
1908 :     $end = $2;
1909 :     }
1910 :     # If we still don't have the end location, compute it from
1911 :     # the feature length.
1912 :     if (! $end) {
1913 :     # Most features have one location, but we do a list iteration
1914 :     # just in case.
1915 :     my @locations = $fig->feature_location($pdbData->[0]);
1916 :     $end = 0;
1917 :     for my $loc (@locations) {
1918 :     my $locObject = BasicLocation->new($loc);
1919 :     $end += $locObject->Length;
1920 : parrello 1.76 }
1921 :     }
1922 : parrello 1.83 # Decode the score.
1923 :     my $realScore = FIGRules::DecodeScore($score);
1924 :     # Connect the PDB to the feature.
1925 : parrello 1.90 $loadIsProteinForFeature->Put($pdbID, $pdbData->[0], $start, $realScore, $end);
1926 : parrello 1.76 }
1927 : parrello 1.83 }
1928 :     }
1929 :     }
1930 :     # We've got all our PDBs now, so we unspool them from the hash.
1931 :     Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2);
1932 :     my $count = 0;
1933 :     for my $pdbID (sort keys %pdbHash) {
1934 :     $loadPDB->Put($pdbID, $pdbHash{$pdbID});
1935 :     $count++;
1936 :     Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0);
1937 :     }
1938 :     # Finally we create the ligand table. This information can be found in the
1939 :     # zinc_name attribute.
1940 :     Trace("Loading ligands.") if T(2);
1941 :     # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates.
1942 :     my $last_zinc_id = "";
1943 :     my $zinc_id = "";
1944 :     my $done = 0;
1945 :     while (! $done) {
1946 :     # Get the next 10000 ligands. We insist that the object ID is greater than
1947 :     # the last ID we processed.
1948 :     Trace("Loading batch starting with ZINC:$zinc_id.") if T(3);
1949 :     my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000',
1950 :     ["ZINC:$zinc_id", "zinc_name"]);
1951 :     Trace(scalar(@attributeData) . " attribute rows returned.") if T(3);
1952 :     if (! @attributeData) {
1953 :     # Here there are no attributes left, so we quit the loop.
1954 :     $done = 1;
1955 :     } else {
1956 :     # Process the attribute data we've received.
1957 :     for my $zinc_data (@attributeData) {
1958 :     # The ZINC ID is found in the first return column, prefixed with the word ZINC.
1959 :     if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) {
1960 :     $zinc_id = $1;
1961 :     # Check for a duplicate.
1962 :     if ($zinc_id eq $last_zinc_id) {
1963 :     $loadLigand->Add("duplicate");
1964 :     } else {
1965 :     # Here it's safe to output the ligand. The ligand name is the attribute value
1966 :     # (third column in the row).
1967 :     $loadLigand->Put($zinc_id, $zinc_data->[2]);
1968 :     # Insure we don't try to add this ID again.
1969 :     $last_zinc_id = $zinc_id;
1970 : parrello 1.76 }
1971 : parrello 1.83 } else {
1972 :     Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0);
1973 :     $loadLigand->Add("errors");
1974 : parrello 1.76 }
1975 :     }
1976 :     }
1977 :     }
1978 : parrello 1.83 Trace("Ligands loaded.") if T(2);
1979 : parrello 1.76 }
1980 :     # Finish the load.
1981 :     my $retVal = $self->_FinishAll();
1982 :     return $retVal;
1983 :     }
1984 : parrello 1.69
1985 :    
1986 : parrello 1.1 =head2 Internal Utility Methods
1987 :    
1988 : parrello 1.76 =head3 SpecialAttribute
1989 :    
1990 : parrello 1.90 my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader);
1991 : parrello 1.76
1992 :     Look for special attributes of a given type. A special attribute is found by comparing one of
1993 :     the columns of the incoming attribute list to a search pattern. If a match is found, then
1994 : parrello 1.77 a set of columns is put into an output table connected to the specified ID.
1995 : parrello 1.76
1996 :     For example, when processing features, the attribute list we look at has three columns: attribute
1997 :     name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name
1998 :     begins with C<iedb_>. The call signature is therefore
1999 :    
2000 : parrello 1.77 my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB);
2001 : parrello 1.76
2002 :     The pattern is matched against column 0, and if we have a match, then column 2's value is put
2003 :     to the output along with the specified feature ID.
2004 :    
2005 :     =over 4
2006 :    
2007 :     =item id
2008 :    
2009 :     ID of the object whose special attributes are being loaded. This forms the first column of the
2010 :     output.
2011 :    
2012 :     =item attributes
2013 :    
2014 :     Reference to a list of tuples.
2015 :    
2016 :     =item idxMatch
2017 :    
2018 :     Index in each tuple of the column to be matched against the pattern. If the match is
2019 :     successful, an output record will be generated.
2020 :    
2021 : parrello 1.77 =item idxValues
2022 : parrello 1.76
2023 : parrello 1.77 Reference to a list containing the indexes in each tuple of the columns to be put as
2024 :     the second column of the output.
2025 : parrello 1.76
2026 :     =item pattern
2027 :    
2028 :     Pattern to be matched against the specified column. The match will be case-insensitive.
2029 :    
2030 :     =item loader
2031 :    
2032 :     An object to which each output record will be put. Usually this is an B<ERDBLoad> object,
2033 :     but technically it could be anything with a C<Put> method.
2034 :    
2035 :     =item RETURN
2036 :    
2037 :     Returns a count of the matches found.
2038 :    
2039 : parrello 1.90 =item
2040 : parrello 1.76
2041 :     =back
2042 :    
2043 :     =cut
2044 :    
2045 :     sub SpecialAttribute {
2046 :     # Get the parameters.
2047 : parrello 1.77 my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_;
2048 : parrello 1.76 # Declare the return variable.
2049 :     my $retVal = 0;
2050 :     # Loop through the attribute rows.
2051 :     for my $row (@{$attributes}) {
2052 :     # Check for a match.
2053 :     if ($row->[$idxMatch] =~ m/$pattern/i) {
2054 : parrello 1.77 # We have a match, so output a row. This is a bit tricky, since we may
2055 :     # be putting out multiple columns of data from the input.
2056 :     my $value = join(" ", map { $row->[$_] } @{$idxValues});
2057 :     $loader->Put($id, $value);
2058 : parrello 1.76 $retVal++;
2059 :     }
2060 :     }
2061 :     Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal;
2062 :     # Return the number of matches.
2063 :     return $retVal;
2064 :     }
2065 :    
2066 : parrello 1.1 =head3 TableLoader
2067 :    
2068 :     Create an ERDBLoad object for the specified table. The object is also added to
2069 :     the internal list in the C<loaders> property of this object. That enables the
2070 :     L</FinishAll> method to terminate all the active loads.
2071 :    
2072 :     This is an instance method.
2073 :    
2074 :     =over 4
2075 :    
2076 :     =item tableName
2077 :    
2078 :     Name of the table (relation) being loaded.
2079 :    
2080 :     =item RETURN
2081 :    
2082 :     Returns an ERDBLoad object for loading the specified table.
2083 :    
2084 :     =back
2085 :    
2086 :     =cut
2087 :    
2088 :     sub _TableLoader {
2089 :     # Get the parameters.
2090 : parrello 1.85 my ($self, $tableName) = @_;
2091 : parrello 1.1 # Create the load object.
2092 : parrello 1.85 my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly);
2093 : parrello 1.1 # Cache it in the loader list.
2094 :     push @{$self->{loaders}}, $retVal;
2095 :     # Return it to the caller.
2096 :     return $retVal;
2097 :     }
2098 :    
2099 :     =head3 FinishAll
2100 :    
2101 :     Finish all the active loads on this object.
2102 :    
2103 :     When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in
2104 :     the list pointed to be the C<loaders> property of this object. This method pops the loaders
2105 :     off the list and finishes them to flush out any accumulated residue.
2106 :    
2107 :     This is an instance method.
2108 :    
2109 :     =over 4
2110 :    
2111 :     =item RETURN
2112 :    
2113 :     Returns a statistics object containing the accumulated statistics for the load.
2114 :    
2115 :     =back
2116 :    
2117 :     =cut
2118 :    
2119 :     sub _FinishAll {
2120 :     # Get this object instance.
2121 :     my ($self) = @_;
2122 :     # Create the statistics object.
2123 :     my $retVal = Stats->new();
2124 :     # Get the loader list.
2125 :     my $loadList = $self->{loaders};
2126 : parrello 1.48 # Create a hash to hold the statistics objects, keyed on relation name.
2127 :     my %loaderHash = ();
2128 : parrello 1.1 # Loop through the list, finishing the loads. Note that if the finish fails, we die
2129 : parrello 1.48 # ignominiously. At some future point, we want to make the loads more restartable.
2130 : parrello 1.1 while (my $loader = pop @{$loadList}) {
2131 : parrello 1.26 # Get the relation name.
2132 : parrello 1.19 my $relName = $loader->RelName;
2133 : parrello 1.26 # Check the ignore flag.
2134 :     if ($loader->Ignore) {
2135 :     Trace("Relation $relName not loaded.") if T(2);
2136 :     } else {
2137 :     # Here we really need to finish.
2138 :     Trace("Finishing $relName.") if T(2);
2139 :     my $stats = $loader->Finish();
2140 : parrello 1.48 $loaderHash{$relName} = $stats;
2141 :     }
2142 :     }
2143 :     # Now we loop through again, actually loading the tables. We want to finish before
2144 :     # loading so that if something goes wrong at this point, all the load files are usable
2145 :     # and we don't have to redo all that work.
2146 :     for my $relName (sort keys %loaderHash) {
2147 :     # Get the statistics for this relation.
2148 :     my $stats = $loaderHash{$relName};
2149 :     # Check for a database load.
2150 :     if ($self->{options}->{dbLoad}) {
2151 :     # Here we want to use the load file just created to load the database.
2152 :     Trace("Loading relation $relName.") if T(2);
2153 :     my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]);
2154 :     # Accumulate the statistics from the DB load.
2155 :     $stats->Accumulate($newStats);
2156 :     }
2157 :     $retVal->Accumulate($stats);
2158 :     Trace("Statistics for $relName:\n" . $stats->Show()) if T(2);
2159 : parrello 1.1 }
2160 :     # Return the load statistics.
2161 :     return $retVal;
2162 :     }
2163 : parrello 1.83
2164 : parrello 1.80 =head3 GetGenomeAttributes
2165 :    
2166 : parrello 1.90 my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys);
2167 : parrello 1.80
2168 : parrello 1.83 Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related
2169 :     attributes for all the features of a genome in a single call, then organizes them into
2170 :     a hash.
2171 : parrello 1.80
2172 :     =over 4
2173 :    
2174 :     =item fig
2175 :    
2176 :     FIG-like object for accessing attributes.
2177 :    
2178 :     =item genomeID
2179 :    
2180 :     ID of the genome who's attributes are desired.
2181 :    
2182 :     =item fids
2183 :    
2184 :     Reference to a list of the feature IDs whose attributes are to be kept.
2185 :    
2186 : parrello 1.85 =item propKeys
2187 :    
2188 :     A list of the keys to retrieve.
2189 :    
2190 : parrello 1.80 =item RETURN
2191 :    
2192 :     Returns a reference to a hash. The key of the hash is the feature ID. The value is the
2193 :     reference to a list of the feature's attribute tuples. Each tuple contains the feature ID,
2194 :     the attribute key, and one or more attribute values.
2195 :    
2196 :     =back
2197 :    
2198 :     =cut
2199 :    
2200 :     sub GetGenomeAttributes {
2201 :     # Get the parameters.
2202 : parrello 1.85 my ($fig, $genomeID, $fids, $propKeys) = @_;
2203 : parrello 1.80 # Declare the return variable.
2204 :     my $retVal = {};
2205 :     # Initialize the hash. This not only enables us to easily determine which FIDs to
2206 :     # keep, it insures that the caller sees a list reference for every known fid,
2207 :     # simplifying the logic.
2208 :     for my $fid (@{$fids}) {
2209 :     $retVal->{$fid} = [];
2210 :     }
2211 : parrello 1.85 # Get the attributes. If ev_code_cron is running, we may get a timeout error, so
2212 :     # an eval is used.
2213 :     my @aList = ();
2214 :     eval {
2215 :     @aList = $fig->get_attributes("fig|$genomeID%", $propKeys);
2216 :     Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3);
2217 :     };
2218 :     # Check for a problem.
2219 :     if ($@) {
2220 :     Trace("Retrying attributes for $genomeID due to error: $@") if T(1);
2221 :     # Our fallback plan is to process the attributes in blocks of 100. This is much slower,
2222 :     # but allows us to continue processing.
2223 :     my $nFids = scalar @{$fids};
2224 :     for (my $i = 0; $i < $nFids; $i += 100) {
2225 :     # Determine the index of the last feature ID we'll be specifying on this pass.
2226 :     # Normally it's $i + 99, but if we're close to the end it may be less.
2227 :     my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99);
2228 :     # Get a slice of the fid list.
2229 :     my @slice = @{$fids}[$i .. $end];
2230 :     # Get the relevant attributes.
2231 :     Trace("Retrieving attributes for fids $i to $end.") if T(3);
2232 :     my @aShort = $fig->get_attributes(\@slice, $propKeys);
2233 :     Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3);
2234 :     push @aList, @aShort;
2235 :     }
2236 :     }
2237 :     # Now we should have all the interesting attributes in @aList. Populate the hash with
2238 :     # them.
2239 : parrello 1.80 for my $aListEntry (@aList) {
2240 :     my $fid = $aListEntry->[0];
2241 :     if (exists $retVal->{$fid}) {
2242 :     push @{$retVal->{$fid}}, $aListEntry;
2243 :     }
2244 :     }
2245 :     # Return the result.
2246 :     return $retVal;
2247 :     }
2248 : parrello 1.1
2249 : parrello 1.95 =head3 GetCommaList
2250 :    
2251 :     my $string = GetCommaList($value);
2252 :    
2253 :     Create a comma-separated list of the values in a list reference. If the
2254 :     list reference is a scalar, it will be returned unchanged. If it is
2255 :     undefined, an empty string will be returned. The idea is that we may be
2256 :     looking at a string, a list, or nothing, but whatever comes out will be a
2257 :     string.
2258 :    
2259 :     =over 4
2260 :    
2261 :     =item value
2262 :    
2263 :     Reference to a list of values to be assembled into the return string.
2264 :    
2265 :     =item RETURN
2266 :    
2267 :     Returns a scalar string containing the content of the input value.
2268 :    
2269 :     =back
2270 :    
2271 :     =cut
2272 :    
2273 :     sub GetCommaList {
2274 :     # Get the parameters.
2275 :     my ($value) = @_;
2276 :     # Declare the return variable.
2277 :     my $retVal = "";
2278 :     # Only proceed if we have an input value.
2279 :     if (defined $value) {
2280 :     # Analyze the input value.
2281 :     if (ref $value eq 'ARRAY') {
2282 :     # Here it's a list reference.
2283 :     $retVal = join(", ", @$value);
2284 :     } else {
2285 :     # Here it's not. Flatten it to a scalar.
2286 :     $retVal = "$value";
2287 :     }
2288 :     }
2289 :     # Return the result.
2290 :     return $retVal;
2291 :     }
2292 :    
2293 : parrello 1.85
2294 : parrello 1.90 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3