Parent Directory
|
Revision Log
Revision 1.89 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | package SproutLoad; | ||
4 : | |||
5 : | use strict; | ||
6 : | use Tracer; | ||
7 : | use PageBuilder; | ||
8 : | use ERDBLoad; | ||
9 : | use FIG; | ||
10 : | parrello | 1.85 | use FIGRules; |
11 : | parrello | 1.1 | use Sprout; |
12 : | use Stats; | ||
13 : | use BasicLocation; | ||
14 : | parrello | 1.18 | use HTML; |
15 : | parrello | 1.88 | use AliasAnalysis; |
16 : | parrello | 1.1 | |
17 : | =head1 Sprout Load Methods | ||
18 : | |||
19 : | =head2 Introduction | ||
20 : | |||
21 : | This object contains the methods needed to copy data from the FIG data store to the | ||
22 : | Sprout database. It makes heavy use of the ERDBLoad object to manage the load into | ||
23 : | individual tables. The client can create an instance of this object and then | ||
24 : | call methods for each group of tables to load. For example, the following code will | ||
25 : | load the Genome- and Feature-related tables. (It is presumed the first command line | ||
26 : | parameter contains the name of a file specifying the genomes.) | ||
27 : | |||
28 : | my $fig = FIG->new(); | ||
29 : | my $sprout = SFXlate->new_sprout_only(); | ||
30 : | my $spl = SproutLoad->new($sprout, $fig, $ARGV[0]); | ||
31 : | my $stats = $spl->LoadGenomeData(); | ||
32 : | $stats->Accumulate($spl->LoadFeatureData()); | ||
33 : | print $stats->Show(); | ||
34 : | |||
35 : | It is worth noting that the FIG object does not need to be a real one. Any object | ||
36 : | that implements the FIG methods for data retrieval could be used. So, for example, | ||
37 : | this object could be used to copy data from one Sprout database to another, or | ||
38 : | from any FIG-compliant data story implemented in the future. | ||
39 : | |||
40 : | To insure that this is possible, each time the FIG object is used, it will be via | ||
41 : | a variable called C<$fig>. This makes it fairly straightforward to determine which | ||
42 : | FIG methods are required to load the Sprout database. | ||
43 : | |||
44 : | parrello | 1.5 | This object creates the load files; however, the tables are not created until it |
45 : | is time to actually do the load from the files into the target database. | ||
46 : | |||
47 : | parrello | 1.1 | =cut |
48 : | |||
49 : | #: Constructor SproutLoad->new(); | ||
50 : | |||
51 : | =head2 Public Methods | ||
52 : | |||
53 : | =head3 new | ||
54 : | |||
55 : | parrello | 1.8 | C<< my $spl = SproutLoad->new($sprout, $fig, $genomeFile, $subsysFile, $options); >> |
56 : | parrello | 1.1 | |
57 : | Construct a new Sprout Loader object, specifying the two participating databases and | ||
58 : | the name of the files containing the list of genomes and subsystems to use. | ||
59 : | |||
60 : | =over 4 | ||
61 : | |||
62 : | =item sprout | ||
63 : | |||
64 : | Sprout object representing the target database. This also specifies the directory to | ||
65 : | be used for creating the load files. | ||
66 : | |||
67 : | =item fig | ||
68 : | |||
69 : | FIG object representing the source data store from which the data is to be taken. | ||
70 : | |||
71 : | =item genomeFile | ||
72 : | |||
73 : | Either the name of the file containing the list of genomes to load or a reference to | ||
74 : | a hash of genome IDs to access codes. If nothing is specified, all complete genomes | ||
75 : | will be loaded and the access code will default to 1. The genome list is presumed | ||
76 : | to be all-inclusive. In other words, all existing data in the target database will | ||
77 : | be deleted and replaced with the data on the specified genes. If a file is specified, | ||
78 : | it should contain one genome ID and access code per line, tab-separated. | ||
79 : | |||
80 : | =item subsysFile | ||
81 : | |||
82 : | Either the name of the file containing the list of trusted subsystems or a reference | ||
83 : | parrello | 1.34 | to a list of subsystem names. If nothing is specified, all NMPDR subsystems will be |
84 : | considered trusted. (A subsystem is considered NMPDR if it has a file named C<NMPDR> | ||
85 : | parrello | 1.76 | in its data directory.) Only subsystem data related to the NMPDR subsystems is loaded. |
86 : | parrello | 1.1 | |
87 : | parrello | 1.8 | =item options |
88 : | |||
89 : | Reference to a hash of command-line options. | ||
90 : | |||
91 : | parrello | 1.1 | =back |
92 : | |||
93 : | =cut | ||
94 : | |||
95 : | sub new { | ||
96 : | # Get the parameters. | ||
97 : | parrello | 1.8 | my ($class, $sprout, $fig, $genomeFile, $subsysFile, $options) = @_; |
98 : | parrello | 1.35 | # Create the genome hash. |
99 : | my %genomes = (); | ||
100 : | # We only need it if load-only is NOT specified. | ||
101 : | if (! $options->{loadOnly}) { | ||
102 : | if (! defined($genomeFile) || $genomeFile eq '') { | ||
103 : | # Here we want all the complete genomes and an access code of 1. | ||
104 : | my @genomeList = $fig->genomes(1); | ||
105 : | %genomes = map { $_ => 1 } @genomeList; | ||
106 : | parrello | 1.88 | Trace(scalar(keys %genomes) . " genomes found.") if T(3); |
107 : | parrello | 1.35 | } else { |
108 : | my $type = ref $genomeFile; | ||
109 : | Trace("Genome file parameter type is \"$type\".") if T(3); | ||
110 : | if ($type eq 'HASH') { | ||
111 : | # Here the user specified a hash of genome IDs to access codes, which is | ||
112 : | # exactly what we want. | ||
113 : | %genomes = %{$genomeFile}; | ||
114 : | } elsif (! $type || $type eq 'SCALAR' ) { | ||
115 : | # The caller specified a file, so read the genomes from the file. (Note | ||
116 : | # that some PERLs return an empty string rather than SCALAR.) | ||
117 : | my @genomeList = Tracer::GetFile($genomeFile); | ||
118 : | if (! @genomeList) { | ||
119 : | # It's an error if the genome file is empty or not found. | ||
120 : | Confess("No genomes found in file \"$genomeFile\"."); | ||
121 : | } else { | ||
122 : | # We build the genome Hash using a loop rather than "map" so that | ||
123 : | # an omitted access code can be defaulted to 1. | ||
124 : | for my $genomeLine (@genomeList) { | ||
125 : | my ($genomeID, $accessCode) = split("\t", $genomeLine); | ||
126 : | parrello | 1.65 | if (! defined($accessCode)) { |
127 : | parrello | 1.35 | $accessCode = 1; |
128 : | } | ||
129 : | $genomes{$genomeID} = $accessCode; | ||
130 : | parrello | 1.3 | } |
131 : | parrello | 1.1 | } |
132 : | parrello | 1.35 | } else { |
133 : | Confess("Invalid genome parameter ($type) in SproutLoad constructor."); | ||
134 : | parrello | 1.1 | } |
135 : | } | ||
136 : | } | ||
137 : | # Load the list of trusted subsystems. | ||
138 : | my %subsystems = (); | ||
139 : | parrello | 1.35 | # We only need it if load-only is NOT specified. |
140 : | if (! $options->{loadOnly}) { | ||
141 : | if (! defined $subsysFile || $subsysFile eq '') { | ||
142 : | parrello | 1.55 | # Here we want all the usable subsystems. First we get the whole list. |
143 : | parrello | 1.35 | my @subs = $fig->all_subsystems(); |
144 : | parrello | 1.76 | # Loop through, checking for the NMPDR file. |
145 : | parrello | 1.35 | for my $sub (@subs) { |
146 : | parrello | 1.76 | if ($fig->nmpdr_subsystem($sub)) { |
147 : | parrello | 1.35 | $subsystems{$sub} = 1; |
148 : | } | ||
149 : | parrello | 1.33 | } |
150 : | parrello | 1.35 | } else { |
151 : | my $type = ref $subsysFile; | ||
152 : | if ($type eq 'ARRAY') { | ||
153 : | # Here the user passed in a list of subsystems. | ||
154 : | %subsystems = map { $_ => 1 } @{$subsysFile}; | ||
155 : | } elsif (! $type || $type eq 'SCALAR') { | ||
156 : | # Here the list of subsystems is in a file. | ||
157 : | if (! -e $subsysFile) { | ||
158 : | # It's an error if the file does not exist. | ||
159 : | Confess("Trusted subsystem file not found."); | ||
160 : | } else { | ||
161 : | # GetFile automatically chomps end-of-line characters, so this | ||
162 : | # is an easy task. | ||
163 : | %subsystems = map { $_ => 1 } Tracer::GetFile($subsysFile); | ||
164 : | } | ||
165 : | parrello | 1.4 | } else { |
166 : | parrello | 1.35 | Confess("Invalid subsystem parameter in SproutLoad constructor."); |
167 : | parrello | 1.4 | } |
168 : | parrello | 1.1 | } |
169 : | parrello | 1.72 | # Go through the subsys hash again, creating the keyword list for each subsystem. |
170 : | for my $subsystem (keys %subsystems) { | ||
171 : | my $name = $subsystem; | ||
172 : | $name =~ s/_/ /g; | ||
173 : | parrello | 1.86 | # my $classes = $fig->subsystem_classification($subsystem); |
174 : | # $name .= " " . join(" ", @{$classes}); | ||
175 : | parrello | 1.72 | $subsystems{$subsystem} = $name; |
176 : | } | ||
177 : | parrello | 1.1 | } |
178 : | parrello | 1.85 | # Get the list of NMPDR-oriented attribute keys. |
179 : | my @propKeys = $fig->get_group_keys("NMPDR"); | ||
180 : | parrello | 1.1 | # Get the data directory from the Sprout object. |
181 : | my ($directory) = $sprout->LoadInfo(); | ||
182 : | # Create the Sprout load object. | ||
183 : | my $retVal = { | ||
184 : | fig => $fig, | ||
185 : | genomes => \%genomes, | ||
186 : | subsystems => \%subsystems, | ||
187 : | sprout => $sprout, | ||
188 : | loadDirectory => $directory, | ||
189 : | parrello | 1.39 | erdb => $sprout, |
190 : | parrello | 1.8 | loaders => [], |
191 : | parrello | 1.85 | options => $options, |
192 : | propKeys => \@propKeys, | ||
193 : | parrello | 1.1 | }; |
194 : | # Bless and return it. | ||
195 : | bless $retVal, $class; | ||
196 : | return $retVal; | ||
197 : | } | ||
198 : | |||
199 : | parrello | 1.23 | =head3 LoadOnly |
200 : | |||
201 : | C<< my $flag = $spl->LoadOnly; >> | ||
202 : | |||
203 : | Return TRUE if we are in load-only mode, else FALSE. | ||
204 : | |||
205 : | =cut | ||
206 : | |||
207 : | sub LoadOnly { | ||
208 : | my ($self) = @_; | ||
209 : | return $self->{options}->{loadOnly}; | ||
210 : | } | ||
211 : | |||
212 : | parrello | 1.25 | |
213 : | parrello | 1.1 | =head3 LoadGenomeData |
214 : | |||
215 : | C<< my $stats = $spl->LoadGenomeData(); >> | ||
216 : | |||
217 : | Load the Genome, Contig, and Sequence data from FIG into Sprout. | ||
218 : | |||
219 : | The Sequence table is the largest single relation in the Sprout database, so this | ||
220 : | method is expected to be slow and clumsy. At some point we will need to make it | ||
221 : | restartable, since an error 10 gigabytes through a 20-gigabyte load is bound to be | ||
222 : | very annoying otherwise. | ||
223 : | |||
224 : | The following relations are loaded by this method. | ||
225 : | |||
226 : | Genome | ||
227 : | HasContig | ||
228 : | Contig | ||
229 : | IsMadeUpOf | ||
230 : | Sequence | ||
231 : | |||
232 : | =over 4 | ||
233 : | |||
234 : | =item RETURNS | ||
235 : | |||
236 : | Returns a statistics object for the loads. | ||
237 : | |||
238 : | =back | ||
239 : | |||
240 : | =cut | ||
241 : | #: Return Type $%; | ||
242 : | sub LoadGenomeData { | ||
243 : | # Get this object instance. | ||
244 : | my ($self) = @_; | ||
245 : | # Get the FIG object. | ||
246 : | my $fig = $self->{fig}; | ||
247 : | # Get the genome count. | ||
248 : | my $genomeHash = $self->{genomes}; | ||
249 : | my $genomeCount = (keys %{$genomeHash}); | ||
250 : | # Create load objects for each of the tables we're loading. | ||
251 : | parrello | 1.23 | my $loadGenome = $self->_TableLoader('Genome'); |
252 : | parrello | 1.85 | my $loadHasContig = $self->_TableLoader('HasContig'); |
253 : | my $loadContig = $self->_TableLoader('Contig'); | ||
254 : | my $loadIsMadeUpOf = $self->_TableLoader('IsMadeUpOf'); | ||
255 : | my $loadSequence = $self->_TableLoader('Sequence'); | ||
256 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
257 : | Trace("Loading from existing files.") if T(2); | ||
258 : | } else { | ||
259 : | Trace("Generating genome data.") if T(2); | ||
260 : | # Now we loop through the genomes, generating the data for each one. | ||
261 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
262 : | Trace("Generating data for genome $genomeID.") if T(3); | ||
263 : | $loadGenome->Add("genomeIn"); | ||
264 : | # The access code comes in via the genome hash. | ||
265 : | my $accessCode = $genomeHash->{$genomeID}; | ||
266 : | parrello | 1.28 | # Get the genus, species, and strain from the scientific name. |
267 : | parrello | 1.23 | my ($genus, $species, @extraData) = split / /, $self->{fig}->genus_species($genomeID); |
268 : | parrello | 1.28 | my $extra = join " ", @extraData; |
269 : | parrello | 1.23 | # Get the full taxonomy. |
270 : | my $taxonomy = $fig->taxonomy_of($genomeID); | ||
271 : | parrello | 1.82 | # Get the version. If no version is specified, we default to the genome ID by itself. |
272 : | my $version = $fig->genome_version($genomeID); | ||
273 : | if (! defined($version)) { | ||
274 : | $version = $genomeID; | ||
275 : | } | ||
276 : | # Get the DNA size. | ||
277 : | my $dnaSize = $fig->genome_szdna($genomeID); | ||
278 : | parrello | 1.68 | # Open the NMPDR group file for this genome. |
279 : | my $group; | ||
280 : | if (open(TMP, "<$FIG_Config::organisms/$genomeID/NMPDR") && | ||
281 : | defined($group = <TMP>)) { | ||
282 : | # Clean the line ending. | ||
283 : | chomp $group; | ||
284 : | } else { | ||
285 : | # No group, so use the default. | ||
286 : | $group = $FIG_Config::otherGroup; | ||
287 : | } | ||
288 : | close TMP; | ||
289 : | parrello | 1.23 | # Output the genome record. |
290 : | parrello | 1.82 | $loadGenome->Put($genomeID, $accessCode, $fig->is_complete($genomeID), |
291 : | $dnaSize, $genus, $group, $species, $extra, $version, $taxonomy); | ||
292 : | parrello | 1.23 | # Now we loop through each of the genome's contigs. |
293 : | my @contigs = $fig->all_contigs($genomeID); | ||
294 : | for my $contigID (@contigs) { | ||
295 : | Trace("Processing contig $contigID for $genomeID.") if T(4); | ||
296 : | $loadContig->Add("contigIn"); | ||
297 : | $loadSequence->Add("contigIn"); | ||
298 : | # Create the contig ID. | ||
299 : | my $sproutContigID = "$genomeID:$contigID"; | ||
300 : | # Create the contig record and relate it to the genome. | ||
301 : | $loadContig->Put($sproutContigID); | ||
302 : | $loadHasContig->Put($genomeID, $sproutContigID); | ||
303 : | # Now we need to split the contig into sequences. The maximum sequence size is | ||
304 : | # a property of the Sprout object. | ||
305 : | my $chunkSize = $self->{sprout}->MaxSequence(); | ||
306 : | # Now we get the sequence a chunk at a time. | ||
307 : | my $contigLen = $fig->contig_ln($genomeID, $contigID); | ||
308 : | for (my $i = 1; $i <= $contigLen; $i += $chunkSize) { | ||
309 : | $loadSequence->Add("chunkIn"); | ||
310 : | # Compute the endpoint of this chunk. | ||
311 : | my $end = FIG::min($i + $chunkSize - 1, $contigLen); | ||
312 : | # Get the actual DNA. | ||
313 : | my $dna = $fig->get_dna($genomeID, $contigID, $i, $end); | ||
314 : | # Compute the sequenceID. | ||
315 : | my $seqID = "$sproutContigID.$i"; | ||
316 : | # Write out the data. For now, the quality vector is always "unknown". | ||
317 : | $loadIsMadeUpOf->Put($sproutContigID, $seqID, $end + 1 - $i, $i); | ||
318 : | $loadSequence->Put($seqID, "unknown", $dna); | ||
319 : | } | ||
320 : | parrello | 1.1 | } |
321 : | } | ||
322 : | } | ||
323 : | # Finish the loads. | ||
324 : | my $retVal = $self->_FinishAll(); | ||
325 : | # Return the result. | ||
326 : | return $retVal; | ||
327 : | } | ||
328 : | |||
329 : | =head3 LoadFeatureData | ||
330 : | |||
331 : | C<< my $stats = $spl->LoadFeatureData(); >> | ||
332 : | |||
333 : | Load the feature data from FIG into Sprout. | ||
334 : | |||
335 : | Features represent annotated genes, and are therefore the heart of the data store. | ||
336 : | |||
337 : | The following relations are loaded by this method. | ||
338 : | |||
339 : | Feature | ||
340 : | FeatureAlias | ||
341 : | parrello | 1.85 | IsAliasOf |
342 : | parrello | 1.1 | FeatureLink |
343 : | FeatureTranslation | ||
344 : | FeatureUpstream | ||
345 : | IsLocatedIn | ||
346 : | parrello | 1.30 | HasFeature |
347 : | parrello | 1.69 | HasRoleInSubsystem |
348 : | parrello | 1.76 | FeatureEssential |
349 : | FeatureVirulent | ||
350 : | FeatureIEDB | ||
351 : | parrello | 1.85 | CDD |
352 : | IsPresentOnProteinOf | ||
353 : | parrello | 1.1 | |
354 : | =over 4 | ||
355 : | |||
356 : | =item RETURNS | ||
357 : | |||
358 : | Returns a statistics object for the loads. | ||
359 : | |||
360 : | =back | ||
361 : | |||
362 : | =cut | ||
363 : | #: Return Type $%; | ||
364 : | sub LoadFeatureData { | ||
365 : | # Get this object instance. | ||
366 : | my ($self) = @_; | ||
367 : | parrello | 1.72 | # Get the FIG and Sprout objects. |
368 : | parrello | 1.1 | my $fig = $self->{fig}; |
369 : | parrello | 1.72 | my $sprout = $self->{sprout}; |
370 : | parrello | 1.1 | # Get the table of genome IDs. |
371 : | my $genomeHash = $self->{genomes}; | ||
372 : | # Create load objects for each of the tables we're loading. | ||
373 : | parrello | 1.23 | my $loadFeature = $self->_TableLoader('Feature'); |
374 : | parrello | 1.85 | my $loadIsLocatedIn = $self->_TableLoader('IsLocatedIn'); |
375 : | parrello | 1.23 | my $loadFeatureAlias = $self->_TableLoader('FeatureAlias'); |
376 : | parrello | 1.85 | my $loadIsAliasOf = $self->_TableLoader('IsAliasOf'); |
377 : | parrello | 1.23 | my $loadFeatureLink = $self->_TableLoader('FeatureLink'); |
378 : | my $loadFeatureTranslation = $self->_TableLoader('FeatureTranslation'); | ||
379 : | my $loadFeatureUpstream = $self->_TableLoader('FeatureUpstream'); | ||
380 : | parrello | 1.85 | my $loadHasFeature = $self->_TableLoader('HasFeature'); |
381 : | my $loadHasRoleInSubsystem = $self->_TableLoader('HasRoleInSubsystem'); | ||
382 : | parrello | 1.76 | my $loadFeatureEssential = $self->_TableLoader('FeatureEssential'); |
383 : | my $loadFeatureVirulent = $self->_TableLoader('FeatureVirulent'); | ||
384 : | my $loadFeatureIEDB = $self->_TableLoader('FeatureIEDB'); | ||
385 : | parrello | 1.85 | my $loadCDD = $self->_TableLoader('CDD'); |
386 : | my $loadIsPresentOnProteinOf = $self->_TableLoader('IsPresentOnProteinOf'); | ||
387 : | parrello | 1.72 | # Get the subsystem hash. |
388 : | my $subHash = $self->{subsystems}; | ||
389 : | parrello | 1.85 | # Get the property keys. |
390 : | my $propKeys = $self->{propKeys}; | ||
391 : | # Create a hashes to hold CDD and alias values. | ||
392 : | my %CDD = (); | ||
393 : | my %alias = (); | ||
394 : | parrello | 1.1 | # Get the maximum sequence size. We need this later for splitting up the |
395 : | # locations. | ||
396 : | my $chunkSize = $self->{sprout}->MaxSegment(); | ||
397 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
398 : | Trace("Loading from existing files.") if T(2); | ||
399 : | } else { | ||
400 : | Trace("Generating feature data.") if T(2); | ||
401 : | # Now we loop through the genomes, generating the data for each one. | ||
402 : | parrello | 1.88 | my @allGenomes = sort keys %{$genomeHash}; |
403 : | Trace(scalar(@allGenomes) . " genomes found in list.") if T(3); | ||
404 : | for my $genomeID (@allGenomes) { | ||
405 : | parrello | 1.23 | Trace("Loading features for genome $genomeID.") if T(3); |
406 : | $loadFeature->Add("genomeIn"); | ||
407 : | # Get the feature list for this genome. | ||
408 : | parrello | 1.82 | my $features = $fig->all_features_detailed_fast($genomeID); |
409 : | parrello | 1.56 | # Sort and count the list. |
410 : | parrello | 1.57 | my @featureTuples = sort { $a->[0] cmp $b->[0] } @{$features}; |
411 : | my $count = scalar @featureTuples; | ||
412 : | parrello | 1.80 | my @fids = map { $_->[0] } @featureTuples; |
413 : | parrello | 1.54 | Trace("$count features found for genome $genomeID.") if T(3); |
414 : | parrello | 1.80 | # Get the attributes for this genome and put them in a hash by feature ID. |
415 : | parrello | 1.85 | my $attributes = GetGenomeAttributes($fig, $genomeID, \@fids, $propKeys); |
416 : | parrello | 1.88 | Trace("Looping through features for $genomeID.") if T(3); |
417 : | parrello | 1.56 | # Set up for our duplicate-feature check. |
418 : | my $oldFeatureID = ""; | ||
419 : | parrello | 1.23 | # Loop through the features. |
420 : | parrello | 1.57 | for my $featureTuple (@featureTuples) { |
421 : | parrello | 1.23 | # Split the tuple. |
422 : | parrello | 1.82 | my ($featureID, $locations, undef, $type, $minloc, $maxloc, $assignment, $user, $quality) = @{$featureTuple}; |
423 : | parrello | 1.56 | # Check for duplicates. |
424 : | if ($featureID eq $oldFeatureID) { | ||
425 : | Trace("Duplicate feature $featureID found.") if T(1); | ||
426 : | } else { | ||
427 : | $oldFeatureID = $featureID; | ||
428 : | # Count this feature. | ||
429 : | $loadFeature->Add("featureIn"); | ||
430 : | parrello | 1.82 | # Fix the quality. It is almost always a space, but some odd stuff might sneak through, and the |
431 : | # Sprout database requires a single character. | ||
432 : | if (! defined($quality) || $quality eq "") { | ||
433 : | $quality = " "; | ||
434 : | } | ||
435 : | parrello | 1.76 | # Begin building the keywords. We start with the genome ID, the |
436 : | parrello | 1.79 | # feature ID, the taxonomy, and the organism name. |
437 : | my @keywords = ($genomeID, $featureID, $fig->genus_species($genomeID), | ||
438 : | $fig->taxonomy_of($genomeID)); | ||
439 : | parrello | 1.81 | # Create the aliases. |
440 : | for my $alias ($fig->feature_aliases($featureID)) { | ||
441 : | parrello | 1.85 | #Connect this alias to this feature. |
442 : | $loadIsAliasOf->Put($alias, $featureID); | ||
443 : | parrello | 1.81 | push @keywords, $alias; |
444 : | parrello | 1.85 | # If this is a locus tag, also add its natural form as a keyword. |
445 : | my $naturalName = AliasAnalysis::Type(LocusTag => $alias); | ||
446 : | if ($naturalName) { | ||
447 : | push @keywords, $naturalName; | ||
448 : | } | ||
449 : | # If this is the first time for the specified alias, create its | ||
450 : | # alias record. | ||
451 : | if (! exists $alias{$alias}) { | ||
452 : | $loadFeatureAlias->Put($alias); | ||
453 : | $alias{$alias} = 1; | ||
454 : | } | ||
455 : | parrello | 1.75 | } |
456 : | Trace("Assignment for $featureID is: $assignment") if T(4); | ||
457 : | # Break the assignment into words and shove it onto the | ||
458 : | # keyword list. | ||
459 : | push @keywords, split(/\s+/, $assignment); | ||
460 : | parrello | 1.72 | # Link this feature to the parent genome. |
461 : | parrello | 1.56 | $loadHasFeature->Put($genomeID, $featureID, $type); |
462 : | # Get the links. | ||
463 : | my @links = $fig->fid_links($featureID); | ||
464 : | for my $link (@links) { | ||
465 : | $loadFeatureLink->Put($featureID, $link); | ||
466 : | parrello | 1.8 | } |
467 : | parrello | 1.56 | # If this is a peg, generate the translation and the upstream. |
468 : | if ($type eq 'peg') { | ||
469 : | $loadFeatureTranslation->Add("pegIn"); | ||
470 : | my $translation = $fig->get_translation($featureID); | ||
471 : | if ($translation) { | ||
472 : | $loadFeatureTranslation->Put($featureID, $translation); | ||
473 : | } | ||
474 : | # We use the default upstream values of u=200 and c=100. | ||
475 : | my $upstream = $fig->upstream_of($featureID, 200, 100); | ||
476 : | if ($upstream) { | ||
477 : | $loadFeatureUpstream->Put($featureID, $upstream); | ||
478 : | } | ||
479 : | parrello | 1.23 | } |
480 : | parrello | 1.69 | # Now we need to find the subsystems this feature participates in. |
481 : | parrello | 1.72 | # We also add the subsystems to the keyword list. Before we do that, |
482 : | parrello | 1.86 | # we must convert underscores to spaces. |
483 : | parrello | 1.69 | my @subsystems = $fig->peg_to_subsystems($featureID); |
484 : | for my $subsystem (@subsystems) { | ||
485 : | parrello | 1.72 | # Only proceed if we like this subsystem. |
486 : | if (exists $subHash->{$subsystem}) { | ||
487 : | # Store the has-role link. | ||
488 : | $loadHasRoleInSubsystem->Put($featureID, $subsystem, $genomeID, $type); | ||
489 : | # Save the subsystem's keyword data. | ||
490 : | my $subKeywords = $subHash->{$subsystem}; | ||
491 : | parrello | 1.75 | push @keywords, split /\s+/, $subKeywords; |
492 : | # Now we need to get this feature's role in the subsystem. | ||
493 : | my $subObject = $fig->get_subsystem($subsystem); | ||
494 : | my @roleColumns = $subObject->get_peg_roles($featureID); | ||
495 : | my @allRoles = $subObject->get_roles(); | ||
496 : | for my $col (@roleColumns) { | ||
497 : | my $role = $allRoles[$col]; | ||
498 : | push @keywords, split /\s+/, $role; | ||
499 : | push @keywords, $subObject->get_role_abbr($col); | ||
500 : | } | ||
501 : | parrello | 1.72 | } |
502 : | } | ||
503 : | parrello | 1.76 | # There are three special attributes computed from property |
504 : | # data that we build next. If the special attribute is non-empty, | ||
505 : | # its name will be added to the keyword list. First, we get all | ||
506 : | # the attributes for this feature. They will come back as | ||
507 : | # 4-tuples: [peg, name, value, URL]. We use a 3-tuple instead: | ||
508 : | # [name, value, value with URL]. (We don't need the PEG, since | ||
509 : | # we already know it.) | ||
510 : | my @attributes = map { [$_->[1], $_->[2], Tracer::CombineURL($_->[2], $_->[3])] } | ||
511 : | parrello | 1.80 | @{$attributes->{$featureID}}; |
512 : | parrello | 1.76 | # Now we process each of the special attributes. |
513 : | if (SpecialAttribute($featureID, \@attributes, | ||
514 : | parrello | 1.77 | 1, [0,2], '^(essential|potential_essential)$', |
515 : | parrello | 1.76 | $loadFeatureEssential)) { |
516 : | push @keywords, 'essential'; | ||
517 : | $loadFeature->Add('essential'); | ||
518 : | parrello | 1.72 | } |
519 : | parrello | 1.76 | if (SpecialAttribute($featureID, \@attributes, |
520 : | parrello | 1.77 | 0, [2], '^virulen', |
521 : | parrello | 1.76 | $loadFeatureVirulent)) { |
522 : | push @keywords, 'virulent'; | ||
523 : | $loadFeature->Add('virulent'); | ||
524 : | } | ||
525 : | if (SpecialAttribute($featureID, \@attributes, | ||
526 : | parrello | 1.77 | 0, [0,2], '^iedb_', |
527 : | parrello | 1.76 | $loadFeatureIEDB)) { |
528 : | push @keywords, 'iedb'; | ||
529 : | $loadFeature->Add('iedb'); | ||
530 : | parrello | 1.75 | } |
531 : | parrello | 1.85 | # Now we have some other attributes we need to process. Currently, |
532 : | # this is CDD and CELLO, but we expect the number to increase. | ||
533 : | my %attributeHash = (); | ||
534 : | for my $attrRow (@{$attributes->{$featureID}}) { | ||
535 : | my (undef, $key, @values) = @{$attrRow}; | ||
536 : | $key =~ /^([^:]+)::(.+)/; | ||
537 : | if (exists $attributeHash{$1}) { | ||
538 : | $attributeHash{$1}->{$2} = \@values; | ||
539 : | } else { | ||
540 : | $attributeHash{$1} = {$2 => \@values}; | ||
541 : | } | ||
542 : | } | ||
543 : | my $celloValue = "unknown"; | ||
544 : | # Pull in the CELLO attribute. There will never be more than one. | ||
545 : | # If we have one, it's a feature attribute AND a keyword. | ||
546 : | my @celloData = keys %{$attributeHash{CELLO}}; | ||
547 : | if (@celloData) { | ||
548 : | $celloValue = $celloData[0]; | ||
549 : | push @keywords, $celloValue; | ||
550 : | } | ||
551 : | # Now we handle CDD. This is a bit more complicated, because | ||
552 : | # there are multiple CDDs per protein. | ||
553 : | if (exists $attributeHash{CDD}) { | ||
554 : | # Get the hash of CDD IDs to scores for this feature. We | ||
555 : | # already know it exists because of the above IF. | ||
556 : | my $cddHash = $attributeHash{CDD}; | ||
557 : | my @cddData = sort keys %{$cddHash}; | ||
558 : | for my $cdd (@cddData) { | ||
559 : | # Extract the score for this CDD and decode it. | ||
560 : | parrello | 1.89 | my ($codeScore) = split(/\s*,\s*/, $cddHash->{$cdd}->[1]); |
561 : | parrello | 1.85 | my $realScore = FIGRules::DecodeScore($codeScore); |
562 : | parrello | 1.89 | # We can't afford to crash because of a bad attribute |
563 : | # value, hence the IF below. | ||
564 : | if (! defined($realScore)) { | ||
565 : | # Bad score, so count it. | ||
566 : | $loadFeature->Add('badCDDscore'); | ||
567 : | } else { | ||
568 : | # Create the connection. | ||
569 : | $loadIsPresentOnProteinOf->Put($cdd, $featureID, $realScore); | ||
570 : | # If this CDD does not yet exist, create its record. | ||
571 : | if (! exists $CDD{$cdd}) { | ||
572 : | $CDD{$cdd} = 1; | ||
573 : | $loadCDD->Put($cdd); | ||
574 : | } | ||
575 : | parrello | 1.85 | } |
576 : | } | ||
577 : | } | ||
578 : | parrello | 1.75 | # Now we need to bust up hyphenated words in the keyword |
579 : | parrello | 1.79 | # list. We keep them separate and put them at the end so |
580 : | # the original word order is available. | ||
581 : | parrello | 1.75 | my $keywordString = ""; |
582 : | parrello | 1.79 | my $bustedString = ""; |
583 : | parrello | 1.75 | for my $keyword (@keywords) { |
584 : | parrello | 1.80 | if (length $keyword >= 3) { |
585 : | parrello | 1.75 | $keywordString .= " $keyword"; |
586 : | if ($keyword =~ /-/) { | ||
587 : | parrello | 1.80 | my @words = split /-/, $keyword; |
588 : | parrello | 1.79 | $bustedString .= join(" ", "", @words); |
589 : | parrello | 1.75 | } |
590 : | } | ||
591 : | parrello | 1.69 | } |
592 : | parrello | 1.79 | $keywordString .= $bustedString; |
593 : | # Get rid of annoying punctuation. | ||
594 : | $keywordString =~ s/[();]//g; | ||
595 : | parrello | 1.72 | # Clean the keyword list. |
596 : | parrello | 1.75 | my $cleanWords = $sprout->CleanKeywords($keywordString); |
597 : | Trace("Keyword string for $featureID: $cleanWords") if T(4); | ||
598 : | parrello | 1.85 | # Now we need to process the feature's locations. First, we split them up. |
599 : | my @locationList = split /\s*,\s*/, $locations; | ||
600 : | # Next, we convert them to Sprout location objects. | ||
601 : | my @locObjectList = map { BasicLocation->new("$genomeID:$_") } @locationList; | ||
602 : | parrello | 1.56 | # This part is the roughest. We need to relate the features to contig |
603 : | # locations, and the locations must be split so that none of them exceed | ||
604 : | # the maximum segment size. This simplifies the genes_in_region processing | ||
605 : | parrello | 1.85 | # for Sprout. To start, we create the location position indicator. |
606 : | parrello | 1.56 | my $i = 1; |
607 : | # Loop through the locations. | ||
608 : | parrello | 1.85 | for my $locObject (@locObjectList) { |
609 : | # Split this location into a list of chunks. | ||
610 : | parrello | 1.56 | my @locOList = (); |
611 : | while (my $peeling = $locObject->Peel($chunkSize)) { | ||
612 : | $loadIsLocatedIn->Add("peeling"); | ||
613 : | push @locOList, $peeling; | ||
614 : | } | ||
615 : | push @locOList, $locObject; | ||
616 : | # Loop through the chunks, creating IsLocatedIn records. The variable | ||
617 : | # "$i" will be used to keep the location index. | ||
618 : | for my $locChunk (@locOList) { | ||
619 : | $loadIsLocatedIn->Put($featureID, $locChunk->Contig, $locChunk->Left, | ||
620 : | $locChunk->Dir, $locChunk->Length, $i); | ||
621 : | $i++; | ||
622 : | } | ||
623 : | parrello | 1.23 | } |
624 : | parrello | 1.85 | # Finally, reassemble the location objects into a list of Sprout location strings. |
625 : | $locations = join(", ", map { $_->String } @locObjectList); | ||
626 : | # Create the feature record. | ||
627 : | $loadFeature->Put($featureID, 1, $user, $quality, $celloValue, $type, $assignment, $cleanWords, $locations); | ||
628 : | parrello | 1.1 | } |
629 : | } | ||
630 : | parrello | 1.88 | Trace("Genome $genomeID processed.") if T(3); |
631 : | parrello | 1.1 | } |
632 : | } | ||
633 : | # Finish the loads. | ||
634 : | my $retVal = $self->_FinishAll(); | ||
635 : | return $retVal; | ||
636 : | } | ||
637 : | |||
638 : | =head3 LoadSubsystemData | ||
639 : | |||
640 : | C<< my $stats = $spl->LoadSubsystemData(); >> | ||
641 : | |||
642 : | Load the subsystem data from FIG into Sprout. | ||
643 : | |||
644 : | Subsystems are groupings of genetic roles that work together to effect a specific | ||
645 : | chemical reaction. Similar organisms require similar subsystems. To curate a subsystem, | ||
646 : | a spreadsheet is created with genomes on one axis and subsystem roles on the other | ||
647 : | axis. Similar features are then mapped into the cells, allowing the annotation of one | ||
648 : | genome's roles to be used to assist in the annotation of others. | ||
649 : | |||
650 : | The following relations are loaded by this method. | ||
651 : | |||
652 : | Subsystem | ||
653 : | parrello | 1.46 | SubsystemClass |
654 : | parrello | 1.1 | Role |
655 : | parrello | 1.19 | RoleEC |
656 : | parrello | 1.85 | IsIdentifiedByEC |
657 : | parrello | 1.1 | SSCell |
658 : | ContainsFeature | ||
659 : | IsGenomeOf | ||
660 : | IsRoleOf | ||
661 : | OccursInSubsystem | ||
662 : | ParticipatesIn | ||
663 : | HasSSCell | ||
664 : | parrello | 1.18 | ConsistsOfRoles |
665 : | RoleSubset | ||
666 : | HasRoleSubset | ||
667 : | ConsistsOfGenomes | ||
668 : | GenomeSubset | ||
669 : | HasGenomeSubset | ||
670 : | parrello | 1.20 | Catalyzes |
671 : | parrello | 1.21 | Diagram |
672 : | RoleOccursIn | ||
673 : | parrello | 1.1 | |
674 : | =over 4 | ||
675 : | |||
676 : | =item RETURNS | ||
677 : | |||
678 : | Returns a statistics object for the loads. | ||
679 : | |||
680 : | =back | ||
681 : | |||
682 : | =cut | ||
683 : | #: Return Type $%; | ||
684 : | sub LoadSubsystemData { | ||
685 : | # Get this object instance. | ||
686 : | my ($self) = @_; | ||
687 : | # Get the FIG object. | ||
688 : | my $fig = $self->{fig}; | ||
689 : | # Get the genome hash. We'll use it to filter the genomes in each | ||
690 : | # spreadsheet. | ||
691 : | my $genomeHash = $self->{genomes}; | ||
692 : | # Get the subsystem hash. This lists the subsystems we'll process. | ||
693 : | my $subsysHash = $self->{subsystems}; | ||
694 : | my @subsysIDs = sort keys %{$subsysHash}; | ||
695 : | parrello | 1.21 | # Get the map list. |
696 : | my @maps = $fig->all_maps; | ||
697 : | parrello | 1.1 | # Create load objects for each of the tables we're loading. |
698 : | parrello | 1.85 | my $loadDiagram = $self->_TableLoader('Diagram'); |
699 : | my $loadRoleOccursIn = $self->_TableLoader('RoleOccursIn'); | ||
700 : | parrello | 1.23 | my $loadSubsystem = $self->_TableLoader('Subsystem'); |
701 : | parrello | 1.85 | my $loadRole = $self->_TableLoader('Role'); |
702 : | my $loadRoleEC = $self->_TableLoader('RoleEC'); | ||
703 : | my $loadIsIdentifiedByEC = $self->_TableLoader('IsIdentifiedByEC'); | ||
704 : | my $loadCatalyzes = $self->_TableLoader('Catalyzes'); | ||
705 : | my $loadSSCell = $self->_TableLoader('SSCell'); | ||
706 : | my $loadContainsFeature = $self->_TableLoader('ContainsFeature'); | ||
707 : | my $loadIsGenomeOf = $self->_TableLoader('IsGenomeOf'); | ||
708 : | my $loadIsRoleOf = $self->_TableLoader('IsRoleOf'); | ||
709 : | my $loadOccursInSubsystem = $self->_TableLoader('OccursInSubsystem'); | ||
710 : | my $loadParticipatesIn = $self->_TableLoader('ParticipatesIn'); | ||
711 : | my $loadHasSSCell = $self->_TableLoader('HasSSCell'); | ||
712 : | my $loadRoleSubset = $self->_TableLoader('RoleSubset'); | ||
713 : | my $loadGenomeSubset = $self->_TableLoader('GenomeSubset'); | ||
714 : | my $loadConsistsOfRoles = $self->_TableLoader('ConsistsOfRoles'); | ||
715 : | my $loadConsistsOfGenomes = $self->_TableLoader('ConsistsOfGenomes'); | ||
716 : | my $loadHasRoleSubset = $self->_TableLoader('HasRoleSubset'); | ||
717 : | my $loadHasGenomeSubset = $self->_TableLoader('HasGenomeSubset'); | ||
718 : | my $loadSubsystemClass = $self->_TableLoader('SubsystemClass'); | ||
719 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
720 : | Trace("Loading from existing files.") if T(2); | ||
721 : | } else { | ||
722 : | Trace("Generating subsystem data.") if T(2); | ||
723 : | parrello | 1.85 | # This hash will contain the roles for each EC. When we're done, this |
724 : | parrello | 1.23 | # information will be used to generate the Catalyzes table. |
725 : | my %ecToRoles = (); | ||
726 : | # Loop through the subsystems. Our first task will be to create the | ||
727 : | # roles. We do this by looping through the subsystems and creating a | ||
728 : | # role hash. The hash tracks each role ID so that we don't create | ||
729 : | # duplicates. As we move along, we'll connect the roles and subsystems | ||
730 : | # and memorize up the reactions. | ||
731 : | my ($genomeID, $roleID); | ||
732 : | my %roleData = (); | ||
733 : | for my $subsysID (@subsysIDs) { | ||
734 : | # Get the subsystem object. | ||
735 : | my $sub = $fig->get_subsystem($subsysID); | ||
736 : | parrello | 1.32 | # Only proceed if the subsystem has a spreadsheet. |
737 : | parrello | 1.83 | if (defined($sub) && ! $sub->{empty_ss}) { |
738 : | parrello | 1.31 | Trace("Creating subsystem $subsysID.") if T(3); |
739 : | $loadSubsystem->Add("subsystemIn"); | ||
740 : | # Create the subsystem record. | ||
741 : | my $curator = $sub->get_curator(); | ||
742 : | my $notes = $sub->get_notes(); | ||
743 : | $loadSubsystem->Put($subsysID, $curator, $notes); | ||
744 : | parrello | 1.72 | # Now for the classification string. This comes back as a list |
745 : | # reference and we convert it to a space-delimited string. | ||
746 : | parrello | 1.64 | my $classList = $fig->subsystem_classification($subsysID); |
747 : | parrello | 1.78 | my $classString = join($FIG_Config::splitter, grep { $_ } @$classList); |
748 : | parrello | 1.72 | $loadSubsystemClass->Put($subsysID, $classString); |
749 : | parrello | 1.31 | # Connect it to its roles. Each role is a column in the subsystem spreadsheet. |
750 : | for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { | ||
751 : | parrello | 1.85 | # Get the role's abbreviation. |
752 : | my $abbr = $sub->get_role_abbr($col); | ||
753 : | parrello | 1.31 | # Connect to this role. |
754 : | $loadOccursInSubsystem->Add("roleIn"); | ||
755 : | parrello | 1.85 | $loadOccursInSubsystem->Put($roleID, $subsysID, $abbr, $col); |
756 : | parrello | 1.31 | # If it's a new role, add it to the role table. |
757 : | if (! exists $roleData{$roleID}) { | ||
758 : | # Get the role's abbreviation. | ||
759 : | # Add the role. | ||
760 : | parrello | 1.85 | $loadRole->Put($roleID); |
761 : | parrello | 1.31 | $roleData{$roleID} = 1; |
762 : | # Check for an EC number. | ||
763 : | parrello | 1.85 | if ($roleID =~ /\(EC (\d+\.\d+\.\d+\.\d+)\s*\)\s*$/) { |
764 : | parrello | 1.31 | my $ec = $1; |
765 : | parrello | 1.85 | $loadIsIdentifiedByEC->Put($roleID, $ec); |
766 : | # Check to see if this is our first encounter with this EC. | ||
767 : | if (exists $ecToRoles{$ec}) { | ||
768 : | # No, so just add this role to the EC list. | ||
769 : | push @{$ecToRoles{$ec}}, $roleID; | ||
770 : | } else { | ||
771 : | # Output this EC. | ||
772 : | $loadRoleEC->Put($ec); | ||
773 : | # Create its role list. | ||
774 : | $ecToRoles{$ec} = [$roleID]; | ||
775 : | } | ||
776 : | parrello | 1.31 | } |
777 : | parrello | 1.23 | } |
778 : | parrello | 1.18 | } |
779 : | parrello | 1.31 | # Now we create the spreadsheet for the subsystem by matching roles to |
780 : | # genomes. Each genome is a row and each role is a column. We may need | ||
781 : | # to actually create the roles as we find them. | ||
782 : | Trace("Creating subsystem $subsysID spreadsheet.") if T(3); | ||
783 : | for (my $row = 0; defined($genomeID = $sub->get_genome($row)); $row++) { | ||
784 : | # Only proceed if this is one of our genomes. | ||
785 : | if (exists $genomeHash->{$genomeID}) { | ||
786 : | # Count the PEGs and cells found for verification purposes. | ||
787 : | my $pegCount = 0; | ||
788 : | my $cellCount = 0; | ||
789 : | # Create a list for the PEGs we find. This list will be used | ||
790 : | # to generate cluster numbers. | ||
791 : | my @pegsFound = (); | ||
792 : | # Create a hash that maps spreadsheet IDs to PEGs. We will | ||
793 : | # use this to generate the ContainsFeature data after we have | ||
794 : | # the cluster numbers. | ||
795 : | my %cellPegs = (); | ||
796 : | # Get the genome's variant code for this subsystem. | ||
797 : | my $variantCode = $sub->get_variant_code($row); | ||
798 : | # Loop through the subsystem's roles. We use an index because it is | ||
799 : | # part of the spreadsheet cell ID. | ||
800 : | for (my $col = 0; defined($roleID = $sub->get_role($col)); $col++) { | ||
801 : | # Get the features in the spreadsheet cell for this genome and role. | ||
802 : | parrello | 1.37 | my @pegs = grep { !$fig->is_deleted_fid($_) } $sub->get_pegs_from_cell($row, $col); |
803 : | parrello | 1.31 | # Only proceed if features exist. |
804 : | if (@pegs > 0) { | ||
805 : | # Create the spreadsheet cell. | ||
806 : | $cellCount++; | ||
807 : | my $cellID = "$subsysID:$genomeID:$col"; | ||
808 : | $loadSSCell->Put($cellID); | ||
809 : | $loadIsGenomeOf->Put($genomeID, $cellID); | ||
810 : | $loadIsRoleOf->Put($roleID, $cellID); | ||
811 : | $loadHasSSCell->Put($subsysID, $cellID); | ||
812 : | # Remember its features. | ||
813 : | push @pegsFound, @pegs; | ||
814 : | $cellPegs{$cellID} = \@pegs; | ||
815 : | $pegCount += @pegs; | ||
816 : | } | ||
817 : | parrello | 1.23 | } |
818 : | parrello | 1.31 | # If we found some cells for this genome, we need to compute clusters and |
819 : | # denote it participates in the subsystem. | ||
820 : | if ($pegCount > 0) { | ||
821 : | Trace("$pegCount PEGs in $cellCount cells for $genomeID.") if T(3); | ||
822 : | $loadParticipatesIn->Put($genomeID, $subsysID, $variantCode); | ||
823 : | # Create a hash mapping PEG IDs to cluster numbers. | ||
824 : | # We default to -1 for all of them. | ||
825 : | my %clusterOf = map { $_ => -1 } @pegsFound; | ||
826 : | parrello | 1.41 | # Partition the PEGs found into clusters. |
827 : | my @clusters = $fig->compute_clusters([keys %clusterOf], $sub); | ||
828 : | parrello | 1.31 | for (my $i = 0; $i <= $#clusters; $i++) { |
829 : | my $subList = $clusters[$i]; | ||
830 : | for my $peg (@{$subList}) { | ||
831 : | $clusterOf{$peg} = $i; | ||
832 : | } | ||
833 : | parrello | 1.23 | } |
834 : | parrello | 1.31 | # Create the ContainsFeature data. |
835 : | for my $cellID (keys %cellPegs) { | ||
836 : | my $cellList = $cellPegs{$cellID}; | ||
837 : | for my $cellPeg (@$cellList) { | ||
838 : | $loadContainsFeature->Put($cellID, $cellPeg, $clusterOf{$cellPeg}); | ||
839 : | } | ||
840 : | parrello | 1.23 | } |
841 : | parrello | 1.18 | } |
842 : | } | ||
843 : | parrello | 1.15 | } |
844 : | parrello | 1.31 | # Now we need to generate the subsets. The subset names must be concatenated to |
845 : | # the subsystem name to make them unique keys. There are two types of subsets: | ||
846 : | # genome subsets and role subsets. We do the role subsets first. | ||
847 : | my @subsetNames = $sub->get_subset_names(); | ||
848 : | for my $subsetID (@subsetNames) { | ||
849 : | # Create the subset record. | ||
850 : | my $actualID = "$subsysID:$subsetID"; | ||
851 : | $loadRoleSubset->Put($actualID); | ||
852 : | # Connect the subset to the subsystem. | ||
853 : | $loadHasRoleSubset->Put($subsysID, $actualID); | ||
854 : | # Connect the subset to its roles. | ||
855 : | my @roles = $sub->get_subsetC_roles($subsetID); | ||
856 : | for my $roleID (@roles) { | ||
857 : | $loadConsistsOfRoles->Put($actualID, $roleID); | ||
858 : | } | ||
859 : | } | ||
860 : | # Next the genome subsets. | ||
861 : | @subsetNames = $sub->get_subset_namesR(); | ||
862 : | for my $subsetID (@subsetNames) { | ||
863 : | # Create the subset record. | ||
864 : | my $actualID = "$subsysID:$subsetID"; | ||
865 : | $loadGenomeSubset->Put($actualID); | ||
866 : | # Connect the subset to the subsystem. | ||
867 : | $loadHasGenomeSubset->Put($subsysID, $actualID); | ||
868 : | # Connect the subset to its genomes. | ||
869 : | my @genomes = $sub->get_subsetR($subsetID); | ||
870 : | for my $genomeID (@genomes) { | ||
871 : | $loadConsistsOfGenomes->Put($actualID, $genomeID); | ||
872 : | } | ||
873 : | parrello | 1.23 | } |
874 : | parrello | 1.18 | } |
875 : | parrello | 1.57 | } |
876 : | # Now we loop through the diagrams. We need to create the diagram records | ||
877 : | # and link each diagram to its roles. Note that only roles which occur | ||
878 : | # in subsystems (and therefore appear in the %ecToRoles hash) are | ||
879 : | # included. | ||
880 : | for my $map (@maps) { | ||
881 : | Trace("Loading diagram $map.") if T(3); | ||
882 : | # Get the diagram's descriptive name. | ||
883 : | my $name = $fig->map_name($map); | ||
884 : | $loadDiagram->Put($map, $name); | ||
885 : | # Now we need to link all the map's roles to it. | ||
886 : | # A hash is used to prevent duplicates. | ||
887 : | my %roleHash = (); | ||
888 : | parrello | 1.87 | for my $ec ($fig->map_to_ecs($map)) { |
889 : | if (exists $ecToRoles{$ec}) { | ||
890 : | for my $role (@{$ecToRoles{$ec}}) { | ||
891 : | if (! $roleHash{$role}) { | ||
892 : | $loadRoleOccursIn->Put($role, $map); | ||
893 : | $roleHash{$role} = 1; | ||
894 : | } | ||
895 : | } | ||
896 : | parrello | 1.23 | } |
897 : | parrello | 1.21 | } |
898 : | parrello | 1.57 | } |
899 : | # Before we leave, we must create the Catalyzes table. We start with the reactions, | ||
900 : | # then use the "ecToRoles" table to convert EC numbers to role IDs. | ||
901 : | my @reactions = $fig->all_reactions(); | ||
902 : | for my $reactionID (@reactions) { | ||
903 : | # Get this reaction's list of roles. The results will be EC numbers. | ||
904 : | parrello | 1.85 | my @ecs = $fig->catalyzed_by($reactionID); |
905 : | parrello | 1.57 | # Loop through the roles, creating catalyzation records. |
906 : | parrello | 1.85 | for my $thisEC (@ecs) { |
907 : | if (exists $ecToRoles{$thisEC}) { | ||
908 : | for my $thisRole (@{$ecToRoles{$thisEC}}) { | ||
909 : | $loadCatalyzes->Put($thisRole, $reactionID); | ||
910 : | } | ||
911 : | parrello | 1.23 | } |
912 : | parrello | 1.18 | } |
913 : | } | ||
914 : | parrello | 1.1 | } |
915 : | # Finish the load. | ||
916 : | my $retVal = $self->_FinishAll(); | ||
917 : | return $retVal; | ||
918 : | } | ||
919 : | |||
920 : | =head3 LoadPropertyData | ||
921 : | |||
922 : | C<< my $stats = $spl->LoadPropertyData(); >> | ||
923 : | |||
924 : | Load the attribute data from FIG into Sprout. | ||
925 : | |||
926 : | Attribute data in FIG corresponds to the Sprout concept of Property. As currently | ||
927 : | implemented, each key-value attribute combination in the SEED corresponds to a | ||
928 : | record in the B<Property> table. The B<HasProperty> relationship links the | ||
929 : | features to the properties. | ||
930 : | |||
931 : | The SEED also allows attributes to be assigned to genomes, but this is not yet | ||
932 : | supported by Sprout. | ||
933 : | |||
934 : | The following relations are loaded by this method. | ||
935 : | |||
936 : | HasProperty | ||
937 : | Property | ||
938 : | |||
939 : | =over 4 | ||
940 : | |||
941 : | =item RETURNS | ||
942 : | |||
943 : | Returns a statistics object for the loads. | ||
944 : | |||
945 : | =back | ||
946 : | |||
947 : | =cut | ||
948 : | #: Return Type $%; | ||
949 : | sub LoadPropertyData { | ||
950 : | # Get this object instance. | ||
951 : | my ($self) = @_; | ||
952 : | # Get the FIG object. | ||
953 : | my $fig = $self->{fig}; | ||
954 : | # Get the genome hash. | ||
955 : | my $genomeHash = $self->{genomes}; | ||
956 : | # Create load objects for each of the tables we're loading. | ||
957 : | parrello | 1.23 | my $loadProperty = $self->_TableLoader('Property'); |
958 : | parrello | 1.85 | my $loadHasProperty = $self->_TableLoader('HasProperty'); |
959 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
960 : | Trace("Loading from existing files.") if T(2); | ||
961 : | } else { | ||
962 : | Trace("Generating property data.") if T(2); | ||
963 : | # Create a hash for storing property IDs. | ||
964 : | my %propertyKeys = (); | ||
965 : | my $nextID = 1; | ||
966 : | parrello | 1.83 | # Get the attributes we intend to store in the property table. |
967 : | parrello | 1.85 | my $propKeys = $self->{propKeys}; |
968 : | parrello | 1.23 | # Loop through the genomes. |
969 : | parrello | 1.66 | for my $genomeID (sort keys %{$genomeHash}) { |
970 : | parrello | 1.23 | $loadProperty->Add("genomeIn"); |
971 : | parrello | 1.24 | Trace("Generating properties for $genomeID.") if T(3); |
972 : | parrello | 1.83 | # Initialize a counter. |
973 : | parrello | 1.24 | my $propertyCount = 0; |
974 : | parrello | 1.80 | # Get the properties for this genome's features. |
975 : | parrello | 1.85 | my @attributes = $fig->get_attributes("fig|$genomeID%", $propKeys); |
976 : | parrello | 1.83 | Trace("Property list built for $genomeID.") if T(3); |
977 : | # Loop through the results, creating HasProperty records. | ||
978 : | for my $attributeData (@attributes) { | ||
979 : | # Pull apart the attribute tuple. | ||
980 : | my ($fid, $key, $value, $url) = @{$attributeData}; | ||
981 : | # Concatenate the key and value and check the "propertyKeys" hash to | ||
982 : | # see if we already have an ID for it. We use a tab for the separator | ||
983 : | # character. | ||
984 : | my $propertyKey = "$key\t$value"; | ||
985 : | # Use the concatenated value to check for an ID. If no ID exists, we | ||
986 : | # create one. | ||
987 : | my $propertyID = $propertyKeys{$propertyKey}; | ||
988 : | if (! $propertyID) { | ||
989 : | # Here we need to create a new property ID for this key/value pair. | ||
990 : | $propertyKeys{$propertyKey} = $nextID; | ||
991 : | $propertyID = $nextID; | ||
992 : | $nextID++; | ||
993 : | $loadProperty->Put($propertyID, $key, $value); | ||
994 : | parrello | 1.1 | } |
995 : | parrello | 1.83 | # Create the HasProperty entry for this feature/property association. |
996 : | $loadHasProperty->Put($fid, $propertyID, $url); | ||
997 : | parrello | 1.1 | } |
998 : | parrello | 1.24 | # Update the statistics. |
999 : | parrello | 1.83 | Trace("$propertyCount attributes processed.") if T(3); |
1000 : | parrello | 1.24 | $loadHasProperty->Add("propertiesIn", $propertyCount); |
1001 : | parrello | 1.1 | } |
1002 : | } | ||
1003 : | # Finish the load. | ||
1004 : | my $retVal = $self->_FinishAll(); | ||
1005 : | return $retVal; | ||
1006 : | } | ||
1007 : | |||
1008 : | =head3 LoadAnnotationData | ||
1009 : | |||
1010 : | C<< my $stats = $spl->LoadAnnotationData(); >> | ||
1011 : | |||
1012 : | Load the annotation data from FIG into Sprout. | ||
1013 : | |||
1014 : | Sprout annotations encompass both the assignments and the annotations in SEED. | ||
1015 : | These describe the function performed by a PEG as well as any other useful | ||
1016 : | information that may aid in identifying its purpose. | ||
1017 : | |||
1018 : | The following relations are loaded by this method. | ||
1019 : | |||
1020 : | Annotation | ||
1021 : | IsTargetOfAnnotation | ||
1022 : | SproutUser | ||
1023 : | MadeAnnotation | ||
1024 : | |||
1025 : | =over 4 | ||
1026 : | |||
1027 : | =item RETURNS | ||
1028 : | |||
1029 : | Returns a statistics object for the loads. | ||
1030 : | |||
1031 : | =back | ||
1032 : | |||
1033 : | =cut | ||
1034 : | #: Return Type $%; | ||
1035 : | sub LoadAnnotationData { | ||
1036 : | # Get this object instance. | ||
1037 : | my ($self) = @_; | ||
1038 : | # Get the FIG object. | ||
1039 : | my $fig = $self->{fig}; | ||
1040 : | # Get the genome hash. | ||
1041 : | my $genomeHash = $self->{genomes}; | ||
1042 : | # Create load objects for each of the tables we're loading. | ||
1043 : | parrello | 1.23 | my $loadAnnotation = $self->_TableLoader('Annotation'); |
1044 : | parrello | 1.85 | my $loadIsTargetOfAnnotation = $self->_TableLoader('IsTargetOfAnnotation'); |
1045 : | my $loadSproutUser = $self->_TableLoader('SproutUser'); | ||
1046 : | my $loadUserAccess = $self->_TableLoader('UserAccess'); | ||
1047 : | my $loadMadeAnnotation = $self->_TableLoader('MadeAnnotation'); | ||
1048 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
1049 : | Trace("Loading from existing files.") if T(2); | ||
1050 : | } else { | ||
1051 : | Trace("Generating annotation data.") if T(2); | ||
1052 : | # Create a hash of user names. We'll use this to prevent us from generating duplicate | ||
1053 : | # user records. | ||
1054 : | my %users = ( FIG => 1, master => 1 ); | ||
1055 : | # Put in FIG and "master". | ||
1056 : | $loadSproutUser->Put("FIG", "Fellowship for Interpretation of Genomes"); | ||
1057 : | $loadUserAccess->Put("FIG", 1); | ||
1058 : | $loadSproutUser->Put("master", "Master User"); | ||
1059 : | $loadUserAccess->Put("master", 1); | ||
1060 : | # Get the current time. | ||
1061 : | my $time = time(); | ||
1062 : | # Loop through the genomes. | ||
1063 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
1064 : | Trace("Processing $genomeID.") if T(3); | ||
1065 : | parrello | 1.38 | # Create a hash of timestamps. We use this to prevent duplicate time stamps |
1066 : | # from showing up for a single PEG's annotations. | ||
1067 : | my %seenTimestamps = (); | ||
1068 : | parrello | 1.36 | # Get the genome's annotations. |
1069 : | my @annotations = $fig->read_all_annotations($genomeID); | ||
1070 : | Trace("Processing annotations.") if T(2); | ||
1071 : | for my $tuple (@annotations) { | ||
1072 : | # Get the annotation tuple. | ||
1073 : | my ($peg, $timestamp, $user, $text) = @{$tuple}; | ||
1074 : | # Here we fix up the annotation text. "\r" is removed, | ||
1075 : | parrello | 1.42 | # and "\t" and "\n" are escaped. Note we use the "gs" |
1076 : | parrello | 1.36 | # modifier so that new-lines inside the text do not |
1077 : | # stop the substitution search. | ||
1078 : | $text =~ s/\r//gs; | ||
1079 : | $text =~ s/\t/\\t/gs; | ||
1080 : | $text =~ s/\n/\\n/gs; | ||
1081 : | # Change assignments by the master user to FIG assignments. | ||
1082 : | $text =~ s/Set master function/Set FIG function/s; | ||
1083 : | # Insure the time stamp is valid. | ||
1084 : | if ($timestamp =~ /^\d+$/) { | ||
1085 : | # Here it's a number. We need to insure the one we use to form | ||
1086 : | # the key is unique. | ||
1087 : | my $keyStamp = $timestamp; | ||
1088 : | while ($seenTimestamps{"$peg:$keyStamp"}) { | ||
1089 : | $keyStamp++; | ||
1090 : | parrello | 1.1 | } |
1091 : | parrello | 1.36 | my $annotationID = "$peg:$keyStamp"; |
1092 : | $seenTimestamps{$annotationID} = 1; | ||
1093 : | # Insure the user exists. | ||
1094 : | if (! $users{$user}) { | ||
1095 : | $loadSproutUser->Put($user, "SEED user"); | ||
1096 : | $loadUserAccess->Put($user, 1); | ||
1097 : | $users{$user} = 1; | ||
1098 : | } | ||
1099 : | # Generate the annotation. | ||
1100 : | $loadAnnotation->Put($annotationID, $timestamp, $text); | ||
1101 : | $loadIsTargetOfAnnotation->Put($peg, $annotationID); | ||
1102 : | $loadMadeAnnotation->Put($user, $annotationID); | ||
1103 : | } else { | ||
1104 : | # Here we have an invalid time stamp. | ||
1105 : | Trace("Invalid time stamp \"$timestamp\" in annotations for $peg.") if T(1); | ||
1106 : | parrello | 1.1 | } |
1107 : | } | ||
1108 : | } | ||
1109 : | } | ||
1110 : | # Finish the load. | ||
1111 : | my $retVal = $self->_FinishAll(); | ||
1112 : | return $retVal; | ||
1113 : | } | ||
1114 : | |||
1115 : | parrello | 1.5 | =head3 LoadSourceData |
1116 : | |||
1117 : | C<< my $stats = $spl->LoadSourceData(); >> | ||
1118 : | |||
1119 : | Load the source data from FIG into Sprout. | ||
1120 : | |||
1121 : | Source data links genomes to information about the organizations that | ||
1122 : | mapped it. | ||
1123 : | |||
1124 : | The following relations are loaded by this method. | ||
1125 : | |||
1126 : | ComesFrom | ||
1127 : | Source | ||
1128 : | SourceURL | ||
1129 : | |||
1130 : | There is no direct support for source attribution in FIG, so we access the SEED | ||
1131 : | files directly. | ||
1132 : | |||
1133 : | =over 4 | ||
1134 : | |||
1135 : | =item RETURNS | ||
1136 : | |||
1137 : | Returns a statistics object for the loads. | ||
1138 : | |||
1139 : | =back | ||
1140 : | |||
1141 : | =cut | ||
1142 : | #: Return Type $%; | ||
1143 : | sub LoadSourceData { | ||
1144 : | # Get this object instance. | ||
1145 : | my ($self) = @_; | ||
1146 : | # Get the FIG object. | ||
1147 : | my $fig = $self->{fig}; | ||
1148 : | # Get the genome hash. | ||
1149 : | my $genomeHash = $self->{genomes}; | ||
1150 : | # Create load objects for each of the tables we're loading. | ||
1151 : | parrello | 1.85 | my $loadComesFrom = $self->_TableLoader('ComesFrom'); |
1152 : | parrello | 1.23 | my $loadSource = $self->_TableLoader('Source'); |
1153 : | my $loadSourceURL = $self->_TableLoader('SourceURL'); | ||
1154 : | if ($self->{options}->{loadOnly}) { | ||
1155 : | Trace("Loading from existing files.") if T(2); | ||
1156 : | } else { | ||
1157 : | Trace("Generating annotation data.") if T(2); | ||
1158 : | # Create hashes to collect the Source information. | ||
1159 : | my %sourceURL = (); | ||
1160 : | my %sourceDesc = (); | ||
1161 : | # Loop through the genomes. | ||
1162 : | my $line; | ||
1163 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
1164 : | Trace("Processing $genomeID.") if T(3); | ||
1165 : | # Open the project file. | ||
1166 : | if ((open(TMP, "<$FIG_Config::organisms/$genomeID/PROJECT")) && | ||
1167 : | defined($line = <TMP>)) { | ||
1168 : | chomp $line; | ||
1169 : | my($sourceID, $desc, $url) = split(/\t/,$line); | ||
1170 : | $loadComesFrom->Put($genomeID, $sourceID); | ||
1171 : | if ($url && ! exists $sourceURL{$sourceID}) { | ||
1172 : | $loadSourceURL->Put($sourceID, $url); | ||
1173 : | $sourceURL{$sourceID} = 1; | ||
1174 : | } | ||
1175 : | if ($desc) { | ||
1176 : | $sourceDesc{$sourceID} = $desc; | ||
1177 : | } elsif (! exists $sourceDesc{$sourceID}) { | ||
1178 : | $sourceDesc{$sourceID} = $sourceID; | ||
1179 : | } | ||
1180 : | parrello | 1.5 | } |
1181 : | parrello | 1.23 | close TMP; |
1182 : | } | ||
1183 : | # Write the source descriptions. | ||
1184 : | for my $sourceID (keys %sourceDesc) { | ||
1185 : | $loadSource->Put($sourceID, $sourceDesc{$sourceID}); | ||
1186 : | parrello | 1.5 | } |
1187 : | parrello | 1.16 | } |
1188 : | parrello | 1.5 | # Finish the load. |
1189 : | my $retVal = $self->_FinishAll(); | ||
1190 : | return $retVal; | ||
1191 : | } | ||
1192 : | |||
1193 : | parrello | 1.6 | =head3 LoadExternalData |
1194 : | |||
1195 : | C<< my $stats = $spl->LoadExternalData(); >> | ||
1196 : | |||
1197 : | Load the external data from FIG into Sprout. | ||
1198 : | |||
1199 : | External data contains information about external feature IDs. | ||
1200 : | |||
1201 : | The following relations are loaded by this method. | ||
1202 : | |||
1203 : | ExternalAliasFunc | ||
1204 : | ExternalAliasOrg | ||
1205 : | |||
1206 : | The support for external IDs in FIG is hidden beneath layers of other data, so | ||
1207 : | we access the SEED files directly to create these tables. This is also one of | ||
1208 : | the few load methods that does not proceed genome by genome. | ||
1209 : | |||
1210 : | =over 4 | ||
1211 : | |||
1212 : | =item RETURNS | ||
1213 : | |||
1214 : | Returns a statistics object for the loads. | ||
1215 : | |||
1216 : | =back | ||
1217 : | |||
1218 : | =cut | ||
1219 : | #: Return Type $%; | ||
1220 : | sub LoadExternalData { | ||
1221 : | # Get this object instance. | ||
1222 : | my ($self) = @_; | ||
1223 : | # Get the FIG object. | ||
1224 : | my $fig = $self->{fig}; | ||
1225 : | # Get the genome hash. | ||
1226 : | my $genomeHash = $self->{genomes}; | ||
1227 : | # Convert the genome hash. We'll get the genus and species for each genome and make | ||
1228 : | # it the key. | ||
1229 : | my %speciesHash = map { $fig->genus_species($_) => $_ } (keys %{$genomeHash}); | ||
1230 : | # Create load objects for each of the tables we're loading. | ||
1231 : | parrello | 1.23 | my $loadExternalAliasFunc = $self->_TableLoader('ExternalAliasFunc'); |
1232 : | my $loadExternalAliasOrg = $self->_TableLoader('ExternalAliasOrg'); | ||
1233 : | if ($self->{options}->{loadOnly}) { | ||
1234 : | Trace("Loading from existing files.") if T(2); | ||
1235 : | } else { | ||
1236 : | Trace("Generating external data.") if T(2); | ||
1237 : | # We loop through the files one at a time. First, the organism file. | ||
1238 : | parrello | 1.58 | Open(\*ORGS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_org.table |"); |
1239 : | parrello | 1.23 | my $orgLine; |
1240 : | while (defined($orgLine = <ORGS>)) { | ||
1241 : | # Clean the input line. | ||
1242 : | chomp $orgLine; | ||
1243 : | # Parse the organism name. | ||
1244 : | my ($protID, $name) = split /\s*\t\s*/, $orgLine; | ||
1245 : | $loadExternalAliasOrg->Put($protID, $name); | ||
1246 : | } | ||
1247 : | close ORGS; | ||
1248 : | # Now the function file. | ||
1249 : | my $funcLine; | ||
1250 : | parrello | 1.58 | Open(\*FUNCS, "sort +0 -1 -u -t\"\t\" $FIG_Config::global/ext_func.table |"); |
1251 : | parrello | 1.23 | while (defined($funcLine = <FUNCS>)) { |
1252 : | # Clean the line ending. | ||
1253 : | chomp $funcLine; | ||
1254 : | # Only proceed if the line is non-blank. | ||
1255 : | if ($funcLine) { | ||
1256 : | # Split it into fields. | ||
1257 : | my @funcFields = split /\s*\t\s*/, $funcLine; | ||
1258 : | # If there's an EC number, append it to the description. | ||
1259 : | if ($#funcFields >= 2 && $funcFields[2] =~ /^(EC .*\S)/) { | ||
1260 : | $funcFields[1] .= " $1"; | ||
1261 : | } | ||
1262 : | # Output the function line. | ||
1263 : | $loadExternalAliasFunc->Put(@funcFields[0,1]); | ||
1264 : | parrello | 1.6 | } |
1265 : | } | ||
1266 : | } | ||
1267 : | # Finish the load. | ||
1268 : | my $retVal = $self->_FinishAll(); | ||
1269 : | return $retVal; | ||
1270 : | } | ||
1271 : | parrello | 1.5 | |
1272 : | parrello | 1.18 | |
1273 : | =head3 LoadReactionData | ||
1274 : | |||
1275 : | C<< my $stats = $spl->LoadReactionData(); >> | ||
1276 : | |||
1277 : | Load the reaction data from FIG into Sprout. | ||
1278 : | |||
1279 : | Reaction data connects reactions to the compounds that participate in them. | ||
1280 : | |||
1281 : | The following relations are loaded by this method. | ||
1282 : | |||
1283 : | parrello | 1.20 | Reaction |
1284 : | parrello | 1.18 | ReactionURL |
1285 : | Compound | ||
1286 : | CompoundName | ||
1287 : | CompoundCAS | ||
1288 : | parrello | 1.85 | IsIdentifiedByCAS |
1289 : | HasCompoundName | ||
1290 : | parrello | 1.18 | IsAComponentOf |
1291 : | |||
1292 : | This method proceeds reaction by reaction rather than genome by genome. | ||
1293 : | |||
1294 : | =over 4 | ||
1295 : | |||
1296 : | =item RETURNS | ||
1297 : | |||
1298 : | Returns a statistics object for the loads. | ||
1299 : | |||
1300 : | =back | ||
1301 : | |||
1302 : | =cut | ||
1303 : | #: Return Type $%; | ||
1304 : | sub LoadReactionData { | ||
1305 : | # Get this object instance. | ||
1306 : | my ($self) = @_; | ||
1307 : | # Get the FIG object. | ||
1308 : | my $fig = $self->{fig}; | ||
1309 : | # Create load objects for each of the tables we're loading. | ||
1310 : | parrello | 1.23 | my $loadReaction = $self->_TableLoader('Reaction'); |
1311 : | parrello | 1.85 | my $loadReactionURL = $self->_TableLoader('ReactionURL'); |
1312 : | my $loadCompound = $self->_TableLoader('Compound'); | ||
1313 : | my $loadCompoundName = $self->_TableLoader('CompoundName'); | ||
1314 : | my $loadCompoundCAS = $self->_TableLoader('CompoundCAS'); | ||
1315 : | my $loadIsAComponentOf = $self->_TableLoader('IsAComponentOf'); | ||
1316 : | my $loadIsIdentifiedByCAS = $self->_TableLoader('IsIdentifiedByCAS'); | ||
1317 : | my $loadHasCompoundName = $self->_TableLoader('HasCompoundName'); | ||
1318 : | parrello | 1.23 | if ($self->{options}->{loadOnly}) { |
1319 : | Trace("Loading from existing files.") if T(2); | ||
1320 : | } else { | ||
1321 : | parrello | 1.85 | Trace("Generating reaction data.") if T(2); |
1322 : | # We need some hashes to prevent duplicates. | ||
1323 : | my %compoundNames = (); | ||
1324 : | my %compoundCASes = (); | ||
1325 : | parrello | 1.23 | # First we create the compounds. |
1326 : | my @compounds = $fig->all_compounds(); | ||
1327 : | for my $cid (@compounds) { | ||
1328 : | # Check for names. | ||
1329 : | my @names = $fig->names_of_compound($cid); | ||
1330 : | # Each name will be given a priority number, starting with 1. | ||
1331 : | my $prio = 1; | ||
1332 : | for my $name (@names) { | ||
1333 : | parrello | 1.85 | if (! exists $compoundNames{$name}) { |
1334 : | $loadCompoundName->Put($name); | ||
1335 : | $compoundNames{$name} = 1; | ||
1336 : | } | ||
1337 : | $loadHasCompoundName->Put($cid, $name, $prio++); | ||
1338 : | parrello | 1.23 | } |
1339 : | # Create the main compound record. Note that the first name | ||
1340 : | # becomes the label. | ||
1341 : | my $label = (@names > 0 ? $names[0] : $cid); | ||
1342 : | $loadCompound->Put($cid, $label); | ||
1343 : | # Check for a CAS ID. | ||
1344 : | my $cas = $fig->cas($cid); | ||
1345 : | if ($cas) { | ||
1346 : | parrello | 1.85 | $loadIsIdentifiedByCAS->Put($cid, $cas); |
1347 : | if (! exists $compoundCASes{$cas}) { | ||
1348 : | $loadCompoundCAS->Put($cas); | ||
1349 : | $compoundCASes{$cas} = 1; | ||
1350 : | } | ||
1351 : | parrello | 1.23 | } |
1352 : | parrello | 1.20 | } |
1353 : | parrello | 1.23 | # All the compounds are set up, so we need to loop through the reactions next. First, |
1354 : | # we initialize the discriminator index. This is a single integer used to insure | ||
1355 : | # duplicate elements in a reaction are not accidentally collapsed. | ||
1356 : | my $discrim = 0; | ||
1357 : | my @reactions = $fig->all_reactions(); | ||
1358 : | for my $reactionID (@reactions) { | ||
1359 : | # Create the reaction record. | ||
1360 : | $loadReaction->Put($reactionID, $fig->reversible($reactionID)); | ||
1361 : | # Compute the reaction's URL. | ||
1362 : | my $url = HTML::reaction_link($reactionID); | ||
1363 : | # Put it in the ReactionURL table. | ||
1364 : | $loadReactionURL->Put($reactionID, $url); | ||
1365 : | # Now we need all of the reaction's compounds. We get these in two phases, | ||
1366 : | # substrates first and then products. | ||
1367 : | for my $product (0, 1) { | ||
1368 : | # Get the compounds of the current type for the current reaction. FIG will | ||
1369 : | # give us 3-tuples: [ID, stoichiometry, main-flag]. At this time we do not | ||
1370 : | # have location data in SEED, so it defaults to the empty string. | ||
1371 : | my @compounds = $fig->reaction2comp($reactionID, $product); | ||
1372 : | for my $compData (@compounds) { | ||
1373 : | # Extract the compound data from the current tuple. | ||
1374 : | my ($cid, $stoich, $main) = @{$compData}; | ||
1375 : | # Link the compound to the reaction. | ||
1376 : | $loadIsAComponentOf->Put($cid, $reactionID, $discrim++, "", $main, | ||
1377 : | $product, $stoich); | ||
1378 : | } | ||
1379 : | parrello | 1.18 | } |
1380 : | } | ||
1381 : | } | ||
1382 : | # Finish the load. | ||
1383 : | my $retVal = $self->_FinishAll(); | ||
1384 : | return $retVal; | ||
1385 : | } | ||
1386 : | |||
1387 : | parrello | 1.43 | =head3 LoadSynonymData |
1388 : | |||
1389 : | C<< my $stats = $spl->LoadSynonymData(); >> | ||
1390 : | |||
1391 : | Load the synonym groups into Sprout. | ||
1392 : | |||
1393 : | The following relations are loaded by this method. | ||
1394 : | |||
1395 : | SynonymGroup | ||
1396 : | IsSynonymGroupFor | ||
1397 : | |||
1398 : | The source information for these relations is taken from the C<maps_to_id> method | ||
1399 : | parrello | 1.56 | of the B<FIG> object. Unfortunately, to make this work, we need to use direct |
1400 : | SQL against the FIG database. | ||
1401 : | parrello | 1.43 | |
1402 : | =over 4 | ||
1403 : | |||
1404 : | =item RETURNS | ||
1405 : | |||
1406 : | Returns a statistics object for the loads. | ||
1407 : | |||
1408 : | =back | ||
1409 : | |||
1410 : | =cut | ||
1411 : | #: Return Type $%; | ||
1412 : | sub LoadSynonymData { | ||
1413 : | # Get this object instance. | ||
1414 : | my ($self) = @_; | ||
1415 : | # Get the FIG object. | ||
1416 : | my $fig = $self->{fig}; | ||
1417 : | # Get the genome hash. | ||
1418 : | my $genomeHash = $self->{genomes}; | ||
1419 : | # Create a load object for the table we're loading. | ||
1420 : | my $loadSynonymGroup = $self->_TableLoader('SynonymGroup'); | ||
1421 : | my $loadIsSynonymGroupFor = $self->_TableLoader('IsSynonymGroupFor'); | ||
1422 : | if ($self->{options}->{loadOnly}) { | ||
1423 : | Trace("Loading from existing files.") if T(2); | ||
1424 : | } else { | ||
1425 : | Trace("Generating synonym group data.") if T(2); | ||
1426 : | parrello | 1.56 | # Get the database handle. |
1427 : | my $dbh = $fig->db_handle(); | ||
1428 : | parrello | 1.85 | # Ask for the synonyms. Note that "maps_to" is a group name, and "syn_id" is a PEG ID or alias. |
1429 : | parrello | 1.59 | my $sth = $dbh->prepare_command("SELECT maps_to, syn_id FROM peg_synonyms ORDER BY maps_to"); |
1430 : | parrello | 1.56 | my $result = $sth->execute(); |
1431 : | if (! defined($result)) { | ||
1432 : | Confess("Database error in Synonym load: " . $sth->errstr()); | ||
1433 : | } else { | ||
1434 : | # Remember the current synonym. | ||
1435 : | my $current_syn = ""; | ||
1436 : | # Count the features. | ||
1437 : | my $featureCount = 0; | ||
1438 : | # Loop through the synonym/peg pairs. | ||
1439 : | while (my @row = $sth->fetchrow()) { | ||
1440 : | parrello | 1.85 | # Get the synonym group ID and feature ID. |
1441 : | parrello | 1.56 | my ($syn_id, $peg) = @row; |
1442 : | # Insure it's for one of our genomes. | ||
1443 : | my $genomeID = FIG::genome_of($peg); | ||
1444 : | if (exists $genomeHash->{$genomeID}) { | ||
1445 : | # Verify the synonym. | ||
1446 : | if ($syn_id ne $current_syn) { | ||
1447 : | # It's new, so put it in the group table. | ||
1448 : | $loadSynonymGroup->Put($syn_id); | ||
1449 : | $current_syn = $syn_id; | ||
1450 : | } | ||
1451 : | # Connect the synonym to the peg. | ||
1452 : | $loadIsSynonymGroupFor->Put($syn_id, $peg); | ||
1453 : | # Count this feature. | ||
1454 : | $featureCount++; | ||
1455 : | if ($featureCount % 1000 == 0) { | ||
1456 : | Trace("$featureCount features processed.") if T(3); | ||
1457 : | } | ||
1458 : | parrello | 1.43 | } |
1459 : | } | ||
1460 : | } | ||
1461 : | } | ||
1462 : | # Finish the load. | ||
1463 : | my $retVal = $self->_FinishAll(); | ||
1464 : | return $retVal; | ||
1465 : | } | ||
1466 : | |||
1467 : | parrello | 1.60 | =head3 LoadFamilyData |
1468 : | |||
1469 : | C<< my $stats = $spl->LoadFamilyData(); >> | ||
1470 : | |||
1471 : | Load the protein families into Sprout. | ||
1472 : | |||
1473 : | The following relations are loaded by this method. | ||
1474 : | |||
1475 : | Family | ||
1476 : | parrello | 1.63 | IsFamilyForFeature |
1477 : | parrello | 1.60 | |
1478 : | The source information for these relations is taken from the C<families_for_protein>, | ||
1479 : | C<family_function>, and C<sz_family> methods of the B<FIG> object. | ||
1480 : | |||
1481 : | =over 4 | ||
1482 : | |||
1483 : | =item RETURNS | ||
1484 : | |||
1485 : | Returns a statistics object for the loads. | ||
1486 : | |||
1487 : | =back | ||
1488 : | |||
1489 : | =cut | ||
1490 : | #: Return Type $%; | ||
1491 : | sub LoadFamilyData { | ||
1492 : | # Get this object instance. | ||
1493 : | my ($self) = @_; | ||
1494 : | # Get the FIG object. | ||
1495 : | my $fig = $self->{fig}; | ||
1496 : | # Get the genome hash. | ||
1497 : | my $genomeHash = $self->{genomes}; | ||
1498 : | # Create load objects for the tables we're loading. | ||
1499 : | my $loadFamily = $self->_TableLoader('Family'); | ||
1500 : | parrello | 1.63 | my $loadIsFamilyForFeature = $self->_TableLoader('IsFamilyForFeature'); |
1501 : | parrello | 1.60 | if ($self->{options}->{loadOnly}) { |
1502 : | Trace("Loading from existing files.") if T(2); | ||
1503 : | } else { | ||
1504 : | Trace("Generating family data.") if T(2); | ||
1505 : | # Create a hash for the family IDs. | ||
1506 : | my %familyHash = (); | ||
1507 : | # Loop through the genomes. | ||
1508 : | for my $genomeID (sort keys %{$genomeHash}) { | ||
1509 : | Trace("Processing features for $genomeID.") if T(2); | ||
1510 : | # Loop through this genome's PEGs. | ||
1511 : | for my $fid ($fig->all_features($genomeID, "peg")) { | ||
1512 : | parrello | 1.63 | $loadIsFamilyForFeature->Add("features", 1); |
1513 : | parrello | 1.60 | # Get this feature's families. |
1514 : | my @families = $fig->families_for_protein($fid); | ||
1515 : | # Loop through the families, connecting them to the feature. | ||
1516 : | for my $family (@families) { | ||
1517 : | parrello | 1.63 | $loadIsFamilyForFeature->Put($family, $fid); |
1518 : | parrello | 1.60 | # If this is a new family, create a record for it. |
1519 : | if (! exists $familyHash{$family}) { | ||
1520 : | parrello | 1.62 | $familyHash{$family} = 1; |
1521 : | parrello | 1.60 | $loadFamily->Add("families", 1); |
1522 : | my $size = $fig->sz_family($family); | ||
1523 : | my $func = $fig->family_function($family); | ||
1524 : | parrello | 1.61 | $loadFamily->Put($family, $size, $func); |
1525 : | parrello | 1.60 | } |
1526 : | } | ||
1527 : | } | ||
1528 : | } | ||
1529 : | } | ||
1530 : | # Finish the load. | ||
1531 : | my $retVal = $self->_FinishAll(); | ||
1532 : | return $retVal; | ||
1533 : | } | ||
1534 : | parrello | 1.43 | |
1535 : | parrello | 1.76 | =head3 LoadDrugData |
1536 : | |||
1537 : | C<< my $stats = $spl->LoadDrugData(); >> | ||
1538 : | |||
1539 : | Load the drug target data into Sprout. | ||
1540 : | |||
1541 : | The following relations are loaded by this method. | ||
1542 : | |||
1543 : | PDB | ||
1544 : | parrello | 1.83 | DocksWith |
1545 : | IsProteinForFeature | ||
1546 : | parrello | 1.76 | Ligand |
1547 : | |||
1548 : | parrello | 1.83 | The source information for these relations is taken from attributes. The |
1549 : | C<PDB> attribute links a PDB to a feature, and is used to build B<IsProteinForFeature>. | ||
1550 : | The C<zinc_name> attribute describes the ligands. The C<docking_results> | ||
1551 : | attribute contains the information for the B<DocksWith> relationship. It is | ||
1552 : | expected that additional attributes and tables will be added in the future. | ||
1553 : | parrello | 1.76 | |
1554 : | =over 4 | ||
1555 : | |||
1556 : | =item RETURNS | ||
1557 : | |||
1558 : | Returns a statistics object for the loads. | ||
1559 : | |||
1560 : | =back | ||
1561 : | |||
1562 : | =cut | ||
1563 : | #: Return Type $%; | ||
1564 : | sub LoadDrugData { | ||
1565 : | # Get this object instance. | ||
1566 : | my ($self) = @_; | ||
1567 : | # Get the FIG object. | ||
1568 : | my $fig = $self->{fig}; | ||
1569 : | # Get the genome hash. | ||
1570 : | my $genomeHash = $self->{genomes}; | ||
1571 : | # Create load objects for the tables we're loading. | ||
1572 : | my $loadPDB = $self->_TableLoader('PDB'); | ||
1573 : | my $loadLigand = $self->_TableLoader('Ligand'); | ||
1574 : | parrello | 1.83 | my $loadIsProteinForFeature = $self->_TableLoader('IsProteinForFeature'); |
1575 : | my $loadDocksWith = $self->_TableLoader('DocksWith'); | ||
1576 : | parrello | 1.76 | if ($self->{options}->{loadOnly}) { |
1577 : | Trace("Loading from existing files.") if T(2); | ||
1578 : | } else { | ||
1579 : | Trace("Generating drug target data.") if T(2); | ||
1580 : | parrello | 1.83 | # First comes the "DocksWith" relationship. This will give us a list of PDBs. |
1581 : | # We can also encounter PDBs when we process "IsProteinForFeature". To manage | ||
1582 : | # this process, PDB information is collected in a hash table and then | ||
1583 : | # unspooled after both relationships are created. | ||
1584 : | my %pdbHash = (); | ||
1585 : | Trace("Generating docking data.") if T(2); | ||
1586 : | # Get all the docking data. This may cause problems if there are too many PDBs, | ||
1587 : | # at which point we'll need another algorithm. The indicator that this is | ||
1588 : | # happening will be a timeout error in the next statement. | ||
1589 : | my @dockData = $fig->query_attributes('$key = ? AND $value < ?', | ||
1590 : | ['docking_results', $FIG_Config::dockLimit]); | ||
1591 : | Trace(scalar(@dockData) . " rows of docking data found.") if T(3); | ||
1592 : | for my $dockData (@dockData) { | ||
1593 : | # Get the docking data components. | ||
1594 : | my ($pdbID, $docking_key, @valueData) = @{$dockData}; | ||
1595 : | # Fix the PDB ID. It's supposed to be lower-case, but this does not always happen. | ||
1596 : | $pdbID = lc $pdbID; | ||
1597 : | parrello | 1.84 | # Strip off the object type. |
1598 : | $pdbID =~ s/pdb://; | ||
1599 : | parrello | 1.83 | # Extract the ZINC ID from the docking key. Note that there are two possible |
1600 : | # formats. | ||
1601 : | my (undef, $zinc_id) = $docking_key =~ /^docking_results::(ZINC)?(\d+)$/; | ||
1602 : | if (! $zinc_id) { | ||
1603 : | Trace("Invalid docking result key $docking_key for $pdbID.") if T(0); | ||
1604 : | $loadDocksWith->Add("errors"); | ||
1605 : | parrello | 1.76 | } else { |
1606 : | parrello | 1.83 | # Get the pieces of the value and parse the energy. |
1607 : | # Note that we don't care about the rank, since | ||
1608 : | # we can sort on the energy level itself in our database. | ||
1609 : | my ($energy, $tool, $type) = @valueData; | ||
1610 : | my ($rank, $total, $vanderwaals, $electrostatic) = split /\s*;\s*/, $energy; | ||
1611 : | parrello | 1.84 | # Ignore predicted results. |
1612 : | if ($type ne "Predicted") { | ||
1613 : | # Count this docking result. | ||
1614 : | if (! exists $pdbHash{$pdbID}) { | ||
1615 : | $pdbHash{$pdbID} = 1; | ||
1616 : | } else { | ||
1617 : | $pdbHash{$pdbID}++; | ||
1618 : | } | ||
1619 : | # Write the result to the output. | ||
1620 : | $loadDocksWith->Put($pdbID, $zinc_id, $electrostatic, $type, $tool, | ||
1621 : | $total, $vanderwaals); | ||
1622 : | } | ||
1623 : | parrello | 1.83 | } |
1624 : | } | ||
1625 : | Trace("Connecting features.") if T(2); | ||
1626 : | # Loop through the genomes. | ||
1627 : | for my $genome (sort keys %{$genomeHash}) { | ||
1628 : | Trace("Generating PDBs for $genome.") if T(3); | ||
1629 : | # Get all of the PDBs that BLAST against this genome's features. | ||
1630 : | my @attributeData = $fig->get_attributes("fig|$genome%", 'PDB::%'); | ||
1631 : | for my $pdbData (@attributeData) { | ||
1632 : | # The PDB ID is coded as a subkey. | ||
1633 : | parrello | 1.84 | if ($pdbData->[1] !~ /PDB::(.+)/i) { |
1634 : | parrello | 1.83 | Trace("Invalid PDB ID \"$pdbData->[1]\" in attribute table.") if T(0); |
1635 : | $loadPDB->Add("errors"); | ||
1636 : | } else { | ||
1637 : | my $pdbID = $1; | ||
1638 : | # Insure the PDB is in the hash. | ||
1639 : | if (! exists $pdbHash{$pdbID}) { | ||
1640 : | $pdbHash{$pdbID} = 0; | ||
1641 : | parrello | 1.76 | } |
1642 : | parrello | 1.83 | # The score and locations are coded in the attribute value. |
1643 : | if ($pdbData->[2] !~ /^([^;]+)(.*)$/) { | ||
1644 : | Trace("Invalid PDB data for $pdbID and feature $pdbData->[0].") if T(0); | ||
1645 : | $loadIsProteinForFeature->Add("errors"); | ||
1646 : | } else { | ||
1647 : | my ($score, $locData) = ($1,$2); | ||
1648 : | # The location data may not be present, so we have to start with some | ||
1649 : | # defaults and then check. | ||
1650 : | my ($start, $end) = (1, 0); | ||
1651 : | if ($locData) { | ||
1652 : | $locData =~ /(\d+)-(\d+)/; | ||
1653 : | $start = $1; | ||
1654 : | $end = $2; | ||
1655 : | } | ||
1656 : | # If we still don't have the end location, compute it from | ||
1657 : | # the feature length. | ||
1658 : | if (! $end) { | ||
1659 : | # Most features have one location, but we do a list iteration | ||
1660 : | # just in case. | ||
1661 : | my @locations = $fig->feature_location($pdbData->[0]); | ||
1662 : | $end = 0; | ||
1663 : | for my $loc (@locations) { | ||
1664 : | my $locObject = BasicLocation->new($loc); | ||
1665 : | $end += $locObject->Length; | ||
1666 : | parrello | 1.76 | } |
1667 : | } | ||
1668 : | parrello | 1.83 | # Decode the score. |
1669 : | my $realScore = FIGRules::DecodeScore($score); | ||
1670 : | # Connect the PDB to the feature. | ||
1671 : | $loadIsProteinForFeature->Put($pdbData->[0], $pdbID, $start, $realScore, $end); | ||
1672 : | parrello | 1.76 | } |
1673 : | parrello | 1.83 | } |
1674 : | } | ||
1675 : | } | ||
1676 : | # We've got all our PDBs now, so we unspool them from the hash. | ||
1677 : | Trace("Generating PDBs. " . scalar(keys %pdbHash) . " found.") if T(2); | ||
1678 : | my $count = 0; | ||
1679 : | for my $pdbID (sort keys %pdbHash) { | ||
1680 : | $loadPDB->Put($pdbID, $pdbHash{$pdbID}); | ||
1681 : | $count++; | ||
1682 : | Trace("$count PDBs processed.") if T(3) && ($count % 500 == 0); | ||
1683 : | } | ||
1684 : | # Finally we create the ligand table. This information can be found in the | ||
1685 : | # zinc_name attribute. | ||
1686 : | Trace("Loading ligands.") if T(2); | ||
1687 : | # The ligand list is huge, so we have to get it in pieces. We also have to check for duplicates. | ||
1688 : | my $last_zinc_id = ""; | ||
1689 : | my $zinc_id = ""; | ||
1690 : | my $done = 0; | ||
1691 : | while (! $done) { | ||
1692 : | # Get the next 10000 ligands. We insist that the object ID is greater than | ||
1693 : | # the last ID we processed. | ||
1694 : | Trace("Loading batch starting with ZINC:$zinc_id.") if T(3); | ||
1695 : | my @attributeData = $fig->query_attributes('$object > ? AND $key = ? ORDER BY $object LIMIT 10000', | ||
1696 : | ["ZINC:$zinc_id", "zinc_name"]); | ||
1697 : | Trace(scalar(@attributeData) . " attribute rows returned.") if T(3); | ||
1698 : | if (! @attributeData) { | ||
1699 : | # Here there are no attributes left, so we quit the loop. | ||
1700 : | $done = 1; | ||
1701 : | } else { | ||
1702 : | # Process the attribute data we've received. | ||
1703 : | for my $zinc_data (@attributeData) { | ||
1704 : | # The ZINC ID is found in the first return column, prefixed with the word ZINC. | ||
1705 : | if ($zinc_data->[0] =~ /^ZINC:(\d+)$/) { | ||
1706 : | $zinc_id = $1; | ||
1707 : | # Check for a duplicate. | ||
1708 : | if ($zinc_id eq $last_zinc_id) { | ||
1709 : | $loadLigand->Add("duplicate"); | ||
1710 : | } else { | ||
1711 : | # Here it's safe to output the ligand. The ligand name is the attribute value | ||
1712 : | # (third column in the row). | ||
1713 : | $loadLigand->Put($zinc_id, $zinc_data->[2]); | ||
1714 : | # Insure we don't try to add this ID again. | ||
1715 : | $last_zinc_id = $zinc_id; | ||
1716 : | parrello | 1.76 | } |
1717 : | parrello | 1.83 | } else { |
1718 : | Trace("Invalid zinc ID \"$zinc_data->[0]\" in attribute table.") if T(0); | ||
1719 : | $loadLigand->Add("errors"); | ||
1720 : | parrello | 1.76 | } |
1721 : | } | ||
1722 : | } | ||
1723 : | } | ||
1724 : | parrello | 1.83 | Trace("Ligands loaded.") if T(2); |
1725 : | parrello | 1.76 | } |
1726 : | # Finish the load. | ||
1727 : | my $retVal = $self->_FinishAll(); | ||
1728 : | return $retVal; | ||
1729 : | } | ||
1730 : | parrello | 1.69 | |
1731 : | |||
1732 : | parrello | 1.1 | =head2 Internal Utility Methods |
1733 : | |||
1734 : | parrello | 1.76 | =head3 SpecialAttribute |
1735 : | |||
1736 : | parrello | 1.77 | C<< my $count = SproutLoad::SpecialAttribute($id, \@attributes, $idxMatch, \@idxValues, $pattern, $loader); >> |
1737 : | parrello | 1.76 | |
1738 : | Look for special attributes of a given type. A special attribute is found by comparing one of | ||
1739 : | the columns of the incoming attribute list to a search pattern. If a match is found, then | ||
1740 : | parrello | 1.77 | a set of columns is put into an output table connected to the specified ID. |
1741 : | parrello | 1.76 | |
1742 : | For example, when processing features, the attribute list we look at has three columns: attribute | ||
1743 : | name, attribute value, and attribute value HTML. The IEDB attribute exists if the attribute name | ||
1744 : | begins with C<iedb_>. The call signature is therefore | ||
1745 : | |||
1746 : | parrello | 1.77 | my $found = SpecialAttribute($fid, \@attributeList, 0, [0,2], '^iedb_', $loadFeatureIEDB); |
1747 : | parrello | 1.76 | |
1748 : | The pattern is matched against column 0, and if we have a match, then column 2's value is put | ||
1749 : | to the output along with the specified feature ID. | ||
1750 : | |||
1751 : | =over 4 | ||
1752 : | |||
1753 : | =item id | ||
1754 : | |||
1755 : | ID of the object whose special attributes are being loaded. This forms the first column of the | ||
1756 : | output. | ||
1757 : | |||
1758 : | =item attributes | ||
1759 : | |||
1760 : | Reference to a list of tuples. | ||
1761 : | |||
1762 : | =item idxMatch | ||
1763 : | |||
1764 : | Index in each tuple of the column to be matched against the pattern. If the match is | ||
1765 : | successful, an output record will be generated. | ||
1766 : | |||
1767 : | parrello | 1.77 | =item idxValues |
1768 : | parrello | 1.76 | |
1769 : | parrello | 1.77 | Reference to a list containing the indexes in each tuple of the columns to be put as |
1770 : | the second column of the output. | ||
1771 : | parrello | 1.76 | |
1772 : | =item pattern | ||
1773 : | |||
1774 : | Pattern to be matched against the specified column. The match will be case-insensitive. | ||
1775 : | |||
1776 : | =item loader | ||
1777 : | |||
1778 : | An object to which each output record will be put. Usually this is an B<ERDBLoad> object, | ||
1779 : | but technically it could be anything with a C<Put> method. | ||
1780 : | |||
1781 : | =item RETURN | ||
1782 : | |||
1783 : | Returns a count of the matches found. | ||
1784 : | |||
1785 : | =item | ||
1786 : | |||
1787 : | =back | ||
1788 : | |||
1789 : | =cut | ||
1790 : | |||
1791 : | sub SpecialAttribute { | ||
1792 : | # Get the parameters. | ||
1793 : | parrello | 1.77 | my ($id, $attributes, $idxMatch, $idxValues, $pattern, $loader) = @_; |
1794 : | parrello | 1.76 | # Declare the return variable. |
1795 : | my $retVal = 0; | ||
1796 : | # Loop through the attribute rows. | ||
1797 : | for my $row (@{$attributes}) { | ||
1798 : | # Check for a match. | ||
1799 : | if ($row->[$idxMatch] =~ m/$pattern/i) { | ||
1800 : | parrello | 1.77 | # We have a match, so output a row. This is a bit tricky, since we may |
1801 : | # be putting out multiple columns of data from the input. | ||
1802 : | my $value = join(" ", map { $row->[$_] } @{$idxValues}); | ||
1803 : | $loader->Put($id, $value); | ||
1804 : | parrello | 1.76 | $retVal++; |
1805 : | } | ||
1806 : | } | ||
1807 : | Trace("$retVal special attributes found for $id and loader " . $loader->RelName() . ".") if T(4) && $retVal; | ||
1808 : | # Return the number of matches. | ||
1809 : | return $retVal; | ||
1810 : | } | ||
1811 : | |||
1812 : | parrello | 1.1 | =head3 TableLoader |
1813 : | |||
1814 : | Create an ERDBLoad object for the specified table. The object is also added to | ||
1815 : | the internal list in the C<loaders> property of this object. That enables the | ||
1816 : | L</FinishAll> method to terminate all the active loads. | ||
1817 : | |||
1818 : | This is an instance method. | ||
1819 : | |||
1820 : | =over 4 | ||
1821 : | |||
1822 : | =item tableName | ||
1823 : | |||
1824 : | Name of the table (relation) being loaded. | ||
1825 : | |||
1826 : | =item RETURN | ||
1827 : | |||
1828 : | Returns an ERDBLoad object for loading the specified table. | ||
1829 : | |||
1830 : | =back | ||
1831 : | |||
1832 : | =cut | ||
1833 : | |||
1834 : | sub _TableLoader { | ||
1835 : | # Get the parameters. | ||
1836 : | parrello | 1.85 | my ($self, $tableName) = @_; |
1837 : | parrello | 1.1 | # Create the load object. |
1838 : | parrello | 1.85 | my $retVal = ERDBLoad->new($self->{erdb}, $tableName, $self->{loadDirectory}, $self->LoadOnly); |
1839 : | parrello | 1.1 | # Cache it in the loader list. |
1840 : | push @{$self->{loaders}}, $retVal; | ||
1841 : | # Return it to the caller. | ||
1842 : | return $retVal; | ||
1843 : | } | ||
1844 : | |||
1845 : | =head3 FinishAll | ||
1846 : | |||
1847 : | Finish all the active loads on this object. | ||
1848 : | |||
1849 : | When a load is started by L</TableLoader>, the controlling B<ERDBLoad> object is cached in | ||
1850 : | the list pointed to be the C<loaders> property of this object. This method pops the loaders | ||
1851 : | off the list and finishes them to flush out any accumulated residue. | ||
1852 : | |||
1853 : | This is an instance method. | ||
1854 : | |||
1855 : | =over 4 | ||
1856 : | |||
1857 : | =item RETURN | ||
1858 : | |||
1859 : | Returns a statistics object containing the accumulated statistics for the load. | ||
1860 : | |||
1861 : | =back | ||
1862 : | |||
1863 : | =cut | ||
1864 : | |||
1865 : | sub _FinishAll { | ||
1866 : | # Get this object instance. | ||
1867 : | my ($self) = @_; | ||
1868 : | # Create the statistics object. | ||
1869 : | my $retVal = Stats->new(); | ||
1870 : | # Get the loader list. | ||
1871 : | my $loadList = $self->{loaders}; | ||
1872 : | parrello | 1.48 | # Create a hash to hold the statistics objects, keyed on relation name. |
1873 : | my %loaderHash = (); | ||
1874 : | parrello | 1.1 | # Loop through the list, finishing the loads. Note that if the finish fails, we die |
1875 : | parrello | 1.48 | # ignominiously. At some future point, we want to make the loads more restartable. |
1876 : | parrello | 1.1 | while (my $loader = pop @{$loadList}) { |
1877 : | parrello | 1.26 | # Get the relation name. |
1878 : | parrello | 1.19 | my $relName = $loader->RelName; |
1879 : | parrello | 1.26 | # Check the ignore flag. |
1880 : | if ($loader->Ignore) { | ||
1881 : | Trace("Relation $relName not loaded.") if T(2); | ||
1882 : | } else { | ||
1883 : | # Here we really need to finish. | ||
1884 : | Trace("Finishing $relName.") if T(2); | ||
1885 : | my $stats = $loader->Finish(); | ||
1886 : | parrello | 1.48 | $loaderHash{$relName} = $stats; |
1887 : | } | ||
1888 : | } | ||
1889 : | # Now we loop through again, actually loading the tables. We want to finish before | ||
1890 : | # loading so that if something goes wrong at this point, all the load files are usable | ||
1891 : | # and we don't have to redo all that work. | ||
1892 : | for my $relName (sort keys %loaderHash) { | ||
1893 : | # Get the statistics for this relation. | ||
1894 : | my $stats = $loaderHash{$relName}; | ||
1895 : | # Check for a database load. | ||
1896 : | if ($self->{options}->{dbLoad}) { | ||
1897 : | # Here we want to use the load file just created to load the database. | ||
1898 : | Trace("Loading relation $relName.") if T(2); | ||
1899 : | my $newStats = $self->{sprout}->LoadUpdate(1, [$relName]); | ||
1900 : | # Accumulate the statistics from the DB load. | ||
1901 : | $stats->Accumulate($newStats); | ||
1902 : | } | ||
1903 : | $retVal->Accumulate($stats); | ||
1904 : | Trace("Statistics for $relName:\n" . $stats->Show()) if T(2); | ||
1905 : | parrello | 1.1 | } |
1906 : | # Return the load statistics. | ||
1907 : | return $retVal; | ||
1908 : | } | ||
1909 : | parrello | 1.83 | |
1910 : | parrello | 1.80 | =head3 GetGenomeAttributes |
1911 : | |||
1912 : | parrello | 1.85 | C<< my $aHashRef = GetGenomeAttributes($fig, $genomeID, \@fids, \@propKeys); >> |
1913 : | parrello | 1.80 | |
1914 : | parrello | 1.83 | Return a hash of attributes keyed on feature ID. This method gets all the NMPDR-related |
1915 : | attributes for all the features of a genome in a single call, then organizes them into | ||
1916 : | a hash. | ||
1917 : | parrello | 1.80 | |
1918 : | =over 4 | ||
1919 : | |||
1920 : | =item fig | ||
1921 : | |||
1922 : | FIG-like object for accessing attributes. | ||
1923 : | |||
1924 : | =item genomeID | ||
1925 : | |||
1926 : | ID of the genome who's attributes are desired. | ||
1927 : | |||
1928 : | =item fids | ||
1929 : | |||
1930 : | Reference to a list of the feature IDs whose attributes are to be kept. | ||
1931 : | |||
1932 : | parrello | 1.85 | =item propKeys |
1933 : | |||
1934 : | A list of the keys to retrieve. | ||
1935 : | |||
1936 : | parrello | 1.80 | =item RETURN |
1937 : | |||
1938 : | Returns a reference to a hash. The key of the hash is the feature ID. The value is the | ||
1939 : | reference to a list of the feature's attribute tuples. Each tuple contains the feature ID, | ||
1940 : | the attribute key, and one or more attribute values. | ||
1941 : | |||
1942 : | =back | ||
1943 : | |||
1944 : | =cut | ||
1945 : | |||
1946 : | sub GetGenomeAttributes { | ||
1947 : | # Get the parameters. | ||
1948 : | parrello | 1.85 | my ($fig, $genomeID, $fids, $propKeys) = @_; |
1949 : | parrello | 1.80 | # Declare the return variable. |
1950 : | my $retVal = {}; | ||
1951 : | # Initialize the hash. This not only enables us to easily determine which FIDs to | ||
1952 : | # keep, it insures that the caller sees a list reference for every known fid, | ||
1953 : | # simplifying the logic. | ||
1954 : | for my $fid (@{$fids}) { | ||
1955 : | $retVal->{$fid} = []; | ||
1956 : | } | ||
1957 : | parrello | 1.85 | # Get the attributes. If ev_code_cron is running, we may get a timeout error, so |
1958 : | # an eval is used. | ||
1959 : | my @aList = (); | ||
1960 : | eval { | ||
1961 : | @aList = $fig->get_attributes("fig|$genomeID%", $propKeys); | ||
1962 : | Trace(scalar(@aList) . " attributes returned for genome $genomeID.") if T(3); | ||
1963 : | }; | ||
1964 : | # Check for a problem. | ||
1965 : | if ($@) { | ||
1966 : | Trace("Retrying attributes for $genomeID due to error: $@") if T(1); | ||
1967 : | # Our fallback plan is to process the attributes in blocks of 100. This is much slower, | ||
1968 : | # but allows us to continue processing. | ||
1969 : | my $nFids = scalar @{$fids}; | ||
1970 : | for (my $i = 0; $i < $nFids; $i += 100) { | ||
1971 : | # Determine the index of the last feature ID we'll be specifying on this pass. | ||
1972 : | # Normally it's $i + 99, but if we're close to the end it may be less. | ||
1973 : | my $end = ($i + 100 > $nFids ? $nFids - 1 : $i + 99); | ||
1974 : | # Get a slice of the fid list. | ||
1975 : | my @slice = @{$fids}[$i .. $end]; | ||
1976 : | # Get the relevant attributes. | ||
1977 : | Trace("Retrieving attributes for fids $i to $end.") if T(3); | ||
1978 : | my @aShort = $fig->get_attributes(\@slice, $propKeys); | ||
1979 : | Trace(scalar(@aShort) . " attributes returned for fids $i to $end.") if T(3); | ||
1980 : | push @aList, @aShort; | ||
1981 : | } | ||
1982 : | } | ||
1983 : | # Now we should have all the interesting attributes in @aList. Populate the hash with | ||
1984 : | # them. | ||
1985 : | parrello | 1.80 | for my $aListEntry (@aList) { |
1986 : | my $fid = $aListEntry->[0]; | ||
1987 : | if (exists $retVal->{$fid}) { | ||
1988 : | push @{$retVal->{$fid}}, $aListEntry; | ||
1989 : | } | ||
1990 : | } | ||
1991 : | # Return the result. | ||
1992 : | return $retVal; | ||
1993 : | } | ||
1994 : | parrello | 1.1 | |
1995 : | parrello | 1.85 | |
1996 : | parrello | 1.1 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |