[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 : olson 1.15 =item disconnected
61 :    
62 :     True if the application is disconnected from the network - do not
63 :     attempt to contact a SAP server for more data.
64 :    
65 : parrello 1.11 =item assignHash
66 :    
67 :     Hash of feature IDs to functional assignments. Deleted features are removed, which
68 :     means only features listed in this hash can be legally inserted into the database.
69 :    
70 : parrello 1.1 =back
71 :    
72 :     =cut
73 :    
74 :     sub Load {
75 :     # Get the parameters.
76 : olson 1.15 my ($sap, $genome, $directory, $disconnected) = @_;
77 : parrello 1.1 # Create the loader object.
78 : olson 1.15 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
79 : parrello 1.1 # Load the contigs.
80 : parrello 1.12 Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
81 : parrello 1.1 $loaderObject->LoadContigs();
82 :     # Load the features.
83 : parrello 1.12 Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
84 : parrello 1.1 $loaderObject->LoadFeatures();
85 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
86 :     # database.
87 :     if (-f "$directory/annotations") {
88 : parrello 1.12 Trace("Processing annotations.") if T(SaplingDataLoader => 3);
89 : parrello 1.7 $loaderObject->LoadAnnotations("$directory/annotations");
90 :     }
91 : parrello 1.1 # Load the subsystem bindings.
92 : parrello 1.12 Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
93 : parrello 1.1 $loaderObject->LoadSubsystems();
94 :     # Create the Genome record and taxonomy information.
95 : parrello 1.12 Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
96 : parrello 1.1 $loaderObject->CreateGenome();
97 :     # Return the statistics.
98 :     return $loaderObject->{stats};
99 :     }
100 :    
101 :     =head3 ClearGenome
102 :    
103 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
104 :    
105 :     Delete the specified genome and all the related records from the specified sapling
106 :     database. This method can also be used to clean up after a failed or aborted load.
107 :    
108 :     =over 4
109 :    
110 :     =item sap
111 :    
112 :     L<Sapling> object used to access the target database.
113 :    
114 :     =item genome
115 :    
116 :     ID of the genome to delete.
117 :    
118 :     =item RETURN
119 :    
120 :     Returns a statistics object counting the records deleted.
121 :    
122 :     =back
123 :    
124 :     =cut
125 :    
126 :     sub ClearGenome {
127 :     # Get the parameters.
128 :     my ($sap, $genome) = @_;
129 :     # Create the statistics object.
130 :     my $stats = Stats->new();
131 : parrello 1.9 # Delete the DNA sequences.
132 :     my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
133 :     for my $seq (@seqs) {
134 :     my $delStats = $sap->Delete(DNASequence => $seq);
135 :     $stats->Accumulate($delStats);
136 :     }
137 : parrello 1.1 # Delete the contigs.
138 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
139 : parrello 1.1 # Delete the features.
140 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
141 : parrello 1.1 # Delete the molecular machines.
142 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
143 : parrello 1.1 # Delete the genome itself.
144 :     my $subStats = $sap->Delete(Genome => $genome);
145 :     # Accumulate the statistics from the delete.
146 :     $stats->Accumulate($subStats);
147 :     # Return the statistics object.
148 :     return $stats;
149 :     }
150 :    
151 : parrello 1.7
152 :     =head3 Process
153 :    
154 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
155 :    
156 :     Load genome data from the specified directory. If the genome data already
157 :     exists in the database, it will be deleted first.
158 :    
159 :     =over 4
160 :    
161 :     =item sap
162 :    
163 :     L</Sapling> object for accessing the database.
164 :    
165 :     =item genome
166 :    
167 :     ID of the genome whose data is being loaded.
168 :    
169 :     =item directory
170 :    
171 :     Name of the directory containing the genome data files.
172 :    
173 : olson 1.16 =item disconnected
174 :    
175 :     True if the application is disconnected from the network - do not
176 :     attempt to contact a SAP server for more data.
177 :    
178 : parrello 1.7 =item RETURN
179 :    
180 :     Returns a statistics object describing the activity during the reload.
181 :    
182 :     =back
183 :    
184 :     =cut
185 :    
186 :     sub Process {
187 :     # Get the parameters.
188 : olson 1.16 my ($sap, $genome, $directory, $disconnected) = @_;
189 : parrello 1.7 # Clear the existing data for the specified genome.
190 :     my $stats = ClearGenome($sap, $genome);
191 :     # Load the new expression data from the specified directory.
192 : olson 1.16 my $newStats = Load($sap, $genome, $directory, $disconnected);
193 : parrello 1.7 # Merge the statistics.
194 :     $stats->Accumulate($newStats);
195 :     # Return the result.
196 :     return $stats;
197 :     }
198 :    
199 :    
200 : parrello 1.1 =head2 Loader Object Methods
201 :    
202 :     =head3 new
203 :    
204 : olson 1.15 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
205 : parrello 1.1
206 :     Create a loader object that can be used to facilitate loading genome data from a
207 :     directory.
208 :    
209 :     =over 4
210 :    
211 :     =item sap
212 :    
213 :     L<Sapling> object used to access the target database.
214 :    
215 :     =item genome
216 :    
217 :     ID of the genome being loaded.
218 :    
219 :     =item directory
220 :    
221 :     Name of the directory containing the genome information.
222 :    
223 : olson 1.15 =item disconnected
224 :    
225 :     Set to a true value if the application should be considered to be disconnected
226 :     from the network - that is, do not attempt to connect to a Sapling server
227 :     to load subsystem data.
228 :    
229 : parrello 1.1 =back
230 :    
231 :     The object created contains the following fields.
232 :    
233 :     =over 4
234 :    
235 :     =item directory
236 :    
237 :     Name of the directory containing the genome data.
238 :    
239 :     =item genome
240 :    
241 :     ID of the genome being loaded.
242 :    
243 :     =item supportRecords
244 :    
245 :     A hash of hashes, used to track the support records known to exist in the database.
246 :    
247 :     =item sap
248 :    
249 :     L<Sapling> object used to access the database.
250 :    
251 :     =item stats
252 :    
253 :     L<Stats> object for tracking statistical information about the load.
254 :    
255 :     =back
256 :    
257 :     =cut
258 :    
259 :     sub new {
260 :     # Get the parameters.
261 : olson 1.15 my ($class, $sap, $genome, $directory, $disconnected) = @_;
262 : parrello 1.1 # Create the object.
263 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
264 :     # Add our specialized data.
265 :     $retVal->{genome} = $genome;
266 :     $retVal->{directory} = $directory;
267 : parrello 1.11 # Leave the assignment hash undefined until we populate it.
268 :     $retVal->{assignHash} = undef;
269 : olson 1.15 $retVal->{disconnected} = defined($disconnected) ? 1 : 0;
270 : parrello 1.3 # Return the result.
271 : parrello 1.1 return $retVal;
272 :     }
273 :    
274 :     =head3 LoadContigs
275 :    
276 :     $loaderObject->LoadContigs();
277 :    
278 :     Load the contig information into the database. This includes the contigs themselves and
279 : parrello 1.14 the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
280 :     number of base pairs as the C<dna> statistic, and the number of GC instances as the
281 :     C<gc_content> statistic.
282 : parrello 1.1
283 :     =cut
284 :    
285 :     sub LoadContigs {
286 :     # Get this object.
287 :     my ($self) = @_;
288 :     # Open the contigs file from the genome directory.
289 :     my $ih = Open(undef, "<$self->{directory}/contigs");
290 :     # Get the length of a DNA segment.
291 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
292 :     # Get the genome ID.
293 :     my $genome = $self->{genome};
294 :     # These variables will contain the current contig ID, the current contig length,
295 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
296 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
297 :     # Loop through the contig file.
298 :     while (! eof $ih) {
299 :     # Get the current record.
300 :     my $line = <$ih>; chomp $line;
301 :     # Is this the start of a new contig?
302 :     if ($line =~ /^>(\S+)/) {
303 :     # Yes. Save the contig ID.
304 :     my $newContigID = $1;
305 :     # Is there a current contig?
306 :     if (defined $contigID) {
307 :     # Yes. Output the current chunk.
308 :     $self->OutputChunk($contigID, $ordinal, $chunk);
309 :     # Output the contig.
310 :     $self->OutputContig($contigID, $contigLen);
311 :     }
312 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
313 :     $self->FixContig(\$newContigID);
314 :     # Initialize the new contig.
315 :     $contigID = $newContigID;
316 :     $contigLen = 0;
317 :     $ordinal = 0;
318 :     $chunk = '';
319 :     $chunkLen = 0;
320 :     } else {
321 :     # Here we have more DNA in the current contig. Are we at the end of
322 :     # the current chunk?
323 :     my $lineLen = length($line);
324 :     my $newChunkLen = $chunkLen + $lineLen;
325 :     if ($newChunkLen < $segmentLength) {
326 :     # No. Add this line to the chunk.
327 :     $chunk .= $line;
328 :     $chunkLen += $lineLen;
329 :     } else {
330 :     # Yes. Create the actual chunk.
331 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
332 :     # Write it out.
333 :     $self->OutputChunk($contigID, $ordinal, $chunk);
334 :     # Set up the new chunk.
335 :     $chunk = substr($line, $segmentLength - $chunkLen);
336 :     $ordinal++;
337 :     $chunkLen = length($chunk);
338 :     }
339 :     # Update the contig length.
340 :     $contigLen += $lineLen;
341 :     }
342 :     }
343 :     # Is there a current contig?
344 :     if (defined $contigID) {
345 :     # Yes. Output the residual chunk.
346 :     $self->OutputChunk($contigID, $ordinal, $chunk);
347 :     # Output the contig itself.
348 :     $self->OutputContig($contigID, $contigLen);
349 :     }
350 :     }
351 :    
352 :     =head3 OutputChunk
353 :    
354 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
355 :    
356 :     Output a chunk of DNA for the specified contig.
357 :    
358 :     =over 4
359 :    
360 :     =item contigID
361 :    
362 :     ID of the contig being output.
363 :    
364 :     =item ordinal
365 :    
366 :     Ordinal number of this chunk.
367 :    
368 :     =item chunk
369 :    
370 :     DNA sequence comprising this chunk.
371 :    
372 :     =back
373 :    
374 :     =cut
375 :    
376 :     sub OutputChunk {
377 :     # Get the parameters.
378 :     my ($self, $contigID, $ordinal, $chunk) = @_;
379 :     # Get the sapling object.
380 :     my $sap = $self->{sap};
381 :     # Compute the chunk ID.
382 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
383 :     # Connect this sequence to the contig.
384 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
385 : parrello 1.1 # Create the DNA sequence.
386 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
387 :     # Record the chunk.
388 :     $self->{stats}->Add(chunks => 1);
389 : parrello 1.14 # Update the GC count.
390 :     $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
391 : parrello 1.1 }
392 :    
393 :     =head3 OutputContig
394 :    
395 :     $loaderObject->OutputContig($contigID, $contigLen);
396 :    
397 :     Write out the current contig.
398 :    
399 :     =over 4
400 :    
401 :     =item contigID
402 :    
403 :     ID of the contig being written out.
404 :    
405 :     =item contigLen
406 :    
407 :     Length of the contig in base pairs.
408 :    
409 :     =back
410 :    
411 :     =cut
412 :    
413 :     sub OutputContig {
414 :     # Get the parameters.
415 :     my ($self, $contigID, $contigLen) = @_;
416 :     # Get the sapling object.
417 :     my $sap = $self->{sap};
418 :     # Connect the contig to the genome.
419 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
420 :     # Output the contig record.
421 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
422 :     # Record the contig.
423 :     $self->{stats}->Add(contigs => 1);
424 :     $self->{stats}->Add(dna => $contigLen);
425 :     }
426 :    
427 :     =head3 LoadFeatures
428 :    
429 :     $loaderObject->LoadFeatures();
430 :    
431 :     Load the feature data into the database. This includes all of the features,
432 :     their protein sequences, and their subsystem information. The number of
433 :     features of each type will be stored in the statistics object, identified by
434 :     the feature type.
435 :    
436 :     =cut
437 :    
438 :     sub LoadFeatures {
439 :     # Get the parameters.
440 :     my ($self) = @_;
441 : parrello 1.7 # Read in the functional assignments.
442 : parrello 1.12 Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
443 : parrello 1.7 my $assignHash = $self->ReadAssignments();
444 : parrello 1.11 $self->{assignHash} = $assignHash;
445 : parrello 1.1 # Get the directory of feature types.
446 :     my $featureDir = "$self->{directory}/Features";
447 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
448 : parrello 1.11 # Check for protein sequences. If we have some, load them into a hash.
449 :     my $protHash = {};
450 :     if (-f "$featureDir/peg/fasta") {
451 : parrello 1.12 Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
452 : parrello 1.11 $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
453 :     }
454 : parrello 1.1 # Create the feature records for the types found.
455 :     for my $type (@types) {
456 :     # Insure this is a genuine feature directory.
457 :     if (-f "$featureDir/$type/tbl") {
458 : parrello 1.11 # Yes. Read in the evidence codes (if any).
459 :     my $evHash = {};
460 :     my $tranFile = "$featureDir/$type/Attributes/transaction_log";
461 :     if (-f $tranFile) {
462 :     $evHash = $self->LoadEvidenceCodes($tranFile);
463 :     }
464 :     # Now load the feature data.
465 :     $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
466 : parrello 1.1 }
467 :     }
468 : parrello 1.11 }
469 :    
470 :     =head3 LoadEvidenceCodes
471 :    
472 :     my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
473 :    
474 :     Load the evidence codes from the specified attribute transaction log file into a
475 :     hash. The log file is in tab-delimited format. The first column contains the
476 :     transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
477 :     the third an attribute name (we'll ignore everything but C<evidence_code>), and
478 :     the fourth the attribute value.
479 :    
480 :     =over 4
481 :    
482 :     =item attributeFile
483 :    
484 :     Name of the attribute transaction log file.
485 :    
486 :     =item RETURN
487 :    
488 :     Returns a reference to a hash mapping each feature ID to a comma-delimited list
489 :     of its evidence codes.
490 :    
491 :     =back
492 :    
493 :     =cut
494 :    
495 :     sub LoadEvidenceCodes {
496 :     # Get the parameters.
497 :     my ($self, $attributeFile) = @_;
498 :     # Get the Sapling database.
499 :     my $sap = $self->{sap};
500 :     # Get the statistics object.
501 :     my $stats = $self->{stats};
502 :     # Get the assignment hash: we use this to filter the feature IDs.
503 :     my $assignHash = $self->{assignHash};
504 :     # Open the attribute log file for input.
505 :     my $ih = Open(undef, "<$attributeFile");
506 :     # This two-dimensional hash will hold the evidence codes for each feature.
507 :     my %retVal;
508 :     # Loop through the attribute log file.
509 :     while (! eof $ih) {
510 :     # Get the current attribute record.
511 :     my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
512 :     $stats->Add(attributeLine => 1);
513 :     # Insure we have all the pieces we need.
514 :     if (! $command || ! $fid || $key ne 'evidence_code') {
515 :     $stats->Add(attributeLineSkipped => 1);
516 :     } elsif (! $assignHash->{$fid}) {
517 :     # Here the attribute is for a deleted feature.
518 :     $stats->Add(attributeFidSkipped => 1);
519 :     } else {
520 :     # Get the sub-hash for this feature.
521 :     if (! exists $retVal{$fid}) {
522 :     $retVal{$fid} = {};
523 :     }
524 :     my $featureSubHash = $retVal{$fid};
525 :     # Process according to the command.
526 :     if ($command eq 'ADD') {
527 :     # Here we are adding a new evidence code.
528 :     $featureSubHash->{$value} = 1;
529 :     $stats->Add(attributeAdd => 1);
530 :     } elsif ($command eq 'DELETE') {
531 :     # Here we are deleting an evidence code.
532 :     delete $featureSubHash->{$value};
533 :     $stats->Add(attributeDelete => 1);
534 :     } else {
535 :     # here we have an unrecognized command.
536 :     $stats->Add(attributeCommandSkip => 1);
537 :     }
538 :     }
539 : parrello 1.1 }
540 : parrello 1.11 # Loop through the hash, converting each sub-hash to a comma-delimited list of
541 :     # evidence codes.
542 :     for my $fid (keys %retVal) {
543 :     $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
544 : parrello 1.6 }
545 : parrello 1.11 # Return the evidence hash.
546 :     return \%retVal;
547 : parrello 1.1 }
548 :    
549 : parrello 1.11
550 : parrello 1.1 =head3 LoadFeatureData
551 :    
552 : parrello 1.11 $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
553 : parrello 1.1
554 :     Load the basic data for each feature into the database. The number of features of
555 :     the type found will be recorded in the statistics object.
556 :    
557 :     =over 4
558 :    
559 :     =item featureDir
560 :    
561 :     Name of the main directory containing all the feature type subdirectories.
562 :    
563 :     =item type
564 :    
565 :     Type of feature to load.
566 :    
567 : parrello 1.11 =item protHash
568 :    
569 :     Reference to a hash mapping each feature ID for a protein-encoding gene to
570 :     its protein sequence.
571 :    
572 :     =item evHash
573 : parrello 1.7
574 : parrello 1.11 Reference to a hash mapping each feature ID to a comma-delimited list of
575 :     its evidence codes (if any).
576 : parrello 1.7
577 : parrello 1.1 =back
578 :    
579 :     =cut
580 :    
581 :     sub LoadFeatureData {
582 :     # Get the parameters.
583 : parrello 1.11 my ($self, $featureDir, $type, $protHash, $evHash) = @_;
584 : parrello 1.1 # Get the sapling database.
585 :     my $sap = $self->{sap};
586 :     # Get the statistics object.
587 :     my $stats = $self->{stats};
588 : parrello 1.11 # Get the assignment hash. This tells us our functional assignments. This method is
589 :     # also where we remove the deleted features from it.
590 :     my $assignHash = $self->{assignHash};
591 : parrello 1.1 # This hash will track the features we've created. If a feature is found a second
592 :     # time, it overwrites the original.
593 : parrello 1.11 my %fidHash;
594 : parrello 1.6 # Finally, we need the timestamp hash. The initial feature population
595 : parrello 1.1 # Insure we have a tbl file for this feature type.
596 :     my $fileName = "$featureDir/$type/tbl";
597 : olson 1.15 my %deleted_features;
598 : parrello 1.1 if (-f $fileName) {
599 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
600 : parrello 1.11 # of deleted features and remove them from the assignment hash. This insures
601 :     # that they are not used by subsequent methods.
602 : parrello 1.3 my $deleteFile = "$featureDir/$type/deleted.features";
603 :     if (-f $deleteFile) {
604 : parrello 1.11 my $dh = Open(undef, "<$deleteFile");
605 :     while (! eof $dh) {
606 :     my ($deletedFid) = Tracer::GetLine($dh);
607 :     if (exists $assignHash->{$deletedFid}) {
608 :     delete $assignHash->{$deletedFid};
609 :     $stats->Add(deletedFid => 1);
610 : olson 1.15 $deleted_features{$deletedFid} = 1;
611 : parrello 1.11 }
612 :     }
613 : parrello 1.3 }
614 :     # Open the main file for input.
615 : parrello 1.12 Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
616 : parrello 1.1 my $ih = Open(undef, "<$fileName");
617 :     while (! eof $ih) {
618 :     # Read this feature's information.
619 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
620 : parrello 1.3 # Only proceed if the feature is NOT deleted.
621 : olson 1.15 if (!$deleted_features{$fid}) {
622 : parrello 1.3 # If the feature already exists, delete it. (This should be extremely rare.)
623 : parrello 1.11 if ($fidHash{$fid}) {
624 : parrello 1.3 $sap->Delete(Feature => $fid);
625 :     $stats->Add(duplicateFid => 1);
626 : parrello 1.1 }
627 : parrello 1.11 # If this is RNA, the alias list is always empty. Sometimes, the functional
628 :     # assignment is found there.
629 :     if ($type eq 'rna') {
630 :     if (! $assignHash->{$fid}) {
631 :     $assignHash->{$fid} = $aliases[0];
632 : parrello 1.2 }
633 : parrello 1.11 @aliases = ();
634 : parrello 1.2 }
635 : parrello 1.11 # Add the feature to the database.
636 : olson 1.16 my $function = $assignHash->{$fid} // "";
637 :     $self->AddFeature($fid, $function, $locations, \@aliases,
638 : parrello 1.11 $protHash->{$fid}, $evHash->{$fid});
639 :     # Denote we've added this feature, so that if a duplicate occurs we're ready.
640 :     $fidHash{$fid} = 1;
641 : parrello 1.2 }
642 : parrello 1.1 }
643 :     }
644 :     }
645 :    
646 : parrello 1.11
647 : parrello 1.1 =head3 LoadProteinData
648 :    
649 : parrello 1.11 my $protHash = $self->LoadProteinData($fileName);
650 : parrello 1.1
651 :     Load the protein sequences from the named FASTA file. The sequences will be stored
652 : parrello 1.11 in a hash by FIG feature ID.
653 : parrello 1.1
654 :     =over 4
655 :    
656 :     =item fileName
657 :    
658 :     Name of the FASTA file containing the protein sequences for this genome.
659 :    
660 : parrello 1.11 =item RETURN
661 :    
662 :     Returns a hash mapping feature IDs to protein sequences.
663 :    
664 : parrello 1.1 =back
665 :    
666 :     =cut
667 :    
668 :     sub LoadProteinData {
669 :     # Get the parameters.
670 :     my ($self, $fileName) = @_;
671 :     # Open the FASTA file for input.
672 :     my $ih = Open(undef, "<$fileName");
673 : parrello 1.11 # Create the return hash.
674 :     my $retVal = {};
675 : parrello 1.1 # We'll track the current protein in here.
676 :     my $fid;
677 :     my $sequence = "";
678 :     # Loop through the input file.
679 :     while (! eof $ih) {
680 :     # Get the current record.
681 :     my $line = <$ih>; chomp $line;
682 :     # Is this a label record.
683 :     if ($line =~ /^>(\S+)/) {
684 :     # Yes. Save the new feature ID.
685 :     my $newFid = $1;
686 :     # Do we have an existing protein?
687 :     if (defined $fid) {
688 : parrello 1.11 # Yes. Store it in the hash.
689 :     $retVal->{$fid} = $sequence;
690 : parrello 1.1 }
691 :     # Initialize for the next protein.
692 :     $fid = $newFid;
693 :     $sequence = "";
694 :     } else {
695 :     # Here we have more letters for the current protein.
696 :     $sequence .= $line;
697 :     }
698 :     }
699 :     # Do we have a residual protein.
700 :     if (defined $fid) {
701 : parrello 1.11 # Yes. Store it in the hash.
702 :     $retVal->{$fid} = $sequence;
703 : parrello 1.1 }
704 : parrello 1.11 # Return the hash.
705 :     return $retVal;
706 : parrello 1.1 }
707 :    
708 : parrello 1.6
709 :     =head3 LoadAnnotations
710 :    
711 :     $loaderObject->LoadAnnotations($fileName);
712 :    
713 :     Read in the annotation history information and use it to create annotation records.
714 :    
715 :     =over 4
716 :    
717 :     =item fileName
718 :    
719 :     Name of the annotation history file. This file is formatted with four fields per
720 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
721 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
722 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
723 :    
724 :     =back
725 :    
726 :     =cut
727 :    
728 :     sub LoadAnnotations {
729 :     # Get the parameters.
730 :     my ($self, $fileName) = @_;
731 : parrello 1.11 # Get the assignment Hash. We use this to filter out deleted features.
732 :     my $assignHash = $self->{assignHash};
733 : parrello 1.6 # Get the Sapling database.
734 :     my $sap = $self->{sap};
735 :     # Get the statistics object.
736 :     my $stats = $self->{stats};
737 :     # Open the input file.
738 :     my $ih = Tracer::Open(undef, "<$fileName");
739 :     # Loop through the input.
740 :     while (! eof $ih) {
741 :     # Read in the peg, timestamp, and user ID.
742 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
743 : parrello 1.11 # Only proceed if the feature is not deleted.
744 :     if ($assignHash->{$fid}) {
745 :     # Add the annotation to this feature.
746 :     $self->MakeAnnotation($fid, $text, $user, $timestamp);
747 : parrello 1.6 }
748 :     }
749 :     }
750 :    
751 :    
752 : parrello 1.1 =head3 WriteProtein
753 :    
754 :     $loaderObject->WriteProtein($fid, $sequence);
755 :    
756 :     Write out the specified protein sequence and associate it with the specified feature.
757 :     This requires checking to see if the protein sequence is already in the database.
758 :    
759 :     =over 4
760 :    
761 :     =item fid
762 :    
763 :     ID of the feature with the specified protein sequence.
764 :    
765 :     =item sequence
766 :    
767 :     Protein sequence for the indicated feature.
768 :    
769 :     =back
770 :    
771 :     =cut
772 :    
773 :     sub WriteProtein {
774 :     # Get the parameters.
775 :     my ($self, $fid, $sequence) = @_;
776 :     # Compute the key of the protein sequence.
777 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
778 : parrello 1.1 # Insure the protein exists.
779 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
780 :     # Connect the feature to it.
781 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
782 :     }
783 :    
784 :     =head3 LoadSubsystems
785 :    
786 :     $loaderObject->LoadSubsystems();
787 :    
788 :     Load the subsystem data into the database. This includes all of the subsystems,
789 :     their categories, roles, and the bindings that connect them to this genome's features.
790 :    
791 :     =cut
792 :    
793 :     sub LoadSubsystems {
794 :     # Get the parameters.
795 :     my ($self) = @_;
796 : olson 1.15
797 :     #
798 :     # If we are running in disconnected mode, do not actually load subsystems.
799 :     # They rely too much on information from the external sapling.
800 :     #
801 :     if ($self->{disconnected})
802 :     {
803 :     return;
804 :     }
805 :    
806 : parrello 1.1 # Get the sapling object.
807 :     my $sap = $self->{sap};
808 :     # Get the statistics object.
809 :     my $stats = $self->{stats};
810 :     # Get the genome ID.
811 :     my $genome = $self->{genome};
812 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
813 :     my $sapO = SAPserver->new();
814 :     # This hash maps subsystem IDs to molecular machine IDs.
815 :     my %machines;
816 :     # This hash maps subsystem/role pairs to machine role IDs.
817 :     my %machineRoles;
818 :     # The first step is to create the subsystems themselves and connect them to the
819 :     # genome. A list is given in the subsystems file of the Subsystems directory.
820 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
821 :     # Create the field hash for the subsystem query.
822 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
823 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
824 :     Subsystem(version)
825 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
826 :     Includes(sequence)
827 :     Role(id) Role(hypothetical) Role(role-index));
828 :     my %fields = map { $_ => $_ } @fields;
829 :     # Loop through the subsystems in the file, insuring we have them in the database.
830 :     while (! eof $ih) {
831 :     # Get this subsystem.
832 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
833 : parrello 1.12 Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
834 : parrello 1.1 # Normalize the subsystem name.
835 :     $subsystem = $sap->SubsystemID($subsystem);
836 :     # Compute this subsystem's MD5.
837 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
838 :     # Insure the subsystem is in the database.
839 :     if (! $sap->Exists(Subsystem => $subsystem)) {
840 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
841 :     # its roles, and its classification information. The variant will be added
842 :     # later if we need it.
843 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
844 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
845 :     -fields => \%fields);
846 :     # Only proceed if we found some roles.
847 :     if (@$roleList > 0) {
848 :     # Get the subsystem information from the first role and create the subsystem.
849 :     my $roleH = $roleList->[0];
850 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
851 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
852 : parrello 1.10 $stats->Add(subsystems => 1);
853 : parrello 1.1 # Now loop through the roles. The Includes records are always inserted, but the
854 :     # roles are only inserted if they don't already exist.
855 :     for $roleH (@$roleList) {
856 :     # Create the Includes record.
857 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
858 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
859 :     # Insure we have the role in place.
860 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
861 : parrello 1.1 my $roleID = $roleFields{id};
862 :     delete $roleFields{id};
863 :     $self->InsureEntity('Role', $roleID, %roleFields);
864 :     # Compute the machine-role ID.
865 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
866 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
867 : parrello 1.10 $stats->Add(subsystemRoles => 1);
868 : parrello 1.1 }
869 :     }
870 :     } else {
871 :     # We already have the subsystem in the database, but we still need to compute the
872 :     # machine role IDs. We need information from the sapling server for this.
873 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
874 :     # Loop through the roles.
875 :     my $roleList = $subHash->{$subsystem};
876 :     for my $roleTuple (@$roleList) {
877 :     my ($roleID, $abbr) = @$roleTuple;
878 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
879 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
880 :     }
881 :     }
882 :     # Now we need to connect the variant to the subsystem. First, we compute the real
883 :     # variant code by removing the asterisks.
884 :     my $variantCode = $variant;
885 :     $variantCode =~ s/^\*+//;
886 :     $variantCode =~ s/\*+$//;
887 :     # Compute the variant key.
888 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
889 :     # Insure we have it in the database.
890 : parrello 1.10 if (! $sap->Exists(Variant => $variantKey)) {
891 :     # Get the variant from the sapling server.
892 :     my $variantH = $sapO->get(-objects => "Variant",
893 :     -filter => {"Variant(id)" => ["=", $variantKey]},
894 :     -fields => {"Variant(code)" => "code",
895 :     "Variant(comment)" => "comment",
896 :     "Variant(type)" => "type",
897 :     "Variant(role-rule)" => "role-rule"},
898 :     -firstOnly => 1);
899 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
900 :     # Connect it to the subsystem.
901 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
902 :     $stats->Add(subsystemVariants => 1);
903 :     }
904 : parrello 1.1 # Now we create the molecular machine connecting this genome to the subsystem
905 :     # variant.
906 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
907 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
908 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
909 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
910 :     # Remember the machine ID.
911 :     $machines{$subsystem} = $machineID;
912 :     }
913 :     # Now we go through the bindings file. This file connects the subsystem roles to
914 :     # the molecular machines.
915 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
916 :     # Loop through the bindings.
917 :     while (! eof $ih) {
918 :     # Get the binding data.
919 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
920 :     # Normalize the subsystem name.
921 :     $subsystem = $sap->SubsystemID($subsystem);
922 :     # Compute the machine role.
923 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
924 :     # Insure it exists.
925 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
926 :     if ($created) {
927 :     # We created the machine role, so connect it to the machine.
928 :     my $machineID = $machines{$subsystem};
929 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
930 :     # Connect it to the role, too.
931 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
932 :     }
933 :     # Connect the feature.
934 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
935 :     }
936 :     }
937 :    
938 :     =head3 CreateGenome
939 :    
940 :     $loaderObject->CreateGenome();
941 :    
942 :     Create the genome record.
943 :    
944 :     =cut
945 :    
946 :     # This hash maps cache statistics to genome record fields.
947 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
948 :     genome_domain => 'domain'};
949 :     # This hash maps domains to the prokaryotic flag.
950 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
951 :    
952 :     sub CreateGenome {
953 :     # Get the parameters.
954 :     my ($self) = @_;
955 :     # Get the genome directory.
956 :     my $dir = $self->{directory};
957 :     # Get the sapling database.
958 :     my $sap = $self->{sap};
959 :     # We'll put the genome attributes in here.
960 :     my %fields;
961 :     # Check for a basic statistics file.
962 :     my $statsFile = "$dir/cache.basic_statistics";
963 :     if (-f $statsFile) {
964 :     # We have the statistics file, so read the major attributes from it.
965 :     my $ih = Open(undef, "<$statsFile");
966 :     while (! eof $ih) {
967 :     my ($key, $value) = Tracer::GetLine($ih);
968 :     my $fieldKey = GENOME_FIELDS->{$key};
969 :     if ($fieldKey) {
970 :     $fields{$fieldKey} = $value;
971 :     }
972 :     }
973 :     # Translate the domain.
974 :     $fields{domain} = ucfirst $fields{domain};
975 :     # Denote the genome is complete.
976 :     $fields{complete} = 1;
977 :     } else {
978 :     # Check to see if this genome is complete.
979 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
980 :     # Get the genome name.
981 :     my $ih = Open(undef, "<$dir/GENOME");
982 :     my $line = <$ih>; chomp $line;
983 :     $fields{'scientific-name'} = $line;
984 :     # Get the taxonomy and extract the domain from it.
985 :     $ih = Open(undef, "<$dir/TAXONOMY");
986 :     ($fields{domain}) = split /;/, <$ih>, 2;
987 :     }
988 :     # Get the counts from the statistics object.
989 :     my $stats = $self->{stats};
990 :     $fields{contigs} = $stats->Ask('contigs');
991 :     $fields{'dna-size'} = $stats->Ask('dna');
992 :     $fields{pegs} = $stats->Ask('peg');
993 :     $fields{rnas} = $stats->Ask('rna');
994 : parrello 1.14 $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
995 : parrello 1.1 # Get the genetic code. The default is 11.
996 :     $fields{'genetic-code'} = 11;
997 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
998 :     if (-f $geneticCodeFile) {
999 :     # There's a genetic code file, so we need to read it for the code.
1000 :     my $ih = Open(undef, "<$geneticCodeFile");
1001 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
1002 :     }
1003 :     # Use the domain to determine whether or not the genome is prokaryotic.
1004 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
1005 :     # Finally, add the genome ID.
1006 :     $fields{id} = $self->{genome};
1007 :     # Create the genome record.
1008 :     $sap->InsertObject('Genome', %fields);
1009 :     }
1010 :    
1011 :     =head3 ReadAssignments
1012 :    
1013 :     my $assignHash = $loaderObject->ReadAssignments();
1014 :    
1015 :     Return a hash mapping each feature ID to its functional assignment. This method
1016 :     essentially reads the B<assigned_functions> file into memory.
1017 :    
1018 :     =cut
1019 :    
1020 :     sub ReadAssignments {
1021 :     # Get the parameters.
1022 :     my ($self) = @_;
1023 :     # Create the return hash.
1024 :     my $retVal = {};
1025 :     # Loop through the assigned-functions file, storing results in the hash. Later
1026 :     # results will override earlier ones.
1027 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
1028 :     while (! eof $ih) {
1029 :     my ($fid, $function) = Tracer::GetLine($ih);
1030 :     $retVal->{$fid} = $function;
1031 :     }
1032 :     # Return the hash built.
1033 :     return $retVal;
1034 :     }
1035 :    
1036 :    
1037 :     =head3 FixContig
1038 :    
1039 :     $loaderObject->FixContig(\$contigID);
1040 :    
1041 :     Insure that the specified contig ID contains the genome ID.
1042 :    
1043 :     =cut
1044 :    
1045 :     sub FixContig {
1046 :     my ($self, $contigID) = @_;
1047 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
1048 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
1049 :     $$contigID = "$self->{genome}:$$contigID";
1050 :     }
1051 :     }
1052 :    
1053 :     =head3 ConnectLocation
1054 :    
1055 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
1056 :    
1057 :     Connect the specified feature to the specified contig location.
1058 :    
1059 :     =over 4
1060 :    
1061 :     =item fid
1062 :    
1063 :     ID of the relevant feature.
1064 :    
1065 :     =item contig
1066 :    
1067 :     ID of the contig containing this segment of the feature (normalized).
1068 :    
1069 :     =item segment
1070 :    
1071 :     Ordinal number of this segment.
1072 :    
1073 :     =item left
1074 :    
1075 :     Location of the segment's leftmost base pair.
1076 :    
1077 :     =item dir
1078 :    
1079 :     Direction (strand) of the segment (C<+> or C<->).
1080 :    
1081 :     =item len
1082 :    
1083 :     Length of the segment (which must be no greater than the configured maximum).
1084 :    
1085 :     =back
1086 :    
1087 :     =cut
1088 :    
1089 :     sub ConnectLocation {
1090 :     # Get the parameters.
1091 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1092 :     # Get the sapling database.
1093 :     my $sap = $self->{sap};
1094 :     # Create the relationship.
1095 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1096 :     begin => $left, dir => $dir, len => $len,
1097 :     ordinal => $segment);
1098 :     # Record it in the statistics.
1099 :     $self->{stats}->Add(segment => 1);
1100 :     }
1101 :    
1102 : parrello 1.6 =head2 Internal Utility Methods
1103 :    
1104 :     =head3 ReadAnnotation
1105 :    
1106 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1107 :    
1108 :     Read the next record from an annotation file. The next record must exist (that is, an
1109 :     end-of-file check should have been performed before calling this method).
1110 :    
1111 :     =over 4
1112 :    
1113 :     =item ih
1114 :    
1115 :     Open file handle for the annotation file.
1116 :    
1117 :     =item RETURN
1118 :    
1119 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1120 :     timestamp, (2) the user ID, and (3) the annotation text.
1121 :    
1122 :     =back
1123 :    
1124 :     =cut
1125 :    
1126 :     sub ReadAnnotation {
1127 :     # Get the parameter.
1128 :     my ($ih) = @_;
1129 :     # Read the three fixed fields.
1130 :     my $fid = <$ih>; chomp $fid;
1131 :     my $timestamp = <$ih>; chomp $timestamp;
1132 :     my $user = <$ih>; chomp $user;
1133 :     # Loop through the lines of the text field.
1134 :     my $text = "";
1135 :     my $line = <$ih>;
1136 : olson 1.13 while (defined($line) && $line ne "//\n") {
1137 : parrello 1.6 $text .= $line;
1138 :     $line = <$ih>;
1139 :     }
1140 :     # Remove the trailing new-line from the text.
1141 :     chomp $text;
1142 :     # Return the fields.
1143 :     return ($fid, $timestamp, $user, $text);
1144 :     }
1145 :    
1146 : olson 1.13 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3