[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 : parrello 1.11 =item assignHash
61 :    
62 :     Hash of feature IDs to functional assignments. Deleted features are removed, which
63 :     means only features listed in this hash can be legally inserted into the database.
64 :    
65 : parrello 1.1 =back
66 :    
67 :     =cut
68 :    
69 :     sub Load {
70 :     # Get the parameters.
71 :     my ($sap, $genome, $directory) = @_;
72 :     # Create the loader object.
73 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
74 :     # Load the contigs.
75 : parrello 1.12 Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
76 : parrello 1.1 $loaderObject->LoadContigs();
77 :     # Load the features.
78 : parrello 1.12 Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
79 : parrello 1.1 $loaderObject->LoadFeatures();
80 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
81 :     # database.
82 :     if (-f "$directory/annotations") {
83 : parrello 1.12 Trace("Processing annotations.") if T(SaplingDataLoader => 3);
84 : parrello 1.7 $loaderObject->LoadAnnotations("$directory/annotations");
85 :     }
86 : parrello 1.1 # Load the subsystem bindings.
87 : parrello 1.12 Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
88 : parrello 1.1 $loaderObject->LoadSubsystems();
89 :     # Create the Genome record and taxonomy information.
90 : parrello 1.12 Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
91 : parrello 1.1 $loaderObject->CreateGenome();
92 :     # Return the statistics.
93 :     return $loaderObject->{stats};
94 :     }
95 :    
96 :     =head3 ClearGenome
97 :    
98 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
99 :    
100 :     Delete the specified genome and all the related records from the specified sapling
101 :     database. This method can also be used to clean up after a failed or aborted load.
102 :    
103 :     =over 4
104 :    
105 :     =item sap
106 :    
107 :     L<Sapling> object used to access the target database.
108 :    
109 :     =item genome
110 :    
111 :     ID of the genome to delete.
112 :    
113 :     =item RETURN
114 :    
115 :     Returns a statistics object counting the records deleted.
116 :    
117 :     =back
118 :    
119 :     =cut
120 :    
121 :     sub ClearGenome {
122 :     # Get the parameters.
123 :     my ($sap, $genome) = @_;
124 :     # Create the statistics object.
125 :     my $stats = Stats->new();
126 : parrello 1.9 # Delete the DNA sequences.
127 :     my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
128 :     for my $seq (@seqs) {
129 :     my $delStats = $sap->Delete(DNASequence => $seq);
130 :     $stats->Accumulate($delStats);
131 :     }
132 : parrello 1.1 # Delete the contigs.
133 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
134 : parrello 1.1 # Delete the features.
135 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
136 : parrello 1.1 # Delete the molecular machines.
137 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
138 : parrello 1.1 # Delete the genome itself.
139 :     my $subStats = $sap->Delete(Genome => $genome);
140 :     # Accumulate the statistics from the delete.
141 :     $stats->Accumulate($subStats);
142 :     # Return the statistics object.
143 :     return $stats;
144 :     }
145 :    
146 : parrello 1.7
147 :     =head3 Process
148 :    
149 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
150 :    
151 :     Load genome data from the specified directory. If the genome data already
152 :     exists in the database, it will be deleted first.
153 :    
154 :     =over 4
155 :    
156 :     =item sap
157 :    
158 :     L</Sapling> object for accessing the database.
159 :    
160 :     =item genome
161 :    
162 :     ID of the genome whose data is being loaded.
163 :    
164 :     =item directory
165 :    
166 :     Name of the directory containing the genome data files.
167 :    
168 :     =item RETURN
169 :    
170 :     Returns a statistics object describing the activity during the reload.
171 :    
172 :     =back
173 :    
174 :     =cut
175 :    
176 :     sub Process {
177 :     # Get the parameters.
178 :     my ($sap, $genome, $directory) = @_;
179 :     # Clear the existing data for the specified genome.
180 :     my $stats = ClearGenome($sap, $genome);
181 :     # Load the new expression data from the specified directory.
182 :     my $newStats = Load($sap, $genome, $directory);
183 :     # Merge the statistics.
184 :     $stats->Accumulate($newStats);
185 :     # Return the result.
186 :     return $stats;
187 :     }
188 :    
189 :    
190 : parrello 1.1 =head2 Loader Object Methods
191 :    
192 :     =head3 new
193 :    
194 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
195 :    
196 :     Create a loader object that can be used to facilitate loading genome data from a
197 :     directory.
198 :    
199 :     =over 4
200 :    
201 :     =item sap
202 :    
203 :     L<Sapling> object used to access the target database.
204 :    
205 :     =item genome
206 :    
207 :     ID of the genome being loaded.
208 :    
209 :     =item directory
210 :    
211 :     Name of the directory containing the genome information.
212 :    
213 :     =back
214 :    
215 :     The object created contains the following fields.
216 :    
217 :     =over 4
218 :    
219 :     =item directory
220 :    
221 :     Name of the directory containing the genome data.
222 :    
223 :     =item genome
224 :    
225 :     ID of the genome being loaded.
226 :    
227 :     =item supportRecords
228 :    
229 :     A hash of hashes, used to track the support records known to exist in the database.
230 :    
231 :     =item sap
232 :    
233 :     L<Sapling> object used to access the database.
234 :    
235 :     =item stats
236 :    
237 :     L<Stats> object for tracking statistical information about the load.
238 :    
239 :     =back
240 :    
241 :     =cut
242 :    
243 :     sub new {
244 :     # Get the parameters.
245 :     my ($class, $sap, $genome, $directory) = @_;
246 :     # Create the object.
247 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
248 :     # Add our specialized data.
249 :     $retVal->{genome} = $genome;
250 :     $retVal->{directory} = $directory;
251 : parrello 1.11 # Leave the assignment hash undefined until we populate it.
252 :     $retVal->{assignHash} = undef;
253 : parrello 1.3 # Return the result.
254 : parrello 1.1 return $retVal;
255 :     }
256 :    
257 :     =head3 LoadContigs
258 :    
259 :     $loaderObject->LoadContigs();
260 :    
261 :     Load the contig information into the database. This includes the contigs themselves and
262 : parrello 1.14 the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
263 :     number of base pairs as the C<dna> statistic, and the number of GC instances as the
264 :     C<gc_content> statistic.
265 : parrello 1.1
266 :     =cut
267 :    
268 :     sub LoadContigs {
269 :     # Get this object.
270 :     my ($self) = @_;
271 :     # Open the contigs file from the genome directory.
272 :     my $ih = Open(undef, "<$self->{directory}/contigs");
273 :     # Get the length of a DNA segment.
274 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
275 :     # Get the genome ID.
276 :     my $genome = $self->{genome};
277 :     # These variables will contain the current contig ID, the current contig length,
278 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
279 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
280 :     # Loop through the contig file.
281 :     while (! eof $ih) {
282 :     # Get the current record.
283 :     my $line = <$ih>; chomp $line;
284 :     # Is this the start of a new contig?
285 :     if ($line =~ /^>(\S+)/) {
286 :     # Yes. Save the contig ID.
287 :     my $newContigID = $1;
288 :     # Is there a current contig?
289 :     if (defined $contigID) {
290 :     # Yes. Output the current chunk.
291 :     $self->OutputChunk($contigID, $ordinal, $chunk);
292 :     # Output the contig.
293 :     $self->OutputContig($contigID, $contigLen);
294 :     }
295 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
296 :     $self->FixContig(\$newContigID);
297 :     # Initialize the new contig.
298 :     $contigID = $newContigID;
299 :     $contigLen = 0;
300 :     $ordinal = 0;
301 :     $chunk = '';
302 :     $chunkLen = 0;
303 :     } else {
304 :     # Here we have more DNA in the current contig. Are we at the end of
305 :     # the current chunk?
306 :     my $lineLen = length($line);
307 :     my $newChunkLen = $chunkLen + $lineLen;
308 :     if ($newChunkLen < $segmentLength) {
309 :     # No. Add this line to the chunk.
310 :     $chunk .= $line;
311 :     $chunkLen += $lineLen;
312 :     } else {
313 :     # Yes. Create the actual chunk.
314 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
315 :     # Write it out.
316 :     $self->OutputChunk($contigID, $ordinal, $chunk);
317 :     # Set up the new chunk.
318 :     $chunk = substr($line, $segmentLength - $chunkLen);
319 :     $ordinal++;
320 :     $chunkLen = length($chunk);
321 :     }
322 :     # Update the contig length.
323 :     $contigLen += $lineLen;
324 :     }
325 :     }
326 :     # Is there a current contig?
327 :     if (defined $contigID) {
328 :     # Yes. Output the residual chunk.
329 :     $self->OutputChunk($contigID, $ordinal, $chunk);
330 :     # Output the contig itself.
331 :     $self->OutputContig($contigID, $contigLen);
332 :     }
333 :     }
334 :    
335 :     =head3 OutputChunk
336 :    
337 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
338 :    
339 :     Output a chunk of DNA for the specified contig.
340 :    
341 :     =over 4
342 :    
343 :     =item contigID
344 :    
345 :     ID of the contig being output.
346 :    
347 :     =item ordinal
348 :    
349 :     Ordinal number of this chunk.
350 :    
351 :     =item chunk
352 :    
353 :     DNA sequence comprising this chunk.
354 :    
355 :     =back
356 :    
357 :     =cut
358 :    
359 :     sub OutputChunk {
360 :     # Get the parameters.
361 :     my ($self, $contigID, $ordinal, $chunk) = @_;
362 :     # Get the sapling object.
363 :     my $sap = $self->{sap};
364 :     # Compute the chunk ID.
365 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
366 :     # Connect this sequence to the contig.
367 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
368 : parrello 1.1 # Create the DNA sequence.
369 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
370 :     # Record the chunk.
371 :     $self->{stats}->Add(chunks => 1);
372 : parrello 1.14 # Update the GC count.
373 :     $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
374 : parrello 1.1 }
375 :    
376 :     =head3 OutputContig
377 :    
378 :     $loaderObject->OutputContig($contigID, $contigLen);
379 :    
380 :     Write out the current contig.
381 :    
382 :     =over 4
383 :    
384 :     =item contigID
385 :    
386 :     ID of the contig being written out.
387 :    
388 :     =item contigLen
389 :    
390 :     Length of the contig in base pairs.
391 :    
392 :     =back
393 :    
394 :     =cut
395 :    
396 :     sub OutputContig {
397 :     # Get the parameters.
398 :     my ($self, $contigID, $contigLen) = @_;
399 :     # Get the sapling object.
400 :     my $sap = $self->{sap};
401 :     # Connect the contig to the genome.
402 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
403 :     # Output the contig record.
404 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
405 :     # Record the contig.
406 :     $self->{stats}->Add(contigs => 1);
407 :     $self->{stats}->Add(dna => $contigLen);
408 :     }
409 :    
410 :     =head3 LoadFeatures
411 :    
412 :     $loaderObject->LoadFeatures();
413 :    
414 :     Load the feature data into the database. This includes all of the features,
415 :     their protein sequences, and their subsystem information. The number of
416 :     features of each type will be stored in the statistics object, identified by
417 :     the feature type.
418 :    
419 :     =cut
420 :    
421 :     sub LoadFeatures {
422 :     # Get the parameters.
423 :     my ($self) = @_;
424 : parrello 1.7 # Read in the functional assignments.
425 : parrello 1.12 Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
426 : parrello 1.7 my $assignHash = $self->ReadAssignments();
427 : parrello 1.11 $self->{assignHash} = $assignHash;
428 : parrello 1.1 # Get the directory of feature types.
429 :     my $featureDir = "$self->{directory}/Features";
430 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
431 : parrello 1.11 # Check for protein sequences. If we have some, load them into a hash.
432 :     my $protHash = {};
433 :     if (-f "$featureDir/peg/fasta") {
434 : parrello 1.12 Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
435 : parrello 1.11 $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
436 :     }
437 : parrello 1.1 # Create the feature records for the types found.
438 :     for my $type (@types) {
439 :     # Insure this is a genuine feature directory.
440 :     if (-f "$featureDir/$type/tbl") {
441 : parrello 1.11 # Yes. Read in the evidence codes (if any).
442 :     my $evHash = {};
443 :     my $tranFile = "$featureDir/$type/Attributes/transaction_log";
444 :     if (-f $tranFile) {
445 :     $evHash = $self->LoadEvidenceCodes($tranFile);
446 :     }
447 :     # Now load the feature data.
448 :     $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
449 : parrello 1.1 }
450 :     }
451 : parrello 1.11 }
452 :    
453 :     =head3 LoadEvidenceCodes
454 :    
455 :     my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
456 :    
457 :     Load the evidence codes from the specified attribute transaction log file into a
458 :     hash. The log file is in tab-delimited format. The first column contains the
459 :     transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
460 :     the third an attribute name (we'll ignore everything but C<evidence_code>), and
461 :     the fourth the attribute value.
462 :    
463 :     =over 4
464 :    
465 :     =item attributeFile
466 :    
467 :     Name of the attribute transaction log file.
468 :    
469 :     =item RETURN
470 :    
471 :     Returns a reference to a hash mapping each feature ID to a comma-delimited list
472 :     of its evidence codes.
473 :    
474 :     =back
475 :    
476 :     =cut
477 :    
478 :     sub LoadEvidenceCodes {
479 :     # Get the parameters.
480 :     my ($self, $attributeFile) = @_;
481 :     # Get the Sapling database.
482 :     my $sap = $self->{sap};
483 :     # Get the statistics object.
484 :     my $stats = $self->{stats};
485 :     # Get the assignment hash: we use this to filter the feature IDs.
486 :     my $assignHash = $self->{assignHash};
487 :     # Open the attribute log file for input.
488 :     my $ih = Open(undef, "<$attributeFile");
489 :     # This two-dimensional hash will hold the evidence codes for each feature.
490 :     my %retVal;
491 :     # Loop through the attribute log file.
492 :     while (! eof $ih) {
493 :     # Get the current attribute record.
494 :     my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
495 :     $stats->Add(attributeLine => 1);
496 :     # Insure we have all the pieces we need.
497 :     if (! $command || ! $fid || $key ne 'evidence_code') {
498 :     $stats->Add(attributeLineSkipped => 1);
499 :     } elsif (! $assignHash->{$fid}) {
500 :     # Here the attribute is for a deleted feature.
501 :     $stats->Add(attributeFidSkipped => 1);
502 :     } else {
503 :     # Get the sub-hash for this feature.
504 :     if (! exists $retVal{$fid}) {
505 :     $retVal{$fid} = {};
506 :     }
507 :     my $featureSubHash = $retVal{$fid};
508 :     # Process according to the command.
509 :     if ($command eq 'ADD') {
510 :     # Here we are adding a new evidence code.
511 :     $featureSubHash->{$value} = 1;
512 :     $stats->Add(attributeAdd => 1);
513 :     } elsif ($command eq 'DELETE') {
514 :     # Here we are deleting an evidence code.
515 :     delete $featureSubHash->{$value};
516 :     $stats->Add(attributeDelete => 1);
517 :     } else {
518 :     # here we have an unrecognized command.
519 :     $stats->Add(attributeCommandSkip => 1);
520 :     }
521 :     }
522 : parrello 1.1 }
523 : parrello 1.11 # Loop through the hash, converting each sub-hash to a comma-delimited list of
524 :     # evidence codes.
525 :     for my $fid (keys %retVal) {
526 :     $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
527 : parrello 1.6 }
528 : parrello 1.11 # Return the evidence hash.
529 :     return \%retVal;
530 : parrello 1.1 }
531 :    
532 : parrello 1.11
533 : parrello 1.1 =head3 LoadFeatureData
534 :    
535 : parrello 1.11 $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
536 : parrello 1.1
537 :     Load the basic data for each feature into the database. The number of features of
538 :     the type found will be recorded in the statistics object.
539 :    
540 :     =over 4
541 :    
542 :     =item featureDir
543 :    
544 :     Name of the main directory containing all the feature type subdirectories.
545 :    
546 :     =item type
547 :    
548 :     Type of feature to load.
549 :    
550 : parrello 1.11 =item protHash
551 :    
552 :     Reference to a hash mapping each feature ID for a protein-encoding gene to
553 :     its protein sequence.
554 :    
555 :     =item evHash
556 : parrello 1.7
557 : parrello 1.11 Reference to a hash mapping each feature ID to a comma-delimited list of
558 :     its evidence codes (if any).
559 : parrello 1.7
560 : parrello 1.1 =back
561 :    
562 :     =cut
563 :    
564 :     sub LoadFeatureData {
565 :     # Get the parameters.
566 : parrello 1.11 my ($self, $featureDir, $type, $protHash, $evHash) = @_;
567 : parrello 1.1 # Get the sapling database.
568 :     my $sap = $self->{sap};
569 :     # Get the statistics object.
570 :     my $stats = $self->{stats};
571 : parrello 1.11 # Get the assignment hash. This tells us our functional assignments. This method is
572 :     # also where we remove the deleted features from it.
573 :     my $assignHash = $self->{assignHash};
574 : parrello 1.1 # This hash will track the features we've created. If a feature is found a second
575 :     # time, it overwrites the original.
576 : parrello 1.11 my %fidHash;
577 : parrello 1.6 # Finally, we need the timestamp hash. The initial feature population
578 : parrello 1.1 # Insure we have a tbl file for this feature type.
579 :     my $fileName = "$featureDir/$type/tbl";
580 :     if (-f $fileName) {
581 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
582 : parrello 1.11 # of deleted features and remove them from the assignment hash. This insures
583 :     # that they are not used by subsequent methods.
584 : parrello 1.3 my $deleteFile = "$featureDir/$type/deleted.features";
585 :     if (-f $deleteFile) {
586 : parrello 1.11 my $dh = Open(undef, "<$deleteFile");
587 :     while (! eof $dh) {
588 :     my ($deletedFid) = Tracer::GetLine($dh);
589 :     if (exists $assignHash->{$deletedFid}) {
590 :     delete $assignHash->{$deletedFid};
591 :     $stats->Add(deletedFid => 1);
592 :     }
593 :     }
594 : parrello 1.3 }
595 :     # Open the main file for input.
596 : parrello 1.12 Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
597 : parrello 1.1 my $ih = Open(undef, "<$fileName");
598 :     while (! eof $ih) {
599 :     # Read this feature's information.
600 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
601 : parrello 1.3 # Only proceed if the feature is NOT deleted.
602 : parrello 1.11 if (exists $assignHash->{$fid}) {
603 : parrello 1.3 # If the feature already exists, delete it. (This should be extremely rare.)
604 : parrello 1.11 if ($fidHash{$fid}) {
605 : parrello 1.3 $sap->Delete(Feature => $fid);
606 :     $stats->Add(duplicateFid => 1);
607 : parrello 1.1 }
608 : parrello 1.11 # If this is RNA, the alias list is always empty. Sometimes, the functional
609 :     # assignment is found there.
610 :     if ($type eq 'rna') {
611 :     if (! $assignHash->{$fid}) {
612 :     $assignHash->{$fid} = $aliases[0];
613 : parrello 1.2 }
614 : parrello 1.11 @aliases = ();
615 : parrello 1.2 }
616 : parrello 1.11 # Add the feature to the database.
617 :     $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
618 :     $protHash->{$fid}, $evHash->{$fid});
619 :     # Denote we've added this feature, so that if a duplicate occurs we're ready.
620 :     $fidHash{$fid} = 1;
621 : parrello 1.2 }
622 : parrello 1.1 }
623 :     }
624 :     }
625 :    
626 : parrello 1.11
627 : parrello 1.1 =head3 LoadProteinData
628 :    
629 : parrello 1.11 my $protHash = $self->LoadProteinData($fileName);
630 : parrello 1.1
631 :     Load the protein sequences from the named FASTA file. The sequences will be stored
632 : parrello 1.11 in a hash by FIG feature ID.
633 : parrello 1.1
634 :     =over 4
635 :    
636 :     =item fileName
637 :    
638 :     Name of the FASTA file containing the protein sequences for this genome.
639 :    
640 : parrello 1.11 =item RETURN
641 :    
642 :     Returns a hash mapping feature IDs to protein sequences.
643 :    
644 : parrello 1.1 =back
645 :    
646 :     =cut
647 :    
648 :     sub LoadProteinData {
649 :     # Get the parameters.
650 :     my ($self, $fileName) = @_;
651 :     # Open the FASTA file for input.
652 :     my $ih = Open(undef, "<$fileName");
653 : parrello 1.11 # Create the return hash.
654 :     my $retVal = {};
655 : parrello 1.1 # We'll track the current protein in here.
656 :     my $fid;
657 :     my $sequence = "";
658 :     # Loop through the input file.
659 :     while (! eof $ih) {
660 :     # Get the current record.
661 :     my $line = <$ih>; chomp $line;
662 :     # Is this a label record.
663 :     if ($line =~ /^>(\S+)/) {
664 :     # Yes. Save the new feature ID.
665 :     my $newFid = $1;
666 :     # Do we have an existing protein?
667 :     if (defined $fid) {
668 : parrello 1.11 # Yes. Store it in the hash.
669 :     $retVal->{$fid} = $sequence;
670 : parrello 1.1 }
671 :     # Initialize for the next protein.
672 :     $fid = $newFid;
673 :     $sequence = "";
674 :     } else {
675 :     # Here we have more letters for the current protein.
676 :     $sequence .= $line;
677 :     }
678 :     }
679 :     # Do we have a residual protein.
680 :     if (defined $fid) {
681 : parrello 1.11 # Yes. Store it in the hash.
682 :     $retVal->{$fid} = $sequence;
683 : parrello 1.1 }
684 : parrello 1.11 # Return the hash.
685 :     return $retVal;
686 : parrello 1.1 }
687 :    
688 : parrello 1.6
689 :     =head3 LoadAnnotations
690 :    
691 :     $loaderObject->LoadAnnotations($fileName);
692 :    
693 :     Read in the annotation history information and use it to create annotation records.
694 :    
695 :     =over 4
696 :    
697 :     =item fileName
698 :    
699 :     Name of the annotation history file. This file is formatted with four fields per
700 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
701 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
702 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
703 :    
704 :     =back
705 :    
706 :     =cut
707 :    
708 :     sub LoadAnnotations {
709 :     # Get the parameters.
710 :     my ($self, $fileName) = @_;
711 : parrello 1.11 # Get the assignment Hash. We use this to filter out deleted features.
712 :     my $assignHash = $self->{assignHash};
713 : parrello 1.6 # Get the Sapling database.
714 :     my $sap = $self->{sap};
715 :     # Get the statistics object.
716 :     my $stats = $self->{stats};
717 :     # Open the input file.
718 :     my $ih = Tracer::Open(undef, "<$fileName");
719 :     # Loop through the input.
720 :     while (! eof $ih) {
721 :     # Read in the peg, timestamp, and user ID.
722 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
723 : parrello 1.11 # Only proceed if the feature is not deleted.
724 :     if ($assignHash->{$fid}) {
725 :     # Add the annotation to this feature.
726 :     $self->MakeAnnotation($fid, $text, $user, $timestamp);
727 : parrello 1.6 }
728 :     }
729 :     }
730 :    
731 :    
732 : parrello 1.1 =head3 WriteProtein
733 :    
734 :     $loaderObject->WriteProtein($fid, $sequence);
735 :    
736 :     Write out the specified protein sequence and associate it with the specified feature.
737 :     This requires checking to see if the protein sequence is already in the database.
738 :    
739 :     =over 4
740 :    
741 :     =item fid
742 :    
743 :     ID of the feature with the specified protein sequence.
744 :    
745 :     =item sequence
746 :    
747 :     Protein sequence for the indicated feature.
748 :    
749 :     =back
750 :    
751 :     =cut
752 :    
753 :     sub WriteProtein {
754 :     # Get the parameters.
755 :     my ($self, $fid, $sequence) = @_;
756 :     # Compute the key of the protein sequence.
757 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
758 : parrello 1.1 # Insure the protein exists.
759 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
760 :     # Connect the feature to it.
761 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
762 :     }
763 :    
764 :     =head3 LoadSubsystems
765 :    
766 :     $loaderObject->LoadSubsystems();
767 :    
768 :     Load the subsystem data into the database. This includes all of the subsystems,
769 :     their categories, roles, and the bindings that connect them to this genome's features.
770 :    
771 :     =cut
772 :    
773 :     sub LoadSubsystems {
774 :     # Get the parameters.
775 :     my ($self) = @_;
776 :     # Get the sapling object.
777 :     my $sap = $self->{sap};
778 :     # Get the statistics object.
779 :     my $stats = $self->{stats};
780 :     # Get the genome ID.
781 :     my $genome = $self->{genome};
782 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
783 :     my $sapO = SAPserver->new();
784 :     # This hash maps subsystem IDs to molecular machine IDs.
785 :     my %machines;
786 :     # This hash maps subsystem/role pairs to machine role IDs.
787 :     my %machineRoles;
788 :     # The first step is to create the subsystems themselves and connect them to the
789 :     # genome. A list is given in the subsystems file of the Subsystems directory.
790 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
791 :     # Create the field hash for the subsystem query.
792 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
793 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
794 :     Subsystem(version)
795 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
796 :     Includes(sequence)
797 :     Role(id) Role(hypothetical) Role(role-index));
798 :     my %fields = map { $_ => $_ } @fields;
799 :     # Loop through the subsystems in the file, insuring we have them in the database.
800 :     while (! eof $ih) {
801 :     # Get this subsystem.
802 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
803 : parrello 1.12 Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
804 : parrello 1.1 # Normalize the subsystem name.
805 :     $subsystem = $sap->SubsystemID($subsystem);
806 :     # Compute this subsystem's MD5.
807 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
808 :     # Insure the subsystem is in the database.
809 :     if (! $sap->Exists(Subsystem => $subsystem)) {
810 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
811 :     # its roles, and its classification information. The variant will be added
812 :     # later if we need it.
813 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
814 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
815 :     -fields => \%fields);
816 :     # Only proceed if we found some roles.
817 :     if (@$roleList > 0) {
818 :     # Get the subsystem information from the first role and create the subsystem.
819 :     my $roleH = $roleList->[0];
820 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
821 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
822 : parrello 1.10 $stats->Add(subsystems => 1);
823 : parrello 1.1 # Now loop through the roles. The Includes records are always inserted, but the
824 :     # roles are only inserted if they don't already exist.
825 :     for $roleH (@$roleList) {
826 :     # Create the Includes record.
827 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
828 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
829 :     # Insure we have the role in place.
830 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
831 : parrello 1.1 my $roleID = $roleFields{id};
832 :     delete $roleFields{id};
833 :     $self->InsureEntity('Role', $roleID, %roleFields);
834 :     # Compute the machine-role ID.
835 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
836 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
837 : parrello 1.10 $stats->Add(subsystemRoles => 1);
838 : parrello 1.1 }
839 :     }
840 :     } else {
841 :     # We already have the subsystem in the database, but we still need to compute the
842 :     # machine role IDs. We need information from the sapling server for this.
843 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
844 :     # Loop through the roles.
845 :     my $roleList = $subHash->{$subsystem};
846 :     for my $roleTuple (@$roleList) {
847 :     my ($roleID, $abbr) = @$roleTuple;
848 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
849 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
850 :     }
851 :     }
852 :     # Now we need to connect the variant to the subsystem. First, we compute the real
853 :     # variant code by removing the asterisks.
854 :     my $variantCode = $variant;
855 :     $variantCode =~ s/^\*+//;
856 :     $variantCode =~ s/\*+$//;
857 :     # Compute the variant key.
858 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
859 :     # Insure we have it in the database.
860 : parrello 1.10 if (! $sap->Exists(Variant => $variantKey)) {
861 :     # Get the variant from the sapling server.
862 :     my $variantH = $sapO->get(-objects => "Variant",
863 :     -filter => {"Variant(id)" => ["=", $variantKey]},
864 :     -fields => {"Variant(code)" => "code",
865 :     "Variant(comment)" => "comment",
866 :     "Variant(type)" => "type",
867 :     "Variant(role-rule)" => "role-rule"},
868 :     -firstOnly => 1);
869 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
870 :     # Connect it to the subsystem.
871 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
872 :     $stats->Add(subsystemVariants => 1);
873 :     }
874 : parrello 1.1 # Now we create the molecular machine connecting this genome to the subsystem
875 :     # variant.
876 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
877 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
878 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
879 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
880 :     # Remember the machine ID.
881 :     $machines{$subsystem} = $machineID;
882 :     }
883 :     # Now we go through the bindings file. This file connects the subsystem roles to
884 :     # the molecular machines.
885 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
886 :     # Loop through the bindings.
887 :     while (! eof $ih) {
888 :     # Get the binding data.
889 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
890 :     # Normalize the subsystem name.
891 :     $subsystem = $sap->SubsystemID($subsystem);
892 :     # Compute the machine role.
893 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
894 :     # Insure it exists.
895 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
896 :     if ($created) {
897 :     # We created the machine role, so connect it to the machine.
898 :     my $machineID = $machines{$subsystem};
899 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
900 :     # Connect it to the role, too.
901 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
902 :     }
903 :     # Connect the feature.
904 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
905 :     }
906 :     }
907 :    
908 :     =head3 CreateGenome
909 :    
910 :     $loaderObject->CreateGenome();
911 :    
912 :     Create the genome record.
913 :    
914 :     =cut
915 :    
916 :     # This hash maps cache statistics to genome record fields.
917 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
918 :     genome_domain => 'domain'};
919 :     # This hash maps domains to the prokaryotic flag.
920 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
921 :    
922 :     sub CreateGenome {
923 :     # Get the parameters.
924 :     my ($self) = @_;
925 :     # Get the genome directory.
926 :     my $dir = $self->{directory};
927 :     # Get the sapling database.
928 :     my $sap = $self->{sap};
929 :     # We'll put the genome attributes in here.
930 :     my %fields;
931 :     # Check for a basic statistics file.
932 :     my $statsFile = "$dir/cache.basic_statistics";
933 :     if (-f $statsFile) {
934 :     # We have the statistics file, so read the major attributes from it.
935 :     my $ih = Open(undef, "<$statsFile");
936 :     while (! eof $ih) {
937 :     my ($key, $value) = Tracer::GetLine($ih);
938 :     my $fieldKey = GENOME_FIELDS->{$key};
939 :     if ($fieldKey) {
940 :     $fields{$fieldKey} = $value;
941 :     }
942 :     }
943 :     # Translate the domain.
944 :     $fields{domain} = ucfirst $fields{domain};
945 :     # Denote the genome is complete.
946 :     $fields{complete} = 1;
947 :     } else {
948 :     # Check to see if this genome is complete.
949 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
950 :     # Get the genome name.
951 :     my $ih = Open(undef, "<$dir/GENOME");
952 :     my $line = <$ih>; chomp $line;
953 :     $fields{'scientific-name'} = $line;
954 :     # Get the taxonomy and extract the domain from it.
955 :     $ih = Open(undef, "<$dir/TAXONOMY");
956 :     ($fields{domain}) = split /;/, <$ih>, 2;
957 :     }
958 :     # Get the counts from the statistics object.
959 :     my $stats = $self->{stats};
960 :     $fields{contigs} = $stats->Ask('contigs');
961 :     $fields{'dna-size'} = $stats->Ask('dna');
962 :     $fields{pegs} = $stats->Ask('peg');
963 :     $fields{rnas} = $stats->Ask('rna');
964 : parrello 1.14 $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
965 : parrello 1.1 # Get the genetic code. The default is 11.
966 :     $fields{'genetic-code'} = 11;
967 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
968 :     if (-f $geneticCodeFile) {
969 :     # There's a genetic code file, so we need to read it for the code.
970 :     my $ih = Open(undef, "<$geneticCodeFile");
971 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
972 :     }
973 :     # Use the domain to determine whether or not the genome is prokaryotic.
974 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
975 :     # Finally, add the genome ID.
976 :     $fields{id} = $self->{genome};
977 :     # Create the genome record.
978 :     $sap->InsertObject('Genome', %fields);
979 :     }
980 :    
981 :     =head3 ReadAssignments
982 :    
983 :     my $assignHash = $loaderObject->ReadAssignments();
984 :    
985 :     Return a hash mapping each feature ID to its functional assignment. This method
986 :     essentially reads the B<assigned_functions> file into memory.
987 :    
988 :     =cut
989 :    
990 :     sub ReadAssignments {
991 :     # Get the parameters.
992 :     my ($self) = @_;
993 :     # Create the return hash.
994 :     my $retVal = {};
995 :     # Loop through the assigned-functions file, storing results in the hash. Later
996 :     # results will override earlier ones.
997 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
998 :     while (! eof $ih) {
999 :     my ($fid, $function) = Tracer::GetLine($ih);
1000 :     $retVal->{$fid} = $function;
1001 :     }
1002 :     # Return the hash built.
1003 :     return $retVal;
1004 :     }
1005 :    
1006 :    
1007 :     =head3 FixContig
1008 :    
1009 :     $loaderObject->FixContig(\$contigID);
1010 :    
1011 :     Insure that the specified contig ID contains the genome ID.
1012 :    
1013 :     =cut
1014 :    
1015 :     sub FixContig {
1016 :     my ($self, $contigID) = @_;
1017 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
1018 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
1019 :     $$contigID = "$self->{genome}:$$contigID";
1020 :     }
1021 :     }
1022 :    
1023 :     =head3 ConnectLocation
1024 :    
1025 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
1026 :    
1027 :     Connect the specified feature to the specified contig location.
1028 :    
1029 :     =over 4
1030 :    
1031 :     =item fid
1032 :    
1033 :     ID of the relevant feature.
1034 :    
1035 :     =item contig
1036 :    
1037 :     ID of the contig containing this segment of the feature (normalized).
1038 :    
1039 :     =item segment
1040 :    
1041 :     Ordinal number of this segment.
1042 :    
1043 :     =item left
1044 :    
1045 :     Location of the segment's leftmost base pair.
1046 :    
1047 :     =item dir
1048 :    
1049 :     Direction (strand) of the segment (C<+> or C<->).
1050 :    
1051 :     =item len
1052 :    
1053 :     Length of the segment (which must be no greater than the configured maximum).
1054 :    
1055 :     =back
1056 :    
1057 :     =cut
1058 :    
1059 :     sub ConnectLocation {
1060 :     # Get the parameters.
1061 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1062 :     # Get the sapling database.
1063 :     my $sap = $self->{sap};
1064 :     # Create the relationship.
1065 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1066 :     begin => $left, dir => $dir, len => $len,
1067 :     ordinal => $segment);
1068 :     # Record it in the statistics.
1069 :     $self->{stats}->Add(segment => 1);
1070 :     }
1071 :    
1072 : parrello 1.6 =head2 Internal Utility Methods
1073 :    
1074 :     =head3 ReadAnnotation
1075 :    
1076 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1077 :    
1078 :     Read the next record from an annotation file. The next record must exist (that is, an
1079 :     end-of-file check should have been performed before calling this method).
1080 :    
1081 :     =over 4
1082 :    
1083 :     =item ih
1084 :    
1085 :     Open file handle for the annotation file.
1086 :    
1087 :     =item RETURN
1088 :    
1089 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1090 :     timestamp, (2) the user ID, and (3) the annotation text.
1091 :    
1092 :     =back
1093 :    
1094 :     =cut
1095 :    
1096 :     sub ReadAnnotation {
1097 :     # Get the parameter.
1098 :     my ($ih) = @_;
1099 :     # Read the three fixed fields.
1100 :     my $fid = <$ih>; chomp $fid;
1101 :     my $timestamp = <$ih>; chomp $timestamp;
1102 :     my $user = <$ih>; chomp $user;
1103 :     # Loop through the lines of the text field.
1104 :     my $text = "";
1105 :     my $line = <$ih>;
1106 : olson 1.13 while (defined($line) && $line ne "//\n") {
1107 : parrello 1.6 $text .= $line;
1108 :     $line = <$ih>;
1109 :     }
1110 :     # Remove the trailing new-line from the text.
1111 :     chomp $text;
1112 :     # Return the fields.
1113 :     return ($fid, $timestamp, $user, $text);
1114 :     }
1115 :    
1116 : olson 1.13 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3