[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 : parrello 1.11 =item assignHash
61 :    
62 :     Hash of feature IDs to functional assignments. Deleted features are removed, which
63 :     means only features listed in this hash can be legally inserted into the database.
64 :    
65 : parrello 1.1 =back
66 :    
67 :     =cut
68 :    
69 :     sub Load {
70 :     # Get the parameters.
71 :     my ($sap, $genome, $directory) = @_;
72 :     # Create the loader object.
73 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
74 :     # Load the contigs.
75 : parrello 1.2 Trace("Loading contigs for $genome.") if T(2);
76 : parrello 1.1 $loaderObject->LoadContigs();
77 :     # Load the features.
78 : parrello 1.2 Trace("Loading features for $genome.") if T(2);
79 : parrello 1.1 $loaderObject->LoadFeatures();
80 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
81 :     # database.
82 :     if (-f "$directory/annotations") {
83 :     Trace("Processing annotations.") if T(3);
84 :     $loaderObject->LoadAnnotations("$directory/annotations");
85 :     }
86 : parrello 1.1 # Load the subsystem bindings.
87 : parrello 1.2 Trace("Loading subsystems for $genome.") if T(2);
88 : parrello 1.1 $loaderObject->LoadSubsystems();
89 :     # Create the Genome record and taxonomy information.
90 : parrello 1.2 Trace("Creating root for $genome.") if T(2);
91 : parrello 1.1 $loaderObject->CreateGenome();
92 :     # Return the statistics.
93 :     return $loaderObject->{stats};
94 :     }
95 :    
96 :     =head3 ClearGenome
97 :    
98 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
99 :    
100 :     Delete the specified genome and all the related records from the specified sapling
101 :     database. This method can also be used to clean up after a failed or aborted load.
102 :    
103 :     =over 4
104 :    
105 :     =item sap
106 :    
107 :     L<Sapling> object used to access the target database.
108 :    
109 :     =item genome
110 :    
111 :     ID of the genome to delete.
112 :    
113 :     =item RETURN
114 :    
115 :     Returns a statistics object counting the records deleted.
116 :    
117 :     =back
118 :    
119 :     =cut
120 :    
121 :     sub ClearGenome {
122 :     # Get the parameters.
123 :     my ($sap, $genome) = @_;
124 :     # Create the statistics object.
125 :     my $stats = Stats->new();
126 : parrello 1.9 # Delete the DNA sequences.
127 :     my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
128 :     for my $seq (@seqs) {
129 :     my $delStats = $sap->Delete(DNASequence => $seq);
130 :     $stats->Accumulate($delStats);
131 :     }
132 : parrello 1.1 # Delete the contigs.
133 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
134 : parrello 1.1 # Delete the features.
135 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
136 : parrello 1.1 # Delete the molecular machines.
137 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
138 : parrello 1.1 # Delete the genome itself.
139 :     my $subStats = $sap->Delete(Genome => $genome);
140 :     # Accumulate the statistics from the delete.
141 :     $stats->Accumulate($subStats);
142 :     # Return the statistics object.
143 :     return $stats;
144 :     }
145 :    
146 : parrello 1.7
147 :     =head3 Process
148 :    
149 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
150 :    
151 :     Load genome data from the specified directory. If the genome data already
152 :     exists in the database, it will be deleted first.
153 :    
154 :     =over 4
155 :    
156 :     =item sap
157 :    
158 :     L</Sapling> object for accessing the database.
159 :    
160 :     =item genome
161 :    
162 :     ID of the genome whose data is being loaded.
163 :    
164 :     =item directory
165 :    
166 :     Name of the directory containing the genome data files.
167 :    
168 :     =item RETURN
169 :    
170 :     Returns a statistics object describing the activity during the reload.
171 :    
172 :     =back
173 :    
174 :     =cut
175 :    
176 :     sub Process {
177 :     # Get the parameters.
178 :     my ($sap, $genome, $directory) = @_;
179 :     # Clear the existing data for the specified genome.
180 :     my $stats = ClearGenome($sap, $genome);
181 :     # Load the new expression data from the specified directory.
182 :     my $newStats = Load($sap, $genome, $directory);
183 :     # Merge the statistics.
184 :     $stats->Accumulate($newStats);
185 :     # Return the result.
186 :     return $stats;
187 :     }
188 :    
189 :    
190 : parrello 1.1 =head2 Loader Object Methods
191 :    
192 :     =head3 new
193 :    
194 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
195 :    
196 :     Create a loader object that can be used to facilitate loading genome data from a
197 :     directory.
198 :    
199 :     =over 4
200 :    
201 :     =item sap
202 :    
203 :     L<Sapling> object used to access the target database.
204 :    
205 :     =item genome
206 :    
207 :     ID of the genome being loaded.
208 :    
209 :     =item directory
210 :    
211 :     Name of the directory containing the genome information.
212 :    
213 :     =back
214 :    
215 :     The object created contains the following fields.
216 :    
217 :     =over 4
218 :    
219 :     =item directory
220 :    
221 :     Name of the directory containing the genome data.
222 :    
223 :     =item genome
224 :    
225 :     ID of the genome being loaded.
226 :    
227 :     =item supportRecords
228 :    
229 :     A hash of hashes, used to track the support records known to exist in the database.
230 :    
231 :     =item sap
232 :    
233 :     L<Sapling> object used to access the database.
234 :    
235 :     =item stats
236 :    
237 :     L<Stats> object for tracking statistical information about the load.
238 :    
239 :     =back
240 :    
241 :     =cut
242 :    
243 :     sub new {
244 :     # Get the parameters.
245 :     my ($class, $sap, $genome, $directory) = @_;
246 :     # Create the object.
247 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
248 :     # Add our specialized data.
249 :     $retVal->{genome} = $genome;
250 :     $retVal->{directory} = $directory;
251 : parrello 1.11 # Leave the assignment hash undefined until we populate it.
252 :     $retVal->{assignHash} = undef;
253 : parrello 1.3 # Return the result.
254 : parrello 1.1 return $retVal;
255 :     }
256 :    
257 :     =head3 LoadContigs
258 :    
259 :     $loaderObject->LoadContigs();
260 :    
261 :     Load the contig information into the database. This includes the contigs themselves and
262 :     the DNA. The number of contigs will be recorded as the C<contigs> statistic and the
263 :     number of base pairs as the C<dna> statistic.
264 :    
265 :     =cut
266 :    
267 :     sub LoadContigs {
268 :     # Get this object.
269 :     my ($self) = @_;
270 :     # Open the contigs file from the genome directory.
271 :     my $ih = Open(undef, "<$self->{directory}/contigs");
272 :     # Get the length of a DNA segment.
273 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
274 :     # Get the genome ID.
275 :     my $genome = $self->{genome};
276 :     # These variables will contain the current contig ID, the current contig length,
277 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
278 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
279 :     # Loop through the contig file.
280 :     while (! eof $ih) {
281 :     # Get the current record.
282 :     my $line = <$ih>; chomp $line;
283 :     # Is this the start of a new contig?
284 :     if ($line =~ /^>(\S+)/) {
285 :     # Yes. Save the contig ID.
286 :     my $newContigID = $1;
287 :     # Is there a current contig?
288 :     if (defined $contigID) {
289 :     # Yes. Output the current chunk.
290 :     $self->OutputChunk($contigID, $ordinal, $chunk);
291 :     # Output the contig.
292 :     $self->OutputContig($contigID, $contigLen);
293 :     }
294 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
295 :     $self->FixContig(\$newContigID);
296 :     # Initialize the new contig.
297 :     $contigID = $newContigID;
298 :     $contigLen = 0;
299 :     $ordinal = 0;
300 :     $chunk = '';
301 :     $chunkLen = 0;
302 :     } else {
303 :     # Here we have more DNA in the current contig. Are we at the end of
304 :     # the current chunk?
305 :     my $lineLen = length($line);
306 :     my $newChunkLen = $chunkLen + $lineLen;
307 :     if ($newChunkLen < $segmentLength) {
308 :     # No. Add this line to the chunk.
309 :     $chunk .= $line;
310 :     $chunkLen += $lineLen;
311 :     } else {
312 :     # Yes. Create the actual chunk.
313 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
314 :     # Write it out.
315 :     $self->OutputChunk($contigID, $ordinal, $chunk);
316 :     # Set up the new chunk.
317 :     $chunk = substr($line, $segmentLength - $chunkLen);
318 :     $ordinal++;
319 :     $chunkLen = length($chunk);
320 :     }
321 :     # Update the contig length.
322 :     $contigLen += $lineLen;
323 :     }
324 :     }
325 :     # Is there a current contig?
326 :     if (defined $contigID) {
327 :     # Yes. Output the residual chunk.
328 :     $self->OutputChunk($contigID, $ordinal, $chunk);
329 :     # Output the contig itself.
330 :     $self->OutputContig($contigID, $contigLen);
331 :     }
332 :     }
333 :    
334 :     =head3 OutputChunk
335 :    
336 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
337 :    
338 :     Output a chunk of DNA for the specified contig.
339 :    
340 :     =over 4
341 :    
342 :     =item contigID
343 :    
344 :     ID of the contig being output.
345 :    
346 :     =item ordinal
347 :    
348 :     Ordinal number of this chunk.
349 :    
350 :     =item chunk
351 :    
352 :     DNA sequence comprising this chunk.
353 :    
354 :     =back
355 :    
356 :     =cut
357 :    
358 :     sub OutputChunk {
359 :     # Get the parameters.
360 :     my ($self, $contigID, $ordinal, $chunk) = @_;
361 :     # Get the sapling object.
362 :     my $sap = $self->{sap};
363 :     # Compute the chunk ID.
364 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
365 :     # Connect this sequence to the contig.
366 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
367 : parrello 1.1 # Create the DNA sequence.
368 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
369 :     # Record the chunk.
370 :     $self->{stats}->Add(chunks => 1);
371 :     }
372 :    
373 :     =head3 OutputContig
374 :    
375 :     $loaderObject->OutputContig($contigID, $contigLen);
376 :    
377 :     Write out the current contig.
378 :    
379 :     =over 4
380 :    
381 :     =item contigID
382 :    
383 :     ID of the contig being written out.
384 :    
385 :     =item contigLen
386 :    
387 :     Length of the contig in base pairs.
388 :    
389 :     =back
390 :    
391 :     =cut
392 :    
393 :     sub OutputContig {
394 :     # Get the parameters.
395 :     my ($self, $contigID, $contigLen) = @_;
396 :     # Get the sapling object.
397 :     my $sap = $self->{sap};
398 :     # Connect the contig to the genome.
399 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
400 :     # Output the contig record.
401 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
402 :     # Record the contig.
403 :     $self->{stats}->Add(contigs => 1);
404 :     $self->{stats}->Add(dna => $contigLen);
405 :     }
406 :    
407 :     =head3 LoadFeatures
408 :    
409 :     $loaderObject->LoadFeatures();
410 :    
411 :     Load the feature data into the database. This includes all of the features,
412 :     their protein sequences, and their subsystem information. The number of
413 :     features of each type will be stored in the statistics object, identified by
414 :     the feature type.
415 :    
416 :     =cut
417 :    
418 :     sub LoadFeatures {
419 :     # Get the parameters.
420 :     my ($self) = @_;
421 : parrello 1.7 # Read in the functional assignments.
422 :     Trace("Reading functional assignments.") if T(3);
423 :     my $assignHash = $self->ReadAssignments();
424 : parrello 1.11 $self->{assignHash} = $assignHash;
425 : parrello 1.1 # Get the directory of feature types.
426 :     my $featureDir = "$self->{directory}/Features";
427 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
428 : parrello 1.11 # Check for protein sequences. If we have some, load them into a hash.
429 :     my $protHash = {};
430 :     if (-f "$featureDir/peg/fasta") {
431 :     Trace("Processing protein sequences.") if T(3);
432 :     $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
433 :     }
434 : parrello 1.1 # Create the feature records for the types found.
435 :     for my $type (@types) {
436 :     # Insure this is a genuine feature directory.
437 :     if (-f "$featureDir/$type/tbl") {
438 : parrello 1.11 # Yes. Read in the evidence codes (if any).
439 :     my $evHash = {};
440 :     my $tranFile = "$featureDir/$type/Attributes/transaction_log";
441 :     if (-f $tranFile) {
442 :     $evHash = $self->LoadEvidenceCodes($tranFile);
443 :     }
444 :     # Now load the feature data.
445 :     $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
446 : parrello 1.1 }
447 :     }
448 : parrello 1.11 }
449 :    
450 :     =head3 LoadEvidenceCodes
451 :    
452 :     my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
453 :    
454 :     Load the evidence codes from the specified attribute transaction log file into a
455 :     hash. The log file is in tab-delimited format. The first column contains the
456 :     transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
457 :     the third an attribute name (we'll ignore everything but C<evidence_code>), and
458 :     the fourth the attribute value.
459 :    
460 :     =over 4
461 :    
462 :     =item attributeFile
463 :    
464 :     Name of the attribute transaction log file.
465 :    
466 :     =item RETURN
467 :    
468 :     Returns a reference to a hash mapping each feature ID to a comma-delimited list
469 :     of its evidence codes.
470 :    
471 :     =back
472 :    
473 :     =cut
474 :    
475 :     sub LoadEvidenceCodes {
476 :     # Get the parameters.
477 :     my ($self, $attributeFile) = @_;
478 :     # Get the Sapling database.
479 :     my $sap = $self->{sap};
480 :     # Get the statistics object.
481 :     my $stats = $self->{stats};
482 :     # Get the assignment hash: we use this to filter the feature IDs.
483 :     my $assignHash = $self->{assignHash};
484 :     # Open the attribute log file for input.
485 :     my $ih = Open(undef, "<$attributeFile");
486 :     # This two-dimensional hash will hold the evidence codes for each feature.
487 :     my %retVal;
488 :     # Loop through the attribute log file.
489 :     while (! eof $ih) {
490 :     # Get the current attribute record.
491 :     my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
492 :     $stats->Add(attributeLine => 1);
493 :     # Insure we have all the pieces we need.
494 :     if (! $command || ! $fid || $key ne 'evidence_code') {
495 :     $stats->Add(attributeLineSkipped => 1);
496 :     } elsif (! $assignHash->{$fid}) {
497 :     # Here the attribute is for a deleted feature.
498 :     $stats->Add(attributeFidSkipped => 1);
499 :     } else {
500 :     # Get the sub-hash for this feature.
501 :     if (! exists $retVal{$fid}) {
502 :     $retVal{$fid} = {};
503 :     }
504 :     my $featureSubHash = $retVal{$fid};
505 :     # Process according to the command.
506 :     if ($command eq 'ADD') {
507 :     # Here we are adding a new evidence code.
508 :     $featureSubHash->{$value} = 1;
509 :     $stats->Add(attributeAdd => 1);
510 :     } elsif ($command eq 'DELETE') {
511 :     # Here we are deleting an evidence code.
512 :     delete $featureSubHash->{$value};
513 :     $stats->Add(attributeDelete => 1);
514 :     } else {
515 :     # here we have an unrecognized command.
516 :     $stats->Add(attributeCommandSkip => 1);
517 :     }
518 :     }
519 : parrello 1.1 }
520 : parrello 1.11 # Loop through the hash, converting each sub-hash to a comma-delimited list of
521 :     # evidence codes.
522 :     for my $fid (keys %retVal) {
523 :     $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
524 : parrello 1.6 }
525 : parrello 1.11 # Return the evidence hash.
526 :     return \%retVal;
527 : parrello 1.1 }
528 :    
529 : parrello 1.11
530 : parrello 1.1 =head3 LoadFeatureData
531 :    
532 : parrello 1.11 $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
533 : parrello 1.1
534 :     Load the basic data for each feature into the database. The number of features of
535 :     the type found will be recorded in the statistics object.
536 :    
537 :     =over 4
538 :    
539 :     =item featureDir
540 :    
541 :     Name of the main directory containing all the feature type subdirectories.
542 :    
543 :     =item type
544 :    
545 :     Type of feature to load.
546 :    
547 : parrello 1.11 =item protHash
548 :    
549 :     Reference to a hash mapping each feature ID for a protein-encoding gene to
550 :     its protein sequence.
551 :    
552 :     =item evHash
553 : parrello 1.7
554 : parrello 1.11 Reference to a hash mapping each feature ID to a comma-delimited list of
555 :     its evidence codes (if any).
556 : parrello 1.7
557 : parrello 1.1 =back
558 :    
559 :     =cut
560 :    
561 :     sub LoadFeatureData {
562 :     # Get the parameters.
563 : parrello 1.11 my ($self, $featureDir, $type, $protHash, $evHash) = @_;
564 : parrello 1.1 # Get the sapling database.
565 :     my $sap = $self->{sap};
566 :     # Get the statistics object.
567 :     my $stats = $self->{stats};
568 : parrello 1.11 # Get the assignment hash. This tells us our functional assignments. This method is
569 :     # also where we remove the deleted features from it.
570 :     my $assignHash = $self->{assignHash};
571 : parrello 1.1 # This hash will track the features we've created. If a feature is found a second
572 :     # time, it overwrites the original.
573 : parrello 1.11 my %fidHash;
574 : parrello 1.6 # Finally, we need the timestamp hash. The initial feature population
575 : parrello 1.1 # Insure we have a tbl file for this feature type.
576 :     my $fileName = "$featureDir/$type/tbl";
577 :     if (-f $fileName) {
578 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
579 : parrello 1.11 # of deleted features and remove them from the assignment hash. This insures
580 :     # that they are not used by subsequent methods.
581 : parrello 1.3 my $deleteFile = "$featureDir/$type/deleted.features";
582 :     if (-f $deleteFile) {
583 : parrello 1.11 my $dh = Open(undef, "<$deleteFile");
584 :     while (! eof $dh) {
585 :     my ($deletedFid) = Tracer::GetLine($dh);
586 :     if (exists $assignHash->{$deletedFid}) {
587 :     delete $assignHash->{$deletedFid};
588 :     $stats->Add(deletedFid => 1);
589 :     }
590 :     }
591 : parrello 1.3 }
592 :     # Open the main file for input.
593 : parrello 1.7 Trace("Reading features from $fileName.") if T(3);
594 : parrello 1.1 my $ih = Open(undef, "<$fileName");
595 :     while (! eof $ih) {
596 :     # Read this feature's information.
597 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
598 : parrello 1.3 # Only proceed if the feature is NOT deleted.
599 : parrello 1.11 if (exists $assignHash->{$fid}) {
600 : parrello 1.3 # If the feature already exists, delete it. (This should be extremely rare.)
601 : parrello 1.11 if ($fidHash{$fid}) {
602 : parrello 1.3 $sap->Delete(Feature => $fid);
603 :     $stats->Add(duplicateFid => 1);
604 : parrello 1.1 }
605 : parrello 1.11 # If this is RNA, the alias list is always empty. Sometimes, the functional
606 :     # assignment is found there.
607 :     if ($type eq 'rna') {
608 :     if (! $assignHash->{$fid}) {
609 :     $assignHash->{$fid} = $aliases[0];
610 : parrello 1.2 }
611 : parrello 1.11 @aliases = ();
612 : parrello 1.2 }
613 : parrello 1.11 # Add the feature to the database.
614 :     $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
615 :     $protHash->{$fid}, $evHash->{$fid});
616 :     # Denote we've added this feature, so that if a duplicate occurs we're ready.
617 :     $fidHash{$fid} = 1;
618 : parrello 1.2 }
619 : parrello 1.1 }
620 :     }
621 :     }
622 :    
623 : parrello 1.11
624 : parrello 1.1 =head3 LoadProteinData
625 :    
626 : parrello 1.11 my $protHash = $self->LoadProteinData($fileName);
627 : parrello 1.1
628 :     Load the protein sequences from the named FASTA file. The sequences will be stored
629 : parrello 1.11 in a hash by FIG feature ID.
630 : parrello 1.1
631 :     =over 4
632 :    
633 :     =item fileName
634 :    
635 :     Name of the FASTA file containing the protein sequences for this genome.
636 :    
637 : parrello 1.11 =item RETURN
638 :    
639 :     Returns a hash mapping feature IDs to protein sequences.
640 :    
641 : parrello 1.1 =back
642 :    
643 :     =cut
644 :    
645 :     sub LoadProteinData {
646 :     # Get the parameters.
647 :     my ($self, $fileName) = @_;
648 :     # Open the FASTA file for input.
649 :     my $ih = Open(undef, "<$fileName");
650 : parrello 1.11 # Create the return hash.
651 :     my $retVal = {};
652 : parrello 1.1 # We'll track the current protein in here.
653 :     my $fid;
654 :     my $sequence = "";
655 :     # Loop through the input file.
656 :     while (! eof $ih) {
657 :     # Get the current record.
658 :     my $line = <$ih>; chomp $line;
659 :     # Is this a label record.
660 :     if ($line =~ /^>(\S+)/) {
661 :     # Yes. Save the new feature ID.
662 :     my $newFid = $1;
663 :     # Do we have an existing protein?
664 :     if (defined $fid) {
665 : parrello 1.11 # Yes. Store it in the hash.
666 :     $retVal->{$fid} = $sequence;
667 : parrello 1.1 }
668 :     # Initialize for the next protein.
669 :     $fid = $newFid;
670 :     $sequence = "";
671 :     } else {
672 :     # Here we have more letters for the current protein.
673 :     $sequence .= $line;
674 :     }
675 :     }
676 :     # Do we have a residual protein.
677 :     if (defined $fid) {
678 : parrello 1.11 # Yes. Store it in the hash.
679 :     $retVal->{$fid} = $sequence;
680 : parrello 1.1 }
681 : parrello 1.11 # Return the hash.
682 :     return $retVal;
683 : parrello 1.1 }
684 :    
685 : parrello 1.6
686 :     =head3 LoadAnnotations
687 :    
688 :     $loaderObject->LoadAnnotations($fileName);
689 :    
690 :     Read in the annotation history information and use it to create annotation records.
691 :    
692 :     =over 4
693 :    
694 :     =item fileName
695 :    
696 :     Name of the annotation history file. This file is formatted with four fields per
697 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
698 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
699 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
700 :    
701 :     =back
702 :    
703 :     =cut
704 :    
705 :     sub LoadAnnotations {
706 :     # Get the parameters.
707 :     my ($self, $fileName) = @_;
708 : parrello 1.11 # Get the assignment Hash. We use this to filter out deleted features.
709 :     my $assignHash = $self->{assignHash};
710 : parrello 1.6 # Get the Sapling database.
711 :     my $sap = $self->{sap};
712 :     # Get the statistics object.
713 :     my $stats = $self->{stats};
714 :     # Open the input file.
715 :     my $ih = Tracer::Open(undef, "<$fileName");
716 :     # Loop through the input.
717 :     while (! eof $ih) {
718 :     # Read in the peg, timestamp, and user ID.
719 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
720 : parrello 1.11 # Only proceed if the feature is not deleted.
721 :     if ($assignHash->{$fid}) {
722 :     # Add the annotation to this feature.
723 :     $self->MakeAnnotation($fid, $text, $user, $timestamp);
724 : parrello 1.6 }
725 :     }
726 :     }
727 :    
728 :    
729 : parrello 1.1 =head3 WriteProtein
730 :    
731 :     $loaderObject->WriteProtein($fid, $sequence);
732 :    
733 :     Write out the specified protein sequence and associate it with the specified feature.
734 :     This requires checking to see if the protein sequence is already in the database.
735 :    
736 :     =over 4
737 :    
738 :     =item fid
739 :    
740 :     ID of the feature with the specified protein sequence.
741 :    
742 :     =item sequence
743 :    
744 :     Protein sequence for the indicated feature.
745 :    
746 :     =back
747 :    
748 :     =cut
749 :    
750 :     sub WriteProtein {
751 :     # Get the parameters.
752 :     my ($self, $fid, $sequence) = @_;
753 :     # Compute the key of the protein sequence.
754 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
755 : parrello 1.1 # Insure the protein exists.
756 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
757 :     # Connect the feature to it.
758 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
759 :     }
760 :    
761 :     =head3 LoadSubsystems
762 :    
763 :     $loaderObject->LoadSubsystems();
764 :    
765 :     Load the subsystem data into the database. This includes all of the subsystems,
766 :     their categories, roles, and the bindings that connect them to this genome's features.
767 :    
768 :     =cut
769 :    
770 :     sub LoadSubsystems {
771 :     # Get the parameters.
772 :     my ($self) = @_;
773 :     # Get the sapling object.
774 :     my $sap = $self->{sap};
775 :     # Get the statistics object.
776 :     my $stats = $self->{stats};
777 :     # Get the genome ID.
778 :     my $genome = $self->{genome};
779 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
780 :     my $sapO = SAPserver->new();
781 :     # This hash maps subsystem IDs to molecular machine IDs.
782 :     my %machines;
783 :     # This hash maps subsystem/role pairs to machine role IDs.
784 :     my %machineRoles;
785 :     # The first step is to create the subsystems themselves and connect them to the
786 :     # genome. A list is given in the subsystems file of the Subsystems directory.
787 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
788 :     # Create the field hash for the subsystem query.
789 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
790 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
791 :     Subsystem(version)
792 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
793 :     Includes(sequence)
794 :     Role(id) Role(hypothetical) Role(role-index));
795 :     my %fields = map { $_ => $_ } @fields;
796 :     # Loop through the subsystems in the file, insuring we have them in the database.
797 :     while (! eof $ih) {
798 :     # Get this subsystem.
799 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
800 : parrello 1.10 Trace("Processing subsystem $subsystem variant $variant.") if T(3);
801 : parrello 1.1 # Normalize the subsystem name.
802 :     $subsystem = $sap->SubsystemID($subsystem);
803 :     # Compute this subsystem's MD5.
804 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
805 :     # Insure the subsystem is in the database.
806 :     if (! $sap->Exists(Subsystem => $subsystem)) {
807 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
808 :     # its roles, and its classification information. The variant will be added
809 :     # later if we need it.
810 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
811 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
812 :     -fields => \%fields);
813 :     # Only proceed if we found some roles.
814 :     if (@$roleList > 0) {
815 :     # Get the subsystem information from the first role and create the subsystem.
816 :     my $roleH = $roleList->[0];
817 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
818 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
819 : parrello 1.10 $stats->Add(subsystems => 1);
820 : parrello 1.1 # Now loop through the roles. The Includes records are always inserted, but the
821 :     # roles are only inserted if they don't already exist.
822 :     for $roleH (@$roleList) {
823 :     # Create the Includes record.
824 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
825 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
826 :     # Insure we have the role in place.
827 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
828 : parrello 1.1 my $roleID = $roleFields{id};
829 :     delete $roleFields{id};
830 :     $self->InsureEntity('Role', $roleID, %roleFields);
831 :     # Compute the machine-role ID.
832 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
833 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
834 : parrello 1.10 $stats->Add(subsystemRoles => 1);
835 : parrello 1.1 }
836 :     }
837 :     } else {
838 :     # We already have the subsystem in the database, but we still need to compute the
839 :     # machine role IDs. We need information from the sapling server for this.
840 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
841 :     # Loop through the roles.
842 :     my $roleList = $subHash->{$subsystem};
843 :     for my $roleTuple (@$roleList) {
844 :     my ($roleID, $abbr) = @$roleTuple;
845 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
846 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
847 :     }
848 :     }
849 :     # Now we need to connect the variant to the subsystem. First, we compute the real
850 :     # variant code by removing the asterisks.
851 :     my $variantCode = $variant;
852 :     $variantCode =~ s/^\*+//;
853 :     $variantCode =~ s/\*+$//;
854 :     # Compute the variant key.
855 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
856 :     # Insure we have it in the database.
857 : parrello 1.10 if (! $sap->Exists(Variant => $variantKey)) {
858 :     # Get the variant from the sapling server.
859 :     my $variantH = $sapO->get(-objects => "Variant",
860 :     -filter => {"Variant(id)" => ["=", $variantKey]},
861 :     -fields => {"Variant(code)" => "code",
862 :     "Variant(comment)" => "comment",
863 :     "Variant(type)" => "type",
864 :     "Variant(role-rule)" => "role-rule"},
865 :     -firstOnly => 1);
866 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
867 :     # Connect it to the subsystem.
868 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
869 :     $stats->Add(subsystemVariants => 1);
870 :     }
871 : parrello 1.1 # Now we create the molecular machine connecting this genome to the subsystem
872 :     # variant.
873 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
874 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
875 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
876 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
877 :     # Remember the machine ID.
878 :     $machines{$subsystem} = $machineID;
879 :     }
880 :     # Now we go through the bindings file. This file connects the subsystem roles to
881 :     # the molecular machines.
882 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
883 :     # Loop through the bindings.
884 :     while (! eof $ih) {
885 :     # Get the binding data.
886 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
887 :     # Normalize the subsystem name.
888 :     $subsystem = $sap->SubsystemID($subsystem);
889 :     # Compute the machine role.
890 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
891 :     # Insure it exists.
892 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
893 :     if ($created) {
894 :     # We created the machine role, so connect it to the machine.
895 :     my $machineID = $machines{$subsystem};
896 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
897 :     # Connect it to the role, too.
898 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
899 :     }
900 :     # Connect the feature.
901 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
902 :     }
903 :     }
904 :    
905 :     =head3 CreateGenome
906 :    
907 :     $loaderObject->CreateGenome();
908 :    
909 :     Create the genome record.
910 :    
911 :     =cut
912 :    
913 :     # This hash maps cache statistics to genome record fields.
914 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
915 :     genome_domain => 'domain'};
916 :     # This hash maps domains to the prokaryotic flag.
917 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
918 :    
919 :     sub CreateGenome {
920 :     # Get the parameters.
921 :     my ($self) = @_;
922 :     # Get the genome directory.
923 :     my $dir = $self->{directory};
924 :     # Get the sapling database.
925 :     my $sap = $self->{sap};
926 :     # We'll put the genome attributes in here.
927 :     my %fields;
928 :     # Check for a basic statistics file.
929 :     my $statsFile = "$dir/cache.basic_statistics";
930 :     if (-f $statsFile) {
931 :     # We have the statistics file, so read the major attributes from it.
932 :     my $ih = Open(undef, "<$statsFile");
933 :     while (! eof $ih) {
934 :     my ($key, $value) = Tracer::GetLine($ih);
935 :     my $fieldKey = GENOME_FIELDS->{$key};
936 :     if ($fieldKey) {
937 :     $fields{$fieldKey} = $value;
938 :     }
939 :     }
940 :     # Translate the domain.
941 :     $fields{domain} = ucfirst $fields{domain};
942 :     # Denote the genome is complete.
943 :     $fields{complete} = 1;
944 :     } else {
945 :     # Check to see if this genome is complete.
946 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
947 :     # Get the genome name.
948 :     my $ih = Open(undef, "<$dir/GENOME");
949 :     my $line = <$ih>; chomp $line;
950 :     $fields{'scientific-name'} = $line;
951 :     # Get the taxonomy and extract the domain from it.
952 :     $ih = Open(undef, "<$dir/TAXONOMY");
953 :     ($fields{domain}) = split /;/, <$ih>, 2;
954 :     }
955 :     # Get the counts from the statistics object.
956 :     my $stats = $self->{stats};
957 :     $fields{contigs} = $stats->Ask('contigs');
958 :     $fields{'dna-size'} = $stats->Ask('dna');
959 :     $fields{pegs} = $stats->Ask('peg');
960 :     $fields{rnas} = $stats->Ask('rna');
961 :     # Get the genetic code. The default is 11.
962 :     $fields{'genetic-code'} = 11;
963 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
964 :     if (-f $geneticCodeFile) {
965 :     # There's a genetic code file, so we need to read it for the code.
966 :     my $ih = Open(undef, "<$geneticCodeFile");
967 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
968 :     }
969 :     # Use the domain to determine whether or not the genome is prokaryotic.
970 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
971 :     # Finally, add the genome ID.
972 :     $fields{id} = $self->{genome};
973 :     # Create the genome record.
974 :     $sap->InsertObject('Genome', %fields);
975 :     }
976 :    
977 :     =head3 ReadAssignments
978 :    
979 :     my $assignHash = $loaderObject->ReadAssignments();
980 :    
981 :     Return a hash mapping each feature ID to its functional assignment. This method
982 :     essentially reads the B<assigned_functions> file into memory.
983 :    
984 :     =cut
985 :    
986 :     sub ReadAssignments {
987 :     # Get the parameters.
988 :     my ($self) = @_;
989 :     # Create the return hash.
990 :     my $retVal = {};
991 :     # Loop through the assigned-functions file, storing results in the hash. Later
992 :     # results will override earlier ones.
993 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
994 :     while (! eof $ih) {
995 :     my ($fid, $function) = Tracer::GetLine($ih);
996 :     $retVal->{$fid} = $function;
997 :     }
998 :     # Return the hash built.
999 :     return $retVal;
1000 :     }
1001 :    
1002 :    
1003 :     =head3 FixContig
1004 :    
1005 :     $loaderObject->FixContig(\$contigID);
1006 :    
1007 :     Insure that the specified contig ID contains the genome ID.
1008 :    
1009 :     =cut
1010 :    
1011 :     sub FixContig {
1012 :     my ($self, $contigID) = @_;
1013 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
1014 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
1015 :     $$contigID = "$self->{genome}:$$contigID";
1016 :     }
1017 :     }
1018 :    
1019 :     =head3 ConnectLocation
1020 :    
1021 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
1022 :    
1023 :     Connect the specified feature to the specified contig location.
1024 :    
1025 :     =over 4
1026 :    
1027 :     =item fid
1028 :    
1029 :     ID of the relevant feature.
1030 :    
1031 :     =item contig
1032 :    
1033 :     ID of the contig containing this segment of the feature (normalized).
1034 :    
1035 :     =item segment
1036 :    
1037 :     Ordinal number of this segment.
1038 :    
1039 :     =item left
1040 :    
1041 :     Location of the segment's leftmost base pair.
1042 :    
1043 :     =item dir
1044 :    
1045 :     Direction (strand) of the segment (C<+> or C<->).
1046 :    
1047 :     =item len
1048 :    
1049 :     Length of the segment (which must be no greater than the configured maximum).
1050 :    
1051 :     =back
1052 :    
1053 :     =cut
1054 :    
1055 :     sub ConnectLocation {
1056 :     # Get the parameters.
1057 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1058 :     # Get the sapling database.
1059 :     my $sap = $self->{sap};
1060 :     # Create the relationship.
1061 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1062 :     begin => $left, dir => $dir, len => $len,
1063 :     ordinal => $segment);
1064 :     # Record it in the statistics.
1065 :     $self->{stats}->Add(segment => 1);
1066 :     }
1067 :    
1068 : parrello 1.6 =head2 Internal Utility Methods
1069 :    
1070 :     =head3 ReadAnnotation
1071 :    
1072 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1073 :    
1074 :     Read the next record from an annotation file. The next record must exist (that is, an
1075 :     end-of-file check should have been performed before calling this method).
1076 :    
1077 :     =over 4
1078 :    
1079 :     =item ih
1080 :    
1081 :     Open file handle for the annotation file.
1082 :    
1083 :     =item RETURN
1084 :    
1085 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1086 :     timestamp, (2) the user ID, and (3) the annotation text.
1087 :    
1088 :     =back
1089 :    
1090 :     =cut
1091 :    
1092 :     sub ReadAnnotation {
1093 :     # Get the parameter.
1094 :     my ($ih) = @_;
1095 :     # Read the three fixed fields.
1096 :     my $fid = <$ih>; chomp $fid;
1097 :     my $timestamp = <$ih>; chomp $timestamp;
1098 :     my $user = <$ih>; chomp $user;
1099 :     # Loop through the lines of the text field.
1100 :     my $text = "";
1101 :     my $line = <$ih>;
1102 :     while ($line ne "//\n") {
1103 :     $text .= $line;
1104 :     $line = <$ih>;
1105 :     }
1106 :     # Remove the trailing new-line from the text.
1107 :     chomp $text;
1108 :     # Return the fields.
1109 :     return ($fid, $timestamp, $user, $text);
1110 :     }
1111 :    
1112 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3