[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.17 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 :     sub Load {
65 :     # Get the parameters.
66 : parrello 1.17 my ($sap, $genome, $directory) = @_;
67 : parrello 1.1 # Create the loader object.
68 : parrello 1.17 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69 : parrello 1.1 # Load the contigs.
70 : parrello 1.12 Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
71 : parrello 1.1 $loaderObject->LoadContigs();
72 :     # Load the features.
73 : parrello 1.12 Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
74 : parrello 1.1 $loaderObject->LoadFeatures();
75 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
76 :     # database.
77 :     if (-f "$directory/annotations") {
78 : parrello 1.12 Trace("Processing annotations.") if T(SaplingDataLoader => 3);
79 : parrello 1.7 $loaderObject->LoadAnnotations("$directory/annotations");
80 :     }
81 : parrello 1.1 # Load the subsystem bindings.
82 : parrello 1.12 Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
83 : parrello 1.1 $loaderObject->LoadSubsystems();
84 :     # Create the Genome record and taxonomy information.
85 : parrello 1.12 Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
86 : parrello 1.1 $loaderObject->CreateGenome();
87 :     # Return the statistics.
88 :     return $loaderObject->{stats};
89 :     }
90 :    
91 :     =head3 ClearGenome
92 :    
93 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
94 :    
95 :     Delete the specified genome and all the related records from the specified sapling
96 :     database. This method can also be used to clean up after a failed or aborted load.
97 :    
98 :     =over 4
99 :    
100 :     =item sap
101 :    
102 :     L<Sapling> object used to access the target database.
103 :    
104 :     =item genome
105 :    
106 :     ID of the genome to delete.
107 :    
108 :     =item RETURN
109 :    
110 :     Returns a statistics object counting the records deleted.
111 :    
112 :     =back
113 :    
114 :     =cut
115 :    
116 :     sub ClearGenome {
117 :     # Get the parameters.
118 :     my ($sap, $genome) = @_;
119 :     # Create the statistics object.
120 :     my $stats = Stats->new();
121 : parrello 1.9 # Delete the DNA sequences.
122 :     my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
123 :     for my $seq (@seqs) {
124 :     my $delStats = $sap->Delete(DNASequence => $seq);
125 :     $stats->Accumulate($delStats);
126 :     }
127 : parrello 1.1 # Delete the contigs.
128 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
129 : parrello 1.1 # Delete the features.
130 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
131 : parrello 1.1 # Delete the molecular machines.
132 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
133 : parrello 1.1 # Delete the genome itself.
134 :     my $subStats = $sap->Delete(Genome => $genome);
135 :     # Accumulate the statistics from the delete.
136 :     $stats->Accumulate($subStats);
137 :     # Return the statistics object.
138 :     return $stats;
139 :     }
140 :    
141 : parrello 1.7
142 :     =head3 Process
143 :    
144 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
145 :    
146 :     Load genome data from the specified directory. If the genome data already
147 :     exists in the database, it will be deleted first.
148 :    
149 :     =over 4
150 :    
151 :     =item sap
152 :    
153 :     L</Sapling> object for accessing the database.
154 :    
155 :     =item genome
156 :    
157 :     ID of the genome whose data is being loaded.
158 :    
159 :     =item directory
160 :    
161 : parrello 1.17 Name of the directory containing the genome data files. If omitted, the
162 :     genome will be deleted from the database.
163 : olson 1.16
164 : parrello 1.7 =item RETURN
165 :    
166 :     Returns a statistics object describing the activity during the reload.
167 :    
168 :     =back
169 :    
170 :     =cut
171 :    
172 :     sub Process {
173 :     # Get the parameters.
174 : parrello 1.17 my ($sap, $genome, $directory) = @_;
175 : parrello 1.7 # Clear the existing data for the specified genome.
176 :     my $stats = ClearGenome($sap, $genome);
177 : parrello 1.17 # Load the new expression data from the specified directory (if one is
178 :     # specified).
179 :     if ($directory) {
180 :     my $newStats = Load($sap, $genome, $directory);
181 :     # Merge the statistics.
182 :     $stats->Accumulate($newStats);
183 :     }
184 : parrello 1.7 # Return the result.
185 :     return $stats;
186 :     }
187 :    
188 :    
189 : parrello 1.1 =head2 Loader Object Methods
190 :    
191 :     =head3 new
192 :    
193 : parrello 1.17 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
194 : parrello 1.1
195 :     Create a loader object that can be used to facilitate loading genome data from a
196 :     directory.
197 :    
198 :     =over 4
199 :    
200 :     =item sap
201 :    
202 :     L<Sapling> object used to access the target database.
203 :    
204 :     =item genome
205 :    
206 :     ID of the genome being loaded.
207 :    
208 :     =item directory
209 :    
210 :     Name of the directory containing the genome information.
211 :    
212 :     =back
213 :    
214 :     The object created contains the following fields.
215 :    
216 :     =over 4
217 :    
218 :     =item directory
219 :    
220 :     Name of the directory containing the genome data.
221 :    
222 :     =item genome
223 :    
224 :     ID of the genome being loaded.
225 :    
226 :     =item supportRecords
227 :    
228 :     A hash of hashes, used to track the support records known to exist in the database.
229 :    
230 :     =item sap
231 :    
232 :     L<Sapling> object used to access the database.
233 :    
234 :     =item stats
235 :    
236 :     L<Stats> object for tracking statistical information about the load.
237 :    
238 :     =back
239 :    
240 :     =cut
241 :    
242 :     sub new {
243 :     # Get the parameters.
244 : parrello 1.17 my ($class, $sap, $genome, $directory) = @_;
245 : parrello 1.1 # Create the object.
246 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
247 :     # Add our specialized data.
248 :     $retVal->{genome} = $genome;
249 :     $retVal->{directory} = $directory;
250 : parrello 1.11 # Leave the assignment hash undefined until we populate it.
251 :     $retVal->{assignHash} = undef;
252 : parrello 1.3 # Return the result.
253 : parrello 1.1 return $retVal;
254 :     }
255 :    
256 :     =head3 LoadContigs
257 :    
258 :     $loaderObject->LoadContigs();
259 :    
260 :     Load the contig information into the database. This includes the contigs themselves and
261 : parrello 1.14 the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
262 :     number of base pairs as the C<dna> statistic, and the number of GC instances as the
263 :     C<gc_content> statistic.
264 : parrello 1.1
265 :     =cut
266 :    
267 :     sub LoadContigs {
268 :     # Get this object.
269 :     my ($self) = @_;
270 :     # Open the contigs file from the genome directory.
271 :     my $ih = Open(undef, "<$self->{directory}/contigs");
272 :     # Get the length of a DNA segment.
273 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
274 :     # Get the genome ID.
275 :     my $genome = $self->{genome};
276 :     # These variables will contain the current contig ID, the current contig length,
277 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
278 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
279 :     # Loop through the contig file.
280 :     while (! eof $ih) {
281 :     # Get the current record.
282 :     my $line = <$ih>; chomp $line;
283 :     # Is this the start of a new contig?
284 :     if ($line =~ /^>(\S+)/) {
285 :     # Yes. Save the contig ID.
286 :     my $newContigID = $1;
287 :     # Is there a current contig?
288 :     if (defined $contigID) {
289 :     # Yes. Output the current chunk.
290 :     $self->OutputChunk($contigID, $ordinal, $chunk);
291 :     # Output the contig.
292 :     $self->OutputContig($contigID, $contigLen);
293 :     }
294 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
295 :     $self->FixContig(\$newContigID);
296 :     # Initialize the new contig.
297 :     $contigID = $newContigID;
298 :     $contigLen = 0;
299 :     $ordinal = 0;
300 :     $chunk = '';
301 :     $chunkLen = 0;
302 :     } else {
303 :     # Here we have more DNA in the current contig. Are we at the end of
304 :     # the current chunk?
305 :     my $lineLen = length($line);
306 :     my $newChunkLen = $chunkLen + $lineLen;
307 :     if ($newChunkLen < $segmentLength) {
308 :     # No. Add this line to the chunk.
309 :     $chunk .= $line;
310 :     $chunkLen += $lineLen;
311 :     } else {
312 :     # Yes. Create the actual chunk.
313 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
314 :     # Write it out.
315 :     $self->OutputChunk($contigID, $ordinal, $chunk);
316 :     # Set up the new chunk.
317 :     $chunk = substr($line, $segmentLength - $chunkLen);
318 :     $ordinal++;
319 :     $chunkLen = length($chunk);
320 :     }
321 :     # Update the contig length.
322 :     $contigLen += $lineLen;
323 :     }
324 :     }
325 :     # Is there a current contig?
326 :     if (defined $contigID) {
327 :     # Yes. Output the residual chunk.
328 :     $self->OutputChunk($contigID, $ordinal, $chunk);
329 :     # Output the contig itself.
330 :     $self->OutputContig($contigID, $contigLen);
331 :     }
332 :     }
333 :    
334 :     =head3 OutputChunk
335 :    
336 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
337 :    
338 :     Output a chunk of DNA for the specified contig.
339 :    
340 :     =over 4
341 :    
342 :     =item contigID
343 :    
344 :     ID of the contig being output.
345 :    
346 :     =item ordinal
347 :    
348 :     Ordinal number of this chunk.
349 :    
350 :     =item chunk
351 :    
352 :     DNA sequence comprising this chunk.
353 :    
354 :     =back
355 :    
356 :     =cut
357 :    
358 :     sub OutputChunk {
359 :     # Get the parameters.
360 :     my ($self, $contigID, $ordinal, $chunk) = @_;
361 :     # Get the sapling object.
362 :     my $sap = $self->{sap};
363 :     # Compute the chunk ID.
364 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
365 :     # Connect this sequence to the contig.
366 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
367 : parrello 1.1 # Create the DNA sequence.
368 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
369 :     # Record the chunk.
370 :     $self->{stats}->Add(chunks => 1);
371 : parrello 1.14 # Update the GC count.
372 :     $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
373 : parrello 1.1 }
374 :    
375 :     =head3 OutputContig
376 :    
377 :     $loaderObject->OutputContig($contigID, $contigLen);
378 :    
379 :     Write out the current contig.
380 :    
381 :     =over 4
382 :    
383 :     =item contigID
384 :    
385 :     ID of the contig being written out.
386 :    
387 :     =item contigLen
388 :    
389 :     Length of the contig in base pairs.
390 :    
391 :     =back
392 :    
393 :     =cut
394 :    
395 :     sub OutputContig {
396 :     # Get the parameters.
397 :     my ($self, $contigID, $contigLen) = @_;
398 :     # Get the sapling object.
399 :     my $sap = $self->{sap};
400 :     # Connect the contig to the genome.
401 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
402 :     # Output the contig record.
403 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
404 :     # Record the contig.
405 :     $self->{stats}->Add(contigs => 1);
406 :     $self->{stats}->Add(dna => $contigLen);
407 :     }
408 :    
409 :     =head3 LoadFeatures
410 :    
411 :     $loaderObject->LoadFeatures();
412 :    
413 :     Load the feature data into the database. This includes all of the features,
414 :     their protein sequences, and their subsystem information. The number of
415 :     features of each type will be stored in the statistics object, identified by
416 :     the feature type.
417 :    
418 :     =cut
419 :    
420 :     sub LoadFeatures {
421 :     # Get the parameters.
422 :     my ($self) = @_;
423 : parrello 1.7 # Read in the functional assignments.
424 : parrello 1.12 Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
425 : parrello 1.7 my $assignHash = $self->ReadAssignments();
426 : parrello 1.11 $self->{assignHash} = $assignHash;
427 : parrello 1.1 # Get the directory of feature types.
428 :     my $featureDir = "$self->{directory}/Features";
429 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
430 : parrello 1.11 # Check for protein sequences. If we have some, load them into a hash.
431 :     my $protHash = {};
432 :     if (-f "$featureDir/peg/fasta") {
433 : parrello 1.12 Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
434 : parrello 1.11 $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
435 :     }
436 : parrello 1.1 # Create the feature records for the types found.
437 :     for my $type (@types) {
438 :     # Insure this is a genuine feature directory.
439 :     if (-f "$featureDir/$type/tbl") {
440 : parrello 1.11 # Yes. Read in the evidence codes (if any).
441 :     my $evHash = {};
442 :     my $tranFile = "$featureDir/$type/Attributes/transaction_log";
443 :     if (-f $tranFile) {
444 :     $evHash = $self->LoadEvidenceCodes($tranFile);
445 :     }
446 :     # Now load the feature data.
447 :     $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
448 : parrello 1.1 }
449 :     }
450 : parrello 1.11 }
451 :    
452 :     =head3 LoadEvidenceCodes
453 :    
454 :     my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
455 :    
456 :     Load the evidence codes from the specified attribute transaction log file into a
457 :     hash. The log file is in tab-delimited format. The first column contains the
458 :     transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
459 :     the third an attribute name (we'll ignore everything but C<evidence_code>), and
460 :     the fourth the attribute value.
461 :    
462 :     =over 4
463 :    
464 :     =item attributeFile
465 :    
466 :     Name of the attribute transaction log file.
467 :    
468 :     =item RETURN
469 :    
470 :     Returns a reference to a hash mapping each feature ID to a comma-delimited list
471 :     of its evidence codes.
472 :    
473 :     =back
474 :    
475 :     =cut
476 :    
477 :     sub LoadEvidenceCodes {
478 :     # Get the parameters.
479 :     my ($self, $attributeFile) = @_;
480 :     # Get the Sapling database.
481 :     my $sap = $self->{sap};
482 :     # Get the statistics object.
483 :     my $stats = $self->{stats};
484 :     # Get the assignment hash: we use this to filter the feature IDs.
485 :     my $assignHash = $self->{assignHash};
486 :     # Open the attribute log file for input.
487 :     my $ih = Open(undef, "<$attributeFile");
488 :     # This two-dimensional hash will hold the evidence codes for each feature.
489 :     my %retVal;
490 :     # Loop through the attribute log file.
491 :     while (! eof $ih) {
492 :     # Get the current attribute record.
493 :     my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
494 :     $stats->Add(attributeLine => 1);
495 :     # Insure we have all the pieces we need.
496 :     if (! $command || ! $fid || $key ne 'evidence_code') {
497 :     $stats->Add(attributeLineSkipped => 1);
498 :     } elsif (! $assignHash->{$fid}) {
499 :     # Here the attribute is for a deleted feature.
500 :     $stats->Add(attributeFidSkipped => 1);
501 :     } else {
502 :     # Get the sub-hash for this feature.
503 :     if (! exists $retVal{$fid}) {
504 :     $retVal{$fid} = {};
505 :     }
506 :     my $featureSubHash = $retVal{$fid};
507 :     # Process according to the command.
508 :     if ($command eq 'ADD') {
509 :     # Here we are adding a new evidence code.
510 :     $featureSubHash->{$value} = 1;
511 :     $stats->Add(attributeAdd => 1);
512 :     } elsif ($command eq 'DELETE') {
513 :     # Here we are deleting an evidence code.
514 :     delete $featureSubHash->{$value};
515 :     $stats->Add(attributeDelete => 1);
516 :     } else {
517 :     # here we have an unrecognized command.
518 :     $stats->Add(attributeCommandSkip => 1);
519 :     }
520 :     }
521 : parrello 1.1 }
522 : parrello 1.11 # Loop through the hash, converting each sub-hash to a comma-delimited list of
523 :     # evidence codes.
524 :     for my $fid (keys %retVal) {
525 :     $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
526 : parrello 1.6 }
527 : parrello 1.11 # Return the evidence hash.
528 :     return \%retVal;
529 : parrello 1.1 }
530 :    
531 : parrello 1.11
532 : parrello 1.1 =head3 LoadFeatureData
533 :    
534 : parrello 1.11 $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
535 : parrello 1.1
536 :     Load the basic data for each feature into the database. The number of features of
537 :     the type found will be recorded in the statistics object.
538 :    
539 :     =over 4
540 :    
541 :     =item featureDir
542 :    
543 :     Name of the main directory containing all the feature type subdirectories.
544 :    
545 :     =item type
546 :    
547 :     Type of feature to load.
548 :    
549 : parrello 1.11 =item protHash
550 :    
551 :     Reference to a hash mapping each feature ID for a protein-encoding gene to
552 :     its protein sequence.
553 :    
554 :     =item evHash
555 : parrello 1.7
556 : parrello 1.11 Reference to a hash mapping each feature ID to a comma-delimited list of
557 :     its evidence codes (if any).
558 : parrello 1.7
559 : parrello 1.1 =back
560 :    
561 :     =cut
562 :    
563 :     sub LoadFeatureData {
564 :     # Get the parameters.
565 : parrello 1.11 my ($self, $featureDir, $type, $protHash, $evHash) = @_;
566 : parrello 1.1 # Get the sapling database.
567 :     my $sap = $self->{sap};
568 :     # Get the statistics object.
569 :     my $stats = $self->{stats};
570 : parrello 1.11 # Get the assignment hash. This tells us our functional assignments. This method is
571 :     # also where we remove the deleted features from it.
572 :     my $assignHash = $self->{assignHash};
573 : parrello 1.1 # This hash will track the features we've created. If a feature is found a second
574 :     # time, it overwrites the original.
575 : parrello 1.11 my %fidHash;
576 : parrello 1.17 # This hash tracks the deleted features. We don't want to update these.
577 :     my %deleted_features;
578 : parrello 1.1 # Insure we have a tbl file for this feature type.
579 :     my $fileName = "$featureDir/$type/tbl";
580 :     if (-f $fileName) {
581 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
582 : parrello 1.11 # of deleted features and remove them from the assignment hash. This insures
583 :     # that they are not used by subsequent methods.
584 : parrello 1.3 my $deleteFile = "$featureDir/$type/deleted.features";
585 :     if (-f $deleteFile) {
586 : parrello 1.11 my $dh = Open(undef, "<$deleteFile");
587 :     while (! eof $dh) {
588 :     my ($deletedFid) = Tracer::GetLine($dh);
589 :     if (exists $assignHash->{$deletedFid}) {
590 :     delete $assignHash->{$deletedFid};
591 :     $stats->Add(deletedFid => 1);
592 : parrello 1.17 $deleted_features{$deletedFid} = 1;
593 : parrello 1.11 }
594 :     }
595 : parrello 1.3 }
596 :     # Open the main file for input.
597 : parrello 1.12 Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
598 : parrello 1.1 my $ih = Open(undef, "<$fileName");
599 :     while (! eof $ih) {
600 :     # Read this feature's information.
601 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
602 : parrello 1.3 # Only proceed if the feature is NOT deleted.
603 : olson 1.15 if (!$deleted_features{$fid}) {
604 : parrello 1.3 # If the feature already exists, delete it. (This should be extremely rare.)
605 : parrello 1.11 if ($fidHash{$fid}) {
606 : parrello 1.3 $sap->Delete(Feature => $fid);
607 :     $stats->Add(duplicateFid => 1);
608 : parrello 1.1 }
609 : parrello 1.11 # If this is RNA, the alias list is always empty. Sometimes, the functional
610 :     # assignment is found there.
611 :     if ($type eq 'rna') {
612 :     if (! $assignHash->{$fid}) {
613 :     $assignHash->{$fid} = $aliases[0];
614 : parrello 1.2 }
615 : parrello 1.11 @aliases = ();
616 : parrello 1.2 }
617 : parrello 1.11 # Add the feature to the database.
618 : parrello 1.17 my $function = $assignHash->{$fid} || "";
619 : olson 1.16 $self->AddFeature($fid, $function, $locations, \@aliases,
620 : parrello 1.11 $protHash->{$fid}, $evHash->{$fid});
621 :     # Denote we've added this feature, so that if a duplicate occurs we're ready.
622 :     $fidHash{$fid} = 1;
623 : parrello 1.2 }
624 : parrello 1.1 }
625 :     }
626 :     }
627 :    
628 : parrello 1.11
629 : parrello 1.1 =head3 LoadProteinData
630 :    
631 : parrello 1.11 my $protHash = $self->LoadProteinData($fileName);
632 : parrello 1.1
633 :     Load the protein sequences from the named FASTA file. The sequences will be stored
634 : parrello 1.11 in a hash by FIG feature ID.
635 : parrello 1.1
636 :     =over 4
637 :    
638 :     =item fileName
639 :    
640 :     Name of the FASTA file containing the protein sequences for this genome.
641 :    
642 : parrello 1.11 =item RETURN
643 :    
644 :     Returns a hash mapping feature IDs to protein sequences.
645 :    
646 : parrello 1.1 =back
647 :    
648 :     =cut
649 :    
650 :     sub LoadProteinData {
651 :     # Get the parameters.
652 :     my ($self, $fileName) = @_;
653 :     # Open the FASTA file for input.
654 :     my $ih = Open(undef, "<$fileName");
655 : parrello 1.11 # Create the return hash.
656 :     my $retVal = {};
657 : parrello 1.1 # We'll track the current protein in here.
658 :     my $fid;
659 :     my $sequence = "";
660 :     # Loop through the input file.
661 :     while (! eof $ih) {
662 :     # Get the current record.
663 :     my $line = <$ih>; chomp $line;
664 :     # Is this a label record.
665 :     if ($line =~ /^>(\S+)/) {
666 :     # Yes. Save the new feature ID.
667 :     my $newFid = $1;
668 :     # Do we have an existing protein?
669 :     if (defined $fid) {
670 : parrello 1.11 # Yes. Store it in the hash.
671 :     $retVal->{$fid} = $sequence;
672 : parrello 1.1 }
673 :     # Initialize for the next protein.
674 :     $fid = $newFid;
675 :     $sequence = "";
676 :     } else {
677 :     # Here we have more letters for the current protein.
678 :     $sequence .= $line;
679 :     }
680 :     }
681 :     # Do we have a residual protein.
682 :     if (defined $fid) {
683 : parrello 1.11 # Yes. Store it in the hash.
684 :     $retVal->{$fid} = $sequence;
685 : parrello 1.1 }
686 : parrello 1.11 # Return the hash.
687 :     return $retVal;
688 : parrello 1.1 }
689 :    
690 : parrello 1.6
691 :     =head3 LoadAnnotations
692 :    
693 :     $loaderObject->LoadAnnotations($fileName);
694 :    
695 :     Read in the annotation history information and use it to create annotation records.
696 :    
697 :     =over 4
698 :    
699 :     =item fileName
700 :    
701 :     Name of the annotation history file. This file is formatted with four fields per
702 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
703 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
704 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
705 :    
706 :     =back
707 :    
708 :     =cut
709 :    
710 :     sub LoadAnnotations {
711 :     # Get the parameters.
712 :     my ($self, $fileName) = @_;
713 : parrello 1.11 # Get the assignment Hash. We use this to filter out deleted features.
714 :     my $assignHash = $self->{assignHash};
715 : parrello 1.6 # Get the Sapling database.
716 :     my $sap = $self->{sap};
717 :     # Get the statistics object.
718 :     my $stats = $self->{stats};
719 :     # Open the input file.
720 :     my $ih = Tracer::Open(undef, "<$fileName");
721 :     # Loop through the input.
722 :     while (! eof $ih) {
723 :     # Read in the peg, timestamp, and user ID.
724 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
725 : parrello 1.11 # Only proceed if the feature is not deleted.
726 :     if ($assignHash->{$fid}) {
727 :     # Add the annotation to this feature.
728 :     $self->MakeAnnotation($fid, $text, $user, $timestamp);
729 : parrello 1.6 }
730 :     }
731 :     }
732 :    
733 :    
734 : parrello 1.1 =head3 WriteProtein
735 :    
736 :     $loaderObject->WriteProtein($fid, $sequence);
737 :    
738 :     Write out the specified protein sequence and associate it with the specified feature.
739 :     This requires checking to see if the protein sequence is already in the database.
740 :    
741 :     =over 4
742 :    
743 :     =item fid
744 :    
745 :     ID of the feature with the specified protein sequence.
746 :    
747 :     =item sequence
748 :    
749 :     Protein sequence for the indicated feature.
750 :    
751 :     =back
752 :    
753 :     =cut
754 :    
755 :     sub WriteProtein {
756 :     # Get the parameters.
757 :     my ($self, $fid, $sequence) = @_;
758 :     # Compute the key of the protein sequence.
759 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
760 : parrello 1.1 # Insure the protein exists.
761 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
762 :     # Connect the feature to it.
763 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
764 :     }
765 :    
766 :     =head3 LoadSubsystems
767 :    
768 : parrello 1.17 $loaderObject->LoadSubsystems($subsysList);
769 :    
770 :     Load the subsystem data into the database. This requires looking through the
771 :     bindings and using them to connect to the subsystems that already exist in the
772 :     database. If the subsystem does not exist, its bindings will not be loaded.
773 :    
774 :     =over 4
775 :    
776 :     =item subsysList
777 :    
778 :     Reference to a list of subsystem IDs. If specified, only the subsystems in the
779 :     list will be processed. This is useful when a particular set of subsystems
780 :     has been replaced.
781 : parrello 1.1
782 : parrello 1.17 =back
783 : parrello 1.1
784 :     =cut
785 :    
786 :     sub LoadSubsystems {
787 :     # Get the parameters.
788 : parrello 1.17 my ($self, $subsysList) = @_;
789 : parrello 1.1 # Get the sapling object.
790 :     my $sap = $self->{sap};
791 :     # Get the statistics object.
792 :     my $stats = $self->{stats};
793 :     # Get the genome ID.
794 :     my $genome = $self->{genome};
795 :     # This hash maps subsystem IDs to molecular machine IDs.
796 :     my %machines;
797 :     # This hash maps subsystem/role pairs to machine role IDs.
798 :     my %machineRoles;
799 : parrello 1.17 # This hash will contain the list of subsystems found in the database.
800 :     my %subsystems;
801 :     # We loop through the subsystems, looking for the ones already in the
802 :     # database. The list is given in the subsystems file of the Subsystems
803 :     # directory.
804 : parrello 1.1 my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
805 :     # Loop through the subsystems in the file, insuring we have them in the database.
806 :     while (! eof $ih) {
807 :     # Get this subsystem.
808 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
809 : parrello 1.12 Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
810 : parrello 1.1 # Normalize the subsystem name.
811 :     $subsystem = $sap->SubsystemID($subsystem);
812 : parrello 1.17 # Insure the subsystem is in the database and is one we're interested in.
813 :     if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
814 :     $sap->Exists(Subsystem => $subsystem)) {
815 :     # We have the subsystem in the database. We need to compute the machine
816 :     # role IDs and create the molecular machine. First, we need to remember
817 :     # the subsystem.
818 :     $subsystems{$subsystem} = 1;
819 :     # Compute this subsystem's MD5.
820 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
821 : parrello 1.1 # Loop through the roles.
822 : parrello 1.17 my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
823 :     [$subsystem], "to-link abbreviation");
824 :     for my $roleTuple (@roleList) {
825 : parrello 1.1 my ($roleID, $abbr) = @$roleTuple;
826 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
827 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
828 :     }
829 : parrello 1.17 # Next we need the variant code and key.
830 :     my $variantCode = BaseSaplingLoader::Starless($variant);
831 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
832 :     # Now we create the molecular machine connecting this genome to the
833 :     # subsystem variant.
834 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
835 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
836 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
837 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
838 :     # Remember the machine ID.
839 :     $machines{$subsystem} = $machineID;
840 : parrello 1.1 }
841 :     }
842 : parrello 1.17 # Now we go through the bindings file. This file connects the subsystem
843 :     # roles to the molecular machines.
844 : parrello 1.1 $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
845 :     # Loop through the bindings.
846 :     while (! eof $ih) {
847 :     # Get the binding data.
848 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
849 :     # Normalize the subsystem name.
850 :     $subsystem = $sap->SubsystemID($subsystem);
851 : parrello 1.17 # Insure the subsystem is in the database.
852 :     if ($subsystems{$subsystem}) {
853 :     # Compute the machine role.
854 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
855 :     # Insure it exists.
856 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
857 :     if ($created) {
858 :     # We created the machine role, so connect it to the machine.
859 :     my $machineID = $machines{$subsystem};
860 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
861 :     # Connect it to the role, too.
862 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
863 :     }
864 :     # Connect the feature.
865 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
866 : parrello 1.1 }
867 :     }
868 :     }
869 :    
870 :     =head3 CreateGenome
871 :    
872 :     $loaderObject->CreateGenome();
873 :    
874 :     Create the genome record.
875 :    
876 :     =cut
877 :    
878 :     # This hash maps cache statistics to genome record fields.
879 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
880 :     genome_domain => 'domain'};
881 :     # This hash maps domains to the prokaryotic flag.
882 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
883 :    
884 :     sub CreateGenome {
885 :     # Get the parameters.
886 :     my ($self) = @_;
887 :     # Get the genome directory.
888 :     my $dir = $self->{directory};
889 :     # Get the sapling database.
890 :     my $sap = $self->{sap};
891 :     # We'll put the genome attributes in here.
892 :     my %fields;
893 :     # Check for a basic statistics file.
894 :     my $statsFile = "$dir/cache.basic_statistics";
895 :     if (-f $statsFile) {
896 :     # We have the statistics file, so read the major attributes from it.
897 :     my $ih = Open(undef, "<$statsFile");
898 :     while (! eof $ih) {
899 :     my ($key, $value) = Tracer::GetLine($ih);
900 :     my $fieldKey = GENOME_FIELDS->{$key};
901 :     if ($fieldKey) {
902 :     $fields{$fieldKey} = $value;
903 :     }
904 :     }
905 :     # Translate the domain.
906 :     $fields{domain} = ucfirst $fields{domain};
907 :     # Denote the genome is complete.
908 :     $fields{complete} = 1;
909 :     } else {
910 :     # Check to see if this genome is complete.
911 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
912 :     # Get the genome name.
913 :     my $ih = Open(undef, "<$dir/GENOME");
914 :     my $line = <$ih>; chomp $line;
915 :     $fields{'scientific-name'} = $line;
916 :     # Get the taxonomy and extract the domain from it.
917 :     $ih = Open(undef, "<$dir/TAXONOMY");
918 :     ($fields{domain}) = split /;/, <$ih>, 2;
919 :     }
920 :     # Get the counts from the statistics object.
921 :     my $stats = $self->{stats};
922 :     $fields{contigs} = $stats->Ask('contigs');
923 :     $fields{'dna-size'} = $stats->Ask('dna');
924 :     $fields{pegs} = $stats->Ask('peg');
925 :     $fields{rnas} = $stats->Ask('rna');
926 : parrello 1.14 $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
927 : parrello 1.1 # Get the genetic code. The default is 11.
928 :     $fields{'genetic-code'} = 11;
929 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
930 :     if (-f $geneticCodeFile) {
931 :     # There's a genetic code file, so we need to read it for the code.
932 :     my $ih = Open(undef, "<$geneticCodeFile");
933 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
934 :     }
935 :     # Use the domain to determine whether or not the genome is prokaryotic.
936 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
937 :     # Finally, add the genome ID.
938 :     $fields{id} = $self->{genome};
939 :     # Create the genome record.
940 :     $sap->InsertObject('Genome', %fields);
941 :     }
942 :    
943 :     =head3 ReadAssignments
944 :    
945 :     my $assignHash = $loaderObject->ReadAssignments();
946 :    
947 :     Return a hash mapping each feature ID to its functional assignment. This method
948 :     essentially reads the B<assigned_functions> file into memory.
949 :    
950 :     =cut
951 :    
952 :     sub ReadAssignments {
953 :     # Get the parameters.
954 :     my ($self) = @_;
955 :     # Create the return hash.
956 :     my $retVal = {};
957 :     # Loop through the assigned-functions file, storing results in the hash. Later
958 :     # results will override earlier ones.
959 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
960 :     while (! eof $ih) {
961 :     my ($fid, $function) = Tracer::GetLine($ih);
962 :     $retVal->{$fid} = $function;
963 :     }
964 :     # Return the hash built.
965 :     return $retVal;
966 :     }
967 :    
968 :    
969 :     =head3 FixContig
970 :    
971 :     $loaderObject->FixContig(\$contigID);
972 :    
973 :     Insure that the specified contig ID contains the genome ID.
974 :    
975 :     =cut
976 :    
977 :     sub FixContig {
978 :     my ($self, $contigID) = @_;
979 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
980 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
981 :     $$contigID = "$self->{genome}:$$contigID";
982 :     }
983 :     }
984 :    
985 :     =head3 ConnectLocation
986 :    
987 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
988 :    
989 :     Connect the specified feature to the specified contig location.
990 :    
991 :     =over 4
992 :    
993 :     =item fid
994 :    
995 :     ID of the relevant feature.
996 :    
997 :     =item contig
998 :    
999 :     ID of the contig containing this segment of the feature (normalized).
1000 :    
1001 :     =item segment
1002 :    
1003 :     Ordinal number of this segment.
1004 :    
1005 :     =item left
1006 :    
1007 :     Location of the segment's leftmost base pair.
1008 :    
1009 :     =item dir
1010 :    
1011 :     Direction (strand) of the segment (C<+> or C<->).
1012 :    
1013 :     =item len
1014 :    
1015 :     Length of the segment (which must be no greater than the configured maximum).
1016 :    
1017 :     =back
1018 :    
1019 :     =cut
1020 :    
1021 :     sub ConnectLocation {
1022 :     # Get the parameters.
1023 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1024 :     # Get the sapling database.
1025 :     my $sap = $self->{sap};
1026 :     # Create the relationship.
1027 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1028 :     begin => $left, dir => $dir, len => $len,
1029 :     ordinal => $segment);
1030 :     # Record it in the statistics.
1031 :     $self->{stats}->Add(segment => 1);
1032 :     }
1033 :    
1034 : parrello 1.6 =head2 Internal Utility Methods
1035 :    
1036 :     =head3 ReadAnnotation
1037 :    
1038 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1039 :    
1040 :     Read the next record from an annotation file. The next record must exist (that is, an
1041 :     end-of-file check should have been performed before calling this method).
1042 :    
1043 :     =over 4
1044 :    
1045 :     =item ih
1046 :    
1047 :     Open file handle for the annotation file.
1048 :    
1049 :     =item RETURN
1050 :    
1051 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1052 :     timestamp, (2) the user ID, and (3) the annotation text.
1053 :    
1054 :     =back
1055 :    
1056 :     =cut
1057 :    
1058 :     sub ReadAnnotation {
1059 :     # Get the parameter.
1060 :     my ($ih) = @_;
1061 :     # Read the three fixed fields.
1062 :     my $fid = <$ih>; chomp $fid;
1063 :     my $timestamp = <$ih>; chomp $timestamp;
1064 :     my $user = <$ih>; chomp $user;
1065 :     # Loop through the lines of the text field.
1066 :     my $text = "";
1067 :     my $line = <$ih>;
1068 : olson 1.13 while (defined($line) && $line ne "//\n") {
1069 : parrello 1.6 $text .= $line;
1070 :     $line = <$ih>;
1071 :     }
1072 :     # Remove the trailing new-line from the text.
1073 :     chomp $text;
1074 :     # Return the fields.
1075 :     return ($fid, $timestamp, $user, $text);
1076 :     }
1077 :    
1078 : olson 1.13 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3