[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.18 use BaseSaplingLoader;
30 :     use MD5Computer;
31 : parrello 1.3 use base qw(SaplingDataLoader);
32 : parrello 1.1
33 :     =head1 Sapling Genome Loader
34 :    
35 :     This package contains the methods used to load a genome into the Sapling from a genome
36 :     directory. This is not very efficient if you are trying to create a full database,
37 :     but is useful if you wish to add genomes one at a time. The information loaded will
38 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
39 :    
40 :     =head2 Main Methods
41 :    
42 :     =head3 Load
43 :    
44 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
45 :    
46 :     Load a genome from a genome directory into the sapling database.
47 :    
48 :     =over 4
49 :    
50 :     =item sap
51 :    
52 :     L<Sapling> object used to access the target database.
53 :    
54 :     =item genome
55 :    
56 :     ID of the genome being loaded.
57 :    
58 :     =item directory
59 :    
60 :     Name of the directory containing the genome information.
61 :    
62 :     =back
63 :    
64 :     =cut
65 :    
66 :     sub Load {
67 :     # Get the parameters.
68 : parrello 1.17 my ($sap, $genome, $directory) = @_;
69 : parrello 1.1 # Create the loader object.
70 : parrello 1.17 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
71 : parrello 1.1 # Load the contigs.
72 : parrello 1.12 Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
73 : parrello 1.1 $loaderObject->LoadContigs();
74 :     # Load the features.
75 : parrello 1.12 Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
76 : parrello 1.1 $loaderObject->LoadFeatures();
77 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
78 :     # database.
79 :     if (-f "$directory/annotations") {
80 : parrello 1.12 Trace("Processing annotations.") if T(SaplingDataLoader => 3);
81 : parrello 1.7 $loaderObject->LoadAnnotations("$directory/annotations");
82 :     }
83 : parrello 1.1 # Load the subsystem bindings.
84 : parrello 1.12 Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
85 : parrello 1.1 $loaderObject->LoadSubsystems();
86 :     # Create the Genome record and taxonomy information.
87 : parrello 1.12 Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
88 : parrello 1.1 $loaderObject->CreateGenome();
89 :     # Return the statistics.
90 :     return $loaderObject->{stats};
91 :     }
92 :    
93 :     =head3 ClearGenome
94 :    
95 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
96 :    
97 :     Delete the specified genome and all the related records from the specified sapling
98 :     database. This method can also be used to clean up after a failed or aborted load.
99 :    
100 :     =over 4
101 :    
102 :     =item sap
103 :    
104 :     L<Sapling> object used to access the target database.
105 :    
106 :     =item genome
107 :    
108 :     ID of the genome to delete.
109 :    
110 :     =item RETURN
111 :    
112 :     Returns a statistics object counting the records deleted.
113 :    
114 :     =back
115 :    
116 :     =cut
117 :    
118 :     sub ClearGenome {
119 :     # Get the parameters.
120 :     my ($sap, $genome) = @_;
121 :     # Create the statistics object.
122 :     my $stats = Stats->new();
123 : parrello 1.9 # Delete the DNA sequences.
124 :     my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
125 :     for my $seq (@seqs) {
126 :     my $delStats = $sap->Delete(DNASequence => $seq);
127 :     $stats->Accumulate($delStats);
128 :     }
129 : parrello 1.1 # Delete the contigs.
130 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
131 : parrello 1.1 # Delete the features.
132 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
133 : parrello 1.1 # Delete the molecular machines.
134 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
135 : parrello 1.1 # Delete the genome itself.
136 :     my $subStats = $sap->Delete(Genome => $genome);
137 :     # Accumulate the statistics from the delete.
138 :     $stats->Accumulate($subStats);
139 : parrello 1.18 Trace("Statistics for delete of $genome:\n" . $stats->Show()) if T(SaplingDataLoader => 3);
140 : parrello 1.1 # Return the statistics object.
141 :     return $stats;
142 :     }
143 :    
144 : parrello 1.7
145 :     =head3 Process
146 :    
147 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
148 :    
149 :     Load genome data from the specified directory. If the genome data already
150 :     exists in the database, it will be deleted first.
151 :    
152 :     =over 4
153 :    
154 :     =item sap
155 :    
156 :     L</Sapling> object for accessing the database.
157 :    
158 :     =item genome
159 :    
160 :     ID of the genome whose data is being loaded.
161 :    
162 :     =item directory
163 :    
164 : parrello 1.17 Name of the directory containing the genome data files. If omitted, the
165 :     genome will be deleted from the database.
166 : olson 1.16
167 : parrello 1.7 =item RETURN
168 :    
169 :     Returns a statistics object describing the activity during the reload.
170 :    
171 :     =back
172 :    
173 :     =cut
174 :    
175 :     sub Process {
176 :     # Get the parameters.
177 : parrello 1.17 my ($sap, $genome, $directory) = @_;
178 : parrello 1.7 # Clear the existing data for the specified genome.
179 :     my $stats = ClearGenome($sap, $genome);
180 : parrello 1.17 # Load the new expression data from the specified directory (if one is
181 :     # specified).
182 :     if ($directory) {
183 :     my $newStats = Load($sap, $genome, $directory);
184 :     # Merge the statistics.
185 :     $stats->Accumulate($newStats);
186 :     }
187 : parrello 1.7 # Return the result.
188 :     return $stats;
189 :     }
190 :    
191 :    
192 : parrello 1.1 =head2 Loader Object Methods
193 :    
194 :     =head3 new
195 :    
196 : parrello 1.17 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
197 : parrello 1.1
198 :     Create a loader object that can be used to facilitate loading genome data from a
199 :     directory.
200 :    
201 :     =over 4
202 :    
203 :     =item sap
204 :    
205 :     L<Sapling> object used to access the target database.
206 :    
207 :     =item genome
208 :    
209 :     ID of the genome being loaded.
210 :    
211 :     =item directory
212 :    
213 :     Name of the directory containing the genome information.
214 :    
215 :     =back
216 :    
217 :     The object created contains the following fields.
218 :    
219 :     =over 4
220 :    
221 :     =item directory
222 :    
223 :     Name of the directory containing the genome data.
224 :    
225 :     =item genome
226 :    
227 :     ID of the genome being loaded.
228 :    
229 :     =item supportRecords
230 :    
231 :     A hash of hashes, used to track the support records known to exist in the database.
232 :    
233 :     =item sap
234 :    
235 :     L<Sapling> object used to access the database.
236 :    
237 :     =item stats
238 :    
239 :     L<Stats> object for tracking statistical information about the load.
240 :    
241 :     =back
242 :    
243 :     =cut
244 :    
245 :     sub new {
246 :     # Get the parameters.
247 : parrello 1.17 my ($class, $sap, $genome, $directory) = @_;
248 : parrello 1.1 # Create the object.
249 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
250 :     # Add our specialized data.
251 :     $retVal->{genome} = $genome;
252 :     $retVal->{directory} = $directory;
253 : parrello 1.11 # Leave the assignment hash undefined until we populate it.
254 :     $retVal->{assignHash} = undef;
255 : parrello 1.18 # Create an MD5 Computer for this genome.
256 :     $retVal->{md5} = MD5Computer->new();
257 : parrello 1.3 # Return the result.
258 : parrello 1.1 return $retVal;
259 :     }
260 :    
261 :     =head3 LoadContigs
262 :    
263 :     $loaderObject->LoadContigs();
264 :    
265 :     Load the contig information into the database. This includes the contigs themselves and
266 : parrello 1.14 the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
267 :     number of base pairs as the C<dna> statistic, and the number of GC instances as the
268 :     C<gc_content> statistic.
269 : parrello 1.1
270 :     =cut
271 :    
272 :     sub LoadContigs {
273 :     # Get this object.
274 :     my ($self) = @_;
275 :     # Open the contigs file from the genome directory.
276 :     my $ih = Open(undef, "<$self->{directory}/contigs");
277 :     # Get the length of a DNA segment.
278 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
279 :     # Get the genome ID.
280 :     my $genome = $self->{genome};
281 :     # These variables will contain the current contig ID, the current contig length,
282 : parrello 1.18 # the accumulated DNA sequence for the current chunk, and its length.
283 :     my ($contigID, $contigLen, $chunk, $chunkLen) = (undef, 0, '', 0);
284 :     # This variable contains a list of the chunks, for use in computing the MD5.
285 :     my @chunks;
286 : parrello 1.1 # Loop through the contig file.
287 :     while (! eof $ih) {
288 :     # Get the current record.
289 :     my $line = <$ih>; chomp $line;
290 :     # Is this the start of a new contig?
291 :     if ($line =~ /^>(\S+)/) {
292 :     # Yes. Save the contig ID.
293 :     my $newContigID = $1;
294 :     # Is there a current contig?
295 :     if (defined $contigID) {
296 : parrello 1.18 # Yes. Output the contig.
297 :     $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
298 : parrello 1.1 }
299 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
300 :     $self->FixContig(\$newContigID);
301 :     # Initialize the new contig.
302 :     $contigID = $newContigID;
303 :     $contigLen = 0;
304 :     $chunk = '';
305 :     $chunkLen = 0;
306 : parrello 1.18 @chunks = ();
307 : parrello 1.1 } else {
308 :     # Here we have more DNA in the current contig. Are we at the end of
309 :     # the current chunk?
310 :     my $lineLen = length($line);
311 :     my $newChunkLen = $chunkLen + $lineLen;
312 :     if ($newChunkLen < $segmentLength) {
313 :     # No. Add this line to the chunk.
314 :     $chunk .= $line;
315 :     $chunkLen += $lineLen;
316 :     } else {
317 :     # Yes. Create the actual chunk.
318 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
319 :     # Write it out.
320 : parrello 1.18 $self->OutputChunk($contigID, scalar @chunks, $chunk);
321 : parrello 1.1 # Set up the new chunk.
322 :     $chunk = substr($line, $segmentLength - $chunkLen);
323 :     $chunkLen = length($chunk);
324 : parrello 1.18 push @chunks, $chunk;
325 : parrello 1.1 }
326 :     # Update the contig length.
327 :     $contigLen += $lineLen;
328 :     }
329 :     }
330 :     # Is there a current contig?
331 :     if (defined $contigID) {
332 : parrello 1.18 # Yes. Output the contig itself.
333 :     $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
334 : parrello 1.1 }
335 :     }
336 :    
337 :     =head3 OutputChunk
338 :    
339 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
340 :    
341 :     Output a chunk of DNA for the specified contig.
342 :    
343 :     =over 4
344 :    
345 :     =item contigID
346 :    
347 :     ID of the contig being output.
348 :    
349 :     =item ordinal
350 :    
351 :     Ordinal number of this chunk.
352 :    
353 :     =item chunk
354 :    
355 :     DNA sequence comprising this chunk.
356 :    
357 :     =back
358 :    
359 :     =cut
360 :    
361 :     sub OutputChunk {
362 :     # Get the parameters.
363 :     my ($self, $contigID, $ordinal, $chunk) = @_;
364 :     # Get the sapling object.
365 :     my $sap = $self->{sap};
366 :     # Compute the chunk ID.
367 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
368 :     # Connect this sequence to the contig.
369 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
370 : parrello 1.1 # Create the DNA sequence.
371 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
372 :     # Record the chunk.
373 :     $self->{stats}->Add(chunks => 1);
374 : parrello 1.14 # Update the GC count.
375 :     $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
376 : parrello 1.1 }
377 :    
378 :     =head3 OutputContig
379 :    
380 : parrello 1.18 $loaderObject->OutputContig($contigID, $contigLen, $chunk, \@chunks);
381 : parrello 1.1
382 :     Write out the current contig.
383 :    
384 :     =over 4
385 :    
386 :     =item contigID
387 :    
388 :     ID of the contig being written out.
389 :    
390 :     =item contigLen
391 :    
392 :     Length of the contig in base pairs.
393 :    
394 : parrello 1.18 =item chunk
395 :    
396 :     Last DNA chunk of the contig.
397 :    
398 :     =item chunks
399 :    
400 :     Reference to a list of the DNA chunks up to (but not including) the last one.
401 :    
402 : parrello 1.1 =back
403 :    
404 :     =cut
405 :    
406 :     sub OutputContig {
407 :     # Get the parameters.
408 : parrello 1.18 my ($self, $contigID, $contigLen, $chunk, $chunks) = @_;
409 : parrello 1.1 # Get the sapling object.
410 :     my $sap = $self->{sap};
411 : parrello 1.18 # Get the MD5 computer.
412 :     my $md5C = $self->{md5};
413 :     # Output the last chunk.
414 :     $self->OutputChunk($contigID, scalar @$chunks, $chunk);
415 : parrello 1.1 # Connect the contig to the genome.
416 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
417 : parrello 1.18 # Compute the MD5.
418 :     push @$chunks, $chunk;
419 :     my $contigMD5 = $md5C->ProcessContig($contigID, $chunks);
420 : parrello 1.1 # Output the contig record.
421 : parrello 1.18 $sap->InsertObject('Contig', id => $contigID, length => $contigLen,
422 :     md5_identifier => $contigMD5);
423 : parrello 1.1 # Record the contig.
424 :     $self->{stats}->Add(contigs => 1);
425 :     $self->{stats}->Add(dna => $contigLen);
426 :     }
427 :    
428 :     =head3 LoadFeatures
429 :    
430 :     $loaderObject->LoadFeatures();
431 :    
432 :     Load the feature data into the database. This includes all of the features,
433 :     their protein sequences, and their subsystem information. The number of
434 :     features of each type will be stored in the statistics object, identified by
435 :     the feature type.
436 :    
437 :     =cut
438 :    
439 :     sub LoadFeatures {
440 :     # Get the parameters.
441 :     my ($self) = @_;
442 : parrello 1.7 # Read in the functional assignments.
443 : parrello 1.12 Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
444 : parrello 1.7 my $assignHash = $self->ReadAssignments();
445 : parrello 1.11 $self->{assignHash} = $assignHash;
446 : parrello 1.1 # Get the directory of feature types.
447 :     my $featureDir = "$self->{directory}/Features";
448 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
449 : parrello 1.11 # Check for protein sequences. If we have some, load them into a hash.
450 :     my $protHash = {};
451 :     if (-f "$featureDir/peg/fasta") {
452 : parrello 1.12 Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
453 : parrello 1.11 $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
454 :     }
455 : parrello 1.1 # Create the feature records for the types found.
456 :     for my $type (@types) {
457 :     # Insure this is a genuine feature directory.
458 :     if (-f "$featureDir/$type/tbl") {
459 : parrello 1.11 # Yes. Read in the evidence codes (if any).
460 :     my $evHash = {};
461 :     my $tranFile = "$featureDir/$type/Attributes/transaction_log";
462 :     if (-f $tranFile) {
463 :     $evHash = $self->LoadEvidenceCodes($tranFile);
464 :     }
465 :     # Now load the feature data.
466 :     $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
467 : parrello 1.1 }
468 :     }
469 : parrello 1.11 }
470 :    
471 :     =head3 LoadEvidenceCodes
472 :    
473 :     my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
474 :    
475 :     Load the evidence codes from the specified attribute transaction log file into a
476 :     hash. The log file is in tab-delimited format. The first column contains the
477 :     transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
478 :     the third an attribute name (we'll ignore everything but C<evidence_code>), and
479 :     the fourth the attribute value.
480 :    
481 :     =over 4
482 :    
483 :     =item attributeFile
484 :    
485 :     Name of the attribute transaction log file.
486 :    
487 :     =item RETURN
488 :    
489 :     Returns a reference to a hash mapping each feature ID to a comma-delimited list
490 :     of its evidence codes.
491 :    
492 :     =back
493 :    
494 :     =cut
495 :    
496 :     sub LoadEvidenceCodes {
497 :     # Get the parameters.
498 :     my ($self, $attributeFile) = @_;
499 :     # Get the Sapling database.
500 :     my $sap = $self->{sap};
501 :     # Get the statistics object.
502 :     my $stats = $self->{stats};
503 :     # Get the assignment hash: we use this to filter the feature IDs.
504 :     my $assignHash = $self->{assignHash};
505 :     # Open the attribute log file for input.
506 :     my $ih = Open(undef, "<$attributeFile");
507 :     # This two-dimensional hash will hold the evidence codes for each feature.
508 :     my %retVal;
509 :     # Loop through the attribute log file.
510 :     while (! eof $ih) {
511 :     # Get the current attribute record.
512 :     my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
513 :     $stats->Add(attributeLine => 1);
514 :     # Insure we have all the pieces we need.
515 :     if (! $command || ! $fid || $key ne 'evidence_code') {
516 :     $stats->Add(attributeLineSkipped => 1);
517 :     } elsif (! $assignHash->{$fid}) {
518 :     # Here the attribute is for a deleted feature.
519 :     $stats->Add(attributeFidSkipped => 1);
520 :     } else {
521 :     # Get the sub-hash for this feature.
522 :     if (! exists $retVal{$fid}) {
523 :     $retVal{$fid} = {};
524 :     }
525 :     my $featureSubHash = $retVal{$fid};
526 :     # Process according to the command.
527 :     if ($command eq 'ADD') {
528 :     # Here we are adding a new evidence code.
529 :     $featureSubHash->{$value} = 1;
530 :     $stats->Add(attributeAdd => 1);
531 :     } elsif ($command eq 'DELETE') {
532 :     # Here we are deleting an evidence code.
533 :     delete $featureSubHash->{$value};
534 :     $stats->Add(attributeDelete => 1);
535 :     } else {
536 :     # here we have an unrecognized command.
537 :     $stats->Add(attributeCommandSkip => 1);
538 :     }
539 :     }
540 : parrello 1.1 }
541 : parrello 1.11 # Loop through the hash, converting each sub-hash to a comma-delimited list of
542 :     # evidence codes.
543 :     for my $fid (keys %retVal) {
544 :     $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
545 : parrello 1.6 }
546 : parrello 1.11 # Return the evidence hash.
547 :     return \%retVal;
548 : parrello 1.1 }
549 :    
550 : parrello 1.11
551 : parrello 1.1 =head3 LoadFeatureData
552 :    
553 : parrello 1.11 $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
554 : parrello 1.1
555 :     Load the basic data for each feature into the database. The number of features of
556 :     the type found will be recorded in the statistics object.
557 :    
558 :     =over 4
559 :    
560 :     =item featureDir
561 :    
562 :     Name of the main directory containing all the feature type subdirectories.
563 :    
564 :     =item type
565 :    
566 :     Type of feature to load.
567 :    
568 : parrello 1.11 =item protHash
569 :    
570 :     Reference to a hash mapping each feature ID for a protein-encoding gene to
571 :     its protein sequence.
572 :    
573 :     =item evHash
574 : parrello 1.7
575 : parrello 1.11 Reference to a hash mapping each feature ID to a comma-delimited list of
576 :     its evidence codes (if any).
577 : parrello 1.7
578 : parrello 1.1 =back
579 :    
580 :     =cut
581 :    
582 :     sub LoadFeatureData {
583 :     # Get the parameters.
584 : parrello 1.11 my ($self, $featureDir, $type, $protHash, $evHash) = @_;
585 : parrello 1.1 # Get the sapling database.
586 :     my $sap = $self->{sap};
587 :     # Get the statistics object.
588 :     my $stats = $self->{stats};
589 : parrello 1.11 # Get the assignment hash. This tells us our functional assignments. This method is
590 :     # also where we remove the deleted features from it.
591 :     my $assignHash = $self->{assignHash};
592 : parrello 1.1 # This hash will track the features we've created. If a feature is found a second
593 :     # time, it overwrites the original.
594 : parrello 1.11 my %fidHash;
595 : parrello 1.17 # This hash tracks the deleted features. We don't want to update these.
596 :     my %deleted_features;
597 : parrello 1.1 # Insure we have a tbl file for this feature type.
598 :     my $fileName = "$featureDir/$type/tbl";
599 :     if (-f $fileName) {
600 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
601 : parrello 1.11 # of deleted features and remove them from the assignment hash. This insures
602 :     # that they are not used by subsequent methods.
603 : parrello 1.3 my $deleteFile = "$featureDir/$type/deleted.features";
604 :     if (-f $deleteFile) {
605 : parrello 1.11 my $dh = Open(undef, "<$deleteFile");
606 :     while (! eof $dh) {
607 :     my ($deletedFid) = Tracer::GetLine($dh);
608 :     if (exists $assignHash->{$deletedFid}) {
609 :     delete $assignHash->{$deletedFid};
610 :     $stats->Add(deletedFid => 1);
611 : parrello 1.17 $deleted_features{$deletedFid} = 1;
612 : parrello 1.11 }
613 :     }
614 : parrello 1.3 }
615 :     # Open the main file for input.
616 : parrello 1.12 Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
617 : parrello 1.1 my $ih = Open(undef, "<$fileName");
618 :     while (! eof $ih) {
619 :     # Read this feature's information.
620 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
621 : parrello 1.3 # Only proceed if the feature is NOT deleted.
622 : olson 1.15 if (!$deleted_features{$fid}) {
623 : parrello 1.3 # If the feature already exists, delete it. (This should be extremely rare.)
624 : parrello 1.11 if ($fidHash{$fid}) {
625 : parrello 1.3 $sap->Delete(Feature => $fid);
626 :     $stats->Add(duplicateFid => 1);
627 : parrello 1.18 } else {
628 :     # It doesn't exist, so record it in the statistics.
629 :     $stats->Add($type => 1);
630 : parrello 1.1 }
631 : parrello 1.11 # If this is RNA, the alias list is always empty. Sometimes, the functional
632 :     # assignment is found there.
633 :     if ($type eq 'rna') {
634 :     if (! $assignHash->{$fid}) {
635 :     $assignHash->{$fid} = $aliases[0];
636 : parrello 1.2 }
637 : parrello 1.11 @aliases = ();
638 : parrello 1.2 }
639 : parrello 1.11 # Add the feature to the database.
640 : parrello 1.17 my $function = $assignHash->{$fid} || "";
641 : olson 1.16 $self->AddFeature($fid, $function, $locations, \@aliases,
642 : parrello 1.11 $protHash->{$fid}, $evHash->{$fid});
643 :     # Denote we've added this feature, so that if a duplicate occurs we're ready.
644 :     $fidHash{$fid} = 1;
645 : parrello 1.2 }
646 : parrello 1.1 }
647 :     }
648 :     }
649 :    
650 : parrello 1.11
651 : parrello 1.1 =head3 LoadProteinData
652 :    
653 : parrello 1.11 my $protHash = $self->LoadProteinData($fileName);
654 : parrello 1.1
655 :     Load the protein sequences from the named FASTA file. The sequences will be stored
656 : parrello 1.11 in a hash by FIG feature ID.
657 : parrello 1.1
658 :     =over 4
659 :    
660 :     =item fileName
661 :    
662 :     Name of the FASTA file containing the protein sequences for this genome.
663 :    
664 : parrello 1.11 =item RETURN
665 :    
666 :     Returns a hash mapping feature IDs to protein sequences.
667 :    
668 : parrello 1.1 =back
669 :    
670 :     =cut
671 :    
672 :     sub LoadProteinData {
673 :     # Get the parameters.
674 :     my ($self, $fileName) = @_;
675 :     # Open the FASTA file for input.
676 :     my $ih = Open(undef, "<$fileName");
677 : parrello 1.11 # Create the return hash.
678 :     my $retVal = {};
679 : parrello 1.1 # We'll track the current protein in here.
680 :     my $fid;
681 :     my $sequence = "";
682 :     # Loop through the input file.
683 :     while (! eof $ih) {
684 :     # Get the current record.
685 :     my $line = <$ih>; chomp $line;
686 :     # Is this a label record.
687 :     if ($line =~ /^>(\S+)/) {
688 :     # Yes. Save the new feature ID.
689 :     my $newFid = $1;
690 :     # Do we have an existing protein?
691 :     if (defined $fid) {
692 : parrello 1.11 # Yes. Store it in the hash.
693 :     $retVal->{$fid} = $sequence;
694 : parrello 1.1 }
695 :     # Initialize for the next protein.
696 :     $fid = $newFid;
697 :     $sequence = "";
698 :     } else {
699 :     # Here we have more letters for the current protein.
700 :     $sequence .= $line;
701 :     }
702 :     }
703 :     # Do we have a residual protein.
704 :     if (defined $fid) {
705 : parrello 1.11 # Yes. Store it in the hash.
706 :     $retVal->{$fid} = $sequence;
707 : parrello 1.1 }
708 : parrello 1.11 # Return the hash.
709 :     return $retVal;
710 : parrello 1.1 }
711 :    
712 : parrello 1.6
713 :     =head3 LoadAnnotations
714 :    
715 :     $loaderObject->LoadAnnotations($fileName);
716 :    
717 :     Read in the annotation history information and use it to create annotation records.
718 :    
719 :     =over 4
720 :    
721 :     =item fileName
722 :    
723 :     Name of the annotation history file. This file is formatted with four fields per
724 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
725 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
726 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
727 :    
728 :     =back
729 :    
730 :     =cut
731 :    
732 :     sub LoadAnnotations {
733 :     # Get the parameters.
734 :     my ($self, $fileName) = @_;
735 : parrello 1.11 # Get the assignment Hash. We use this to filter out deleted features.
736 :     my $assignHash = $self->{assignHash};
737 : parrello 1.6 # Get the Sapling database.
738 :     my $sap = $self->{sap};
739 :     # Get the statistics object.
740 :     my $stats = $self->{stats};
741 :     # Open the input file.
742 :     my $ih = Tracer::Open(undef, "<$fileName");
743 :     # Loop through the input.
744 :     while (! eof $ih) {
745 :     # Read in the peg, timestamp, and user ID.
746 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
747 : parrello 1.11 # Only proceed if the feature is not deleted.
748 :     if ($assignHash->{$fid}) {
749 :     # Add the annotation to this feature.
750 :     $self->MakeAnnotation($fid, $text, $user, $timestamp);
751 : parrello 1.6 }
752 :     }
753 :     }
754 :    
755 :    
756 : parrello 1.1 =head3 WriteProtein
757 :    
758 :     $loaderObject->WriteProtein($fid, $sequence);
759 :    
760 :     Write out the specified protein sequence and associate it with the specified feature.
761 :     This requires checking to see if the protein sequence is already in the database.
762 :    
763 :     =over 4
764 :    
765 :     =item fid
766 :    
767 :     ID of the feature with the specified protein sequence.
768 :    
769 :     =item sequence
770 :    
771 :     Protein sequence for the indicated feature.
772 :    
773 :     =back
774 :    
775 :     =cut
776 :    
777 :     sub WriteProtein {
778 :     # Get the parameters.
779 :     my ($self, $fid, $sequence) = @_;
780 :     # Compute the key of the protein sequence.
781 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
782 : parrello 1.1 # Insure the protein exists.
783 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
784 :     # Connect the feature to it.
785 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
786 :     }
787 :    
788 :     =head3 LoadSubsystems
789 :    
790 : parrello 1.17 $loaderObject->LoadSubsystems($subsysList);
791 :    
792 :     Load the subsystem data into the database. This requires looking through the
793 :     bindings and using them to connect to the subsystems that already exist in the
794 :     database. If the subsystem does not exist, its bindings will not be loaded.
795 :    
796 :     =over 4
797 :    
798 :     =item subsysList
799 :    
800 :     Reference to a list of subsystem IDs. If specified, only the subsystems in the
801 :     list will be processed. This is useful when a particular set of subsystems
802 :     has been replaced.
803 : parrello 1.1
804 : parrello 1.17 =back
805 : parrello 1.1
806 :     =cut
807 :    
808 :     sub LoadSubsystems {
809 :     # Get the parameters.
810 : parrello 1.17 my ($self, $subsysList) = @_;
811 : parrello 1.1 # Get the sapling object.
812 :     my $sap = $self->{sap};
813 :     # Get the statistics object.
814 :     my $stats = $self->{stats};
815 :     # Get the genome ID.
816 :     my $genome = $self->{genome};
817 : parrello 1.18 # Compute the subsystem and binding file names.
818 :     my $subFileName = "$self->{directory}/Subsystems/subsystems";
819 :     my $bindFileName = "$self->{directory}/Subsystems/bindings";
820 : parrello 1.19 # Get a hash of the molecular machines already connected to this genome.
821 :     my %machinesOld = map { $_ => 1 } $sap->GetFlat('Uses', 'Uses(from-link) = ?',
822 :     [$genome], 'to-link');
823 : parrello 1.18 # Only proceed if both exist.
824 :     if (! -f $subFileName || ! -f $bindFileName) {
825 :     Trace("Missing subsystem data for $genome.") if T(1);
826 :     $stats->Add(noSubsystems => 1);
827 :     } else {
828 :     # This hash maps subsystem IDs to molecular machine IDs.
829 :     my %machines;
830 :     # This hash maps subsystem/role pairs to machine role IDs.
831 :     my %machineRoles;
832 :     # This hash will contain the list of subsystems found in the database.
833 :     my %subsystems;
834 :     # We loop through the subsystems, looking for the ones already in the
835 :     # database. The list is given in the subsystems file of the Subsystems
836 :     # directory.
837 :     my $ih = Open(undef, "<$subFileName");
838 :     # Loop through the subsystems in the file, insuring we have them in the database.
839 :     while (! eof $ih) {
840 :     # Get this subsystem.
841 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
842 :     Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
843 :     # Normalize the subsystem name.
844 :     $subsystem = $sap->SubsystemID($subsystem);
845 :     # Insure the subsystem is in the database and is one we're interested in.
846 :     if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
847 :     $sap->Exists(Subsystem => $subsystem)) {
848 :     # We have the subsystem in the database. We need to compute the machine
849 :     # role IDs and create the molecular machine. First, we need to remember
850 :     # the subsystem.
851 :     $subsystems{$subsystem} = 1;
852 :     # Compute this subsystem's MD5.
853 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
854 :     my $rolePrefix = "$subsystemMD5:$genome";
855 :     # Loop through the roles.
856 :     my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
857 :     [$subsystem], "to-link abbreviation");
858 :     for my $roleTuple (@roleList) {
859 :     my ($roleID, $abbr) = @$roleTuple;
860 :     my $machineRoleID = $rolePrefix . '::' . $abbr;
861 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
862 :     }
863 :     # Next we need the variant code and key.
864 :     my $variantCode = BaseSaplingLoader::Starless($variant);
865 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
866 :     # Now we create the molecular machine connecting this genome to the
867 :     # subsystem variant.
868 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
869 : parrello 1.19 # Does it already exist?
870 :     if ($machinesOld{$machineID}) {
871 :     # Yes. Output a warning.
872 :     Trace("Machine $machineID already found for $subsystem and genome $genome.") if T(1);
873 :     $stats->Add(duplicateMachine => 1);
874 :     } else {
875 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
876 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
877 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
878 :     }
879 : parrello 1.18 # Remember the machine ID.
880 :     $machines{$subsystem} = $machineID;
881 : parrello 1.1 }
882 :     }
883 : parrello 1.18 # Now we go through the bindings file. This file connects the subsystem
884 :     # roles to the molecular machines.
885 :     $ih = Open(undef, "<$bindFileName");
886 :     # Loop through the bindings.
887 :     while (! eof $ih) {
888 :     # Get the binding data.
889 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
890 :     # Normalize the subsystem name.
891 :     $subsystem = $sap->SubsystemID($subsystem);
892 :     # Insure the subsystem is in the database.
893 :     if ($subsystems{$subsystem}) {
894 :     # Compute the machine role.
895 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
896 :     # Insure it exists.
897 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
898 :     if ($created) {
899 :     # We created the machine role, so connect it to the machine.
900 :     my $machineID = $machines{$subsystem};
901 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
902 :     # Connect it to the role, too.
903 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
904 :     }
905 :     # Connect the feature.
906 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
907 : parrello 1.17 }
908 : parrello 1.1 }
909 :     }
910 :     }
911 :    
912 :     =head3 CreateGenome
913 :    
914 :     $loaderObject->CreateGenome();
915 :    
916 :     Create the genome record.
917 :    
918 :     =cut
919 :    
920 :     # This hash maps cache statistics to genome record fields.
921 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
922 :     genome_domain => 'domain'};
923 :     # This hash maps domains to the prokaryotic flag.
924 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
925 :    
926 :     sub CreateGenome {
927 :     # Get the parameters.
928 :     my ($self) = @_;
929 :     # Get the genome directory.
930 :     my $dir = $self->{directory};
931 :     # Get the sapling database.
932 :     my $sap = $self->{sap};
933 : parrello 1.18 # Get the MD5 computer.
934 :     my $md5C = $self->{md5};
935 : parrello 1.1 # We'll put the genome attributes in here.
936 :     my %fields;
937 :     # Check for a basic statistics file.
938 :     my $statsFile = "$dir/cache.basic_statistics";
939 :     if (-f $statsFile) {
940 :     # We have the statistics file, so read the major attributes from it.
941 :     my $ih = Open(undef, "<$statsFile");
942 :     while (! eof $ih) {
943 :     my ($key, $value) = Tracer::GetLine($ih);
944 :     my $fieldKey = GENOME_FIELDS->{$key};
945 :     if ($fieldKey) {
946 :     $fields{$fieldKey} = $value;
947 :     }
948 :     }
949 :     # Translate the domain.
950 :     $fields{domain} = ucfirst $fields{domain};
951 :     # Denote the genome is complete.
952 :     $fields{complete} = 1;
953 :     } else {
954 :     # Check to see if this genome is complete.
955 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
956 :     # Get the genome name.
957 :     my $ih = Open(undef, "<$dir/GENOME");
958 :     my $line = <$ih>; chomp $line;
959 :     $fields{'scientific-name'} = $line;
960 :     # Get the taxonomy and extract the domain from it.
961 :     $ih = Open(undef, "<$dir/TAXONOMY");
962 : parrello 1.18 ($fields{domain}) = split m/;/, <$ih>, 2;
963 : parrello 1.1 }
964 :     # Get the counts from the statistics object.
965 :     my $stats = $self->{stats};
966 :     $fields{contigs} = $stats->Ask('contigs');
967 :     $fields{'dna-size'} = $stats->Ask('dna');
968 :     $fields{pegs} = $stats->Ask('peg');
969 :     $fields{rnas} = $stats->Ask('rna');
970 : parrello 1.14 $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
971 : parrello 1.1 # Get the genetic code. The default is 11.
972 :     $fields{'genetic-code'} = 11;
973 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
974 :     if (-f $geneticCodeFile) {
975 :     # There's a genetic code file, so we need to read it for the code.
976 :     my $ih = Open(undef, "<$geneticCodeFile");
977 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
978 :     }
979 :     # Use the domain to determine whether or not the genome is prokaryotic.
980 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
981 : parrello 1.18 # Compute the genome MD5.
982 :     $fields{md5_identifier} = $md5C->CloseGenome();
983 : parrello 1.1 # Finally, add the genome ID.
984 :     $fields{id} = $self->{genome};
985 :     # Create the genome record.
986 :     $sap->InsertObject('Genome', %fields);
987 :     }
988 :    
989 :     =head3 ReadAssignments
990 :    
991 :     my $assignHash = $loaderObject->ReadAssignments();
992 :    
993 :     Return a hash mapping each feature ID to its functional assignment. This method
994 :     essentially reads the B<assigned_functions> file into memory.
995 :    
996 :     =cut
997 :    
998 :     sub ReadAssignments {
999 :     # Get the parameters.
1000 :     my ($self) = @_;
1001 :     # Create the return hash.
1002 :     my $retVal = {};
1003 :     # Loop through the assigned-functions file, storing results in the hash. Later
1004 :     # results will override earlier ones.
1005 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
1006 :     while (! eof $ih) {
1007 :     my ($fid, $function) = Tracer::GetLine($ih);
1008 :     $retVal->{$fid} = $function;
1009 :     }
1010 :     # Return the hash built.
1011 :     return $retVal;
1012 :     }
1013 :    
1014 :    
1015 :     =head3 FixContig
1016 :    
1017 :     $loaderObject->FixContig(\$contigID);
1018 :    
1019 :     Insure that the specified contig ID contains the genome ID.
1020 :    
1021 :     =cut
1022 :    
1023 :     sub FixContig {
1024 :     my ($self, $contigID) = @_;
1025 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
1026 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
1027 :     $$contigID = "$self->{genome}:$$contigID";
1028 :     }
1029 :     }
1030 :    
1031 :     =head3 ConnectLocation
1032 :    
1033 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
1034 :    
1035 :     Connect the specified feature to the specified contig location.
1036 :    
1037 :     =over 4
1038 :    
1039 :     =item fid
1040 :    
1041 :     ID of the relevant feature.
1042 :    
1043 :     =item contig
1044 :    
1045 :     ID of the contig containing this segment of the feature (normalized).
1046 :    
1047 :     =item segment
1048 :    
1049 :     Ordinal number of this segment.
1050 :    
1051 :     =item left
1052 :    
1053 :     Location of the segment's leftmost base pair.
1054 :    
1055 :     =item dir
1056 :    
1057 :     Direction (strand) of the segment (C<+> or C<->).
1058 :    
1059 :     =item len
1060 :    
1061 :     Length of the segment (which must be no greater than the configured maximum).
1062 :    
1063 :     =back
1064 :    
1065 :     =cut
1066 :    
1067 :     sub ConnectLocation {
1068 :     # Get the parameters.
1069 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1070 :     # Get the sapling database.
1071 :     my $sap = $self->{sap};
1072 :     # Create the relationship.
1073 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1074 :     begin => $left, dir => $dir, len => $len,
1075 :     ordinal => $segment);
1076 :     # Record it in the statistics.
1077 :     $self->{stats}->Add(segment => 1);
1078 :     }
1079 :    
1080 : parrello 1.6 =head2 Internal Utility Methods
1081 :    
1082 :     =head3 ReadAnnotation
1083 :    
1084 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1085 :    
1086 :     Read the next record from an annotation file. The next record must exist (that is, an
1087 :     end-of-file check should have been performed before calling this method).
1088 :    
1089 :     =over 4
1090 :    
1091 :     =item ih
1092 :    
1093 :     Open file handle for the annotation file.
1094 :    
1095 :     =item RETURN
1096 :    
1097 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1098 :     timestamp, (2) the user ID, and (3) the annotation text.
1099 :    
1100 :     =back
1101 :    
1102 :     =cut
1103 :    
1104 :     sub ReadAnnotation {
1105 :     # Get the parameter.
1106 :     my ($ih) = @_;
1107 :     # Read the three fixed fields.
1108 :     my $fid = <$ih>; chomp $fid;
1109 :     my $timestamp = <$ih>; chomp $timestamp;
1110 :     my $user = <$ih>; chomp $user;
1111 :     # Loop through the lines of the text field.
1112 :     my $text = "";
1113 :     my $line = <$ih>;
1114 : olson 1.13 while (defined($line) && $line ne "//\n") {
1115 : parrello 1.6 $text .= $line;
1116 :     $line = <$ih>;
1117 :     }
1118 :     # Remove the trailing new-line from the text.
1119 :     chomp $text;
1120 :     # Return the fields.
1121 :     return ($fid, $timestamp, $user, $text);
1122 :     }
1123 :    
1124 : olson 1.13 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3