[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 : olson 1.15 =item disconnected
61 :    
62 :     True if the application is disconnected from the network - do not
63 :     attempt to contact a SAP server for more data.
64 :    
65 : parrello 1.11 =item assignHash
66 :    
67 :     Hash of feature IDs to functional assignments. Deleted features are removed, which
68 :     means only features listed in this hash can be legally inserted into the database.
69 :    
70 : parrello 1.1 =back
71 :    
72 :     =cut
73 :    
74 :     sub Load {
75 :     # Get the parameters.
76 : olson 1.15 my ($sap, $genome, $directory, $disconnected) = @_;
77 : parrello 1.1 # Create the loader object.
78 : olson 1.15 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
79 : parrello 1.1 # Load the contigs.
80 : parrello 1.12 Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
81 : parrello 1.1 $loaderObject->LoadContigs();
82 :     # Load the features.
83 : parrello 1.12 Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
84 : parrello 1.1 $loaderObject->LoadFeatures();
85 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
86 :     # database.
87 :     if (-f "$directory/annotations") {
88 : parrello 1.12 Trace("Processing annotations.") if T(SaplingDataLoader => 3);
89 : parrello 1.7 $loaderObject->LoadAnnotations("$directory/annotations");
90 :     }
91 : parrello 1.1 # Load the subsystem bindings.
92 : parrello 1.12 Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
93 : parrello 1.1 $loaderObject->LoadSubsystems();
94 :     # Create the Genome record and taxonomy information.
95 : parrello 1.12 Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
96 : parrello 1.1 $loaderObject->CreateGenome();
97 :     # Return the statistics.
98 :     return $loaderObject->{stats};
99 :     }
100 :    
101 :     =head3 ClearGenome
102 :    
103 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
104 :    
105 :     Delete the specified genome and all the related records from the specified sapling
106 :     database. This method can also be used to clean up after a failed or aborted load.
107 :    
108 :     =over 4
109 :    
110 :     =item sap
111 :    
112 :     L<Sapling> object used to access the target database.
113 :    
114 :     =item genome
115 :    
116 :     ID of the genome to delete.
117 :    
118 :     =item RETURN
119 :    
120 :     Returns a statistics object counting the records deleted.
121 :    
122 :     =back
123 :    
124 :     =cut
125 :    
126 :     sub ClearGenome {
127 :     # Get the parameters.
128 :     my ($sap, $genome) = @_;
129 :     # Create the statistics object.
130 :     my $stats = Stats->new();
131 : parrello 1.9 # Delete the DNA sequences.
132 :     my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
133 :     for my $seq (@seqs) {
134 :     my $delStats = $sap->Delete(DNASequence => $seq);
135 :     $stats->Accumulate($delStats);
136 :     }
137 : parrello 1.1 # Delete the contigs.
138 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
139 : parrello 1.1 # Delete the features.
140 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
141 : parrello 1.1 # Delete the molecular machines.
142 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
143 : parrello 1.1 # Delete the genome itself.
144 :     my $subStats = $sap->Delete(Genome => $genome);
145 :     # Accumulate the statistics from the delete.
146 :     $stats->Accumulate($subStats);
147 :     # Return the statistics object.
148 :     return $stats;
149 :     }
150 :    
151 : parrello 1.7
152 :     =head3 Process
153 :    
154 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
155 :    
156 :     Load genome data from the specified directory. If the genome data already
157 :     exists in the database, it will be deleted first.
158 :    
159 :     =over 4
160 :    
161 :     =item sap
162 :    
163 :     L</Sapling> object for accessing the database.
164 :    
165 :     =item genome
166 :    
167 :     ID of the genome whose data is being loaded.
168 :    
169 :     =item directory
170 :    
171 :     Name of the directory containing the genome data files.
172 :    
173 :     =item RETURN
174 :    
175 :     Returns a statistics object describing the activity during the reload.
176 :    
177 :     =back
178 :    
179 :     =cut
180 :    
181 :     sub Process {
182 :     # Get the parameters.
183 :     my ($sap, $genome, $directory) = @_;
184 :     # Clear the existing data for the specified genome.
185 :     my $stats = ClearGenome($sap, $genome);
186 :     # Load the new expression data from the specified directory.
187 :     my $newStats = Load($sap, $genome, $directory);
188 :     # Merge the statistics.
189 :     $stats->Accumulate($newStats);
190 :     # Return the result.
191 :     return $stats;
192 :     }
193 :    
194 :    
195 : parrello 1.1 =head2 Loader Object Methods
196 :    
197 :     =head3 new
198 :    
199 : olson 1.15 my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);
200 : parrello 1.1
201 :     Create a loader object that can be used to facilitate loading genome data from a
202 :     directory.
203 :    
204 :     =over 4
205 :    
206 :     =item sap
207 :    
208 :     L<Sapling> object used to access the target database.
209 :    
210 :     =item genome
211 :    
212 :     ID of the genome being loaded.
213 :    
214 :     =item directory
215 :    
216 :     Name of the directory containing the genome information.
217 :    
218 : olson 1.15 =item disconnected
219 :    
220 :     Set to a true value if the application should be considered to be disconnected
221 :     from the network - that is, do not attempt to connect to a Sapling server
222 :     to load subsystem data.
223 :    
224 : parrello 1.1 =back
225 :    
226 :     The object created contains the following fields.
227 :    
228 :     =over 4
229 :    
230 :     =item directory
231 :    
232 :     Name of the directory containing the genome data.
233 :    
234 :     =item genome
235 :    
236 :     ID of the genome being loaded.
237 :    
238 :     =item supportRecords
239 :    
240 :     A hash of hashes, used to track the support records known to exist in the database.
241 :    
242 :     =item sap
243 :    
244 :     L<Sapling> object used to access the database.
245 :    
246 :     =item stats
247 :    
248 :     L<Stats> object for tracking statistical information about the load.
249 :    
250 :     =back
251 :    
252 :     =cut
253 :    
254 :     sub new {
255 :     # Get the parameters.
256 : olson 1.15 my ($class, $sap, $genome, $directory, $disconnected) = @_;
257 : parrello 1.1 # Create the object.
258 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
259 :     # Add our specialized data.
260 :     $retVal->{genome} = $genome;
261 :     $retVal->{directory} = $directory;
262 : parrello 1.11 # Leave the assignment hash undefined until we populate it.
263 :     $retVal->{assignHash} = undef;
264 : olson 1.15 $retVal->{disconnected} = defined($disconnected) ? 1 : 0;
265 : parrello 1.3 # Return the result.
266 : parrello 1.1 return $retVal;
267 :     }
268 :    
269 :     =head3 LoadContigs
270 :    
271 :     $loaderObject->LoadContigs();
272 :    
273 :     Load the contig information into the database. This includes the contigs themselves and
274 : parrello 1.14 the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
275 :     number of base pairs as the C<dna> statistic, and the number of GC instances as the
276 :     C<gc_content> statistic.
277 : parrello 1.1
278 :     =cut
279 :    
280 :     sub LoadContigs {
281 :     # Get this object.
282 :     my ($self) = @_;
283 :     # Open the contigs file from the genome directory.
284 :     my $ih = Open(undef, "<$self->{directory}/contigs");
285 :     # Get the length of a DNA segment.
286 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
287 :     # Get the genome ID.
288 :     my $genome = $self->{genome};
289 :     # These variables will contain the current contig ID, the current contig length,
290 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
291 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
292 :     # Loop through the contig file.
293 :     while (! eof $ih) {
294 :     # Get the current record.
295 :     my $line = <$ih>; chomp $line;
296 :     # Is this the start of a new contig?
297 :     if ($line =~ /^>(\S+)/) {
298 :     # Yes. Save the contig ID.
299 :     my $newContigID = $1;
300 :     # Is there a current contig?
301 :     if (defined $contigID) {
302 :     # Yes. Output the current chunk.
303 :     $self->OutputChunk($contigID, $ordinal, $chunk);
304 :     # Output the contig.
305 :     $self->OutputContig($contigID, $contigLen);
306 :     }
307 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
308 :     $self->FixContig(\$newContigID);
309 :     # Initialize the new contig.
310 :     $contigID = $newContigID;
311 :     $contigLen = 0;
312 :     $ordinal = 0;
313 :     $chunk = '';
314 :     $chunkLen = 0;
315 :     } else {
316 :     # Here we have more DNA in the current contig. Are we at the end of
317 :     # the current chunk?
318 :     my $lineLen = length($line);
319 :     my $newChunkLen = $chunkLen + $lineLen;
320 :     if ($newChunkLen < $segmentLength) {
321 :     # No. Add this line to the chunk.
322 :     $chunk .= $line;
323 :     $chunkLen += $lineLen;
324 :     } else {
325 :     # Yes. Create the actual chunk.
326 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
327 :     # Write it out.
328 :     $self->OutputChunk($contigID, $ordinal, $chunk);
329 :     # Set up the new chunk.
330 :     $chunk = substr($line, $segmentLength - $chunkLen);
331 :     $ordinal++;
332 :     $chunkLen = length($chunk);
333 :     }
334 :     # Update the contig length.
335 :     $contigLen += $lineLen;
336 :     }
337 :     }
338 :     # Is there a current contig?
339 :     if (defined $contigID) {
340 :     # Yes. Output the residual chunk.
341 :     $self->OutputChunk($contigID, $ordinal, $chunk);
342 :     # Output the contig itself.
343 :     $self->OutputContig($contigID, $contigLen);
344 :     }
345 :     }
346 :    
347 :     =head3 OutputChunk
348 :    
349 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
350 :    
351 :     Output a chunk of DNA for the specified contig.
352 :    
353 :     =over 4
354 :    
355 :     =item contigID
356 :    
357 :     ID of the contig being output.
358 :    
359 :     =item ordinal
360 :    
361 :     Ordinal number of this chunk.
362 :    
363 :     =item chunk
364 :    
365 :     DNA sequence comprising this chunk.
366 :    
367 :     =back
368 :    
369 :     =cut
370 :    
371 :     sub OutputChunk {
372 :     # Get the parameters.
373 :     my ($self, $contigID, $ordinal, $chunk) = @_;
374 :     # Get the sapling object.
375 :     my $sap = $self->{sap};
376 :     # Compute the chunk ID.
377 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
378 :     # Connect this sequence to the contig.
379 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
380 : parrello 1.1 # Create the DNA sequence.
381 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
382 :     # Record the chunk.
383 :     $self->{stats}->Add(chunks => 1);
384 : parrello 1.14 # Update the GC count.
385 :     $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
386 : parrello 1.1 }
387 :    
388 :     =head3 OutputContig
389 :    
390 :     $loaderObject->OutputContig($contigID, $contigLen);
391 :    
392 :     Write out the current contig.
393 :    
394 :     =over 4
395 :    
396 :     =item contigID
397 :    
398 :     ID of the contig being written out.
399 :    
400 :     =item contigLen
401 :    
402 :     Length of the contig in base pairs.
403 :    
404 :     =back
405 :    
406 :     =cut
407 :    
408 :     sub OutputContig {
409 :     # Get the parameters.
410 :     my ($self, $contigID, $contigLen) = @_;
411 :     # Get the sapling object.
412 :     my $sap = $self->{sap};
413 :     # Connect the contig to the genome.
414 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
415 :     # Output the contig record.
416 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
417 :     # Record the contig.
418 :     $self->{stats}->Add(contigs => 1);
419 :     $self->{stats}->Add(dna => $contigLen);
420 :     }
421 :    
422 :     =head3 LoadFeatures
423 :    
424 :     $loaderObject->LoadFeatures();
425 :    
426 :     Load the feature data into the database. This includes all of the features,
427 :     their protein sequences, and their subsystem information. The number of
428 :     features of each type will be stored in the statistics object, identified by
429 :     the feature type.
430 :    
431 :     =cut
432 :    
433 :     sub LoadFeatures {
434 :     # Get the parameters.
435 :     my ($self) = @_;
436 : parrello 1.7 # Read in the functional assignments.
437 : parrello 1.12 Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
438 : parrello 1.7 my $assignHash = $self->ReadAssignments();
439 : parrello 1.11 $self->{assignHash} = $assignHash;
440 : parrello 1.1 # Get the directory of feature types.
441 :     my $featureDir = "$self->{directory}/Features";
442 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
443 : parrello 1.11 # Check for protein sequences. If we have some, load them into a hash.
444 :     my $protHash = {};
445 :     if (-f "$featureDir/peg/fasta") {
446 : parrello 1.12 Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
447 : parrello 1.11 $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
448 :     }
449 : parrello 1.1 # Create the feature records for the types found.
450 :     for my $type (@types) {
451 :     # Insure this is a genuine feature directory.
452 :     if (-f "$featureDir/$type/tbl") {
453 : parrello 1.11 # Yes. Read in the evidence codes (if any).
454 :     my $evHash = {};
455 :     my $tranFile = "$featureDir/$type/Attributes/transaction_log";
456 :     if (-f $tranFile) {
457 :     $evHash = $self->LoadEvidenceCodes($tranFile);
458 :     }
459 :     # Now load the feature data.
460 :     $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
461 : parrello 1.1 }
462 :     }
463 : parrello 1.11 }
464 :    
465 :     =head3 LoadEvidenceCodes
466 :    
467 :     my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
468 :    
469 :     Load the evidence codes from the specified attribute transaction log file into a
470 :     hash. The log file is in tab-delimited format. The first column contains the
471 :     transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
472 :     the third an attribute name (we'll ignore everything but C<evidence_code>), and
473 :     the fourth the attribute value.
474 :    
475 :     =over 4
476 :    
477 :     =item attributeFile
478 :    
479 :     Name of the attribute transaction log file.
480 :    
481 :     =item RETURN
482 :    
483 :     Returns a reference to a hash mapping each feature ID to a comma-delimited list
484 :     of its evidence codes.
485 :    
486 :     =back
487 :    
488 :     =cut
489 :    
490 :     sub LoadEvidenceCodes {
491 :     # Get the parameters.
492 :     my ($self, $attributeFile) = @_;
493 :     # Get the Sapling database.
494 :     my $sap = $self->{sap};
495 :     # Get the statistics object.
496 :     my $stats = $self->{stats};
497 :     # Get the assignment hash: we use this to filter the feature IDs.
498 :     my $assignHash = $self->{assignHash};
499 :     # Open the attribute log file for input.
500 :     my $ih = Open(undef, "<$attributeFile");
501 :     # This two-dimensional hash will hold the evidence codes for each feature.
502 :     my %retVal;
503 :     # Loop through the attribute log file.
504 :     while (! eof $ih) {
505 :     # Get the current attribute record.
506 :     my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
507 :     $stats->Add(attributeLine => 1);
508 :     # Insure we have all the pieces we need.
509 :     if (! $command || ! $fid || $key ne 'evidence_code') {
510 :     $stats->Add(attributeLineSkipped => 1);
511 :     } elsif (! $assignHash->{$fid}) {
512 :     # Here the attribute is for a deleted feature.
513 :     $stats->Add(attributeFidSkipped => 1);
514 :     } else {
515 :     # Get the sub-hash for this feature.
516 :     if (! exists $retVal{$fid}) {
517 :     $retVal{$fid} = {};
518 :     }
519 :     my $featureSubHash = $retVal{$fid};
520 :     # Process according to the command.
521 :     if ($command eq 'ADD') {
522 :     # Here we are adding a new evidence code.
523 :     $featureSubHash->{$value} = 1;
524 :     $stats->Add(attributeAdd => 1);
525 :     } elsif ($command eq 'DELETE') {
526 :     # Here we are deleting an evidence code.
527 :     delete $featureSubHash->{$value};
528 :     $stats->Add(attributeDelete => 1);
529 :     } else {
530 :     # here we have an unrecognized command.
531 :     $stats->Add(attributeCommandSkip => 1);
532 :     }
533 :     }
534 : parrello 1.1 }
535 : parrello 1.11 # Loop through the hash, converting each sub-hash to a comma-delimited list of
536 :     # evidence codes.
537 :     for my $fid (keys %retVal) {
538 :     $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
539 : parrello 1.6 }
540 : parrello 1.11 # Return the evidence hash.
541 :     return \%retVal;
542 : parrello 1.1 }
543 :    
544 : parrello 1.11
545 : parrello 1.1 =head3 LoadFeatureData
546 :    
547 : parrello 1.11 $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
548 : parrello 1.1
549 :     Load the basic data for each feature into the database. The number of features of
550 :     the type found will be recorded in the statistics object.
551 :    
552 :     =over 4
553 :    
554 :     =item featureDir
555 :    
556 :     Name of the main directory containing all the feature type subdirectories.
557 :    
558 :     =item type
559 :    
560 :     Type of feature to load.
561 :    
562 : parrello 1.11 =item protHash
563 :    
564 :     Reference to a hash mapping each feature ID for a protein-encoding gene to
565 :     its protein sequence.
566 :    
567 :     =item evHash
568 : parrello 1.7
569 : parrello 1.11 Reference to a hash mapping each feature ID to a comma-delimited list of
570 :     its evidence codes (if any).
571 : parrello 1.7
572 : parrello 1.1 =back
573 :    
574 :     =cut
575 :    
576 :     sub LoadFeatureData {
577 :     # Get the parameters.
578 : parrello 1.11 my ($self, $featureDir, $type, $protHash, $evHash) = @_;
579 : parrello 1.1 # Get the sapling database.
580 :     my $sap = $self->{sap};
581 :     # Get the statistics object.
582 :     my $stats = $self->{stats};
583 : parrello 1.11 # Get the assignment hash. This tells us our functional assignments. This method is
584 :     # also where we remove the deleted features from it.
585 :     my $assignHash = $self->{assignHash};
586 : parrello 1.1 # This hash will track the features we've created. If a feature is found a second
587 :     # time, it overwrites the original.
588 : parrello 1.11 my %fidHash;
589 : parrello 1.6 # Finally, we need the timestamp hash. The initial feature population
590 : parrello 1.1 # Insure we have a tbl file for this feature type.
591 :     my $fileName = "$featureDir/$type/tbl";
592 : olson 1.15 my %deleted_features;
593 : parrello 1.1 if (-f $fileName) {
594 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
595 : parrello 1.11 # of deleted features and remove them from the assignment hash. This insures
596 :     # that they are not used by subsequent methods.
597 : parrello 1.3 my $deleteFile = "$featureDir/$type/deleted.features";
598 :     if (-f $deleteFile) {
599 : parrello 1.11 my $dh = Open(undef, "<$deleteFile");
600 :     while (! eof $dh) {
601 :     my ($deletedFid) = Tracer::GetLine($dh);
602 :     if (exists $assignHash->{$deletedFid}) {
603 :     delete $assignHash->{$deletedFid};
604 :     $stats->Add(deletedFid => 1);
605 : olson 1.15 $deleted_features{$deletedFid} = 1;
606 : parrello 1.11 }
607 :     }
608 : parrello 1.3 }
609 :     # Open the main file for input.
610 : parrello 1.12 Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
611 : parrello 1.1 my $ih = Open(undef, "<$fileName");
612 :     while (! eof $ih) {
613 :     # Read this feature's information.
614 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
615 : parrello 1.3 # Only proceed if the feature is NOT deleted.
616 : olson 1.15 if (!$deleted_features{$fid}) {
617 : parrello 1.3 # If the feature already exists, delete it. (This should be extremely rare.)
618 : parrello 1.11 if ($fidHash{$fid}) {
619 : parrello 1.3 $sap->Delete(Feature => $fid);
620 :     $stats->Add(duplicateFid => 1);
621 : parrello 1.1 }
622 : parrello 1.11 # If this is RNA, the alias list is always empty. Sometimes, the functional
623 :     # assignment is found there.
624 :     if ($type eq 'rna') {
625 :     if (! $assignHash->{$fid}) {
626 :     $assignHash->{$fid} = $aliases[0];
627 : parrello 1.2 }
628 : parrello 1.11 @aliases = ();
629 : parrello 1.2 }
630 : parrello 1.11 # Add the feature to the database.
631 :     $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,
632 :     $protHash->{$fid}, $evHash->{$fid});
633 :     # Denote we've added this feature, so that if a duplicate occurs we're ready.
634 :     $fidHash{$fid} = 1;
635 : parrello 1.2 }
636 : parrello 1.1 }
637 :     }
638 :     }
639 :    
640 : parrello 1.11
641 : parrello 1.1 =head3 LoadProteinData
642 :    
643 : parrello 1.11 my $protHash = $self->LoadProteinData($fileName);
644 : parrello 1.1
645 :     Load the protein sequences from the named FASTA file. The sequences will be stored
646 : parrello 1.11 in a hash by FIG feature ID.
647 : parrello 1.1
648 :     =over 4
649 :    
650 :     =item fileName
651 :    
652 :     Name of the FASTA file containing the protein sequences for this genome.
653 :    
654 : parrello 1.11 =item RETURN
655 :    
656 :     Returns a hash mapping feature IDs to protein sequences.
657 :    
658 : parrello 1.1 =back
659 :    
660 :     =cut
661 :    
662 :     sub LoadProteinData {
663 :     # Get the parameters.
664 :     my ($self, $fileName) = @_;
665 :     # Open the FASTA file for input.
666 :     my $ih = Open(undef, "<$fileName");
667 : parrello 1.11 # Create the return hash.
668 :     my $retVal = {};
669 : parrello 1.1 # We'll track the current protein in here.
670 :     my $fid;
671 :     my $sequence = "";
672 :     # Loop through the input file.
673 :     while (! eof $ih) {
674 :     # Get the current record.
675 :     my $line = <$ih>; chomp $line;
676 :     # Is this a label record.
677 :     if ($line =~ /^>(\S+)/) {
678 :     # Yes. Save the new feature ID.
679 :     my $newFid = $1;
680 :     # Do we have an existing protein?
681 :     if (defined $fid) {
682 : parrello 1.11 # Yes. Store it in the hash.
683 :     $retVal->{$fid} = $sequence;
684 : parrello 1.1 }
685 :     # Initialize for the next protein.
686 :     $fid = $newFid;
687 :     $sequence = "";
688 :     } else {
689 :     # Here we have more letters for the current protein.
690 :     $sequence .= $line;
691 :     }
692 :     }
693 :     # Do we have a residual protein.
694 :     if (defined $fid) {
695 : parrello 1.11 # Yes. Store it in the hash.
696 :     $retVal->{$fid} = $sequence;
697 : parrello 1.1 }
698 : parrello 1.11 # Return the hash.
699 :     return $retVal;
700 : parrello 1.1 }
701 :    
702 : parrello 1.6
703 :     =head3 LoadAnnotations
704 :    
705 :     $loaderObject->LoadAnnotations($fileName);
706 :    
707 :     Read in the annotation history information and use it to create annotation records.
708 :    
709 :     =over 4
710 :    
711 :     =item fileName
712 :    
713 :     Name of the annotation history file. This file is formatted with four fields per
714 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
715 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
716 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
717 :    
718 :     =back
719 :    
720 :     =cut
721 :    
722 :     sub LoadAnnotations {
723 :     # Get the parameters.
724 :     my ($self, $fileName) = @_;
725 : parrello 1.11 # Get the assignment Hash. We use this to filter out deleted features.
726 :     my $assignHash = $self->{assignHash};
727 : parrello 1.6 # Get the Sapling database.
728 :     my $sap = $self->{sap};
729 :     # Get the statistics object.
730 :     my $stats = $self->{stats};
731 :     # Open the input file.
732 :     my $ih = Tracer::Open(undef, "<$fileName");
733 :     # Loop through the input.
734 :     while (! eof $ih) {
735 :     # Read in the peg, timestamp, and user ID.
736 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
737 : parrello 1.11 # Only proceed if the feature is not deleted.
738 :     if ($assignHash->{$fid}) {
739 :     # Add the annotation to this feature.
740 :     $self->MakeAnnotation($fid, $text, $user, $timestamp);
741 : parrello 1.6 }
742 :     }
743 :     }
744 :    
745 :    
746 : parrello 1.1 =head3 WriteProtein
747 :    
748 :     $loaderObject->WriteProtein($fid, $sequence);
749 :    
750 :     Write out the specified protein sequence and associate it with the specified feature.
751 :     This requires checking to see if the protein sequence is already in the database.
752 :    
753 :     =over 4
754 :    
755 :     =item fid
756 :    
757 :     ID of the feature with the specified protein sequence.
758 :    
759 :     =item sequence
760 :    
761 :     Protein sequence for the indicated feature.
762 :    
763 :     =back
764 :    
765 :     =cut
766 :    
767 :     sub WriteProtein {
768 :     # Get the parameters.
769 :     my ($self, $fid, $sequence) = @_;
770 :     # Compute the key of the protein sequence.
771 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
772 : parrello 1.1 # Insure the protein exists.
773 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
774 :     # Connect the feature to it.
775 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
776 :     }
777 :    
778 :     =head3 LoadSubsystems
779 :    
780 :     $loaderObject->LoadSubsystems();
781 :    
782 :     Load the subsystem data into the database. This includes all of the subsystems,
783 :     their categories, roles, and the bindings that connect them to this genome's features.
784 :    
785 :     =cut
786 :    
787 :     sub LoadSubsystems {
788 :     # Get the parameters.
789 :     my ($self) = @_;
790 : olson 1.15
791 :     #
792 :     # If we are running in disconnected mode, do not actually load subsystems.
793 :     # They rely too much on information from the external sapling.
794 :     #
795 :     if ($self->{disconnected})
796 :     {
797 :     return;
798 :     }
799 :    
800 : parrello 1.1 # Get the sapling object.
801 :     my $sap = $self->{sap};
802 :     # Get the statistics object.
803 :     my $stats = $self->{stats};
804 :     # Get the genome ID.
805 :     my $genome = $self->{genome};
806 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
807 :     my $sapO = SAPserver->new();
808 :     # This hash maps subsystem IDs to molecular machine IDs.
809 :     my %machines;
810 :     # This hash maps subsystem/role pairs to machine role IDs.
811 :     my %machineRoles;
812 :     # The first step is to create the subsystems themselves and connect them to the
813 :     # genome. A list is given in the subsystems file of the Subsystems directory.
814 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
815 :     # Create the field hash for the subsystem query.
816 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
817 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
818 :     Subsystem(version)
819 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
820 :     Includes(sequence)
821 :     Role(id) Role(hypothetical) Role(role-index));
822 :     my %fields = map { $_ => $_ } @fields;
823 :     # Loop through the subsystems in the file, insuring we have them in the database.
824 :     while (! eof $ih) {
825 :     # Get this subsystem.
826 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
827 : parrello 1.12 Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
828 : parrello 1.1 # Normalize the subsystem name.
829 :     $subsystem = $sap->SubsystemID($subsystem);
830 :     # Compute this subsystem's MD5.
831 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
832 :     # Insure the subsystem is in the database.
833 :     if (! $sap->Exists(Subsystem => $subsystem)) {
834 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
835 :     # its roles, and its classification information. The variant will be added
836 :     # later if we need it.
837 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
838 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
839 :     -fields => \%fields);
840 :     # Only proceed if we found some roles.
841 :     if (@$roleList > 0) {
842 :     # Get the subsystem information from the first role and create the subsystem.
843 :     my $roleH = $roleList->[0];
844 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
845 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
846 : parrello 1.10 $stats->Add(subsystems => 1);
847 : parrello 1.1 # Now loop through the roles. The Includes records are always inserted, but the
848 :     # roles are only inserted if they don't already exist.
849 :     for $roleH (@$roleList) {
850 :     # Create the Includes record.
851 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
852 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
853 :     # Insure we have the role in place.
854 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
855 : parrello 1.1 my $roleID = $roleFields{id};
856 :     delete $roleFields{id};
857 :     $self->InsureEntity('Role', $roleID, %roleFields);
858 :     # Compute the machine-role ID.
859 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
860 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
861 : parrello 1.10 $stats->Add(subsystemRoles => 1);
862 : parrello 1.1 }
863 :     }
864 :     } else {
865 :     # We already have the subsystem in the database, but we still need to compute the
866 :     # machine role IDs. We need information from the sapling server for this.
867 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
868 :     # Loop through the roles.
869 :     my $roleList = $subHash->{$subsystem};
870 :     for my $roleTuple (@$roleList) {
871 :     my ($roleID, $abbr) = @$roleTuple;
872 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
873 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
874 :     }
875 :     }
876 :     # Now we need to connect the variant to the subsystem. First, we compute the real
877 :     # variant code by removing the asterisks.
878 :     my $variantCode = $variant;
879 :     $variantCode =~ s/^\*+//;
880 :     $variantCode =~ s/\*+$//;
881 :     # Compute the variant key.
882 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
883 :     # Insure we have it in the database.
884 : parrello 1.10 if (! $sap->Exists(Variant => $variantKey)) {
885 :     # Get the variant from the sapling server.
886 :     my $variantH = $sapO->get(-objects => "Variant",
887 :     -filter => {"Variant(id)" => ["=", $variantKey]},
888 :     -fields => {"Variant(code)" => "code",
889 :     "Variant(comment)" => "comment",
890 :     "Variant(type)" => "type",
891 :     "Variant(role-rule)" => "role-rule"},
892 :     -firstOnly => 1);
893 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
894 :     # Connect it to the subsystem.
895 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
896 :     $stats->Add(subsystemVariants => 1);
897 :     }
898 : parrello 1.1 # Now we create the molecular machine connecting this genome to the subsystem
899 :     # variant.
900 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
901 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
902 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
903 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
904 :     # Remember the machine ID.
905 :     $machines{$subsystem} = $machineID;
906 :     }
907 :     # Now we go through the bindings file. This file connects the subsystem roles to
908 :     # the molecular machines.
909 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
910 :     # Loop through the bindings.
911 :     while (! eof $ih) {
912 :     # Get the binding data.
913 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
914 :     # Normalize the subsystem name.
915 :     $subsystem = $sap->SubsystemID($subsystem);
916 :     # Compute the machine role.
917 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
918 :     # Insure it exists.
919 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
920 :     if ($created) {
921 :     # We created the machine role, so connect it to the machine.
922 :     my $machineID = $machines{$subsystem};
923 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
924 :     # Connect it to the role, too.
925 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
926 :     }
927 :     # Connect the feature.
928 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
929 :     }
930 :     }
931 :    
932 :     =head3 CreateGenome
933 :    
934 :     $loaderObject->CreateGenome();
935 :    
936 :     Create the genome record.
937 :    
938 :     =cut
939 :    
940 :     # This hash maps cache statistics to genome record fields.
941 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
942 :     genome_domain => 'domain'};
943 :     # This hash maps domains to the prokaryotic flag.
944 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
945 :    
946 :     sub CreateGenome {
947 :     # Get the parameters.
948 :     my ($self) = @_;
949 :     # Get the genome directory.
950 :     my $dir = $self->{directory};
951 :     # Get the sapling database.
952 :     my $sap = $self->{sap};
953 :     # We'll put the genome attributes in here.
954 :     my %fields;
955 :     # Check for a basic statistics file.
956 :     my $statsFile = "$dir/cache.basic_statistics";
957 :     if (-f $statsFile) {
958 :     # We have the statistics file, so read the major attributes from it.
959 :     my $ih = Open(undef, "<$statsFile");
960 :     while (! eof $ih) {
961 :     my ($key, $value) = Tracer::GetLine($ih);
962 :     my $fieldKey = GENOME_FIELDS->{$key};
963 :     if ($fieldKey) {
964 :     $fields{$fieldKey} = $value;
965 :     }
966 :     }
967 :     # Translate the domain.
968 :     $fields{domain} = ucfirst $fields{domain};
969 :     # Denote the genome is complete.
970 :     $fields{complete} = 1;
971 :     } else {
972 :     # Check to see if this genome is complete.
973 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
974 :     # Get the genome name.
975 :     my $ih = Open(undef, "<$dir/GENOME");
976 :     my $line = <$ih>; chomp $line;
977 :     $fields{'scientific-name'} = $line;
978 :     # Get the taxonomy and extract the domain from it.
979 :     $ih = Open(undef, "<$dir/TAXONOMY");
980 :     ($fields{domain}) = split /;/, <$ih>, 2;
981 :     }
982 :     # Get the counts from the statistics object.
983 :     my $stats = $self->{stats};
984 :     $fields{contigs} = $stats->Ask('contigs');
985 :     $fields{'dna-size'} = $stats->Ask('dna');
986 :     $fields{pegs} = $stats->Ask('peg');
987 :     $fields{rnas} = $stats->Ask('rna');
988 : parrello 1.14 $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
989 : parrello 1.1 # Get the genetic code. The default is 11.
990 :     $fields{'genetic-code'} = 11;
991 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
992 :     if (-f $geneticCodeFile) {
993 :     # There's a genetic code file, so we need to read it for the code.
994 :     my $ih = Open(undef, "<$geneticCodeFile");
995 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
996 :     }
997 :     # Use the domain to determine whether or not the genome is prokaryotic.
998 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
999 :     # Finally, add the genome ID.
1000 :     $fields{id} = $self->{genome};
1001 :     # Create the genome record.
1002 :     $sap->InsertObject('Genome', %fields);
1003 :     }
1004 :    
1005 :     =head3 ReadAssignments
1006 :    
1007 :     my $assignHash = $loaderObject->ReadAssignments();
1008 :    
1009 :     Return a hash mapping each feature ID to its functional assignment. This method
1010 :     essentially reads the B<assigned_functions> file into memory.
1011 :    
1012 :     =cut
1013 :    
1014 :     sub ReadAssignments {
1015 :     # Get the parameters.
1016 :     my ($self) = @_;
1017 :     # Create the return hash.
1018 :     my $retVal = {};
1019 :     # Loop through the assigned-functions file, storing results in the hash. Later
1020 :     # results will override earlier ones.
1021 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
1022 :     while (! eof $ih) {
1023 :     my ($fid, $function) = Tracer::GetLine($ih);
1024 :     $retVal->{$fid} = $function;
1025 :     }
1026 :     # Return the hash built.
1027 :     return $retVal;
1028 :     }
1029 :    
1030 :    
1031 :     =head3 FixContig
1032 :    
1033 :     $loaderObject->FixContig(\$contigID);
1034 :    
1035 :     Insure that the specified contig ID contains the genome ID.
1036 :    
1037 :     =cut
1038 :    
1039 :     sub FixContig {
1040 :     my ($self, $contigID) = @_;
1041 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
1042 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
1043 :     $$contigID = "$self->{genome}:$$contigID";
1044 :     }
1045 :     }
1046 :    
1047 :     =head3 ConnectLocation
1048 :    
1049 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
1050 :    
1051 :     Connect the specified feature to the specified contig location.
1052 :    
1053 :     =over 4
1054 :    
1055 :     =item fid
1056 :    
1057 :     ID of the relevant feature.
1058 :    
1059 :     =item contig
1060 :    
1061 :     ID of the contig containing this segment of the feature (normalized).
1062 :    
1063 :     =item segment
1064 :    
1065 :     Ordinal number of this segment.
1066 :    
1067 :     =item left
1068 :    
1069 :     Location of the segment's leftmost base pair.
1070 :    
1071 :     =item dir
1072 :    
1073 :     Direction (strand) of the segment (C<+> or C<->).
1074 :    
1075 :     =item len
1076 :    
1077 :     Length of the segment (which must be no greater than the configured maximum).
1078 :    
1079 :     =back
1080 :    
1081 :     =cut
1082 :    
1083 :     sub ConnectLocation {
1084 :     # Get the parameters.
1085 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1086 :     # Get the sapling database.
1087 :     my $sap = $self->{sap};
1088 :     # Create the relationship.
1089 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1090 :     begin => $left, dir => $dir, len => $len,
1091 :     ordinal => $segment);
1092 :     # Record it in the statistics.
1093 :     $self->{stats}->Add(segment => 1);
1094 :     }
1095 :    
1096 : parrello 1.6 =head2 Internal Utility Methods
1097 :    
1098 :     =head3 ReadAnnotation
1099 :    
1100 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1101 :    
1102 :     Read the next record from an annotation file. The next record must exist (that is, an
1103 :     end-of-file check should have been performed before calling this method).
1104 :    
1105 :     =over 4
1106 :    
1107 :     =item ih
1108 :    
1109 :     Open file handle for the annotation file.
1110 :    
1111 :     =item RETURN
1112 :    
1113 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1114 :     timestamp, (2) the user ID, and (3) the annotation text.
1115 :    
1116 :     =back
1117 :    
1118 :     =cut
1119 :    
1120 :     sub ReadAnnotation {
1121 :     # Get the parameter.
1122 :     my ($ih) = @_;
1123 :     # Read the three fixed fields.
1124 :     my $fid = <$ih>; chomp $fid;
1125 :     my $timestamp = <$ih>; chomp $timestamp;
1126 :     my $user = <$ih>; chomp $user;
1127 :     # Loop through the lines of the text field.
1128 :     my $text = "";
1129 :     my $line = <$ih>;
1130 : olson 1.13 while (defined($line) && $line ne "//\n") {
1131 : parrello 1.6 $text .= $line;
1132 :     $line = <$ih>;
1133 :     }
1134 :     # Remove the trailing new-line from the text.
1135 :     chomp $text;
1136 :     # Return the fields.
1137 :     return ($fid, $timestamp, $user, $text);
1138 :     }
1139 :    
1140 : olson 1.13 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3