[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 :     sub Load {
65 :     # Get the parameters.
66 :     my ($sap, $genome, $directory) = @_;
67 :     # Create the loader object.
68 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69 :     # Load the contigs.
70 : parrello 1.2 Trace("Loading contigs for $genome.") if T(2);
71 : parrello 1.1 $loaderObject->LoadContigs();
72 :     # Load the features.
73 : parrello 1.2 Trace("Loading features for $genome.") if T(2);
74 : parrello 1.1 $loaderObject->LoadFeatures();
75 : parrello 1.7 # Check for annotation history. If we have it, load the history records into the
76 :     # database.
77 :     if (-f "$directory/annotations") {
78 :     Trace("Processing annotations.") if T(3);
79 :     $loaderObject->LoadAnnotations("$directory/annotations");
80 :     }
81 : parrello 1.1 # Load the subsystem bindings.
82 : parrello 1.2 Trace("Loading subsystems for $genome.") if T(2);
83 : parrello 1.1 $loaderObject->LoadSubsystems();
84 :     # Create the Genome record and taxonomy information.
85 : parrello 1.2 Trace("Creating root for $genome.") if T(2);
86 : parrello 1.1 $loaderObject->CreateGenome();
87 :     # Return the statistics.
88 :     return $loaderObject->{stats};
89 :     }
90 :    
91 :     =head3 ClearGenome
92 :    
93 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
94 :    
95 :     Delete the specified genome and all the related records from the specified sapling
96 :     database. This method can also be used to clean up after a failed or aborted load.
97 :    
98 :     =over 4
99 :    
100 :     =item sap
101 :    
102 :     L<Sapling> object used to access the target database.
103 :    
104 :     =item genome
105 :    
106 :     ID of the genome to delete.
107 :    
108 :     =item RETURN
109 :    
110 :     Returns a statistics object counting the records deleted.
111 :    
112 :     =back
113 :    
114 :     =cut
115 :    
116 :     sub ClearGenome {
117 :     # Get the parameters.
118 :     my ($sap, $genome) = @_;
119 :     # Create the statistics object.
120 :     my $stats = Stats->new();
121 :     # Delete the DNA.
122 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
123 : parrello 1.1 # Delete the contigs.
124 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
125 : parrello 1.1 # Delete the features.
126 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
127 : parrello 1.1 # Delete the molecular machines.
128 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
129 : parrello 1.1 # Delete the genome itself.
130 :     my $subStats = $sap->Delete(Genome => $genome);
131 :     # Accumulate the statistics from the delete.
132 :     $stats->Accumulate($subStats);
133 :     # Return the statistics object.
134 :     return $stats;
135 :     }
136 :    
137 : parrello 1.7
138 :     =head3 Process
139 :    
140 :     my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
141 :    
142 :     Load genome data from the specified directory. If the genome data already
143 :     exists in the database, it will be deleted first.
144 :    
145 :     =over 4
146 :    
147 :     =item sap
148 :    
149 :     L</Sapling> object for accessing the database.
150 :    
151 :     =item genome
152 :    
153 :     ID of the genome whose data is being loaded.
154 :    
155 :     =item directory
156 :    
157 :     Name of the directory containing the genome data files.
158 :    
159 :     =item RETURN
160 :    
161 :     Returns a statistics object describing the activity during the reload.
162 :    
163 :     =back
164 :    
165 :     =cut
166 :    
167 :     sub Process {
168 :     # Get the parameters.
169 :     my ($sap, $genome, $directory) = @_;
170 :     # Clear the existing data for the specified genome.
171 :     my $stats = ClearGenome($sap, $genome);
172 :     # Load the new expression data from the specified directory.
173 :     my $newStats = Load($sap, $genome, $directory);
174 :     # Merge the statistics.
175 :     $stats->Accumulate($newStats);
176 :     # Return the result.
177 :     return $stats;
178 :     }
179 :    
180 :    
181 : parrello 1.1 =head2 Loader Object Methods
182 :    
183 :     =head3 new
184 :    
185 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
186 :    
187 :     Create a loader object that can be used to facilitate loading genome data from a
188 :     directory.
189 :    
190 :     =over 4
191 :    
192 :     =item sap
193 :    
194 :     L<Sapling> object used to access the target database.
195 :    
196 :     =item genome
197 :    
198 :     ID of the genome being loaded.
199 :    
200 :     =item directory
201 :    
202 :     Name of the directory containing the genome information.
203 :    
204 :     =back
205 :    
206 :     The object created contains the following fields.
207 :    
208 :     =over 4
209 :    
210 :     =item directory
211 :    
212 :     Name of the directory containing the genome data.
213 :    
214 :     =item genome
215 :    
216 :     ID of the genome being loaded.
217 :    
218 :     =item supportRecords
219 :    
220 :     A hash of hashes, used to track the support records known to exist in the database.
221 :    
222 :     =item sap
223 :    
224 :     L<Sapling> object used to access the database.
225 :    
226 :     =item stats
227 :    
228 :     L<Stats> object for tracking statistical information about the load.
229 :    
230 : parrello 1.6 =item timestamps
231 :    
232 :     A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,
233 :     and used to prevent duplicate timestamps.
234 :    
235 : parrello 1.1 =back
236 :    
237 :     =cut
238 :    
239 :     sub new {
240 :     # Get the parameters.
241 :     my ($class, $sap, $genome, $directory) = @_;
242 :     # Create the object.
243 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
244 :     # Add our specialized data.
245 :     $retVal->{genome} = $genome;
246 :     $retVal->{directory} = $directory;
247 : parrello 1.6 $retVal->{timestamps} = {};
248 : parrello 1.3 # Return the result.
249 : parrello 1.1 return $retVal;
250 :     }
251 :    
252 :     =head3 LoadContigs
253 :    
254 :     $loaderObject->LoadContigs();
255 :    
256 :     Load the contig information into the database. This includes the contigs themselves and
257 :     the DNA. The number of contigs will be recorded as the C<contigs> statistic and the
258 :     number of base pairs as the C<dna> statistic.
259 :    
260 :     =cut
261 :    
262 :     sub LoadContigs {
263 :     # Get this object.
264 :     my ($self) = @_;
265 :     # Open the contigs file from the genome directory.
266 :     my $ih = Open(undef, "<$self->{directory}/contigs");
267 :     # Get the length of a DNA segment.
268 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
269 :     # Get the genome ID.
270 :     my $genome = $self->{genome};
271 :     # These variables will contain the current contig ID, the current contig length,
272 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
273 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
274 :     # Loop through the contig file.
275 :     while (! eof $ih) {
276 :     # Get the current record.
277 :     my $line = <$ih>; chomp $line;
278 :     # Is this the start of a new contig?
279 :     if ($line =~ /^>(\S+)/) {
280 :     # Yes. Save the contig ID.
281 :     my $newContigID = $1;
282 :     # Is there a current contig?
283 :     if (defined $contigID) {
284 :     # Yes. Output the current chunk.
285 :     $self->OutputChunk($contigID, $ordinal, $chunk);
286 :     # Output the contig.
287 :     $self->OutputContig($contigID, $contigLen);
288 :     }
289 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
290 :     $self->FixContig(\$newContigID);
291 :     # Initialize the new contig.
292 :     $contigID = $newContigID;
293 :     $contigLen = 0;
294 :     $ordinal = 0;
295 :     $chunk = '';
296 :     $chunkLen = 0;
297 :     } else {
298 :     # Here we have more DNA in the current contig. Are we at the end of
299 :     # the current chunk?
300 :     my $lineLen = length($line);
301 :     my $newChunkLen = $chunkLen + $lineLen;
302 :     if ($newChunkLen < $segmentLength) {
303 :     # No. Add this line to the chunk.
304 :     $chunk .= $line;
305 :     $chunkLen += $lineLen;
306 :     } else {
307 :     # Yes. Create the actual chunk.
308 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
309 :     # Write it out.
310 :     $self->OutputChunk($contigID, $ordinal, $chunk);
311 :     # Set up the new chunk.
312 :     $chunk = substr($line, $segmentLength - $chunkLen);
313 :     $ordinal++;
314 :     $chunkLen = length($chunk);
315 :     }
316 :     # Update the contig length.
317 :     $contigLen += $lineLen;
318 :     }
319 :     }
320 :     # Is there a current contig?
321 :     if (defined $contigID) {
322 :     # Yes. Output the residual chunk.
323 :     $self->OutputChunk($contigID, $ordinal, $chunk);
324 :     # Output the contig itself.
325 :     $self->OutputContig($contigID, $contigLen);
326 :     }
327 :     }
328 :    
329 :     =head3 OutputChunk
330 :    
331 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
332 :    
333 :     Output a chunk of DNA for the specified contig.
334 :    
335 :     =over 4
336 :    
337 :     =item contigID
338 :    
339 :     ID of the contig being output.
340 :    
341 :     =item ordinal
342 :    
343 :     Ordinal number of this chunk.
344 :    
345 :     =item chunk
346 :    
347 :     DNA sequence comprising this chunk.
348 :    
349 :     =back
350 :    
351 :     =cut
352 :    
353 :     sub OutputChunk {
354 :     # Get the parameters.
355 :     my ($self, $contigID, $ordinal, $chunk) = @_;
356 :     # Get the sapling object.
357 :     my $sap = $self->{sap};
358 :     # Compute the chunk ID.
359 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
360 :     # Connect this sequence to the contig.
361 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
362 : parrello 1.1 # Create the DNA sequence.
363 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
364 :     # Record the chunk.
365 :     $self->{stats}->Add(chunks => 1);
366 :     }
367 :    
368 :     =head3 OutputContig
369 :    
370 :     $loaderObject->OutputContig($contigID, $contigLen);
371 :    
372 :     Write out the current contig.
373 :    
374 :     =over 4
375 :    
376 :     =item contigID
377 :    
378 :     ID of the contig being written out.
379 :    
380 :     =item contigLen
381 :    
382 :     Length of the contig in base pairs.
383 :    
384 :     =back
385 :    
386 :     =cut
387 :    
388 :     sub OutputContig {
389 :     # Get the parameters.
390 :     my ($self, $contigID, $contigLen) = @_;
391 :     # Get the sapling object.
392 :     my $sap = $self->{sap};
393 :     # Connect the contig to the genome.
394 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
395 :     # Output the contig record.
396 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
397 :     # Record the contig.
398 :     $self->{stats}->Add(contigs => 1);
399 :     $self->{stats}->Add(dna => $contigLen);
400 :     }
401 :    
402 :     =head3 LoadFeatures
403 :    
404 :     $loaderObject->LoadFeatures();
405 :    
406 :     Load the feature data into the database. This includes all of the features,
407 :     their protein sequences, and their subsystem information. The number of
408 :     features of each type will be stored in the statistics object, identified by
409 :     the feature type.
410 :    
411 :     =cut
412 :    
413 :     sub LoadFeatures {
414 :     # Get the parameters.
415 :     my ($self) = @_;
416 : parrello 1.7 # Read in the functional assignments.
417 :     Trace("Reading functional assignments.") if T(3);
418 :     my $assignHash = $self->ReadAssignments();
419 : parrello 1.1 # Get the directory of feature types.
420 :     my $featureDir = "$self->{directory}/Features";
421 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
422 :     # Create the feature records for the types found.
423 :     for my $type (@types) {
424 :     # Insure this is a genuine feature directory.
425 :     if (-f "$featureDir/$type/tbl") {
426 :     # Yes, load the feature data.
427 : parrello 1.7 $self->LoadFeatureData($featureDir, $type, $assignHash);
428 : parrello 1.1 }
429 :     }
430 :     # Check for protein sequences. If we have some, load them into the database.
431 :     if (-f "$featureDir/peg/fasta") {
432 : parrello 1.7 Trace("Processing protein sequences.") if T(3);
433 : parrello 1.1 $self->LoadProteinData("$featureDir/peg/fasta");
434 :     }
435 : parrello 1.7 # Now loop through the features, connecting them to their roles. Note that deleted
436 :     # features will not be in the assignment hash.
437 :     Trace("Connecting features to roles.") if T(3);
438 :     for my $fid (keys %$assignHash) {
439 :     $self->ConnectFunctionRoles($fid, $assignHash->{$fid});
440 : parrello 1.6 }
441 : parrello 1.1 }
442 :    
443 :     =head3 LoadFeatureData
444 :    
445 : parrello 1.7 $loaderObject->LoadFeatureData($featureDir, $type, $assignHash);
446 : parrello 1.1
447 :     Load the basic data for each feature into the database. The number of features of
448 :     the type found will be recorded in the statistics object.
449 :    
450 :     =over 4
451 :    
452 :     =item featureDir
453 :    
454 :     Name of the main directory containing all the feature type subdirectories.
455 :    
456 :     =item type
457 :    
458 :     Type of feature to load.
459 :    
460 : parrello 1.7 =item assignHash
461 :    
462 :     Reference to a hash mapping each feature ID to its functional assignment.
463 :    
464 : parrello 1.1 =back
465 :    
466 :     =cut
467 :    
468 :     sub LoadFeatureData {
469 :     # Get the parameters.
470 : parrello 1.7 my ($self, $featureDir, $type, $assignHash) = @_;
471 : parrello 1.1 # Get the sapling database.
472 :     my $sap = $self->{sap};
473 :     # Get the maximum location segment length. We'll need this later.
474 :     my $maxLength = $sap->TuningParameter('maxLocationLength');
475 :     # Get the statistics object.
476 :     my $stats = $self->{stats};
477 :     # This hash will track the features we've created. If a feature is found a second
478 :     # time, it overwrites the original.
479 : parrello 1.6 my $fidHash = $self->{timestamps};
480 :     # Finally, we need the timestamp hash. The initial feature population
481 : parrello 1.1 # Insure we have a tbl file for this feature type.
482 :     my $fileName = "$featureDir/$type/tbl";
483 :     if (-f $fileName) {
484 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
485 :     # of deleted features.
486 :     my %deletedFids;
487 :     my $deleteFile = "$featureDir/$type/deleted.features";
488 :     if (-f $deleteFile) {
489 :     %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);
490 :     }
491 :     # Open the main file for input.
492 : parrello 1.7 Trace("Reading features from $fileName.") if T(3);
493 : parrello 1.1 my $ih = Open(undef, "<$fileName");
494 :     while (! eof $ih) {
495 :     # Read this feature's information.
496 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
497 : parrello 1.3 # Only proceed if the feature is NOT deleted.
498 :     if (! exists $deletedFids{$fid}) {
499 :     # If the feature already exists, delete it. (This should be extremely rare.)
500 : parrello 1.6 if (exists $fidHash->{$fid}) {
501 : parrello 1.3 $sap->Delete(Feature => $fid);
502 :     $stats->Add(duplicateFid => 1);
503 : parrello 1.1 }
504 : parrello 1.3 # Otherwise connect this feature to the genome.
505 :     $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
506 :     # Now we must parse the locations. This will contain a list of the location
507 :     # data 4-tuples (contig, start, dir, len).
508 :     my @locData;
509 :     # This will contain the total sequence length.
510 :     my $length = 0;
511 :     # Loop through the locations.
512 :     for my $loc (split /\s*,\s*/, $locations) {
513 :     # Get this location's components.
514 :     my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);
515 :     # Normalize the contig.
516 :     $self->FixContig(\$contig);
517 :     # Compute the direction.
518 :     if ($start <= $stop) {
519 :     # Here we have the forward strand.
520 :     my $len = $stop + 1 - $start;
521 :     push @locData, [$contig, $start, '+', $len];
522 :     $length += $len;
523 : parrello 1.1 } else {
524 : parrello 1.3 # Here we have the reverse strand.
525 :     my $len = $start + 1 - $stop;
526 :     push @locData, [$contig, $stop, '-', $len];
527 :     $length += $len;
528 : parrello 1.1 }
529 :     }
530 : parrello 1.3 # Now we can create the feature record.
531 :     $sap->InsertObject('Feature', id => $fid, feature_type => $type,
532 :     function => $assignHash->{$fid}, locked => 0,
533 :     sequence_length => $length);
534 : parrello 1.6 $fidHash->{$fid} = {};
535 : parrello 1.3 $stats->Add($type => 1);
536 :     # The next step is to connect the feature record to its locations. This
537 :     # involves dividing the location into segments. The following variable
538 :     # will count the total number of segments.
539 :     my $segment = 0;
540 :     # Loop through the locations.
541 :     for my $loc (@locData) {
542 :     # Get the current location's information.
543 :     my ($contig, $left, $dir, $len) = @$loc;
544 :     # Peel off any segments.
545 :     while ($len > $maxLength) {
546 :     # Process according to the direction.
547 :     if ($dir eq '+') {
548 :     # Forward strand: peel from the beginning.
549 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir,
550 :     $maxLength);
551 :     $left += $maxLength;
552 :     } else {
553 :     # Reverse strand: peel from the end.
554 :     $self->ConnectLocation($fid, $contig, $segment,
555 :     $left + $len - $maxLength, $dir,
556 :     $maxLength);
557 :     }
558 :     # Denote we've used up a segment.
559 :     $len -= $maxLength;
560 :     $segment++;
561 :     }
562 :     # Output the last segment.
563 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
564 :     }
565 :     # Now we process the aliases and create the identifiers. We don't do this
566 :     # for RNA, because the RNA function is stored in the aliases.
567 :     if ($type ne 'rna') {
568 :     for my $alias (@aliases) {
569 :     my $normalized;
570 :     # Determine the type.
571 :     my $aliasType = AliasAnalysis::TypeOf($alias);
572 :     $stats->Add(aliasAll => 1);
573 :     # Is this a recognized type?
574 :     if ($aliasType) {
575 :     $stats->Add(aliasNormal => 1);
576 :     # Yes. Write it normally.
577 :     $self->CreateIdentifier($alias, B => $aliasType, $fid);
578 :     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
579 :     # No, but this is a specially-marked locus tag.
580 :     $normalized = $1;
581 :     $stats->Add(aliasLocus => 1);
582 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
583 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
584 :     # No, but this is a natural locus tag.
585 :     $stats->Add(aliasLocus => 1);
586 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
587 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
588 :     # No, but this is a natural gene name.
589 :     $stats->Add(aliasGene => 1);
590 :     $self->CreateIdentifier($normalized, B => 'GENE', $fid);
591 :     } elsif ($alias =~ /^\d+$/) {
592 :     # Here it's a naked number, which means it's a GI number
593 :     # of some sort.
594 :     $stats->Add(aliasGI => 1);
595 :     $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
596 :     } elsif ($alias =~ /^protein_id\|(.+)/) {
597 :     # Here we have a REFSEQ protein ID. Right now we don't have a way to
598 :     # handle that, because we don't know the feature's protein ID here.
599 :     $stats->Add(aliasProtein => 1);
600 :     } elsif ($alias =~ /[:|]/) {
601 :     # Here it's an alias of an unknown type, so we skip it.
602 :     $stats->Add(aliasUnknown => 1);
603 :     } else {
604 :     # Here it's a miscellaneous type.
605 :     $stats->Add(aliasMisc => 1);
606 :     $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
607 :     }
608 : parrello 1.2 }
609 :     }
610 :     }
611 : parrello 1.1 }
612 :     }
613 :     }
614 :    
615 :     =head3 LoadProteinData
616 :    
617 :     $self->LoadProteinData($fileName);
618 :    
619 :     Load the protein sequences from the named FASTA file. The sequences will be stored
620 :     in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature
621 :     records themselves will not be created (it is presumed they are already there).
622 :    
623 :     =over 4
624 :    
625 :     =item fileName
626 :    
627 :     Name of the FASTA file containing the protein sequences for this genome.
628 :    
629 :     =back
630 :    
631 :     =cut
632 :    
633 :     sub LoadProteinData {
634 :     # Get the parameters.
635 :     my ($self, $fileName) = @_;
636 :     # Open the FASTA file for input.
637 :     my $ih = Open(undef, "<$fileName");
638 :     # We'll track the current protein in here.
639 :     my $fid;
640 :     my $sequence = "";
641 :     # Loop through the input file.
642 :     while (! eof $ih) {
643 :     # Get the current record.
644 :     my $line = <$ih>; chomp $line;
645 :     # Is this a label record.
646 :     if ($line =~ /^>(\S+)/) {
647 :     # Yes. Save the new feature ID.
648 :     my $newFid = $1;
649 :     # Do we have an existing protein?
650 :     if (defined $fid) {
651 :     # Yes. Write it out.
652 :     $self->WriteProtein($fid, $sequence);
653 :     }
654 :     # Initialize for the next protein.
655 :     $fid = $newFid;
656 :     $sequence = "";
657 :     } else {
658 :     # Here we have more letters for the current protein.
659 :     $sequence .= $line;
660 :     }
661 :     }
662 :     # Do we have a residual protein.
663 :     if (defined $fid) {
664 :     # Yes. Write it out.
665 :     $self->WriteProtein($fid, $sequence);
666 :     }
667 :     }
668 :    
669 : parrello 1.6
670 :     =head3 LoadAnnotations
671 :    
672 :     $loaderObject->LoadAnnotations($fileName);
673 :    
674 :     Read in the annotation history information and use it to create annotation records.
675 :    
676 :     =over 4
677 :    
678 :     =item fileName
679 :    
680 :     Name of the annotation history file. This file is formatted with four fields per
681 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
682 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
683 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
684 :    
685 :     =back
686 :    
687 :     =cut
688 :    
689 :     sub LoadAnnotations {
690 :     # Get the parameters.
691 :     my ($self, $fileName) = @_;
692 :     # Get the timestamp hash.
693 :     my $timeHash = $self->{timestamps};
694 :     # Get the Sapling database.
695 :     my $sap = $self->{sap};
696 :     # Get the statistics object.
697 :     my $stats = $self->{stats};
698 :     # Open the input file.
699 :     my $ih = Tracer::Open(undef, "<$fileName");
700 :     # Loop through the input.
701 :     while (! eof $ih) {
702 :     # Read in the peg, timestamp, and user ID.
703 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
704 :     # Only proceed if the feature exists.
705 :     if (! exists $timeHash->{$fid}) {
706 :     $stats->Add(skippedAnnotation => 1);
707 :     } else {
708 :     # Change assignments by the master user to FIG assignments.
709 :     $text =~ s/Set master function/Set FIG function/s;
710 :     # Insure the time stamp is valid.
711 :     if ($timestamp =~ /^\d+$/) {
712 :     # Here it's a number. We need to insure the one we use to form
713 :     # the key is unique.
714 :     my $keyStamp = $timestamp;
715 :     while ($timeHash->{$fid}{$keyStamp}) {
716 :     $keyStamp++;
717 :     $stats->Add(skippedStamp => 1);
718 :     }
719 :     # Form the annotation ID.
720 : parrello 1.7 my $annotationID = SaplingDataLoader::ComputeAnnotationID($fid, $keyStamp);
721 : parrello 1.6 $timeHash->{$fid}{$keyStamp} = 1;
722 :     # Generate the annotation.
723 :     $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);
724 :     $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,
725 :     comment => $text, annotator => $user);
726 :     } else {
727 :     # Here we have an invalid time stamp.
728 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);
729 :     }
730 :     }
731 :     }
732 :     }
733 :    
734 :    
735 : parrello 1.1 =head3 WriteProtein
736 :    
737 :     $loaderObject->WriteProtein($fid, $sequence);
738 :    
739 :     Write out the specified protein sequence and associate it with the specified feature.
740 :     This requires checking to see if the protein sequence is already in the database.
741 :    
742 :     =over 4
743 :    
744 :     =item fid
745 :    
746 :     ID of the feature with the specified protein sequence.
747 :    
748 :     =item sequence
749 :    
750 :     Protein sequence for the indicated feature.
751 :    
752 :     =back
753 :    
754 :     =cut
755 :    
756 :     sub WriteProtein {
757 :     # Get the parameters.
758 :     my ($self, $fid, $sequence) = @_;
759 :     # Compute the key of the protein sequence.
760 : parrello 1.8 my $protID = $self->{sap}->ProteinID($sequence);
761 : parrello 1.1 # Insure the protein exists.
762 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
763 :     # Connect the feature to it.
764 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
765 :     }
766 :    
767 :     =head3 LoadSubsystems
768 :    
769 :     $loaderObject->LoadSubsystems();
770 :    
771 :     Load the subsystem data into the database. This includes all of the subsystems,
772 :     their categories, roles, and the bindings that connect them to this genome's features.
773 :    
774 :     =cut
775 :    
776 :     sub LoadSubsystems {
777 :     # Get the parameters.
778 :     my ($self) = @_;
779 :     # Get the sapling object.
780 :     my $sap = $self->{sap};
781 :     # Get the statistics object.
782 :     my $stats = $self->{stats};
783 :     # Get the genome ID.
784 :     my $genome = $self->{genome};
785 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
786 :     my $sapO = SAPserver->new();
787 :     # This hash maps subsystem IDs to molecular machine IDs.
788 :     my %machines;
789 :     # This hash maps subsystem/role pairs to machine role IDs.
790 :     my %machineRoles;
791 :     # The first step is to create the subsystems themselves and connect them to the
792 :     # genome. A list is given in the subsystems file of the Subsystems directory.
793 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
794 :     # Create the field hash for the subsystem query.
795 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
796 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
797 :     Subsystem(version)
798 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
799 :     Includes(sequence)
800 :     Role(id) Role(hypothetical) Role(role-index));
801 :     my %fields = map { $_ => $_ } @fields;
802 :     # Loop through the subsystems in the file, insuring we have them in the database.
803 :     while (! eof $ih) {
804 :     # Get this subsystem.
805 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
806 :     # Normalize the subsystem name.
807 :     $subsystem = $sap->SubsystemID($subsystem);
808 :     # Compute this subsystem's MD5.
809 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
810 :     # Insure the subsystem is in the database.
811 :     if (! $sap->Exists(Subsystem => $subsystem)) {
812 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
813 :     # its roles, and its classification information. The variant will be added
814 :     # later if we need it.
815 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
816 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
817 :     -fields => \%fields);
818 :     # Only proceed if we found some roles.
819 :     if (@$roleList > 0) {
820 :     # Get the subsystem information from the first role and create the subsystem.
821 :     my $roleH = $roleList->[0];
822 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
823 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
824 :     # Now loop through the roles. The Includes records are always inserted, but the
825 :     # roles are only inserted if they don't already exist.
826 :     for $roleH (@$roleList) {
827 :     # Create the Includes record.
828 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
829 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
830 :     # Insure we have the role in place.
831 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
832 : parrello 1.1 my $roleID = $roleFields{id};
833 :     delete $roleFields{id};
834 :     $self->InsureEntity('Role', $roleID, %roleFields);
835 :     # Compute the machine-role ID.
836 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
837 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
838 :     }
839 :     }
840 :     } else {
841 :     # We already have the subsystem in the database, but we still need to compute the
842 :     # machine role IDs. We need information from the sapling server for this.
843 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
844 :     # Loop through the roles.
845 :     my $roleList = $subHash->{$subsystem};
846 :     for my $roleTuple (@$roleList) {
847 :     my ($roleID, $abbr) = @$roleTuple;
848 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
849 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
850 :     }
851 :     }
852 :     # Now we need to connect the variant to the subsystem. First, we compute the real
853 :     # variant code by removing the asterisks.
854 :     my $variantCode = $variant;
855 :     $variantCode =~ s/^\*+//;
856 :     $variantCode =~ s/\*+$//;
857 :     # Compute the variant key.
858 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
859 :     # Get the variant from the sapling server.
860 :     my $variantH = $sapO->get(-objects => "Variant",
861 :     -filter => {"Variant(id)" => ["=", $variantKey]},
862 :     -fields => {"Variant(code)" => "code",
863 :     "Variant(comment)" => "comment",
864 :     "Variant(type)" => "type",
865 :     "Variant(role-rule)" => "role-rule"},
866 :     -firstOnly => 1);
867 :     # Insure we have it in the database.
868 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
869 :     $stats->Add(subsystems => 1);
870 :     # Connect it to the subsystem.
871 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
872 :     # Now we create the molecular machine connecting this genome to the subsystem
873 :     # variant.
874 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
875 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
876 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
877 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
878 :     # Remember the machine ID.
879 :     $machines{$subsystem} = $machineID;
880 :     }
881 :     # Now we go through the bindings file. This file connects the subsystem roles to
882 :     # the molecular machines.
883 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
884 :     # Loop through the bindings.
885 :     while (! eof $ih) {
886 :     # Get the binding data.
887 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
888 :     # Normalize the subsystem name.
889 :     $subsystem = $sap->SubsystemID($subsystem);
890 :     # Compute the machine role.
891 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
892 :     # Insure it exists.
893 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
894 :     if ($created) {
895 :     # We created the machine role, so connect it to the machine.
896 :     my $machineID = $machines{$subsystem};
897 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
898 :     # Connect it to the role, too.
899 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
900 :     }
901 :     # Connect the feature.
902 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
903 :     }
904 :     }
905 :    
906 :     =head3 CreateGenome
907 :    
908 :     $loaderObject->CreateGenome();
909 :    
910 :     Create the genome record.
911 :    
912 :     =cut
913 :    
914 :     # This hash maps cache statistics to genome record fields.
915 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
916 :     genome_domain => 'domain'};
917 :     # This hash maps domains to the prokaryotic flag.
918 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
919 :    
920 :     sub CreateGenome {
921 :     # Get the parameters.
922 :     my ($self) = @_;
923 :     # Get the genome directory.
924 :     my $dir = $self->{directory};
925 :     # Get the sapling database.
926 :     my $sap = $self->{sap};
927 :     # We'll put the genome attributes in here.
928 :     my %fields;
929 :     # Check for a basic statistics file.
930 :     my $statsFile = "$dir/cache.basic_statistics";
931 :     if (-f $statsFile) {
932 :     # We have the statistics file, so read the major attributes from it.
933 :     my $ih = Open(undef, "<$statsFile");
934 :     while (! eof $ih) {
935 :     my ($key, $value) = Tracer::GetLine($ih);
936 :     my $fieldKey = GENOME_FIELDS->{$key};
937 :     if ($fieldKey) {
938 :     $fields{$fieldKey} = $value;
939 :     }
940 :     }
941 :     # Translate the domain.
942 :     $fields{domain} = ucfirst $fields{domain};
943 :     # Denote the genome is complete.
944 :     $fields{complete} = 1;
945 :     } else {
946 :     # Check to see if this genome is complete.
947 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
948 :     # Get the genome name.
949 :     my $ih = Open(undef, "<$dir/GENOME");
950 :     my $line = <$ih>; chomp $line;
951 :     $fields{'scientific-name'} = $line;
952 :     # Get the taxonomy and extract the domain from it.
953 :     $ih = Open(undef, "<$dir/TAXONOMY");
954 :     ($fields{domain}) = split /;/, <$ih>, 2;
955 :     }
956 :     # Get the counts from the statistics object.
957 :     my $stats = $self->{stats};
958 :     $fields{contigs} = $stats->Ask('contigs');
959 :     $fields{'dna-size'} = $stats->Ask('dna');
960 :     $fields{pegs} = $stats->Ask('peg');
961 :     $fields{rnas} = $stats->Ask('rna');
962 :     # Get the genetic code. The default is 11.
963 :     $fields{'genetic-code'} = 11;
964 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
965 :     if (-f $geneticCodeFile) {
966 :     # There's a genetic code file, so we need to read it for the code.
967 :     my $ih = Open(undef, "<$geneticCodeFile");
968 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
969 :     }
970 :     # Use the domain to determine whether or not the genome is prokaryotic.
971 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
972 :     # Finally, add the genome ID.
973 :     $fields{id} = $self->{genome};
974 :     # Create the genome record.
975 :     $sap->InsertObject('Genome', %fields);
976 :     }
977 :    
978 :     =head3 ReadAssignments
979 :    
980 :     my $assignHash = $loaderObject->ReadAssignments();
981 :    
982 :     Return a hash mapping each feature ID to its functional assignment. This method
983 :     essentially reads the B<assigned_functions> file into memory.
984 :    
985 :     =cut
986 :    
987 :     sub ReadAssignments {
988 :     # Get the parameters.
989 :     my ($self) = @_;
990 :     # Create the return hash.
991 :     my $retVal = {};
992 :     # Loop through the assigned-functions file, storing results in the hash. Later
993 :     # results will override earlier ones.
994 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
995 :     while (! eof $ih) {
996 :     my ($fid, $function) = Tracer::GetLine($ih);
997 :     $retVal->{$fid} = $function;
998 :     }
999 :     # Return the hash built.
1000 :     return $retVal;
1001 :     }
1002 :    
1003 :    
1004 :     =head3 FixContig
1005 :    
1006 :     $loaderObject->FixContig(\$contigID);
1007 :    
1008 :     Insure that the specified contig ID contains the genome ID.
1009 :    
1010 :     =cut
1011 :    
1012 :     sub FixContig {
1013 :     my ($self, $contigID) = @_;
1014 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
1015 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
1016 :     $$contigID = "$self->{genome}:$$contigID";
1017 :     }
1018 :     }
1019 :    
1020 :     =head3 ConnectLocation
1021 :    
1022 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
1023 :    
1024 :     Connect the specified feature to the specified contig location.
1025 :    
1026 :     =over 4
1027 :    
1028 :     =item fid
1029 :    
1030 :     ID of the relevant feature.
1031 :    
1032 :     =item contig
1033 :    
1034 :     ID of the contig containing this segment of the feature (normalized).
1035 :    
1036 :     =item segment
1037 :    
1038 :     Ordinal number of this segment.
1039 :    
1040 :     =item left
1041 :    
1042 :     Location of the segment's leftmost base pair.
1043 :    
1044 :     =item dir
1045 :    
1046 :     Direction (strand) of the segment (C<+> or C<->).
1047 :    
1048 :     =item len
1049 :    
1050 :     Length of the segment (which must be no greater than the configured maximum).
1051 :    
1052 :     =back
1053 :    
1054 :     =cut
1055 :    
1056 :     sub ConnectLocation {
1057 :     # Get the parameters.
1058 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1059 :     # Get the sapling database.
1060 :     my $sap = $self->{sap};
1061 :     # Create the relationship.
1062 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1063 :     begin => $left, dir => $dir, len => $len,
1064 :     ordinal => $segment);
1065 :     # Record it in the statistics.
1066 :     $self->{stats}->Add(segment => 1);
1067 :     }
1068 :    
1069 : parrello 1.2 =head3 CreateIdentifier
1070 :    
1071 :     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
1072 :    
1073 :     Link an identifier to a feature. The identifier is presented in prefixed form and is of the
1074 :     specified type and the specified confidence level.
1075 :    
1076 :     =over 4
1077 :    
1078 :     =item alias
1079 :    
1080 :     Identifier to connect to the feature.
1081 :    
1082 :     =item conf
1083 :    
1084 :     Confidence level (C<A> curated, C<B> normal, C<C> protein only).
1085 :    
1086 :     =item aliasType
1087 :    
1088 :     Type of alias (e.g. C<NCBI>, C<LocusTag>).
1089 :    
1090 :     =item fid
1091 :    
1092 :     ID of the relevant feature.
1093 :    
1094 :     =back
1095 :    
1096 :     =cut
1097 :    
1098 :     sub CreateIdentifier {
1099 :     # Get the parameters.
1100 :     my ($self, $alias, $conf, $aliasType, $fid) = @_;
1101 :     # Get the Sapling object.
1102 :     my $sap = $self->{sap};
1103 :     # Compute the identifier's natural form.
1104 :     my $natural = $alias;
1105 :     if ($natural =~ /[:|](.+)/) {
1106 :     $natural = $1;
1107 :     }
1108 :     # Insure the identifier exists in the database.
1109 :     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1110 :     # Connect the identifier to the feature.
1111 : parrello 1.5 $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
1112 : parrello 1.2 }
1113 :    
1114 : parrello 1.6 =head2 Internal Utility Methods
1115 :    
1116 :     =head3 ReadAnnotation
1117 :    
1118 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1119 :    
1120 :     Read the next record from an annotation file. The next record must exist (that is, an
1121 :     end-of-file check should have been performed before calling this method).
1122 :    
1123 :     =over 4
1124 :    
1125 :     =item ih
1126 :    
1127 :     Open file handle for the annotation file.
1128 :    
1129 :     =item RETURN
1130 :    
1131 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1132 :     timestamp, (2) the user ID, and (3) the annotation text.
1133 :    
1134 :     =back
1135 :    
1136 :     =cut
1137 :    
1138 :     sub ReadAnnotation {
1139 :     # Get the parameter.
1140 :     my ($ih) = @_;
1141 :     # Read the three fixed fields.
1142 :     my $fid = <$ih>; chomp $fid;
1143 :     my $timestamp = <$ih>; chomp $timestamp;
1144 :     my $user = <$ih>; chomp $user;
1145 :     # Loop through the lines of the text field.
1146 :     my $text = "";
1147 :     my $line = <$ih>;
1148 :     while ($line ne "//\n") {
1149 :     $text .= $line;
1150 :     $line = <$ih>;
1151 :     }
1152 :     # Remove the trailing new-line from the text.
1153 :     chomp $text;
1154 :     # Return the fields.
1155 :     return ($fid, $timestamp, $user, $text);
1156 :     }
1157 :    
1158 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3