[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 :     sub Load {
65 :     # Get the parameters.
66 :     my ($sap, $genome, $directory) = @_;
67 :     # Create the loader object.
68 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69 :     # Load the contigs.
70 : parrello 1.2 Trace("Loading contigs for $genome.") if T(2);
71 : parrello 1.1 $loaderObject->LoadContigs();
72 :     # Load the features.
73 : parrello 1.2 Trace("Loading features for $genome.") if T(2);
74 : parrello 1.1 $loaderObject->LoadFeatures();
75 :     # Load the subsystem bindings.
76 : parrello 1.2 Trace("Loading subsystems for $genome.") if T(2);
77 : parrello 1.1 $loaderObject->LoadSubsystems();
78 :     # Create the Genome record and taxonomy information.
79 : parrello 1.2 Trace("Creating root for $genome.") if T(2);
80 : parrello 1.1 $loaderObject->CreateGenome();
81 :     # Return the statistics.
82 :     return $loaderObject->{stats};
83 :     }
84 :    
85 :     =head3 ClearGenome
86 :    
87 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
88 :    
89 :     Delete the specified genome and all the related records from the specified sapling
90 :     database. This method can also be used to clean up after a failed or aborted load.
91 :    
92 :     =over 4
93 :    
94 :     =item sap
95 :    
96 :     L<Sapling> object used to access the target database.
97 :    
98 :     =item genome
99 :    
100 :     ID of the genome to delete.
101 :    
102 :     =item RETURN
103 :    
104 :     Returns a statistics object counting the records deleted.
105 :    
106 :     =back
107 :    
108 :     =cut
109 :    
110 :     sub ClearGenome {
111 :     # Get the parameters.
112 :     my ($sap, $genome) = @_;
113 :     # Create the statistics object.
114 :     my $stats = Stats->new();
115 :     # Delete the DNA.
116 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
117 : parrello 1.1 # Delete the contigs.
118 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
119 : parrello 1.1 # Delete the features.
120 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
121 : parrello 1.1 # Delete the molecular machines.
122 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
123 : parrello 1.1 # Delete the genome itself.
124 :     my $subStats = $sap->Delete(Genome => $genome);
125 :     # Accumulate the statistics from the delete.
126 :     $stats->Accumulate($subStats);
127 :     # Return the statistics object.
128 :     return $stats;
129 :     }
130 :    
131 :     =head2 Loader Object Methods
132 :    
133 :     =head3 new
134 :    
135 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
136 :    
137 :     Create a loader object that can be used to facilitate loading genome data from a
138 :     directory.
139 :    
140 :     =over 4
141 :    
142 :     =item sap
143 :    
144 :     L<Sapling> object used to access the target database.
145 :    
146 :     =item genome
147 :    
148 :     ID of the genome being loaded.
149 :    
150 :     =item directory
151 :    
152 :     Name of the directory containing the genome information.
153 :    
154 :     =back
155 :    
156 :     The object created contains the following fields.
157 :    
158 :     =over 4
159 :    
160 :     =item directory
161 :    
162 :     Name of the directory containing the genome data.
163 :    
164 :     =item genome
165 :    
166 :     ID of the genome being loaded.
167 :    
168 :     =item supportRecords
169 :    
170 :     A hash of hashes, used to track the support records known to exist in the database.
171 :    
172 :     =item sap
173 :    
174 :     L<Sapling> object used to access the database.
175 :    
176 :     =item stats
177 :    
178 :     L<Stats> object for tracking statistical information about the load.
179 :    
180 : parrello 1.6 =item timestamps
181 :    
182 :     A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,
183 :     and used to prevent duplicate timestamps.
184 :    
185 : parrello 1.1 =back
186 :    
187 :     =cut
188 :    
189 :     sub new {
190 :     # Get the parameters.
191 :     my ($class, $sap, $genome, $directory) = @_;
192 :     # Create the object.
193 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
194 :     # Add our specialized data.
195 :     $retVal->{genome} = $genome;
196 :     $retVal->{directory} = $directory;
197 : parrello 1.6 $retVal->{timestamps} = {};
198 : parrello 1.3 # Return the result.
199 : parrello 1.1 return $retVal;
200 :     }
201 :    
202 :     =head3 LoadContigs
203 :    
204 :     $loaderObject->LoadContigs();
205 :    
206 :     Load the contig information into the database. This includes the contigs themselves and
207 :     the DNA. The number of contigs will be recorded as the C<contigs> statistic and the
208 :     number of base pairs as the C<dna> statistic.
209 :    
210 :     =cut
211 :    
212 :     sub LoadContigs {
213 :     # Get this object.
214 :     my ($self) = @_;
215 :     # Open the contigs file from the genome directory.
216 :     my $ih = Open(undef, "<$self->{directory}/contigs");
217 :     # Get the length of a DNA segment.
218 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
219 :     # Get the genome ID.
220 :     my $genome = $self->{genome};
221 :     # These variables will contain the current contig ID, the current contig length,
222 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
223 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
224 :     # Loop through the contig file.
225 :     while (! eof $ih) {
226 :     # Get the current record.
227 :     my $line = <$ih>; chomp $line;
228 :     # Is this the start of a new contig?
229 :     if ($line =~ /^>(\S+)/) {
230 :     # Yes. Save the contig ID.
231 :     my $newContigID = $1;
232 :     # Is there a current contig?
233 :     if (defined $contigID) {
234 :     # Yes. Output the current chunk.
235 :     $self->OutputChunk($contigID, $ordinal, $chunk);
236 :     # Output the contig.
237 :     $self->OutputContig($contigID, $contigLen);
238 :     }
239 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
240 :     $self->FixContig(\$newContigID);
241 :     # Initialize the new contig.
242 :     $contigID = $newContigID;
243 :     $contigLen = 0;
244 :     $ordinal = 0;
245 :     $chunk = '';
246 :     $chunkLen = 0;
247 :     } else {
248 :     # Here we have more DNA in the current contig. Are we at the end of
249 :     # the current chunk?
250 :     my $lineLen = length($line);
251 :     my $newChunkLen = $chunkLen + $lineLen;
252 :     if ($newChunkLen < $segmentLength) {
253 :     # No. Add this line to the chunk.
254 :     $chunk .= $line;
255 :     $chunkLen += $lineLen;
256 :     } else {
257 :     # Yes. Create the actual chunk.
258 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
259 :     # Write it out.
260 :     $self->OutputChunk($contigID, $ordinal, $chunk);
261 :     # Set up the new chunk.
262 :     $chunk = substr($line, $segmentLength - $chunkLen);
263 :     $ordinal++;
264 :     $chunkLen = length($chunk);
265 :     }
266 :     # Update the contig length.
267 :     $contigLen += $lineLen;
268 :     }
269 :     }
270 :     # Is there a current contig?
271 :     if (defined $contigID) {
272 :     # Yes. Output the residual chunk.
273 :     $self->OutputChunk($contigID, $ordinal, $chunk);
274 :     # Output the contig itself.
275 :     $self->OutputContig($contigID, $contigLen);
276 :     }
277 :     }
278 :    
279 :     =head3 OutputChunk
280 :    
281 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
282 :    
283 :     Output a chunk of DNA for the specified contig.
284 :    
285 :     =over 4
286 :    
287 :     =item contigID
288 :    
289 :     ID of the contig being output.
290 :    
291 :     =item ordinal
292 :    
293 :     Ordinal number of this chunk.
294 :    
295 :     =item chunk
296 :    
297 :     DNA sequence comprising this chunk.
298 :    
299 :     =back
300 :    
301 :     =cut
302 :    
303 :     sub OutputChunk {
304 :     # Get the parameters.
305 :     my ($self, $contigID, $ordinal, $chunk) = @_;
306 :     # Get the sapling object.
307 :     my $sap = $self->{sap};
308 :     # Compute the chunk ID.
309 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
310 :     # Connect this sequence to the contig.
311 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
312 : parrello 1.1 # Create the DNA sequence.
313 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
314 :     # Record the chunk.
315 :     $self->{stats}->Add(chunks => 1);
316 :     }
317 :    
318 :     =head3 OutputContig
319 :    
320 :     $loaderObject->OutputContig($contigID, $contigLen);
321 :    
322 :     Write out the current contig.
323 :    
324 :     =over 4
325 :    
326 :     =item contigID
327 :    
328 :     ID of the contig being written out.
329 :    
330 :     =item contigLen
331 :    
332 :     Length of the contig in base pairs.
333 :    
334 :     =back
335 :    
336 :     =cut
337 :    
338 :     sub OutputContig {
339 :     # Get the parameters.
340 :     my ($self, $contigID, $contigLen) = @_;
341 :     # Get the sapling object.
342 :     my $sap = $self->{sap};
343 :     # Connect the contig to the genome.
344 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
345 :     # Output the contig record.
346 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
347 :     # Record the contig.
348 :     $self->{stats}->Add(contigs => 1);
349 :     $self->{stats}->Add(dna => $contigLen);
350 :     }
351 :    
352 :     =head3 LoadFeatures
353 :    
354 :     $loaderObject->LoadFeatures();
355 :    
356 :     Load the feature data into the database. This includes all of the features,
357 :     their protein sequences, and their subsystem information. The number of
358 :     features of each type will be stored in the statistics object, identified by
359 :     the feature type.
360 :    
361 :     =cut
362 :    
363 :     sub LoadFeatures {
364 :     # Get the parameters.
365 :     my ($self) = @_;
366 :     # Get the directory of feature types.
367 :     my $featureDir = "$self->{directory}/Features";
368 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
369 :     # Create the feature records for the types found.
370 :     for my $type (@types) {
371 :     # Insure this is a genuine feature directory.
372 :     if (-f "$featureDir/$type/tbl") {
373 :     # Yes, load the feature data.
374 :     $self->LoadFeatureData($featureDir, $type);
375 :     }
376 :     }
377 :     # Check for protein sequences. If we have some, load them into the database.
378 :     if (-f "$featureDir/peg/fasta") {
379 :     $self->LoadProteinData("$featureDir/peg/fasta");
380 :     }
381 : parrello 1.6 # Check for annotation history. If we have it, load the history records into the
382 :     # database.
383 :     if (-f "$featureDir/annotations") {
384 :     $self->LoadAnnotations("$featureDir/annotations");
385 :     }
386 : parrello 1.1 }
387 :    
388 :     =head3 LoadFeatureData
389 :    
390 : parrello 1.3 $loaderObject->LoadFeatureData($featureDir, $type);
391 : parrello 1.1
392 :     Load the basic data for each feature into the database. The number of features of
393 :     the type found will be recorded in the statistics object.
394 :    
395 :     =over 4
396 :    
397 :     =item featureDir
398 :    
399 :     Name of the main directory containing all the feature type subdirectories.
400 :    
401 :     =item type
402 :    
403 :     Type of feature to load.
404 :    
405 :     =back
406 :    
407 :     =cut
408 :    
409 :     sub LoadFeatureData {
410 :     # Get the parameters.
411 :     my ($self, $featureDir, $type) = @_;
412 :     # Get the sapling database.
413 :     my $sap = $self->{sap};
414 :     # Get the maximum location segment length. We'll need this later.
415 :     my $maxLength = $sap->TuningParameter('maxLocationLength');
416 :     # Get the statistics object.
417 :     my $stats = $self->{stats};
418 :     # Read in the functional assignments.
419 :     my $assignHash = $self->ReadAssignments();
420 :     # This hash will track the features we've created. If a feature is found a second
421 :     # time, it overwrites the original.
422 : parrello 1.6 my $fidHash = $self->{timestamps};
423 :     # Finally, we need the timestamp hash. The initial feature population
424 : parrello 1.1 # Insure we have a tbl file for this feature type.
425 :     my $fileName = "$featureDir/$type/tbl";
426 :     if (-f $fileName) {
427 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
428 :     # of deleted features.
429 :     my %deletedFids;
430 :     my $deleteFile = "$featureDir/$type/deleted.features";
431 :     if (-f $deleteFile) {
432 :     %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);
433 :     }
434 :     # Open the main file for input.
435 : parrello 1.1 my $ih = Open(undef, "<$fileName");
436 :     while (! eof $ih) {
437 :     # Read this feature's information.
438 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
439 : parrello 1.3 # Only proceed if the feature is NOT deleted.
440 :     if (! exists $deletedFids{$fid}) {
441 :     # If the feature already exists, delete it. (This should be extremely rare.)
442 : parrello 1.6 if (exists $fidHash->{$fid}) {
443 : parrello 1.3 $sap->Delete(Feature => $fid);
444 :     $stats->Add(duplicateFid => 1);
445 : parrello 1.1 }
446 : parrello 1.3 # Otherwise connect this feature to the genome.
447 :     $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
448 :     # Now we must parse the locations. This will contain a list of the location
449 :     # data 4-tuples (contig, start, dir, len).
450 :     my @locData;
451 :     # This will contain the total sequence length.
452 :     my $length = 0;
453 :     # Loop through the locations.
454 :     for my $loc (split /\s*,\s*/, $locations) {
455 :     # Get this location's components.
456 :     my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);
457 :     # Normalize the contig.
458 :     $self->FixContig(\$contig);
459 :     # Compute the direction.
460 :     if ($start <= $stop) {
461 :     # Here we have the forward strand.
462 :     my $len = $stop + 1 - $start;
463 :     push @locData, [$contig, $start, '+', $len];
464 :     $length += $len;
465 : parrello 1.1 } else {
466 : parrello 1.3 # Here we have the reverse strand.
467 :     my $len = $start + 1 - $stop;
468 :     push @locData, [$contig, $stop, '-', $len];
469 :     $length += $len;
470 : parrello 1.1 }
471 :     }
472 : parrello 1.3 # Now we can create the feature record.
473 :     $sap->InsertObject('Feature', id => $fid, feature_type => $type,
474 :     function => $assignHash->{$fid}, locked => 0,
475 :     sequence_length => $length);
476 : parrello 1.6 $fidHash->{$fid} = {};
477 : parrello 1.3 $stats->Add($type => 1);
478 :     # The next step is to connect the feature record to its locations. This
479 :     # involves dividing the location into segments. The following variable
480 :     # will count the total number of segments.
481 :     my $segment = 0;
482 :     # Loop through the locations.
483 :     for my $loc (@locData) {
484 :     # Get the current location's information.
485 :     my ($contig, $left, $dir, $len) = @$loc;
486 :     # Peel off any segments.
487 :     while ($len > $maxLength) {
488 :     # Process according to the direction.
489 :     if ($dir eq '+') {
490 :     # Forward strand: peel from the beginning.
491 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir,
492 :     $maxLength);
493 :     $left += $maxLength;
494 :     } else {
495 :     # Reverse strand: peel from the end.
496 :     $self->ConnectLocation($fid, $contig, $segment,
497 :     $left + $len - $maxLength, $dir,
498 :     $maxLength);
499 :     }
500 :     # Denote we've used up a segment.
501 :     $len -= $maxLength;
502 :     $segment++;
503 :     }
504 :     # Output the last segment.
505 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
506 :     }
507 :     # Now we process the aliases and create the identifiers. We don't do this
508 :     # for RNA, because the RNA function is stored in the aliases.
509 :     if ($type ne 'rna') {
510 :     for my $alias (@aliases) {
511 :     my $normalized;
512 :     # Determine the type.
513 :     my $aliasType = AliasAnalysis::TypeOf($alias);
514 :     $stats->Add(aliasAll => 1);
515 :     # Is this a recognized type?
516 :     if ($aliasType) {
517 :     $stats->Add(aliasNormal => 1);
518 :     # Yes. Write it normally.
519 :     $self->CreateIdentifier($alias, B => $aliasType, $fid);
520 :     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
521 :     # No, but this is a specially-marked locus tag.
522 :     $normalized = $1;
523 :     $stats->Add(aliasLocus => 1);
524 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
525 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
526 :     # No, but this is a natural locus tag.
527 :     $stats->Add(aliasLocus => 1);
528 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
529 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
530 :     # No, but this is a natural gene name.
531 :     $stats->Add(aliasGene => 1);
532 :     $self->CreateIdentifier($normalized, B => 'GENE', $fid);
533 :     } elsif ($alias =~ /^\d+$/) {
534 :     # Here it's a naked number, which means it's a GI number
535 :     # of some sort.
536 :     $stats->Add(aliasGI => 1);
537 :     $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
538 :     } elsif ($alias =~ /^protein_id\|(.+)/) {
539 :     # Here we have a REFSEQ protein ID. Right now we don't have a way to
540 :     # handle that, because we don't know the feature's protein ID here.
541 :     $stats->Add(aliasProtein => 1);
542 :     } elsif ($alias =~ /[:|]/) {
543 :     # Here it's an alias of an unknown type, so we skip it.
544 :     $stats->Add(aliasUnknown => 1);
545 :     } else {
546 :     # Here it's a miscellaneous type.
547 :     $stats->Add(aliasMisc => 1);
548 :     $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
549 :     }
550 : parrello 1.2 }
551 :     }
552 :     }
553 : parrello 1.1 }
554 :     }
555 : parrello 1.3 # Now loop through the features, connecting them to their roles. Note that deleted
556 :     # features will not be in the assignment hash.
557 :     for my $fid (keys %$assignHash) {
558 :     # Get the roles and the error count.
559 :     my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});
560 :     # Accumulate the errors in the stats object.
561 :     $stats->Add(roleErrors => $errors);
562 :     # Is this a suspicious function?
563 :     if (! defined $roles) {
564 :     # Yes, so track it.
565 :     $stats->Add(badFunction => 1);
566 :     } else {
567 :     # No, connect the roles.
568 :     for my $role (@$roles) {
569 :     # Insure this role exists.
570 :     my $hypo = hypo($role);
571 :     $self->InsureEntity(Role => $role, hypothetical => $hypo);
572 :     # Connect it to the feature.
573 :     $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);
574 :     }
575 :     }
576 :     }
577 : parrello 1.1 }
578 :    
579 :     =head3 LoadProteinData
580 :    
581 :     $self->LoadProteinData($fileName);
582 :    
583 :     Load the protein sequences from the named FASTA file. The sequences will be stored
584 :     in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature
585 :     records themselves will not be created (it is presumed they are already there).
586 :    
587 :     =over 4
588 :    
589 :     =item fileName
590 :    
591 :     Name of the FASTA file containing the protein sequences for this genome.
592 :    
593 :     =back
594 :    
595 :     =cut
596 :    
597 :     sub LoadProteinData {
598 :     # Get the parameters.
599 :     my ($self, $fileName) = @_;
600 :     # Open the FASTA file for input.
601 :     my $ih = Open(undef, "<$fileName");
602 :     # We'll track the current protein in here.
603 :     my $fid;
604 :     my $sequence = "";
605 :     # Loop through the input file.
606 :     while (! eof $ih) {
607 :     # Get the current record.
608 :     my $line = <$ih>; chomp $line;
609 :     # Is this a label record.
610 :     if ($line =~ /^>(\S+)/) {
611 :     # Yes. Save the new feature ID.
612 :     my $newFid = $1;
613 :     # Do we have an existing protein?
614 :     if (defined $fid) {
615 :     # Yes. Write it out.
616 :     $self->WriteProtein($fid, $sequence);
617 :     }
618 :     # Initialize for the next protein.
619 :     $fid = $newFid;
620 :     $sequence = "";
621 :     } else {
622 :     # Here we have more letters for the current protein.
623 :     $sequence .= $line;
624 :     }
625 :     }
626 :     # Do we have a residual protein.
627 :     if (defined $fid) {
628 :     # Yes. Write it out.
629 :     $self->WriteProtein($fid, $sequence);
630 :     }
631 :     }
632 :    
633 : parrello 1.6
634 :     =head3 LoadAnnotations
635 :    
636 :     $loaderObject->LoadAnnotations($fileName);
637 :    
638 :     Read in the annotation history information and use it to create annotation records.
639 :    
640 :     =over 4
641 :    
642 :     =item fileName
643 :    
644 :     Name of the annotation history file. This file is formatted with four fields per
645 :     record. Each field is on a separate line, with a double slash (C<//>) used as the
646 :     line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
647 :     (formatted as an integer), (2) the user name, and (3) the annotation text.
648 :    
649 :     =back
650 :    
651 :     =cut
652 :    
653 :     sub LoadAnnotations {
654 :     # Get the parameters.
655 :     my ($self, $fileName) = @_;
656 :     # Get the timestamp hash.
657 :     my $timeHash = $self->{timestamps};
658 :     # Get the Sapling database.
659 :     my $sap = $self->{sap};
660 :     # Get the statistics object.
661 :     my $stats = $self->{stats};
662 :     # Open the input file.
663 :     my $ih = Tracer::Open(undef, "<$fileName");
664 :     # Loop through the input.
665 :     while (! eof $ih) {
666 :     # Read in the peg, timestamp, and user ID.
667 :     my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
668 :     # Only proceed if the feature exists.
669 :     if (! exists $timeHash->{$fid}) {
670 :     $stats->Add(skippedAnnotation => 1);
671 :     } else {
672 :     # Change assignments by the master user to FIG assignments.
673 :     $text =~ s/Set master function/Set FIG function/s;
674 :     # Insure the time stamp is valid.
675 :     if ($timestamp =~ /^\d+$/) {
676 :     # Here it's a number. We need to insure the one we use to form
677 :     # the key is unique.
678 :     my $keyStamp = $timestamp;
679 :     while ($timeHash->{$fid}{$keyStamp}) {
680 :     $keyStamp++;
681 :     $stats->Add(skippedStamp => 1);
682 :     }
683 :     # Form the annotation ID.
684 :     my $annotationID = "$fid:" . Tracer::Pad(9999999999 - $keyStamp, 10,
685 :     1, "0");
686 :     $timeHash->{$fid}{$keyStamp} = 1;
687 :     # Generate the annotation.
688 :     $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);
689 :     $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,
690 :     comment => $text, annotator => $user);
691 :     } else {
692 :     # Here we have an invalid time stamp.
693 :     Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);
694 :     }
695 :     }
696 :     }
697 :     }
698 :    
699 :    
700 : parrello 1.1 =head3 WriteProtein
701 :    
702 :     $loaderObject->WriteProtein($fid, $sequence);
703 :    
704 :     Write out the specified protein sequence and associate it with the specified feature.
705 :     This requires checking to see if the protein sequence is already in the database.
706 :    
707 :     =over 4
708 :    
709 :     =item fid
710 :    
711 :     ID of the feature with the specified protein sequence.
712 :    
713 :     =item sequence
714 :    
715 :     Protein sequence for the indicated feature.
716 :    
717 :     =back
718 :    
719 :     =cut
720 :    
721 :     sub WriteProtein {
722 :     # Get the parameters.
723 :     my ($self, $fid, $sequence) = @_;
724 :     # Compute the key of the protein sequence.
725 :     my $protID = ERDB::DigestKey($sequence);
726 :     # Insure the protein exists.
727 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
728 :     # Connect the feature to it.
729 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
730 :     }
731 :    
732 :     =head3 LoadSubsystems
733 :    
734 :     $loaderObject->LoadSubsystems();
735 :    
736 :     Load the subsystem data into the database. This includes all of the subsystems,
737 :     their categories, roles, and the bindings that connect them to this genome's features.
738 :    
739 :     =cut
740 :    
741 :     sub LoadSubsystems {
742 :     # Get the parameters.
743 :     my ($self) = @_;
744 :     # Get the sapling object.
745 :     my $sap = $self->{sap};
746 :     # Get the statistics object.
747 :     my $stats = $self->{stats};
748 :     # Get the genome ID.
749 :     my $genome = $self->{genome};
750 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
751 :     my $sapO = SAPserver->new();
752 :     # This hash maps subsystem IDs to molecular machine IDs.
753 :     my %machines;
754 :     # This hash maps subsystem/role pairs to machine role IDs.
755 :     my %machineRoles;
756 :     # The first step is to create the subsystems themselves and connect them to the
757 :     # genome. A list is given in the subsystems file of the Subsystems directory.
758 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
759 :     # Create the field hash for the subsystem query.
760 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
761 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
762 :     Subsystem(version)
763 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
764 :     Includes(sequence)
765 :     Role(id) Role(hypothetical) Role(role-index));
766 :     my %fields = map { $_ => $_ } @fields;
767 :     # Loop through the subsystems in the file, insuring we have them in the database.
768 :     while (! eof $ih) {
769 :     # Get this subsystem.
770 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
771 :     # Normalize the subsystem name.
772 :     $subsystem = $sap->SubsystemID($subsystem);
773 :     # Compute this subsystem's MD5.
774 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
775 :     # Insure the subsystem is in the database.
776 :     if (! $sap->Exists(Subsystem => $subsystem)) {
777 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
778 :     # its roles, and its classification information. The variant will be added
779 :     # later if we need it.
780 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
781 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
782 :     -fields => \%fields);
783 :     # Only proceed if we found some roles.
784 :     if (@$roleList > 0) {
785 :     # Get the subsystem information from the first role and create the subsystem.
786 :     my $roleH = $roleList->[0];
787 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
788 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
789 :     # Now loop through the roles. The Includes records are always inserted, but the
790 :     # roles are only inserted if they don't already exist.
791 :     for $roleH (@$roleList) {
792 :     # Create the Includes record.
793 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
794 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
795 :     # Insure we have the role in place.
796 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
797 : parrello 1.1 my $roleID = $roleFields{id};
798 :     delete $roleFields{id};
799 :     $self->InsureEntity('Role', $roleID, %roleFields);
800 :     # Compute the machine-role ID.
801 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
802 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
803 :     }
804 :     }
805 :     } else {
806 :     # We already have the subsystem in the database, but we still need to compute the
807 :     # machine role IDs. We need information from the sapling server for this.
808 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
809 :     # Loop through the roles.
810 :     my $roleList = $subHash->{$subsystem};
811 :     for my $roleTuple (@$roleList) {
812 :     my ($roleID, $abbr) = @$roleTuple;
813 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
814 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
815 :     }
816 :     }
817 :     # Now we need to connect the variant to the subsystem. First, we compute the real
818 :     # variant code by removing the asterisks.
819 :     my $variantCode = $variant;
820 :     $variantCode =~ s/^\*+//;
821 :     $variantCode =~ s/\*+$//;
822 :     # Compute the variant key.
823 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
824 :     # Get the variant from the sapling server.
825 :     my $variantH = $sapO->get(-objects => "Variant",
826 :     -filter => {"Variant(id)" => ["=", $variantKey]},
827 :     -fields => {"Variant(code)" => "code",
828 :     "Variant(comment)" => "comment",
829 :     "Variant(type)" => "type",
830 :     "Variant(role-rule)" => "role-rule"},
831 :     -firstOnly => 1);
832 :     # Insure we have it in the database.
833 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
834 :     $stats->Add(subsystems => 1);
835 :     # Connect it to the subsystem.
836 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
837 :     # Now we create the molecular machine connecting this genome to the subsystem
838 :     # variant.
839 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
840 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
841 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
842 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
843 :     # Remember the machine ID.
844 :     $machines{$subsystem} = $machineID;
845 :     }
846 :     # Now we go through the bindings file. This file connects the subsystem roles to
847 :     # the molecular machines.
848 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
849 :     # Loop through the bindings.
850 :     while (! eof $ih) {
851 :     # Get the binding data.
852 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
853 :     # Normalize the subsystem name.
854 :     $subsystem = $sap->SubsystemID($subsystem);
855 :     # Compute the machine role.
856 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
857 :     # Insure it exists.
858 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
859 :     if ($created) {
860 :     # We created the machine role, so connect it to the machine.
861 :     my $machineID = $machines{$subsystem};
862 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
863 :     # Connect it to the role, too.
864 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
865 :     }
866 :     # Connect the feature.
867 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
868 :     }
869 :     }
870 :    
871 :     =head3 CreateGenome
872 :    
873 :     $loaderObject->CreateGenome();
874 :    
875 :     Create the genome record.
876 :    
877 :     =cut
878 :    
879 :     # This hash maps cache statistics to genome record fields.
880 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
881 :     genome_domain => 'domain'};
882 :     # This hash maps domains to the prokaryotic flag.
883 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
884 :    
885 :     sub CreateGenome {
886 :     # Get the parameters.
887 :     my ($self) = @_;
888 :     # Get the genome directory.
889 :     my $dir = $self->{directory};
890 :     # Get the sapling database.
891 :     my $sap = $self->{sap};
892 :     # We'll put the genome attributes in here.
893 :     my %fields;
894 :     # Check for a basic statistics file.
895 :     my $statsFile = "$dir/cache.basic_statistics";
896 :     if (-f $statsFile) {
897 :     # We have the statistics file, so read the major attributes from it.
898 :     my $ih = Open(undef, "<$statsFile");
899 :     while (! eof $ih) {
900 :     my ($key, $value) = Tracer::GetLine($ih);
901 :     my $fieldKey = GENOME_FIELDS->{$key};
902 :     if ($fieldKey) {
903 :     $fields{$fieldKey} = $value;
904 :     }
905 :     }
906 :     # Translate the domain.
907 :     $fields{domain} = ucfirst $fields{domain};
908 :     # Denote the genome is complete.
909 :     $fields{complete} = 1;
910 :     } else {
911 :     # Check to see if this genome is complete.
912 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
913 :     # Get the genome name.
914 :     my $ih = Open(undef, "<$dir/GENOME");
915 :     my $line = <$ih>; chomp $line;
916 :     $fields{'scientific-name'} = $line;
917 :     # Get the taxonomy and extract the domain from it.
918 :     $ih = Open(undef, "<$dir/TAXONOMY");
919 :     ($fields{domain}) = split /;/, <$ih>, 2;
920 :     }
921 :     # Get the counts from the statistics object.
922 :     my $stats = $self->{stats};
923 :     $fields{contigs} = $stats->Ask('contigs');
924 :     $fields{'dna-size'} = $stats->Ask('dna');
925 :     $fields{pegs} = $stats->Ask('peg');
926 :     $fields{rnas} = $stats->Ask('rna');
927 :     # Get the genetic code. The default is 11.
928 :     $fields{'genetic-code'} = 11;
929 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
930 :     if (-f $geneticCodeFile) {
931 :     # There's a genetic code file, so we need to read it for the code.
932 :     my $ih = Open(undef, "<$geneticCodeFile");
933 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
934 :     }
935 :     # Use the domain to determine whether or not the genome is prokaryotic.
936 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
937 :     # Finally, add the genome ID.
938 :     $fields{id} = $self->{genome};
939 :     # Create the genome record.
940 :     $sap->InsertObject('Genome', %fields);
941 :     }
942 :    
943 :     =head3 ReadAssignments
944 :    
945 :     my $assignHash = $loaderObject->ReadAssignments();
946 :    
947 :     Return a hash mapping each feature ID to its functional assignment. This method
948 :     essentially reads the B<assigned_functions> file into memory.
949 :    
950 :     =cut
951 :    
952 :     sub ReadAssignments {
953 :     # Get the parameters.
954 :     my ($self) = @_;
955 :     # Create the return hash.
956 :     my $retVal = {};
957 :     # Loop through the assigned-functions file, storing results in the hash. Later
958 :     # results will override earlier ones.
959 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
960 :     while (! eof $ih) {
961 :     my ($fid, $function) = Tracer::GetLine($ih);
962 :     $retVal->{$fid} = $function;
963 :     }
964 :     # Return the hash built.
965 :     return $retVal;
966 :     }
967 :    
968 :    
969 :     =head3 FixContig
970 :    
971 :     $loaderObject->FixContig(\$contigID);
972 :    
973 :     Insure that the specified contig ID contains the genome ID.
974 :    
975 :     =cut
976 :    
977 :     sub FixContig {
978 :     my ($self, $contigID) = @_;
979 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
980 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
981 :     $$contigID = "$self->{genome}:$$contigID";
982 :     }
983 :     }
984 :    
985 :     =head3 ConnectLocation
986 :    
987 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
988 :    
989 :     Connect the specified feature to the specified contig location.
990 :    
991 :     =over 4
992 :    
993 :     =item fid
994 :    
995 :     ID of the relevant feature.
996 :    
997 :     =item contig
998 :    
999 :     ID of the contig containing this segment of the feature (normalized).
1000 :    
1001 :     =item segment
1002 :    
1003 :     Ordinal number of this segment.
1004 :    
1005 :     =item left
1006 :    
1007 :     Location of the segment's leftmost base pair.
1008 :    
1009 :     =item dir
1010 :    
1011 :     Direction (strand) of the segment (C<+> or C<->).
1012 :    
1013 :     =item len
1014 :    
1015 :     Length of the segment (which must be no greater than the configured maximum).
1016 :    
1017 :     =back
1018 :    
1019 :     =cut
1020 :    
1021 :     sub ConnectLocation {
1022 :     # Get the parameters.
1023 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
1024 :     # Get the sapling database.
1025 :     my $sap = $self->{sap};
1026 :     # Create the relationship.
1027 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
1028 :     begin => $left, dir => $dir, len => $len,
1029 :     ordinal => $segment);
1030 :     # Record it in the statistics.
1031 :     $self->{stats}->Add(segment => 1);
1032 :     }
1033 :    
1034 : parrello 1.2 =head3 CreateIdentifier
1035 :    
1036 :     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
1037 :    
1038 :     Link an identifier to a feature. The identifier is presented in prefixed form and is of the
1039 :     specified type and the specified confidence level.
1040 :    
1041 :     =over 4
1042 :    
1043 :     =item alias
1044 :    
1045 :     Identifier to connect to the feature.
1046 :    
1047 :     =item conf
1048 :    
1049 :     Confidence level (C<A> curated, C<B> normal, C<C> protein only).
1050 :    
1051 :     =item aliasType
1052 :    
1053 :     Type of alias (e.g. C<NCBI>, C<LocusTag>).
1054 :    
1055 :     =item fid
1056 :    
1057 :     ID of the relevant feature.
1058 :    
1059 :     =back
1060 :    
1061 :     =cut
1062 :    
1063 :     sub CreateIdentifier {
1064 :     # Get the parameters.
1065 :     my ($self, $alias, $conf, $aliasType, $fid) = @_;
1066 :     # Get the Sapling object.
1067 :     my $sap = $self->{sap};
1068 :     # Compute the identifier's natural form.
1069 :     my $natural = $alias;
1070 :     if ($natural =~ /[:|](.+)/) {
1071 :     $natural = $1;
1072 :     }
1073 :     # Insure the identifier exists in the database.
1074 :     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1075 :     # Connect the identifier to the feature.
1076 : parrello 1.5 $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
1077 : parrello 1.2 }
1078 :    
1079 : parrello 1.6 =head2 Internal Utility Methods
1080 :    
1081 :     =head3 ReadAnnotation
1082 :    
1083 :     my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1084 :    
1085 :     Read the next record from an annotation file. The next record must exist (that is, an
1086 :     end-of-file check should have been performed before calling this method).
1087 :    
1088 :     =over 4
1089 :    
1090 :     =item ih
1091 :    
1092 :     Open file handle for the annotation file.
1093 :    
1094 :     =item RETURN
1095 :    
1096 :     Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1097 :     timestamp, (2) the user ID, and (3) the annotation text.
1098 :    
1099 :     =back
1100 :    
1101 :     =cut
1102 :    
1103 :     sub ReadAnnotation {
1104 :     # Get the parameter.
1105 :     my ($ih) = @_;
1106 :     # Read the three fixed fields.
1107 :     my $fid = <$ih>; chomp $fid;
1108 :     my $timestamp = <$ih>; chomp $timestamp;
1109 :     my $user = <$ih>; chomp $user;
1110 :     # Loop through the lines of the text field.
1111 :     my $text = "";
1112 :     my $line = <$ih>;
1113 :     while ($line ne "//\n") {
1114 :     $text .= $line;
1115 :     $line = <$ih>;
1116 :     }
1117 :     # Remove the trailing new-line from the text.
1118 :     chomp $text;
1119 :     # Return the fields.
1120 :     return ($fid, $timestamp, $user, $text);
1121 :     }
1122 :    
1123 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3