[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.3 use base qw(SaplingDataLoader);
30 : parrello 1.1
31 :     =head1 Sapling Genome Loader
32 :    
33 :     This package contains the methods used to load a genome into the Sapling from a genome
34 :     directory. This is not very efficient if you are trying to create a full database,
35 :     but is useful if you wish to add genomes one at a time. The information loaded will
36 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
37 :    
38 :     =head2 Main Methods
39 :    
40 :     =head3 Load
41 :    
42 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
43 :    
44 :     Load a genome from a genome directory into the sapling database.
45 :    
46 :     =over 4
47 :    
48 :     =item sap
49 :    
50 :     L<Sapling> object used to access the target database.
51 :    
52 :     =item genome
53 :    
54 :     ID of the genome being loaded.
55 :    
56 :     =item directory
57 :    
58 :     Name of the directory containing the genome information.
59 :    
60 :     =back
61 :    
62 :     =cut
63 :    
64 :     sub Load {
65 :     # Get the parameters.
66 :     my ($sap, $genome, $directory) = @_;
67 :     # Create the loader object.
68 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69 :     # Load the contigs.
70 : parrello 1.2 Trace("Loading contigs for $genome.") if T(2);
71 : parrello 1.1 $loaderObject->LoadContigs();
72 :     # Load the features.
73 : parrello 1.2 Trace("Loading features for $genome.") if T(2);
74 : parrello 1.1 $loaderObject->LoadFeatures();
75 :     # Load the subsystem bindings.
76 : parrello 1.2 Trace("Loading subsystems for $genome.") if T(2);
77 : parrello 1.1 $loaderObject->LoadSubsystems();
78 :     # Create the Genome record and taxonomy information.
79 : parrello 1.2 Trace("Creating root for $genome.") if T(2);
80 : parrello 1.1 $loaderObject->CreateGenome();
81 :     # Return the statistics.
82 :     return $loaderObject->{stats};
83 :     }
84 :    
85 :     =head3 ClearGenome
86 :    
87 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
88 :    
89 :     Delete the specified genome and all the related records from the specified sapling
90 :     database. This method can also be used to clean up after a failed or aborted load.
91 :    
92 :     =over 4
93 :    
94 :     =item sap
95 :    
96 :     L<Sapling> object used to access the target database.
97 :    
98 :     =item genome
99 :    
100 :     ID of the genome to delete.
101 :    
102 :     =item RETURN
103 :    
104 :     Returns a statistics object counting the records deleted.
105 :    
106 :     =back
107 :    
108 :     =cut
109 :    
110 :     sub ClearGenome {
111 :     # Get the parameters.
112 :     my ($sap, $genome) = @_;
113 :     # Create the statistics object.
114 :     my $stats = Stats->new();
115 :     # Delete the DNA.
116 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
117 : parrello 1.1 # Delete the contigs.
118 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
119 : parrello 1.1 # Delete the features.
120 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
121 : parrello 1.1 # Delete the molecular machines.
122 : parrello 1.4 SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
123 : parrello 1.1 # Delete the genome itself.
124 :     my $subStats = $sap->Delete(Genome => $genome);
125 :     # Accumulate the statistics from the delete.
126 :     $stats->Accumulate($subStats);
127 :     # Return the statistics object.
128 :     return $stats;
129 :     }
130 :    
131 :     =head2 Loader Object Methods
132 :    
133 :     =head3 new
134 :    
135 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
136 :    
137 :     Create a loader object that can be used to facilitate loading genome data from a
138 :     directory.
139 :    
140 :     =over 4
141 :    
142 :     =item sap
143 :    
144 :     L<Sapling> object used to access the target database.
145 :    
146 :     =item genome
147 :    
148 :     ID of the genome being loaded.
149 :    
150 :     =item directory
151 :    
152 :     Name of the directory containing the genome information.
153 :    
154 :     =back
155 :    
156 :     The object created contains the following fields.
157 :    
158 :     =over 4
159 :    
160 :     =item directory
161 :    
162 :     Name of the directory containing the genome data.
163 :    
164 :     =item genome
165 :    
166 :     ID of the genome being loaded.
167 :    
168 :     =item supportRecords
169 :    
170 :     A hash of hashes, used to track the support records known to exist in the database.
171 :    
172 :     =item sap
173 :    
174 :     L<Sapling> object used to access the database.
175 :    
176 :     =item stats
177 :    
178 :     L<Stats> object for tracking statistical information about the load.
179 :    
180 :     =back
181 :    
182 :     =cut
183 :    
184 :     sub new {
185 :     # Get the parameters.
186 :     my ($class, $sap, $genome, $directory) = @_;
187 :     # Create the object.
188 : parrello 1.3 my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
189 :     # Add our specialized data.
190 :     $retVal->{genome} = $genome;
191 :     $retVal->{directory} = $directory;
192 :     # Return the result.
193 : parrello 1.1 return $retVal;
194 :     }
195 :    
196 :     =head3 LoadContigs
197 :    
198 :     $loaderObject->LoadContigs();
199 :    
200 :     Load the contig information into the database. This includes the contigs themselves and
201 :     the DNA. The number of contigs will be recorded as the C<contigs> statistic and the
202 :     number of base pairs as the C<dna> statistic.
203 :    
204 :     =cut
205 :    
206 :     sub LoadContigs {
207 :     # Get this object.
208 :     my ($self) = @_;
209 :     # Open the contigs file from the genome directory.
210 :     my $ih = Open(undef, "<$self->{directory}/contigs");
211 :     # Get the length of a DNA segment.
212 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
213 :     # Get the genome ID.
214 :     my $genome = $self->{genome};
215 :     # These variables will contain the current contig ID, the current contig length,
216 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
217 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
218 :     # Loop through the contig file.
219 :     while (! eof $ih) {
220 :     # Get the current record.
221 :     my $line = <$ih>; chomp $line;
222 :     # Is this the start of a new contig?
223 :     if ($line =~ /^>(\S+)/) {
224 :     # Yes. Save the contig ID.
225 :     my $newContigID = $1;
226 :     # Is there a current contig?
227 :     if (defined $contigID) {
228 :     # Yes. Output the current chunk.
229 :     $self->OutputChunk($contigID, $ordinal, $chunk);
230 :     # Output the contig.
231 :     $self->OutputContig($contigID, $contigLen);
232 :     }
233 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
234 :     $self->FixContig(\$newContigID);
235 :     # Initialize the new contig.
236 :     $contigID = $newContigID;
237 :     $contigLen = 0;
238 :     $ordinal = 0;
239 :     $chunk = '';
240 :     $chunkLen = 0;
241 :     } else {
242 :     # Here we have more DNA in the current contig. Are we at the end of
243 :     # the current chunk?
244 :     my $lineLen = length($line);
245 :     my $newChunkLen = $chunkLen + $lineLen;
246 :     if ($newChunkLen < $segmentLength) {
247 :     # No. Add this line to the chunk.
248 :     $chunk .= $line;
249 :     $chunkLen += $lineLen;
250 :     } else {
251 :     # Yes. Create the actual chunk.
252 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
253 :     # Write it out.
254 :     $self->OutputChunk($contigID, $ordinal, $chunk);
255 :     # Set up the new chunk.
256 :     $chunk = substr($line, $segmentLength - $chunkLen);
257 :     $ordinal++;
258 :     $chunkLen = length($chunk);
259 :     }
260 :     # Update the contig length.
261 :     $contigLen += $lineLen;
262 :     }
263 :     }
264 :     # Is there a current contig?
265 :     if (defined $contigID) {
266 :     # Yes. Output the residual chunk.
267 :     $self->OutputChunk($contigID, $ordinal, $chunk);
268 :     # Output the contig itself.
269 :     $self->OutputContig($contigID, $contigLen);
270 :     }
271 :     }
272 :    
273 :     =head3 OutputChunk
274 :    
275 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
276 :    
277 :     Output a chunk of DNA for the specified contig.
278 :    
279 :     =over 4
280 :    
281 :     =item contigID
282 :    
283 :     ID of the contig being output.
284 :    
285 :     =item ordinal
286 :    
287 :     Ordinal number of this chunk.
288 :    
289 :     =item chunk
290 :    
291 :     DNA sequence comprising this chunk.
292 :    
293 :     =back
294 :    
295 :     =cut
296 :    
297 :     sub OutputChunk {
298 :     # Get the parameters.
299 :     my ($self, $contigID, $ordinal, $chunk) = @_;
300 :     # Get the sapling object.
301 :     my $sap = $self->{sap};
302 :     # Compute the chunk ID.
303 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
304 :     # Connect this sequence to the contig.
305 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
306 : parrello 1.1 # Create the DNA sequence.
307 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
308 :     # Record the chunk.
309 :     $self->{stats}->Add(chunks => 1);
310 :     }
311 :    
312 :     =head3 OutputContig
313 :    
314 :     $loaderObject->OutputContig($contigID, $contigLen);
315 :    
316 :     Write out the current contig.
317 :    
318 :     =over 4
319 :    
320 :     =item contigID
321 :    
322 :     ID of the contig being written out.
323 :    
324 :     =item contigLen
325 :    
326 :     Length of the contig in base pairs.
327 :    
328 :     =back
329 :    
330 :     =cut
331 :    
332 :     sub OutputContig {
333 :     # Get the parameters.
334 :     my ($self, $contigID, $contigLen) = @_;
335 :     # Get the sapling object.
336 :     my $sap = $self->{sap};
337 :     # Connect the contig to the genome.
338 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
339 :     # Output the contig record.
340 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
341 :     # Record the contig.
342 :     $self->{stats}->Add(contigs => 1);
343 :     $self->{stats}->Add(dna => $contigLen);
344 :     }
345 :    
346 :     =head3 LoadFeatures
347 :    
348 :     $loaderObject->LoadFeatures();
349 :    
350 :     Load the feature data into the database. This includes all of the features,
351 :     their protein sequences, and their subsystem information. The number of
352 :     features of each type will be stored in the statistics object, identified by
353 :     the feature type.
354 :    
355 :     =cut
356 :    
357 :     sub LoadFeatures {
358 :     # Get the parameters.
359 :     my ($self) = @_;
360 :     # Get the directory of feature types.
361 :     my $featureDir = "$self->{directory}/Features";
362 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
363 :     # Create the feature records for the types found.
364 :     for my $type (@types) {
365 :     # Insure this is a genuine feature directory.
366 :     if (-f "$featureDir/$type/tbl") {
367 :     # Yes, load the feature data.
368 :     $self->LoadFeatureData($featureDir, $type);
369 :     }
370 :     }
371 :     # Check for protein sequences. If we have some, load them into the database.
372 :     if (-f "$featureDir/peg/fasta") {
373 :     $self->LoadProteinData("$featureDir/peg/fasta");
374 :     }
375 :     }
376 :    
377 :     =head3 LoadFeatureData
378 :    
379 : parrello 1.3 $loaderObject->LoadFeatureData($featureDir, $type);
380 : parrello 1.1
381 :     Load the basic data for each feature into the database. The number of features of
382 :     the type found will be recorded in the statistics object.
383 :    
384 :     =over 4
385 :    
386 :     =item featureDir
387 :    
388 :     Name of the main directory containing all the feature type subdirectories.
389 :    
390 :     =item type
391 :    
392 :     Type of feature to load.
393 :    
394 :     =back
395 :    
396 :     =cut
397 :    
398 :     sub LoadFeatureData {
399 :     # Get the parameters.
400 :     my ($self, $featureDir, $type) = @_;
401 :     # Get the sapling database.
402 :     my $sap = $self->{sap};
403 :     # Get the maximum location segment length. We'll need this later.
404 :     my $maxLength = $sap->TuningParameter('maxLocationLength');
405 :     # Get the statistics object.
406 :     my $stats = $self->{stats};
407 :     # Read in the functional assignments.
408 :     my $assignHash = $self->ReadAssignments();
409 :     # This hash will track the features we've created. If a feature is found a second
410 :     # time, it overwrites the original.
411 :     my %fids;
412 :     # Insure we have a tbl file for this feature type.
413 :     my $fileName = "$featureDir/$type/tbl";
414 :     if (-f $fileName) {
415 : parrello 1.3 # We have one, so we can read through it. First, however, we need to get the list
416 :     # of deleted features.
417 :     my %deletedFids;
418 :     my $deleteFile = "$featureDir/$type/deleted.features";
419 :     if (-f $deleteFile) {
420 :     %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);
421 :     }
422 :     # Open the main file for input.
423 : parrello 1.1 my $ih = Open(undef, "<$fileName");
424 :     while (! eof $ih) {
425 :     # Read this feature's information.
426 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
427 : parrello 1.3 # Only proceed if the feature is NOT deleted.
428 :     if (! exists $deletedFids{$fid}) {
429 :     # If the feature already exists, delete it. (This should be extremely rare.)
430 :     if (exists $fids{$fid}) {
431 :     $sap->Delete(Feature => $fid);
432 :     $stats->Add(duplicateFid => 1);
433 : parrello 1.1 }
434 : parrello 1.3 # Otherwise connect this feature to the genome.
435 :     $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
436 :     # Now we must parse the locations. This will contain a list of the location
437 :     # data 4-tuples (contig, start, dir, len).
438 :     my @locData;
439 :     # This will contain the total sequence length.
440 :     my $length = 0;
441 :     # Loop through the locations.
442 :     for my $loc (split /\s*,\s*/, $locations) {
443 :     # Get this location's components.
444 :     my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);
445 :     # Normalize the contig.
446 :     $self->FixContig(\$contig);
447 :     # Compute the direction.
448 :     if ($start <= $stop) {
449 :     # Here we have the forward strand.
450 :     my $len = $stop + 1 - $start;
451 :     push @locData, [$contig, $start, '+', $len];
452 :     $length += $len;
453 : parrello 1.1 } else {
454 : parrello 1.3 # Here we have the reverse strand.
455 :     my $len = $start + 1 - $stop;
456 :     push @locData, [$contig, $stop, '-', $len];
457 :     $length += $len;
458 : parrello 1.1 }
459 :     }
460 : parrello 1.3 # Now we can create the feature record.
461 :     $sap->InsertObject('Feature', id => $fid, feature_type => $type,
462 :     function => $assignHash->{$fid}, locked => 0,
463 :     sequence_length => $length);
464 :     $fids{$fid} = 1;
465 :     $stats->Add($type => 1);
466 :     # The next step is to connect the feature record to its locations. This
467 :     # involves dividing the location into segments. The following variable
468 :     # will count the total number of segments.
469 :     my $segment = 0;
470 :     # Loop through the locations.
471 :     for my $loc (@locData) {
472 :     # Get the current location's information.
473 :     my ($contig, $left, $dir, $len) = @$loc;
474 :     # Peel off any segments.
475 :     while ($len > $maxLength) {
476 :     # Process according to the direction.
477 :     if ($dir eq '+') {
478 :     # Forward strand: peel from the beginning.
479 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir,
480 :     $maxLength);
481 :     $left += $maxLength;
482 :     } else {
483 :     # Reverse strand: peel from the end.
484 :     $self->ConnectLocation($fid, $contig, $segment,
485 :     $left + $len - $maxLength, $dir,
486 :     $maxLength);
487 :     }
488 :     # Denote we've used up a segment.
489 :     $len -= $maxLength;
490 :     $segment++;
491 :     }
492 :     # Output the last segment.
493 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
494 :     }
495 :     # Now we process the aliases and create the identifiers. We don't do this
496 :     # for RNA, because the RNA function is stored in the aliases.
497 :     if ($type ne 'rna') {
498 :     for my $alias (@aliases) {
499 :     my $normalized;
500 :     # Determine the type.
501 :     my $aliasType = AliasAnalysis::TypeOf($alias);
502 :     $stats->Add(aliasAll => 1);
503 :     # Is this a recognized type?
504 :     if ($aliasType) {
505 :     $stats->Add(aliasNormal => 1);
506 :     # Yes. Write it normally.
507 :     $self->CreateIdentifier($alias, B => $aliasType, $fid);
508 :     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
509 :     # No, but this is a specially-marked locus tag.
510 :     $normalized = $1;
511 :     $stats->Add(aliasLocus => 1);
512 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
513 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
514 :     # No, but this is a natural locus tag.
515 :     $stats->Add(aliasLocus => 1);
516 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
517 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
518 :     # No, but this is a natural gene name.
519 :     $stats->Add(aliasGene => 1);
520 :     $self->CreateIdentifier($normalized, B => 'GENE', $fid);
521 :     } elsif ($alias =~ /^\d+$/) {
522 :     # Here it's a naked number, which means it's a GI number
523 :     # of some sort.
524 :     $stats->Add(aliasGI => 1);
525 :     $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
526 :     } elsif ($alias =~ /^protein_id\|(.+)/) {
527 :     # Here we have a REFSEQ protein ID. Right now we don't have a way to
528 :     # handle that, because we don't know the feature's protein ID here.
529 :     $stats->Add(aliasProtein => 1);
530 :     } elsif ($alias =~ /[:|]/) {
531 :     # Here it's an alias of an unknown type, so we skip it.
532 :     $stats->Add(aliasUnknown => 1);
533 :     } else {
534 :     # Here it's a miscellaneous type.
535 :     $stats->Add(aliasMisc => 1);
536 :     $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
537 :     }
538 : parrello 1.2 }
539 :     }
540 :     }
541 : parrello 1.1 }
542 :     }
543 : parrello 1.3 # Now loop through the features, connecting them to their roles. Note that deleted
544 :     # features will not be in the assignment hash.
545 :     for my $fid (keys %$assignHash) {
546 :     # Get the roles and the error count.
547 :     my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});
548 :     # Accumulate the errors in the stats object.
549 :     $stats->Add(roleErrors => $errors);
550 :     # Is this a suspicious function?
551 :     if (! defined $roles) {
552 :     # Yes, so track it.
553 :     $stats->Add(badFunction => 1);
554 :     } else {
555 :     # No, connect the roles.
556 :     for my $role (@$roles) {
557 :     # Insure this role exists.
558 :     my $hypo = hypo($role);
559 :     $self->InsureEntity(Role => $role, hypothetical => $hypo);
560 :     # Connect it to the feature.
561 :     $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);
562 :     }
563 :     }
564 :     }
565 : parrello 1.1 }
566 :    
567 :     =head3 LoadProteinData
568 :    
569 :     $self->LoadProteinData($fileName);
570 :    
571 :     Load the protein sequences from the named FASTA file. The sequences will be stored
572 :     in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature
573 :     records themselves will not be created (it is presumed they are already there).
574 :    
575 :     =over 4
576 :    
577 :     =item fileName
578 :    
579 :     Name of the FASTA file containing the protein sequences for this genome.
580 :    
581 :     =back
582 :    
583 :     =cut
584 :    
585 :     sub LoadProteinData {
586 :     # Get the parameters.
587 :     my ($self, $fileName) = @_;
588 :     # Open the FASTA file for input.
589 :     my $ih = Open(undef, "<$fileName");
590 :     # We'll track the current protein in here.
591 :     my $fid;
592 :     my $sequence = "";
593 :     # Loop through the input file.
594 :     while (! eof $ih) {
595 :     # Get the current record.
596 :     my $line = <$ih>; chomp $line;
597 :     # Is this a label record.
598 :     if ($line =~ /^>(\S+)/) {
599 :     # Yes. Save the new feature ID.
600 :     my $newFid = $1;
601 :     # Do we have an existing protein?
602 :     if (defined $fid) {
603 :     # Yes. Write it out.
604 :     $self->WriteProtein($fid, $sequence);
605 :     }
606 :     # Initialize for the next protein.
607 :     $fid = $newFid;
608 :     $sequence = "";
609 :     } else {
610 :     # Here we have more letters for the current protein.
611 :     $sequence .= $line;
612 :     }
613 :     }
614 :     # Do we have a residual protein.
615 :     if (defined $fid) {
616 :     # Yes. Write it out.
617 :     $self->WriteProtein($fid, $sequence);
618 :     }
619 :     }
620 :    
621 :     =head3 WriteProtein
622 :    
623 :     $loaderObject->WriteProtein($fid, $sequence);
624 :    
625 :     Write out the specified protein sequence and associate it with the specified feature.
626 :     This requires checking to see if the protein sequence is already in the database.
627 :    
628 :     =over 4
629 :    
630 :     =item fid
631 :    
632 :     ID of the feature with the specified protein sequence.
633 :    
634 :     =item sequence
635 :    
636 :     Protein sequence for the indicated feature.
637 :    
638 :     =back
639 :    
640 :     =cut
641 :    
642 :     sub WriteProtein {
643 :     # Get the parameters.
644 :     my ($self, $fid, $sequence) = @_;
645 :     # Compute the key of the protein sequence.
646 :     my $protID = ERDB::DigestKey($sequence);
647 :     # Insure the protein exists.
648 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
649 :     # Connect the feature to it.
650 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
651 :     }
652 :    
653 :     =head3 LoadSubsystems
654 :    
655 :     $loaderObject->LoadSubsystems();
656 :    
657 :     Load the subsystem data into the database. This includes all of the subsystems,
658 :     their categories, roles, and the bindings that connect them to this genome's features.
659 :    
660 :     =cut
661 :    
662 :     sub LoadSubsystems {
663 :     # Get the parameters.
664 :     my ($self) = @_;
665 :     # Get the sapling object.
666 :     my $sap = $self->{sap};
667 :     # Get the statistics object.
668 :     my $stats = $self->{stats};
669 :     # Get the genome ID.
670 :     my $genome = $self->{genome};
671 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
672 :     my $sapO = SAPserver->new();
673 :     # This hash maps subsystem IDs to molecular machine IDs.
674 :     my %machines;
675 :     # This hash maps subsystem/role pairs to machine role IDs.
676 :     my %machineRoles;
677 :     # The first step is to create the subsystems themselves and connect them to the
678 :     # genome. A list is given in the subsystems file of the Subsystems directory.
679 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
680 :     # Create the field hash for the subsystem query.
681 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
682 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
683 :     Subsystem(version)
684 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
685 :     Includes(sequence)
686 :     Role(id) Role(hypothetical) Role(role-index));
687 :     my %fields = map { $_ => $_ } @fields;
688 :     # Loop through the subsystems in the file, insuring we have them in the database.
689 :     while (! eof $ih) {
690 :     # Get this subsystem.
691 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
692 :     # Normalize the subsystem name.
693 :     $subsystem = $sap->SubsystemID($subsystem);
694 :     # Compute this subsystem's MD5.
695 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
696 :     # Insure the subsystem is in the database.
697 :     if (! $sap->Exists(Subsystem => $subsystem)) {
698 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
699 :     # its roles, and its classification information. The variant will be added
700 :     # later if we need it.
701 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
702 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
703 :     -fields => \%fields);
704 :     # Only proceed if we found some roles.
705 :     if (@$roleList > 0) {
706 :     # Get the subsystem information from the first role and create the subsystem.
707 :     my $roleH = $roleList->[0];
708 : parrello 1.3 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
709 : parrello 1.1 $sap->InsertObject('Subsystem', %subFields);
710 :     # Now loop through the roles. The Includes records are always inserted, but the
711 :     # roles are only inserted if they don't already exist.
712 :     for $roleH (@$roleList) {
713 :     # Create the Includes record.
714 : parrello 1.3 my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
715 : parrello 1.1 $sap->InsertObject('Includes', %incFields);
716 :     # Insure we have the role in place.
717 : parrello 1.3 my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
718 : parrello 1.1 my $roleID = $roleFields{id};
719 :     delete $roleFields{id};
720 :     $self->InsureEntity('Role', $roleID, %roleFields);
721 :     # Compute the machine-role ID.
722 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
723 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
724 :     }
725 :     }
726 :     } else {
727 :     # We already have the subsystem in the database, but we still need to compute the
728 :     # machine role IDs. We need information from the sapling server for this.
729 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
730 :     # Loop through the roles.
731 :     my $roleList = $subHash->{$subsystem};
732 :     for my $roleTuple (@$roleList) {
733 :     my ($roleID, $abbr) = @$roleTuple;
734 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
735 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
736 :     }
737 :     }
738 :     # Now we need to connect the variant to the subsystem. First, we compute the real
739 :     # variant code by removing the asterisks.
740 :     my $variantCode = $variant;
741 :     $variantCode =~ s/^\*+//;
742 :     $variantCode =~ s/\*+$//;
743 :     # Compute the variant key.
744 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
745 :     # Get the variant from the sapling server.
746 :     my $variantH = $sapO->get(-objects => "Variant",
747 :     -filter => {"Variant(id)" => ["=", $variantKey]},
748 :     -fields => {"Variant(code)" => "code",
749 :     "Variant(comment)" => "comment",
750 :     "Variant(type)" => "type",
751 :     "Variant(role-rule)" => "role-rule"},
752 :     -firstOnly => 1);
753 :     # Insure we have it in the database.
754 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
755 :     $stats->Add(subsystems => 1);
756 :     # Connect it to the subsystem.
757 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
758 :     # Now we create the molecular machine connecting this genome to the subsystem
759 :     # variant.
760 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
761 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
762 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
763 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
764 :     # Remember the machine ID.
765 :     $machines{$subsystem} = $machineID;
766 :     }
767 :     # Now we go through the bindings file. This file connects the subsystem roles to
768 :     # the molecular machines.
769 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
770 :     # Loop through the bindings.
771 :     while (! eof $ih) {
772 :     # Get the binding data.
773 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
774 :     # Normalize the subsystem name.
775 :     $subsystem = $sap->SubsystemID($subsystem);
776 :     # Compute the machine role.
777 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
778 :     # Insure it exists.
779 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
780 :     if ($created) {
781 :     # We created the machine role, so connect it to the machine.
782 :     my $machineID = $machines{$subsystem};
783 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
784 :     # Connect it to the role, too.
785 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
786 :     }
787 :     # Connect the feature.
788 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
789 :     }
790 :     }
791 :    
792 :     =head3 CreateGenome
793 :    
794 :     $loaderObject->CreateGenome();
795 :    
796 :     Create the genome record.
797 :    
798 :     =cut
799 :    
800 :     # This hash maps cache statistics to genome record fields.
801 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
802 :     genome_domain => 'domain'};
803 :     # This hash maps domains to the prokaryotic flag.
804 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
805 :    
806 :     sub CreateGenome {
807 :     # Get the parameters.
808 :     my ($self) = @_;
809 :     # Get the genome directory.
810 :     my $dir = $self->{directory};
811 :     # Get the sapling database.
812 :     my $sap = $self->{sap};
813 :     # We'll put the genome attributes in here.
814 :     my %fields;
815 :     # Check for a basic statistics file.
816 :     my $statsFile = "$dir/cache.basic_statistics";
817 :     if (-f $statsFile) {
818 :     # We have the statistics file, so read the major attributes from it.
819 :     my $ih = Open(undef, "<$statsFile");
820 :     while (! eof $ih) {
821 :     my ($key, $value) = Tracer::GetLine($ih);
822 :     my $fieldKey = GENOME_FIELDS->{$key};
823 :     if ($fieldKey) {
824 :     $fields{$fieldKey} = $value;
825 :     }
826 :     }
827 :     # Translate the domain.
828 :     $fields{domain} = ucfirst $fields{domain};
829 :     # Denote the genome is complete.
830 :     $fields{complete} = 1;
831 :     } else {
832 :     # Check to see if this genome is complete.
833 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
834 :     # Get the genome name.
835 :     my $ih = Open(undef, "<$dir/GENOME");
836 :     my $line = <$ih>; chomp $line;
837 :     $fields{'scientific-name'} = $line;
838 :     # Get the taxonomy and extract the domain from it.
839 :     $ih = Open(undef, "<$dir/TAXONOMY");
840 :     ($fields{domain}) = split /;/, <$ih>, 2;
841 :     }
842 :     # Get the counts from the statistics object.
843 :     my $stats = $self->{stats};
844 :     $fields{contigs} = $stats->Ask('contigs');
845 :     $fields{'dna-size'} = $stats->Ask('dna');
846 :     $fields{pegs} = $stats->Ask('peg');
847 :     $fields{rnas} = $stats->Ask('rna');
848 :     # Get the genetic code. The default is 11.
849 :     $fields{'genetic-code'} = 11;
850 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
851 :     if (-f $geneticCodeFile) {
852 :     # There's a genetic code file, so we need to read it for the code.
853 :     my $ih = Open(undef, "<$geneticCodeFile");
854 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
855 :     }
856 :     # Use the domain to determine whether or not the genome is prokaryotic.
857 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
858 :     # Finally, add the genome ID.
859 :     $fields{id} = $self->{genome};
860 :     # Create the genome record.
861 :     $sap->InsertObject('Genome', %fields);
862 :     }
863 :    
864 :     =head3 ReadAssignments
865 :    
866 :     my $assignHash = $loaderObject->ReadAssignments();
867 :    
868 :     Return a hash mapping each feature ID to its functional assignment. This method
869 :     essentially reads the B<assigned_functions> file into memory.
870 :    
871 :     =cut
872 :    
873 :     sub ReadAssignments {
874 :     # Get the parameters.
875 :     my ($self) = @_;
876 :     # Create the return hash.
877 :     my $retVal = {};
878 :     # Loop through the assigned-functions file, storing results in the hash. Later
879 :     # results will override earlier ones.
880 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
881 :     while (! eof $ih) {
882 :     my ($fid, $function) = Tracer::GetLine($ih);
883 :     $retVal->{$fid} = $function;
884 :     }
885 :     # Return the hash built.
886 :     return $retVal;
887 :     }
888 :    
889 :    
890 :     =head3 FixContig
891 :    
892 :     $loaderObject->FixContig(\$contigID);
893 :    
894 :     Insure that the specified contig ID contains the genome ID.
895 :    
896 :     =cut
897 :    
898 :     sub FixContig {
899 :     my ($self, $contigID) = @_;
900 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
901 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
902 :     $$contigID = "$self->{genome}:$$contigID";
903 :     }
904 :     }
905 :    
906 :     =head3 ConnectLocation
907 :    
908 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
909 :    
910 :     Connect the specified feature to the specified contig location.
911 :    
912 :     =over 4
913 :    
914 :     =item fid
915 :    
916 :     ID of the relevant feature.
917 :    
918 :     =item contig
919 :    
920 :     ID of the contig containing this segment of the feature (normalized).
921 :    
922 :     =item segment
923 :    
924 :     Ordinal number of this segment.
925 :    
926 :     =item left
927 :    
928 :     Location of the segment's leftmost base pair.
929 :    
930 :     =item dir
931 :    
932 :     Direction (strand) of the segment (C<+> or C<->).
933 :    
934 :     =item len
935 :    
936 :     Length of the segment (which must be no greater than the configured maximum).
937 :    
938 :     =back
939 :    
940 :     =cut
941 :    
942 :     sub ConnectLocation {
943 :     # Get the parameters.
944 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
945 :     # Get the sapling database.
946 :     my $sap = $self->{sap};
947 :     # Create the relationship.
948 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
949 :     begin => $left, dir => $dir, len => $len,
950 :     ordinal => $segment);
951 :     # Record it in the statistics.
952 :     $self->{stats}->Add(segment => 1);
953 :     }
954 :    
955 :     =head2 Internal Utility Methods
956 :    
957 : parrello 1.2 =head3 CreateIdentifier
958 :    
959 :     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
960 :    
961 :     Link an identifier to a feature. The identifier is presented in prefixed form and is of the
962 :     specified type and the specified confidence level.
963 :    
964 :     =over 4
965 :    
966 :     =item alias
967 :    
968 :     Identifier to connect to the feature.
969 :    
970 :     =item conf
971 :    
972 :     Confidence level (C<A> curated, C<B> normal, C<C> protein only).
973 :    
974 :     =item aliasType
975 :    
976 :     Type of alias (e.g. C<NCBI>, C<LocusTag>).
977 :    
978 :     =item fid
979 :    
980 :     ID of the relevant feature.
981 :    
982 :     =back
983 :    
984 :     =cut
985 :    
986 :     sub CreateIdentifier {
987 :     # Get the parameters.
988 :     my ($self, $alias, $conf, $aliasType, $fid) = @_;
989 :     # Get the Sapling object.
990 :     my $sap = $self->{sap};
991 :     # Compute the identifier's natural form.
992 :     my $natural = $alias;
993 :     if ($natural =~ /[:|](.+)/) {
994 :     $natural = $1;
995 :     }
996 :     # Insure the identifier exists in the database.
997 :     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
998 :     # Connect the identifier to the feature.
999 : parrello 1.5 $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
1000 : parrello 1.2 }
1001 :    
1002 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3