[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 :    
29 :     =head1 Sapling Genome Loader
30 :    
31 :     This package contains the methods used to load a genome into the Sapling from a genome
32 :     directory. This is not very efficient if you are trying to create a full database,
33 :     but is useful if you wish to add genomes one at a time. The information loaded will
34 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
35 :    
36 :     =head2 Main Methods
37 :    
38 :     =head3 Load
39 :    
40 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
41 :    
42 :     Load a genome from a genome directory into the sapling database.
43 :    
44 :     =over 4
45 :    
46 :     =item sap
47 :    
48 :     L<Sapling> object used to access the target database.
49 :    
50 :     =item genome
51 :    
52 :     ID of the genome being loaded.
53 :    
54 :     =item directory
55 :    
56 :     Name of the directory containing the genome information.
57 :    
58 :     =back
59 :    
60 :     =cut
61 :    
62 :     sub Load {
63 :     # Get the parameters.
64 :     my ($sap, $genome, $directory) = @_;
65 :     # Create the loader object.
66 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
67 :     # Load the contigs.
68 :     $loaderObject->LoadContigs();
69 :     # Load the features.
70 :     $loaderObject->LoadFeatures();
71 :     # Load the subsystem bindings.
72 :     $loaderObject->LoadSubsystems();
73 :     # Create the Genome record and taxonomy information.
74 :     $loaderObject->CreateGenome();
75 :     # Return the statistics.
76 :     return $loaderObject->{stats};
77 :     }
78 :    
79 :     =head3 ClearGenome
80 :    
81 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
82 :    
83 :     Delete the specified genome and all the related records from the specified sapling
84 :     database. This method can also be used to clean up after a failed or aborted load.
85 :    
86 :     =over 4
87 :    
88 :     =item sap
89 :    
90 :     L<Sapling> object used to access the target database.
91 :    
92 :     =item genome
93 :    
94 :     ID of the genome to delete.
95 :    
96 :     =item RETURN
97 :    
98 :     Returns a statistics object counting the records deleted.
99 :    
100 :     =back
101 :    
102 :     =cut
103 :    
104 :     sub ClearGenome {
105 :     # Get the parameters.
106 :     my ($sap, $genome) = @_;
107 :     # Create the statistics object.
108 :     my $stats = Stats->new();
109 :     # Delete the DNA.
110 :     DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
111 :     # Delete the contigs.
112 :     DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
113 :     # Delete the features.
114 :     DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
115 :     # Delete the molecular machines.
116 :     DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
117 :     # Delete the genome itself.
118 :     my $subStats = $sap->Delete(Genome => $genome);
119 :     # Accumulate the statistics from the delete.
120 :     $stats->Accumulate($subStats);
121 :     # Return the statistics object.
122 :     return $stats;
123 :     }
124 :    
125 :     =head2 Loader Object Methods
126 :    
127 :     =head3 new
128 :    
129 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
130 :    
131 :     Create a loader object that can be used to facilitate loading genome data from a
132 :     directory.
133 :    
134 :     =over 4
135 :    
136 :     =item sap
137 :    
138 :     L<Sapling> object used to access the target database.
139 :    
140 :     =item genome
141 :    
142 :     ID of the genome being loaded.
143 :    
144 :     =item directory
145 :    
146 :     Name of the directory containing the genome information.
147 :    
148 :     =back
149 :    
150 :     The object created contains the following fields.
151 :    
152 :     =over 4
153 :    
154 :     =item directory
155 :    
156 :     Name of the directory containing the genome data.
157 :    
158 :     =item genome
159 :    
160 :     ID of the genome being loaded.
161 :    
162 :     =item supportRecords
163 :    
164 :     A hash of hashes, used to track the support records known to exist in the database.
165 :    
166 :     =item sap
167 :    
168 :     L<Sapling> object used to access the database.
169 :    
170 :     =item stats
171 :    
172 :     L<Stats> object for tracking statistical information about the load.
173 :    
174 :     =back
175 :    
176 :     =cut
177 :    
178 :     sub new {
179 :     # Get the parameters.
180 :     my ($class, $sap, $genome, $directory) = @_;
181 :     # Create the object.
182 :     my $retVal = {
183 :     sap => $sap,
184 :     genome => $genome,
185 :     directory => $directory,
186 :     stats => Stats->new(qw(contigs dna pegs rnas)),
187 :     supportRecords => {}
188 :     };
189 :     # Bless and return it.
190 :     bless $retVal, $class;
191 :     return $retVal;
192 :     }
193 :    
194 :     =head3 LoadContigs
195 :    
196 :     $loaderObject->LoadContigs();
197 :    
198 :     Load the contig information into the database. This includes the contigs themselves and
199 :     the DNA. The number of contigs will be recorded as the C<contigs> statistic and the
200 :     number of base pairs as the C<dna> statistic.
201 :    
202 :     =cut
203 :    
204 :     sub LoadContigs {
205 :     # Get this object.
206 :     my ($self) = @_;
207 :     # Open the contigs file from the genome directory.
208 :     my $ih = Open(undef, "<$self->{directory}/contigs");
209 :     # Get the length of a DNA segment.
210 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
211 :     # Get the genome ID.
212 :     my $genome = $self->{genome};
213 :     # These variables will contain the current contig ID, the current contig length,
214 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
215 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
216 :     # Loop through the contig file.
217 :     while (! eof $ih) {
218 :     # Get the current record.
219 :     my $line = <$ih>; chomp $line;
220 :     # Is this the start of a new contig?
221 :     if ($line =~ /^>(\S+)/) {
222 :     # Yes. Save the contig ID.
223 :     my $newContigID = $1;
224 :     # Is there a current contig?
225 :     if (defined $contigID) {
226 :     # Yes. Output the current chunk.
227 :     $self->OutputChunk($contigID, $ordinal, $chunk);
228 :     # Output the contig.
229 :     $self->OutputContig($contigID, $contigLen);
230 :     }
231 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
232 :     $self->FixContig(\$newContigID);
233 :     # Initialize the new contig.
234 :     $contigID = $newContigID;
235 :     $contigLen = 0;
236 :     $ordinal = 0;
237 :     $chunk = '';
238 :     $chunkLen = 0;
239 :     } else {
240 :     # Here we have more DNA in the current contig. Are we at the end of
241 :     # the current chunk?
242 :     my $lineLen = length($line);
243 :     my $newChunkLen = $chunkLen + $lineLen;
244 :     if ($newChunkLen < $segmentLength) {
245 :     # No. Add this line to the chunk.
246 :     $chunk .= $line;
247 :     $chunkLen += $lineLen;
248 :     } else {
249 :     # Yes. Create the actual chunk.
250 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
251 :     # Write it out.
252 :     $self->OutputChunk($contigID, $ordinal, $chunk);
253 :     # Set up the new chunk.
254 :     $chunk = substr($line, $segmentLength - $chunkLen);
255 :     $ordinal++;
256 :     $chunkLen = length($chunk);
257 :     }
258 :     # Update the contig length.
259 :     $contigLen += $lineLen;
260 :     }
261 :     }
262 :     # Is there a current contig?
263 :     if (defined $contigID) {
264 :     # Yes. Output the residual chunk.
265 :     $self->OutputChunk($contigID, $ordinal, $chunk);
266 :     # Output the contig itself.
267 :     $self->OutputContig($contigID, $contigLen);
268 :     }
269 :     }
270 :    
271 :     =head3 OutputChunk
272 :    
273 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
274 :    
275 :     Output a chunk of DNA for the specified contig.
276 :    
277 :     =over 4
278 :    
279 :     =item contigID
280 :    
281 :     ID of the contig being output.
282 :    
283 :     =item ordinal
284 :    
285 :     Ordinal number of this chunk.
286 :    
287 :     =item chunk
288 :    
289 :     DNA sequence comprising this chunk.
290 :    
291 :     =back
292 :    
293 :     =cut
294 :    
295 :     sub OutputChunk {
296 :     # Get the parameters.
297 :     my ($self, $contigID, $ordinal, $chunk) = @_;
298 :     # Get the sapling object.
299 :     my $sap = $self->{sap};
300 :     # Compute the chunk ID.
301 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
302 :     # Connect this sequence to the contig.
303 :     $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);
304 :     # Create the DNA sequence.
305 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
306 :     # Record the chunk.
307 :     $self->{stats}->Add(chunks => 1);
308 :     }
309 :    
310 :     =head3 OutputContig
311 :    
312 :     $loaderObject->OutputContig($contigID, $contigLen);
313 :    
314 :     Write out the current contig.
315 :    
316 :     =over 4
317 :    
318 :     =item contigID
319 :    
320 :     ID of the contig being written out.
321 :    
322 :     =item contigLen
323 :    
324 :     Length of the contig in base pairs.
325 :    
326 :     =back
327 :    
328 :     =cut
329 :    
330 :     sub OutputContig {
331 :     # Get the parameters.
332 :     my ($self, $contigID, $contigLen) = @_;
333 :     # Get the sapling object.
334 :     my $sap = $self->{sap};
335 :     # Connect the contig to the genome.
336 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
337 :     # Output the contig record.
338 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
339 :     # Record the contig.
340 :     $self->{stats}->Add(contigs => 1);
341 :     $self->{stats}->Add(dna => $contigLen);
342 :     }
343 :    
344 :     =head3 LoadFeatures
345 :    
346 :     $loaderObject->LoadFeatures();
347 :    
348 :     Load the feature data into the database. This includes all of the features,
349 :     their protein sequences, and their subsystem information. The number of
350 :     features of each type will be stored in the statistics object, identified by
351 :     the feature type.
352 :    
353 :     =cut
354 :    
355 :     sub LoadFeatures {
356 :     # Get the parameters.
357 :     my ($self) = @_;
358 :     # Get the directory of feature types.
359 :     my $featureDir = "$self->{directory}/Features";
360 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
361 :     # Create the feature records for the types found.
362 :     for my $type (@types) {
363 :     # Insure this is a genuine feature directory.
364 :     if (-f "$featureDir/$type/tbl") {
365 :     # Yes, load the feature data.
366 :     $self->LoadFeatureData($featureDir, $type);
367 :     }
368 :     }
369 :     # Check for protein sequences. If we have some, load them into the database.
370 :     if (-f "$featureDir/peg/fasta") {
371 :     $self->LoadProteinData("$featureDir/peg/fasta");
372 :     }
373 :     }
374 :    
375 :     =head3 LoadFeatureData
376 :    
377 :     $self->LoadFeatureData($featureDir, $type);
378 :    
379 :     Load the basic data for each feature into the database. The number of features of
380 :     the type found will be recorded in the statistics object.
381 :    
382 :     =over 4
383 :    
384 :     =item featureDir
385 :    
386 :     Name of the main directory containing all the feature type subdirectories.
387 :    
388 :     =item type
389 :    
390 :     Type of feature to load.
391 :    
392 :     =back
393 :    
394 :     =cut
395 :    
396 :     sub LoadFeatureData {
397 :     # Get the parameters.
398 :     my ($self, $featureDir, $type) = @_;
399 :     # Get the sapling database.
400 :     my $sap = $self->{sap};
401 :     # Get the maximum location segment length. We'll need this later.
402 :     my $maxLength = $sap->TuningParameter('maxLocationLength');
403 :     # Get the statistics object.
404 :     my $stats = $self->{stats};
405 :     # Read in the functional assignments.
406 :     my $assignHash = $self->ReadAssignments();
407 :     # This hash will track the features we've created. If a feature is found a second
408 :     # time, it overwrites the original.
409 :     my %fids;
410 :     # Insure we have a tbl file for this feature type.
411 :     my $fileName = "$featureDir/$type/tbl";
412 :     if (-f $fileName) {
413 :     # We have one, so open it for input.
414 :     my $ih = Open(undef, "<$fileName");
415 :     while (! eof $ih) {
416 :     # Read this feature's information.
417 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
418 :     # If the feature already exists, delete it.
419 :     if (exists $fids{$fid}) {
420 :     $sap->Delete(Feature => $fid);
421 :     $stats->Add(duplicateFid => 1);
422 :     } else {
423 :     # Otherwise connect it to the genome.
424 :     $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
425 :     }
426 :     # Now we must parse the locations. This will contain a list of the location
427 :     # data 4-tuples (contig, start, dir, len).
428 :     my @locData;
429 :     # This will contain the total sequence length.
430 :     my $length = 0;
431 :     # Loop through the locations.
432 :     for my $loc (split /\s*,\s*/, $locations) {
433 :     # Get this location's components.
434 :     my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);
435 :     # Normalize the contig.
436 :     $self->FixContig(\$contig);
437 :     # Compute the direction.
438 :     if ($start <= $stop) {
439 :     # Here we have the forward strand.
440 :     my $len = $stop + 1 - $start;
441 :     push @locData, [$contig, $start, '+', $len];
442 :     $length += $len;
443 :     } else {
444 :     # Here we have the reverse strand.
445 :     my $len = $start + 1 - $stop;
446 :     push @locData, [$contig, $stop, '-', $len];
447 :     $length += $len;
448 :     }
449 :     }
450 :     # Now we can create the feature record.
451 :     $sap->InsertObject('Feature', id => $fid, feature_type => $type,
452 :     function => $assignHash->{$fid}, locked => 0,
453 :     sequence_length => $length);
454 :     $fids{$fid} = 1;
455 :     $stats->Add($type => 1);
456 :     # The next step is to connect the feature record to its locations. This
457 :     # involves dividing the location into segments. The following variable
458 :     # will count the total number of segments.
459 :     my $segment = 0;
460 :     # Loop through the locations.
461 :     for my $loc (@locData) {
462 :     # Get the current location's information.
463 :     my ($contig, $left, $dir, $len) = @$loc;
464 :     # Peel off any segments.
465 :     while ($len > $maxLength) {
466 :     # Process according to the direction.
467 :     if ($dir eq '+') {
468 :     # Forward strand: peel from the beginning.
469 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir,
470 :     $maxLength);
471 :     $left += $maxLength;
472 :     } else {
473 :     # Reverse strand: peel from the end.
474 :     $self->ConnectLocation($fid, $contig, $segment,
475 :     $left + $len - $maxLength, $dir,
476 :     $maxLength);
477 :     }
478 :     # Denote we've used up a segment.
479 :     $len -= $maxLength;
480 :     $segment++;
481 :     }
482 :     # Output the last segment.
483 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
484 :     }
485 :     }
486 :     }
487 :     }
488 :    
489 :     =head3 LoadProteinData
490 :    
491 :     $self->LoadProteinData($fileName);
492 :    
493 :     Load the protein sequences from the named FASTA file. The sequences will be stored
494 :     in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature
495 :     records themselves will not be created (it is presumed they are already there).
496 :    
497 :     =over 4
498 :    
499 :     =item fileName
500 :    
501 :     Name of the FASTA file containing the protein sequences for this genome.
502 :    
503 :     =back
504 :    
505 :     =cut
506 :    
507 :     sub LoadProteinData {
508 :     # Get the parameters.
509 :     my ($self, $fileName) = @_;
510 :     # Open the FASTA file for input.
511 :     my $ih = Open(undef, "<$fileName");
512 :     # We'll track the current protein in here.
513 :     my $fid;
514 :     my $sequence = "";
515 :     # Loop through the input file.
516 :     while (! eof $ih) {
517 :     # Get the current record.
518 :     my $line = <$ih>; chomp $line;
519 :     # Is this a label record.
520 :     if ($line =~ /^>(\S+)/) {
521 :     # Yes. Save the new feature ID.
522 :     my $newFid = $1;
523 :     # Do we have an existing protein?
524 :     if (defined $fid) {
525 :     # Yes. Write it out.
526 :     $self->WriteProtein($fid, $sequence);
527 :     }
528 :     # Initialize for the next protein.
529 :     $fid = $newFid;
530 :     $sequence = "";
531 :     } else {
532 :     # Here we have more letters for the current protein.
533 :     $sequence .= $line;
534 :     }
535 :     }
536 :     # Do we have a residual protein.
537 :     if (defined $fid) {
538 :     # Yes. Write it out.
539 :     $self->WriteProtein($fid, $sequence);
540 :     }
541 :     }
542 :    
543 :     =head3 WriteProtein
544 :    
545 :     $loaderObject->WriteProtein($fid, $sequence);
546 :    
547 :     Write out the specified protein sequence and associate it with the specified feature.
548 :     This requires checking to see if the protein sequence is already in the database.
549 :    
550 :     =over 4
551 :    
552 :     =item fid
553 :    
554 :     ID of the feature with the specified protein sequence.
555 :    
556 :     =item sequence
557 :    
558 :     Protein sequence for the indicated feature.
559 :    
560 :     =back
561 :    
562 :     =cut
563 :    
564 :     sub WriteProtein {
565 :     # Get the parameters.
566 :     my ($self, $fid, $sequence) = @_;
567 :     # Compute the key of the protein sequence.
568 :     my $protID = ERDB::DigestKey($sequence);
569 :     # Insure the protein exists.
570 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
571 :     # Connect the feature to it.
572 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
573 :     }
574 :    
575 :     =head3 InsureEntity
576 :    
577 :     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);
578 :    
579 :     Insure that the specified record exists in the database. If no record is found of the
580 :     specified type with the specified ID, one will be created with the indicated fields.
581 :    
582 :     =over 4
583 :    
584 :     =item $entityType
585 :    
586 :     Type of entity to check.
587 :    
588 :     =item id
589 :    
590 :     ID of the entity instance in question.
591 :    
592 :     =item fields
593 :    
594 :     Hash mapping field names to values for all the fields in the desired entity record except
595 :     for the ID.
596 :    
597 :     =item RETURN
598 :    
599 :     Returns TRUE if a new object was created, FALSE if it already existed.
600 :    
601 :     =back
602 :    
603 :     =cut
604 :    
605 :     sub InsureEntity {
606 :     # Get the parameters.
607 :     my ($self, $entityType, $id, %fields) = @_;
608 :     # Get the database.
609 :     my $sap = $self->{sap};
610 :     # Get the support record ID hash.
611 :     my $supportHash = $self->{supportRecords};
612 :     # Denote we haven't created a new record.
613 :     my $retVal = 0;
614 :     # Get the sub-hash for this entity type.
615 :     my $entityHash = $supportHash->{$entityType};
616 :     if (! defined $entityHash) {
617 :     $entityHash = {};
618 :     $supportHash->{$entityType} = $entityHash;
619 :     }
620 :     # Check for this instance.
621 :     if (! $entityHash->{$id}) {
622 :     # It's not found. Check the database.
623 :     if (! $sap->Exists($entityType => $id)) {
624 :     # It's not in the database either, so create it.
625 :     $sap->InsertObject($entityType, id => $id, %fields);
626 :     $self->{stats}->Add(insertSupport => 1);
627 :     $retVal = 1;
628 :     }
629 :     # Mark the record in the hash so we know we have it.
630 :     $entityHash->{$id} = 1;
631 :     }
632 :     # Return the insertion indicator.
633 :     return $retVal;
634 :     }
635 :    
636 :     =head3 LoadSubsystems
637 :    
638 :     $loaderObject->LoadSubsystems();
639 :    
640 :     Load the subsystem data into the database. This includes all of the subsystems,
641 :     their categories, roles, and the bindings that connect them to this genome's features.
642 :    
643 :     =cut
644 :    
645 :     sub LoadSubsystems {
646 :     # Get the parameters.
647 :     my ($self) = @_;
648 :     # Get the sapling object.
649 :     my $sap = $self->{sap};
650 :     # Get the statistics object.
651 :     my $stats = $self->{stats};
652 :     # Get the genome ID.
653 :     my $genome = $self->{genome};
654 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
655 :     my $sapO = SAPserver->new();
656 :     # This hash maps subsystem IDs to molecular machine IDs.
657 :     my %machines;
658 :     # This hash maps subsystem/role pairs to machine role IDs.
659 :     my %machineRoles;
660 :     # The first step is to create the subsystems themselves and connect them to the
661 :     # genome. A list is given in the subsystems file of the Subsystems directory.
662 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
663 :     # Create the field hash for the subsystem query.
664 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
665 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
666 :     Subsystem(version)
667 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
668 :     Includes(sequence)
669 :     Role(id) Role(hypothetical) Role(role-index));
670 :     my %fields = map { $_ => $_ } @fields;
671 :     # Loop through the subsystems in the file, insuring we have them in the database.
672 :     while (! eof $ih) {
673 :     # Get this subsystem.
674 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
675 :     # Normalize the subsystem name.
676 :     $subsystem = $sap->SubsystemID($subsystem);
677 :     # Compute this subsystem's MD5.
678 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
679 :     # Insure the subsystem is in the database.
680 :     if (! $sap->Exists(Subsystem => $subsystem)) {
681 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
682 :     # its roles, and its classification information. The variant will be added
683 :     # later if we need it.
684 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
685 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
686 :     -fields => \%fields);
687 :     # Only proceed if we found some roles.
688 :     if (@$roleList > 0) {
689 :     # Get the subsystem information from the first role and create the subsystem.
690 :     my $roleH = $roleList->[0];
691 :     my %subFields = ExtractFields(Subsystem => $roleH);
692 :     $sap->InsertObject('Subsystem', %subFields);
693 :     # Now loop through the roles. The Includes records are always inserted, but the
694 :     # roles are only inserted if they don't already exist.
695 :     for $roleH (@$roleList) {
696 :     # Create the Includes record.
697 :     my %incFields = ExtractFields(Includes => $roleH);
698 :     $sap->InsertObject('Includes', %incFields);
699 :     # Insure we have the role in place.
700 :     my %roleFields = ExtractFields(Role => $roleH);
701 :     my $roleID = $roleFields{id};
702 :     delete $roleFields{id};
703 :     $self->InsureEntity('Role', $roleID, %roleFields);
704 :     # Compute the machine-role ID.
705 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
706 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
707 :     }
708 :     }
709 :     } else {
710 :     # We already have the subsystem in the database, but we still need to compute the
711 :     # machine role IDs. We need information from the sapling server for this.
712 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
713 :     # Loop through the roles.
714 :     my $roleList = $subHash->{$subsystem};
715 :     for my $roleTuple (@$roleList) {
716 :     my ($roleID, $abbr) = @$roleTuple;
717 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
718 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
719 :     }
720 :     }
721 :     # Now we need to connect the variant to the subsystem. First, we compute the real
722 :     # variant code by removing the asterisks.
723 :     my $variantCode = $variant;
724 :     $variantCode =~ s/^\*+//;
725 :     $variantCode =~ s/\*+$//;
726 :     # Compute the variant key.
727 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
728 :     # Get the variant from the sapling server.
729 :     my $variantH = $sapO->get(-objects => "Variant",
730 :     -filter => {"Variant(id)" => ["=", $variantKey]},
731 :     -fields => {"Variant(code)" => "code",
732 :     "Variant(comment)" => "comment",
733 :     "Variant(type)" => "type",
734 :     "Variant(role-rule)" => "role-rule"},
735 :     -firstOnly => 1);
736 :     # Insure we have it in the database.
737 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
738 :     $stats->Add(subsystems => 1);
739 :     # Connect it to the subsystem.
740 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
741 :     # Now we create the molecular machine connecting this genome to the subsystem
742 :     # variant.
743 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
744 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
745 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
746 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
747 :     # Remember the machine ID.
748 :     $machines{$subsystem} = $machineID;
749 :     }
750 :     # Now we go through the bindings file. This file connects the subsystem roles to
751 :     # the molecular machines.
752 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
753 :     # Loop through the bindings.
754 :     while (! eof $ih) {
755 :     # Get the binding data.
756 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
757 :     # Normalize the subsystem name.
758 :     $subsystem = $sap->SubsystemID($subsystem);
759 :     # Compute the machine role.
760 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
761 :     # Insure it exists.
762 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
763 :     if ($created) {
764 :     # We created the machine role, so connect it to the machine.
765 :     my $machineID = $machines{$subsystem};
766 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
767 :     # Connect it to the role, too.
768 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
769 :     }
770 :     # Connect the feature.
771 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
772 :     }
773 :     }
774 :    
775 :     =head3 CreateGenome
776 :    
777 :     $loaderObject->CreateGenome();
778 :    
779 :     Create the genome record.
780 :    
781 :     =cut
782 :    
783 :     # This hash maps cache statistics to genome record fields.
784 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
785 :     genome_domain => 'domain'};
786 :     # This hash maps domains to the prokaryotic flag.
787 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
788 :    
789 :     sub CreateGenome {
790 :     # Get the parameters.
791 :     my ($self) = @_;
792 :     # Get the genome directory.
793 :     my $dir = $self->{directory};
794 :     # Get the sapling database.
795 :     my $sap = $self->{sap};
796 :     # We'll put the genome attributes in here.
797 :     my %fields;
798 :     # Check for a basic statistics file.
799 :     my $statsFile = "$dir/cache.basic_statistics";
800 :     if (-f $statsFile) {
801 :     # We have the statistics file, so read the major attributes from it.
802 :     my $ih = Open(undef, "<$statsFile");
803 :     while (! eof $ih) {
804 :     my ($key, $value) = Tracer::GetLine($ih);
805 :     my $fieldKey = GENOME_FIELDS->{$key};
806 :     if ($fieldKey) {
807 :     $fields{$fieldKey} = $value;
808 :     }
809 :     }
810 :     # Translate the domain.
811 :     $fields{domain} = ucfirst $fields{domain};
812 :     # Denote the genome is complete.
813 :     $fields{complete} = 1;
814 :     } else {
815 :     # Check to see if this genome is complete.
816 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
817 :     # Get the genome name.
818 :     my $ih = Open(undef, "<$dir/GENOME");
819 :     my $line = <$ih>; chomp $line;
820 :     $fields{'scientific-name'} = $line;
821 :     # Get the taxonomy and extract the domain from it.
822 :     $ih = Open(undef, "<$dir/TAXONOMY");
823 :     ($fields{domain}) = split /;/, <$ih>, 2;
824 :     }
825 :     # Get the counts from the statistics object.
826 :     my $stats = $self->{stats};
827 :     $fields{contigs} = $stats->Ask('contigs');
828 :     $fields{'dna-size'} = $stats->Ask('dna');
829 :     $fields{pegs} = $stats->Ask('peg');
830 :     $fields{rnas} = $stats->Ask('rna');
831 :     # Get the genetic code. The default is 11.
832 :     $fields{'genetic-code'} = 11;
833 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
834 :     if (-f $geneticCodeFile) {
835 :     # There's a genetic code file, so we need to read it for the code.
836 :     my $ih = Open(undef, "<$geneticCodeFile");
837 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
838 :     }
839 :     # Use the domain to determine whether or not the genome is prokaryotic.
840 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
841 :     # Finally, add the genome ID.
842 :     $fields{id} = $self->{genome};
843 :     # Create the genome record.
844 :     $sap->InsertObject('Genome', %fields);
845 :     }
846 :    
847 :     =head3 ReadAssignments
848 :    
849 :     my $assignHash = $loaderObject->ReadAssignments();
850 :    
851 :     Return a hash mapping each feature ID to its functional assignment. This method
852 :     essentially reads the B<assigned_functions> file into memory.
853 :    
854 :     =cut
855 :    
856 :     sub ReadAssignments {
857 :     # Get the parameters.
858 :     my ($self) = @_;
859 :     # Create the return hash.
860 :     my $retVal = {};
861 :     # Loop through the assigned-functions file, storing results in the hash. Later
862 :     # results will override earlier ones.
863 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
864 :     while (! eof $ih) {
865 :     my ($fid, $function) = Tracer::GetLine($ih);
866 :     $retVal->{$fid} = $function;
867 :     }
868 :     # Return the hash built.
869 :     return $retVal;
870 :     }
871 :    
872 :    
873 :     =head3 FixContig
874 :    
875 :     $loaderObject->FixContig(\$contigID);
876 :    
877 :     Insure that the specified contig ID contains the genome ID.
878 :    
879 :     =cut
880 :    
881 :     sub FixContig {
882 :     my ($self, $contigID) = @_;
883 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
884 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
885 :     $$contigID = "$self->{genome}:$$contigID";
886 :     }
887 :     }
888 :    
889 :     =head3 ConnectLocation
890 :    
891 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
892 :    
893 :     Connect the specified feature to the specified contig location.
894 :    
895 :     =over 4
896 :    
897 :     =item fid
898 :    
899 :     ID of the relevant feature.
900 :    
901 :     =item contig
902 :    
903 :     ID of the contig containing this segment of the feature (normalized).
904 :    
905 :     =item segment
906 :    
907 :     Ordinal number of this segment.
908 :    
909 :     =item left
910 :    
911 :     Location of the segment's leftmost base pair.
912 :    
913 :     =item dir
914 :    
915 :     Direction (strand) of the segment (C<+> or C<->).
916 :    
917 :     =item len
918 :    
919 :     Length of the segment (which must be no greater than the configured maximum).
920 :    
921 :     =back
922 :    
923 :     =cut
924 :    
925 :     sub ConnectLocation {
926 :     # Get the parameters.
927 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
928 :     # Get the sapling database.
929 :     my $sap = $self->{sap};
930 :     # Create the relationship.
931 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
932 :     begin => $left, dir => $dir, len => $len,
933 :     ordinal => $segment);
934 :     # Record it in the statistics.
935 :     $self->{stats}->Add(segment => 1);
936 :     }
937 :    
938 :     =head2 Internal Utility Methods
939 :    
940 :     =head3 DeleteRelatedRecords
941 :    
942 :     DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);
943 :    
944 :     Delete all the records in the named entity and relationship relating to the
945 :     specified genome and roll up the statistics in the specified statistics object.
946 :    
947 :     =over 4
948 :    
949 :     =item sap
950 :    
951 :     L<Sapling> object for accessing the database.
952 :    
953 :     =item genome
954 :    
955 :     ID of the relevant genome.
956 :    
957 :     =item stats
958 :    
959 :     L<Stats> object for tracking the delete activity.
960 :    
961 :     =item relName
962 :    
963 :     Name of a relationship from the B<Genome> table.
964 :    
965 :     =item entityName
966 :    
967 :     Name of the entity on the other side of the relationship.
968 :    
969 :     =back
970 :    
971 :     =cut
972 :    
973 :     sub DeleteRelatedRecords {
974 :     # Get the parameters.
975 :     my ($sap, $genome, $stats, $relName, $entityName) = @_;
976 :     # Get all the relationship records.
977 :     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],
978 :     "to-link");
979 :     # Loop through the relationship records, deleting them and the target entity
980 :     # records.
981 :     for my $target (@targets) {
982 :     # Delete the relationship instance.
983 :     $sap->DeleteRow($relName, $genome, $target);
984 :     $stats->Add($relName => 1);
985 :     # Delete the entity instance.
986 :     my $subStats = $sap->Delete($entityName, $target);
987 :     # Roll up the statistics.
988 :     $stats->Accumulate($subStats);
989 :     }
990 :     }
991 :    
992 :     =head3 ExtractFields
993 :    
994 :     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);
995 :    
996 :     Extract from the incoming hash the field names and values from the specified table.
997 :    
998 :     =over 4
999 :    
1000 :     =item tableName
1001 :    
1002 :     Name of the table whose field names and values are desired.
1003 :    
1004 :     =item dataHash
1005 :    
1006 :     Reference to a hash mapping fully-qualified ERDB field names to values.
1007 :    
1008 :     =item RETURN
1009 :    
1010 :     Returns a hash containing only the fields from the specified table and their values.
1011 :    
1012 :     =back
1013 :    
1014 :     =cut
1015 :    
1016 :     sub ExtractFields {
1017 :     # Get the parameters.
1018 :     my ($tableName, $dataHash) = @_;
1019 :     # Declare the return variable.
1020 :     my %retVal;
1021 :     # Extract the desired fields.
1022 :     for my $field (keys %$dataHash) {
1023 :     # Is this a field for the specified table?
1024 :     if ($field =~ /^$tableName\(([^)]+)/) {
1025 :     # Yes, put it in the output hash.
1026 :     $retVal{$1} = $dataHash->{$field};
1027 :     }
1028 :     }
1029 :     # Return the computed hash.
1030 :     return %retVal;
1031 :     }
1032 :    
1033 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3