[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Annotation of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     package SaplingGenomeLoader;
21 :    
22 :     use strict;
23 :     use Tracer;
24 :     use Stats;
25 :     use SeedUtils;
26 :     use SAPserver;
27 :     use Sapling;
28 : parrello 1.2 use AliasAnalysis;
29 : parrello 1.1
30 :     =head1 Sapling Genome Loader
31 :    
32 :     This package contains the methods used to load a genome into the Sapling from a genome
33 :     directory. This is not very efficient if you are trying to create a full database,
34 :     but is useful if you wish to add genomes one at a time. The information loaded will
35 :     include the basic genome data, the contigs and the DNA, subsystems, and the features.
36 :    
37 :     =head2 Main Methods
38 :    
39 :     =head3 Load
40 :    
41 :     my $stats = SaplingGenomeLoader::Load($sap, $genome, $directory);
42 :    
43 :     Load a genome from a genome directory into the sapling database.
44 :    
45 :     =over 4
46 :    
47 :     =item sap
48 :    
49 :     L<Sapling> object used to access the target database.
50 :    
51 :     =item genome
52 :    
53 :     ID of the genome being loaded.
54 :    
55 :     =item directory
56 :    
57 :     Name of the directory containing the genome information.
58 :    
59 :     =back
60 :    
61 :     =cut
62 :    
63 :     sub Load {
64 :     # Get the parameters.
65 :     my ($sap, $genome, $directory) = @_;
66 :     # Create the loader object.
67 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
68 :     # Load the contigs.
69 : parrello 1.2 Trace("Loading contigs for $genome.") if T(2);
70 : parrello 1.1 $loaderObject->LoadContigs();
71 :     # Load the features.
72 : parrello 1.2 Trace("Loading features for $genome.") if T(2);
73 : parrello 1.1 $loaderObject->LoadFeatures();
74 :     # Load the subsystem bindings.
75 : parrello 1.2 Trace("Loading subsystems for $genome.") if T(2);
76 : parrello 1.1 $loaderObject->LoadSubsystems();
77 :     # Create the Genome record and taxonomy information.
78 : parrello 1.2 Trace("Creating root for $genome.") if T(2);
79 : parrello 1.1 $loaderObject->CreateGenome();
80 :     # Return the statistics.
81 :     return $loaderObject->{stats};
82 :     }
83 :    
84 :     =head3 ClearGenome
85 :    
86 :     my $stats = SaplingGenomeObject::ClearGenome($sap, $genome);
87 :    
88 :     Delete the specified genome and all the related records from the specified sapling
89 :     database. This method can also be used to clean up after a failed or aborted load.
90 :    
91 :     =over 4
92 :    
93 :     =item sap
94 :    
95 :     L<Sapling> object used to access the target database.
96 :    
97 :     =item genome
98 :    
99 :     ID of the genome to delete.
100 :    
101 :     =item RETURN
102 :    
103 :     Returns a statistics object counting the records deleted.
104 :    
105 :     =back
106 :    
107 :     =cut
108 :    
109 :     sub ClearGenome {
110 :     # Get the parameters.
111 :     my ($sap, $genome) = @_;
112 :     # Create the statistics object.
113 :     my $stats = Stats->new();
114 :     # Delete the DNA.
115 :     DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
116 :     # Delete the contigs.
117 :     DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
118 :     # Delete the features.
119 :     DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
120 :     # Delete the molecular machines.
121 :     DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
122 :     # Delete the genome itself.
123 :     my $subStats = $sap->Delete(Genome => $genome);
124 :     # Accumulate the statistics from the delete.
125 :     $stats->Accumulate($subStats);
126 :     # Return the statistics object.
127 :     return $stats;
128 :     }
129 :    
130 :     =head2 Loader Object Methods
131 :    
132 :     =head3 new
133 :    
134 :     my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
135 :    
136 :     Create a loader object that can be used to facilitate loading genome data from a
137 :     directory.
138 :    
139 :     =over 4
140 :    
141 :     =item sap
142 :    
143 :     L<Sapling> object used to access the target database.
144 :    
145 :     =item genome
146 :    
147 :     ID of the genome being loaded.
148 :    
149 :     =item directory
150 :    
151 :     Name of the directory containing the genome information.
152 :    
153 :     =back
154 :    
155 :     The object created contains the following fields.
156 :    
157 :     =over 4
158 :    
159 :     =item directory
160 :    
161 :     Name of the directory containing the genome data.
162 :    
163 :     =item genome
164 :    
165 :     ID of the genome being loaded.
166 :    
167 :     =item supportRecords
168 :    
169 :     A hash of hashes, used to track the support records known to exist in the database.
170 :    
171 :     =item sap
172 :    
173 :     L<Sapling> object used to access the database.
174 :    
175 :     =item stats
176 :    
177 :     L<Stats> object for tracking statistical information about the load.
178 :    
179 :     =back
180 :    
181 :     =cut
182 :    
183 :     sub new {
184 :     # Get the parameters.
185 :     my ($class, $sap, $genome, $directory) = @_;
186 :     # Create the object.
187 :     my $retVal = {
188 :     sap => $sap,
189 :     genome => $genome,
190 :     directory => $directory,
191 :     stats => Stats->new(qw(contigs dna pegs rnas)),
192 :     supportRecords => {}
193 :     };
194 :     # Bless and return it.
195 :     bless $retVal, $class;
196 :     return $retVal;
197 :     }
198 :    
199 :     =head3 LoadContigs
200 :    
201 :     $loaderObject->LoadContigs();
202 :    
203 :     Load the contig information into the database. This includes the contigs themselves and
204 :     the DNA. The number of contigs will be recorded as the C<contigs> statistic and the
205 :     number of base pairs as the C<dna> statistic.
206 :    
207 :     =cut
208 :    
209 :     sub LoadContigs {
210 :     # Get this object.
211 :     my ($self) = @_;
212 :     # Open the contigs file from the genome directory.
213 :     my $ih = Open(undef, "<$self->{directory}/contigs");
214 :     # Get the length of a DNA segment.
215 :     my $segmentLength = $self->{sap}->TuningParameter('maxSequenceLength');
216 :     # Get the genome ID.
217 :     my $genome = $self->{genome};
218 :     # These variables will contain the current contig ID, the current contig length,
219 :     # the ordinal of the current chunk, the accumulated DNA sequence, and its length.
220 :     my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);
221 :     # Loop through the contig file.
222 :     while (! eof $ih) {
223 :     # Get the current record.
224 :     my $line = <$ih>; chomp $line;
225 :     # Is this the start of a new contig?
226 :     if ($line =~ /^>(\S+)/) {
227 :     # Yes. Save the contig ID.
228 :     my $newContigID = $1;
229 :     # Is there a current contig?
230 :     if (defined $contigID) {
231 :     # Yes. Output the current chunk.
232 :     $self->OutputChunk($contigID, $ordinal, $chunk);
233 :     # Output the contig.
234 :     $self->OutputContig($contigID, $contigLen);
235 :     }
236 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
237 :     $self->FixContig(\$newContigID);
238 :     # Initialize the new contig.
239 :     $contigID = $newContigID;
240 :     $contigLen = 0;
241 :     $ordinal = 0;
242 :     $chunk = '';
243 :     $chunkLen = 0;
244 :     } else {
245 :     # Here we have more DNA in the current contig. Are we at the end of
246 :     # the current chunk?
247 :     my $lineLen = length($line);
248 :     my $newChunkLen = $chunkLen + $lineLen;
249 :     if ($newChunkLen < $segmentLength) {
250 :     # No. Add this line to the chunk.
251 :     $chunk .= $line;
252 :     $chunkLen += $lineLen;
253 :     } else {
254 :     # Yes. Create the actual chunk.
255 :     $chunk .= substr($line, 0, $segmentLength - $chunkLen);
256 :     # Write it out.
257 :     $self->OutputChunk($contigID, $ordinal, $chunk);
258 :     # Set up the new chunk.
259 :     $chunk = substr($line, $segmentLength - $chunkLen);
260 :     $ordinal++;
261 :     $chunkLen = length($chunk);
262 :     }
263 :     # Update the contig length.
264 :     $contigLen += $lineLen;
265 :     }
266 :     }
267 :     # Is there a current contig?
268 :     if (defined $contigID) {
269 :     # Yes. Output the residual chunk.
270 :     $self->OutputChunk($contigID, $ordinal, $chunk);
271 :     # Output the contig itself.
272 :     $self->OutputContig($contigID, $contigLen);
273 :     }
274 :     }
275 :    
276 :     =head3 OutputChunk
277 :    
278 :     $loaderObject->OutputChunk($contigID, $ordinal, $chunk);
279 :    
280 :     Output a chunk of DNA for the specified contig.
281 :    
282 :     =over 4
283 :    
284 :     =item contigID
285 :    
286 :     ID of the contig being output.
287 :    
288 :     =item ordinal
289 :    
290 :     Ordinal number of this chunk.
291 :    
292 :     =item chunk
293 :    
294 :     DNA sequence comprising this chunk.
295 :    
296 :     =back
297 :    
298 :     =cut
299 :    
300 :     sub OutputChunk {
301 :     # Get the parameters.
302 :     my ($self, $contigID, $ordinal, $chunk) = @_;
303 :     # Get the sapling object.
304 :     my $sap = $self->{sap};
305 :     # Compute the chunk ID.
306 :     my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
307 :     # Connect this sequence to the contig.
308 : parrello 1.2 $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
309 : parrello 1.1 # Create the DNA sequence.
310 :     $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
311 :     # Record the chunk.
312 :     $self->{stats}->Add(chunks => 1);
313 :     }
314 :    
315 :     =head3 OutputContig
316 :    
317 :     $loaderObject->OutputContig($contigID, $contigLen);
318 :    
319 :     Write out the current contig.
320 :    
321 :     =over 4
322 :    
323 :     =item contigID
324 :    
325 :     ID of the contig being written out.
326 :    
327 :     =item contigLen
328 :    
329 :     Length of the contig in base pairs.
330 :    
331 :     =back
332 :    
333 :     =cut
334 :    
335 :     sub OutputContig {
336 :     # Get the parameters.
337 :     my ($self, $contigID, $contigLen) = @_;
338 :     # Get the sapling object.
339 :     my $sap = $self->{sap};
340 :     # Connect the contig to the genome.
341 :     $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
342 :     # Output the contig record.
343 :     $sap->InsertObject('Contig', id => $contigID, length => $contigLen);
344 :     # Record the contig.
345 :     $self->{stats}->Add(contigs => 1);
346 :     $self->{stats}->Add(dna => $contigLen);
347 :     }
348 :    
349 :     =head3 LoadFeatures
350 :    
351 :     $loaderObject->LoadFeatures();
352 :    
353 :     Load the feature data into the database. This includes all of the features,
354 :     their protein sequences, and their subsystem information. The number of
355 :     features of each type will be stored in the statistics object, identified by
356 :     the feature type.
357 :    
358 :     =cut
359 :    
360 :     sub LoadFeatures {
361 :     # Get the parameters.
362 :     my ($self) = @_;
363 :     # Get the directory of feature types.
364 :     my $featureDir = "$self->{directory}/Features";
365 :     my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
366 :     # Create the feature records for the types found.
367 :     for my $type (@types) {
368 :     # Insure this is a genuine feature directory.
369 :     if (-f "$featureDir/$type/tbl") {
370 :     # Yes, load the feature data.
371 :     $self->LoadFeatureData($featureDir, $type);
372 :     }
373 :     }
374 :     # Check for protein sequences. If we have some, load them into the database.
375 :     if (-f "$featureDir/peg/fasta") {
376 :     $self->LoadProteinData("$featureDir/peg/fasta");
377 :     }
378 :     }
379 :    
380 :     =head3 LoadFeatureData
381 :    
382 :     $self->LoadFeatureData($featureDir, $type);
383 :    
384 :     Load the basic data for each feature into the database. The number of features of
385 :     the type found will be recorded in the statistics object.
386 :    
387 :     =over 4
388 :    
389 :     =item featureDir
390 :    
391 :     Name of the main directory containing all the feature type subdirectories.
392 :    
393 :     =item type
394 :    
395 :     Type of feature to load.
396 :    
397 :     =back
398 :    
399 :     =cut
400 :    
401 :     sub LoadFeatureData {
402 :     # Get the parameters.
403 :     my ($self, $featureDir, $type) = @_;
404 :     # Get the sapling database.
405 :     my $sap = $self->{sap};
406 :     # Get the maximum location segment length. We'll need this later.
407 :     my $maxLength = $sap->TuningParameter('maxLocationLength');
408 :     # Get the statistics object.
409 :     my $stats = $self->{stats};
410 :     # Read in the functional assignments.
411 :     my $assignHash = $self->ReadAssignments();
412 :     # This hash will track the features we've created. If a feature is found a second
413 :     # time, it overwrites the original.
414 :     my %fids;
415 :     # Insure we have a tbl file for this feature type.
416 :     my $fileName = "$featureDir/$type/tbl";
417 :     if (-f $fileName) {
418 :     # We have one, so open it for input.
419 :     my $ih = Open(undef, "<$fileName");
420 :     while (! eof $ih) {
421 :     # Read this feature's information.
422 :     my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
423 :     # If the feature already exists, delete it.
424 :     if (exists $fids{$fid}) {
425 :     $sap->Delete(Feature => $fid);
426 :     $stats->Add(duplicateFid => 1);
427 :     }
428 : parrello 1.2 # Otherwise connect this feature to the genome.
429 :     $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
430 : parrello 1.1 # Now we must parse the locations. This will contain a list of the location
431 :     # data 4-tuples (contig, start, dir, len).
432 :     my @locData;
433 :     # This will contain the total sequence length.
434 :     my $length = 0;
435 :     # Loop through the locations.
436 :     for my $loc (split /\s*,\s*/, $locations) {
437 :     # Get this location's components.
438 :     my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);
439 :     # Normalize the contig.
440 :     $self->FixContig(\$contig);
441 :     # Compute the direction.
442 :     if ($start <= $stop) {
443 :     # Here we have the forward strand.
444 :     my $len = $stop + 1 - $start;
445 :     push @locData, [$contig, $start, '+', $len];
446 :     $length += $len;
447 :     } else {
448 :     # Here we have the reverse strand.
449 :     my $len = $start + 1 - $stop;
450 :     push @locData, [$contig, $stop, '-', $len];
451 :     $length += $len;
452 :     }
453 :     }
454 :     # Now we can create the feature record.
455 :     $sap->InsertObject('Feature', id => $fid, feature_type => $type,
456 :     function => $assignHash->{$fid}, locked => 0,
457 :     sequence_length => $length);
458 :     $fids{$fid} = 1;
459 :     $stats->Add($type => 1);
460 :     # The next step is to connect the feature record to its locations. This
461 :     # involves dividing the location into segments. The following variable
462 :     # will count the total number of segments.
463 :     my $segment = 0;
464 :     # Loop through the locations.
465 :     for my $loc (@locData) {
466 :     # Get the current location's information.
467 :     my ($contig, $left, $dir, $len) = @$loc;
468 :     # Peel off any segments.
469 :     while ($len > $maxLength) {
470 :     # Process according to the direction.
471 :     if ($dir eq '+') {
472 :     # Forward strand: peel from the beginning.
473 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir,
474 :     $maxLength);
475 :     $left += $maxLength;
476 :     } else {
477 :     # Reverse strand: peel from the end.
478 :     $self->ConnectLocation($fid, $contig, $segment,
479 :     $left + $len - $maxLength, $dir,
480 :     $maxLength);
481 :     }
482 :     # Denote we've used up a segment.
483 :     $len -= $maxLength;
484 :     $segment++;
485 :     }
486 :     # Output the last segment.
487 :     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
488 :     }
489 : parrello 1.2 # Now we process the aliases and create the identifiers. We don't do this
490 :     # for RNA, because the RNA function is stored in the aliases.
491 :     if ($type ne 'rna') {
492 :     for my $alias (@aliases) {
493 :     my $normalized;
494 :     # Determine the type.
495 :     my $aliasType = AliasAnalysis::TypeOf($alias);
496 :     $stats->Add(aliasAll => 1);
497 :     # Is this a recognized type?
498 :     if ($aliasType) {
499 :     $stats->Add(aliasNormal => 1);
500 :     # Yes. Write it normally.
501 :     $self->CreateIdentifier($alias, B => $aliasType, $fid);
502 :     } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
503 :     # No, but this is a specially-marked locus tag.
504 :     $normalized = $1;
505 :     $stats->Add(aliasLocus => 1);
506 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
507 :     } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
508 :     # No, but this is a natural locus tag.
509 :     $stats->Add(aliasLocus => 1);
510 :     $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
511 :     } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
512 :     # No, but this is a natural gene name.
513 :     $stats->Add(aliasGene => 1);
514 :     $self->CreateIdentifier($normalized, B => 'GENE', $fid);
515 :     } elsif ($alias =~ /^\d+$/) {
516 :     # Here it's a naked number, which means it's a GI number
517 :     # of some sort.
518 :     $stats->Add(aliasGI => 1);
519 :     $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
520 :     } elsif ($alias =~ /^protein_id\|(.+)/) {
521 :     # Here we have a REFSEQ protein ID. Right now we don't have a way to
522 :     # handle that, because we don't know the feature's protein ID here.
523 :     $stats->Add(aliasProtein => 1);
524 :     } elsif ($alias =~ /[:|]/) {
525 :     # Here it's an alias of an unknown type, so we skip it.
526 :     $stats->Add(aliasUnknown => 1);
527 :     } else {
528 :     # Here it's a miscellaneous type.
529 :     $stats->Add(aliasMisc => 1);
530 :     $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
531 :     }
532 :     }
533 :     }
534 : parrello 1.1 }
535 :     }
536 :     }
537 :    
538 :     =head3 LoadProteinData
539 :    
540 :     $self->LoadProteinData($fileName);
541 :    
542 :     Load the protein sequences from the named FASTA file. The sequences will be stored
543 :     in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature
544 :     records themselves will not be created (it is presumed they are already there).
545 :    
546 :     =over 4
547 :    
548 :     =item fileName
549 :    
550 :     Name of the FASTA file containing the protein sequences for this genome.
551 :    
552 :     =back
553 :    
554 :     =cut
555 :    
556 :     sub LoadProteinData {
557 :     # Get the parameters.
558 :     my ($self, $fileName) = @_;
559 :     # Open the FASTA file for input.
560 :     my $ih = Open(undef, "<$fileName");
561 :     # We'll track the current protein in here.
562 :     my $fid;
563 :     my $sequence = "";
564 :     # Loop through the input file.
565 :     while (! eof $ih) {
566 :     # Get the current record.
567 :     my $line = <$ih>; chomp $line;
568 :     # Is this a label record.
569 :     if ($line =~ /^>(\S+)/) {
570 :     # Yes. Save the new feature ID.
571 :     my $newFid = $1;
572 :     # Do we have an existing protein?
573 :     if (defined $fid) {
574 :     # Yes. Write it out.
575 :     $self->WriteProtein($fid, $sequence);
576 :     }
577 :     # Initialize for the next protein.
578 :     $fid = $newFid;
579 :     $sequence = "";
580 :     } else {
581 :     # Here we have more letters for the current protein.
582 :     $sequence .= $line;
583 :     }
584 :     }
585 :     # Do we have a residual protein.
586 :     if (defined $fid) {
587 :     # Yes. Write it out.
588 :     $self->WriteProtein($fid, $sequence);
589 :     }
590 :     }
591 :    
592 :     =head3 WriteProtein
593 :    
594 :     $loaderObject->WriteProtein($fid, $sequence);
595 :    
596 :     Write out the specified protein sequence and associate it with the specified feature.
597 :     This requires checking to see if the protein sequence is already in the database.
598 :    
599 :     =over 4
600 :    
601 :     =item fid
602 :    
603 :     ID of the feature with the specified protein sequence.
604 :    
605 :     =item sequence
606 :    
607 :     Protein sequence for the indicated feature.
608 :    
609 :     =back
610 :    
611 :     =cut
612 :    
613 :     sub WriteProtein {
614 :     # Get the parameters.
615 :     my ($self, $fid, $sequence) = @_;
616 :     # Compute the key of the protein sequence.
617 :     my $protID = ERDB::DigestKey($sequence);
618 :     # Insure the protein exists.
619 :     $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
620 :     # Connect the feature to it.
621 :     $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
622 :     }
623 :    
624 :     =head3 InsureEntity
625 :    
626 :     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);
627 :    
628 :     Insure that the specified record exists in the database. If no record is found of the
629 :     specified type with the specified ID, one will be created with the indicated fields.
630 :    
631 :     =over 4
632 :    
633 :     =item $entityType
634 :    
635 :     Type of entity to check.
636 :    
637 :     =item id
638 :    
639 :     ID of the entity instance in question.
640 :    
641 :     =item fields
642 :    
643 :     Hash mapping field names to values for all the fields in the desired entity record except
644 :     for the ID.
645 :    
646 :     =item RETURN
647 :    
648 :     Returns TRUE if a new object was created, FALSE if it already existed.
649 :    
650 :     =back
651 :    
652 :     =cut
653 :    
654 :     sub InsureEntity {
655 :     # Get the parameters.
656 :     my ($self, $entityType, $id, %fields) = @_;
657 :     # Get the database.
658 :     my $sap = $self->{sap};
659 :     # Get the support record ID hash.
660 :     my $supportHash = $self->{supportRecords};
661 :     # Denote we haven't created a new record.
662 :     my $retVal = 0;
663 :     # Get the sub-hash for this entity type.
664 :     my $entityHash = $supportHash->{$entityType};
665 :     if (! defined $entityHash) {
666 :     $entityHash = {};
667 :     $supportHash->{$entityType} = $entityHash;
668 :     }
669 :     # Check for this instance.
670 :     if (! $entityHash->{$id}) {
671 :     # It's not found. Check the database.
672 :     if (! $sap->Exists($entityType => $id)) {
673 :     # It's not in the database either, so create it.
674 :     $sap->InsertObject($entityType, id => $id, %fields);
675 :     $self->{stats}->Add(insertSupport => 1);
676 :     $retVal = 1;
677 :     }
678 :     # Mark the record in the hash so we know we have it.
679 :     $entityHash->{$id} = 1;
680 :     }
681 :     # Return the insertion indicator.
682 :     return $retVal;
683 :     }
684 :    
685 :     =head3 LoadSubsystems
686 :    
687 :     $loaderObject->LoadSubsystems();
688 :    
689 :     Load the subsystem data into the database. This includes all of the subsystems,
690 :     their categories, roles, and the bindings that connect them to this genome's features.
691 :    
692 :     =cut
693 :    
694 :     sub LoadSubsystems {
695 :     # Get the parameters.
696 :     my ($self) = @_;
697 :     # Get the sapling object.
698 :     my $sap = $self->{sap};
699 :     # Get the statistics object.
700 :     my $stats = $self->{stats};
701 :     # Get the genome ID.
702 :     my $genome = $self->{genome};
703 :     # Connect to the Sapling server so we can get the subsystem variant and role information.
704 :     my $sapO = SAPserver->new();
705 :     # This hash maps subsystem IDs to molecular machine IDs.
706 :     my %machines;
707 :     # This hash maps subsystem/role pairs to machine role IDs.
708 :     my %machineRoles;
709 :     # The first step is to create the subsystems themselves and connect them to the
710 :     # genome. A list is given in the subsystems file of the Subsystems directory.
711 :     my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
712 :     # Create the field hash for the subsystem query.
713 :     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)
714 :     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)
715 :     Subsystem(version)
716 :     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)
717 :     Includes(sequence)
718 :     Role(id) Role(hypothetical) Role(role-index));
719 :     my %fields = map { $_ => $_ } @fields;
720 :     # Loop through the subsystems in the file, insuring we have them in the database.
721 :     while (! eof $ih) {
722 :     # Get this subsystem.
723 :     my ($subsystem, $variant) = Tracer::GetLine($ih);
724 :     # Normalize the subsystem name.
725 :     $subsystem = $sap->SubsystemID($subsystem);
726 :     # Compute this subsystem's MD5.
727 :     my $subsystemMD5 = ERDB::DigestKey($subsystem);
728 :     # Insure the subsystem is in the database.
729 :     if (! $sap->Exists(Subsystem => $subsystem)) {
730 :     # The subsystem isn't present, so we need to read it. We get the subsystem,
731 :     # its roles, and its classification information. The variant will be added
732 :     # later if we need it.
733 :     my $roleList = $sapO->get(-objects => "Subsystem Includes Role",
734 :     -filter => {"Subsystem(id)" => ["=", $subsystem] },
735 :     -fields => \%fields);
736 :     # Only proceed if we found some roles.
737 :     if (@$roleList > 0) {
738 :     # Get the subsystem information from the first role and create the subsystem.
739 :     my $roleH = $roleList->[0];
740 :     my %subFields = ExtractFields(Subsystem => $roleH);
741 :     $sap->InsertObject('Subsystem', %subFields);
742 :     # Now loop through the roles. The Includes records are always inserted, but the
743 :     # roles are only inserted if they don't already exist.
744 :     for $roleH (@$roleList) {
745 :     # Create the Includes record.
746 :     my %incFields = ExtractFields(Includes => $roleH);
747 :     $sap->InsertObject('Includes', %incFields);
748 :     # Insure we have the role in place.
749 :     my %roleFields = ExtractFields(Role => $roleH);
750 :     my $roleID = $roleFields{id};
751 :     delete $roleFields{id};
752 :     $self->InsureEntity('Role', $roleID, %roleFields);
753 :     # Compute the machine-role ID.
754 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});
755 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
756 :     }
757 :     }
758 :     } else {
759 :     # We already have the subsystem in the database, but we still need to compute the
760 :     # machine role IDs. We need information from the sapling server for this.
761 :     my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);
762 :     # Loop through the roles.
763 :     my $roleList = $subHash->{$subsystem};
764 :     for my $roleTuple (@$roleList) {
765 :     my ($roleID, $abbr) = @$roleTuple;
766 :     my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
767 :     $machineRoles{$subsystem}{$roleID} = $machineRoleID;
768 :     }
769 :     }
770 :     # Now we need to connect the variant to the subsystem. First, we compute the real
771 :     # variant code by removing the asterisks.
772 :     my $variantCode = $variant;
773 :     $variantCode =~ s/^\*+//;
774 :     $variantCode =~ s/\*+$//;
775 :     # Compute the variant key.
776 :     my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
777 :     # Get the variant from the sapling server.
778 :     my $variantH = $sapO->get(-objects => "Variant",
779 :     -filter => {"Variant(id)" => ["=", $variantKey]},
780 :     -fields => {"Variant(code)" => "code",
781 :     "Variant(comment)" => "comment",
782 :     "Variant(type)" => "type",
783 :     "Variant(role-rule)" => "role-rule"},
784 :     -firstOnly => 1);
785 :     # Insure we have it in the database.
786 :     $self->InsureEntity('Variant', $variantKey, %$variantH);
787 :     $stats->Add(subsystems => 1);
788 :     # Connect it to the subsystem.
789 :     $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);
790 :     # Now we create the molecular machine connecting this genome to the subsystem
791 :     # variant.
792 :     my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
793 :     $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
794 :     $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
795 :     $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
796 :     # Remember the machine ID.
797 :     $machines{$subsystem} = $machineID;
798 :     }
799 :     # Now we go through the bindings file. This file connects the subsystem roles to
800 :     # the molecular machines.
801 :     $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
802 :     # Loop through the bindings.
803 :     while (! eof $ih) {
804 :     # Get the binding data.
805 :     my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
806 :     # Normalize the subsystem name.
807 :     $subsystem = $sap->SubsystemID($subsystem);
808 :     # Compute the machine role.
809 :     my $machineRoleID = $machineRoles{$subsystem}{$role};
810 :     # Insure it exists.
811 :     my $created = $self->InsureEntity(MachineRole => $machineRoleID);
812 :     if ($created) {
813 :     # We created the machine role, so connect it to the machine.
814 :     my $machineID = $machines{$subsystem};
815 :     $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID);
816 :     # Connect it to the role, too.
817 :     $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID);
818 :     }
819 :     # Connect the feature.
820 :     $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
821 :     }
822 :     }
823 :    
824 :     =head3 CreateGenome
825 :    
826 :     $loaderObject->CreateGenome();
827 :    
828 :     Create the genome record.
829 :    
830 :     =cut
831 :    
832 :     # This hash maps cache statistics to genome record fields.
833 :     use constant GENOME_FIELDS => {genome_name => 'scientific-name',
834 :     genome_domain => 'domain'};
835 :     # This hash maps domains to the prokaryotic flag.
836 :     use constant PROK_FLAG => {Bacteria => 1, Archaea => 1};
837 :    
838 :     sub CreateGenome {
839 :     # Get the parameters.
840 :     my ($self) = @_;
841 :     # Get the genome directory.
842 :     my $dir = $self->{directory};
843 :     # Get the sapling database.
844 :     my $sap = $self->{sap};
845 :     # We'll put the genome attributes in here.
846 :     my %fields;
847 :     # Check for a basic statistics file.
848 :     my $statsFile = "$dir/cache.basic_statistics";
849 :     if (-f $statsFile) {
850 :     # We have the statistics file, so read the major attributes from it.
851 :     my $ih = Open(undef, "<$statsFile");
852 :     while (! eof $ih) {
853 :     my ($key, $value) = Tracer::GetLine($ih);
854 :     my $fieldKey = GENOME_FIELDS->{$key};
855 :     if ($fieldKey) {
856 :     $fields{$fieldKey} = $value;
857 :     }
858 :     }
859 :     # Translate the domain.
860 :     $fields{domain} = ucfirst $fields{domain};
861 :     # Denote the genome is complete.
862 :     $fields{complete} = 1;
863 :     } else {
864 :     # Check to see if this genome is complete.
865 :     $fields{complete} = (-f "$dir/COMPLETE" ? 1 : 0);
866 :     # Get the genome name.
867 :     my $ih = Open(undef, "<$dir/GENOME");
868 :     my $line = <$ih>; chomp $line;
869 :     $fields{'scientific-name'} = $line;
870 :     # Get the taxonomy and extract the domain from it.
871 :     $ih = Open(undef, "<$dir/TAXONOMY");
872 :     ($fields{domain}) = split /;/, <$ih>, 2;
873 :     }
874 :     # Get the counts from the statistics object.
875 :     my $stats = $self->{stats};
876 :     $fields{contigs} = $stats->Ask('contigs');
877 :     $fields{'dna-size'} = $stats->Ask('dna');
878 :     $fields{pegs} = $stats->Ask('peg');
879 :     $fields{rnas} = $stats->Ask('rna');
880 :     # Get the genetic code. The default is 11.
881 :     $fields{'genetic-code'} = 11;
882 :     my $geneticCodeFile = "$dir/GENETIC_CODE";
883 :     if (-f $geneticCodeFile) {
884 :     # There's a genetic code file, so we need to read it for the code.
885 :     my $ih = Open(undef, "<$geneticCodeFile");
886 :     ($fields{'genetic-code'}) = Tracer::GetLine($ih);
887 :     }
888 :     # Use the domain to determine whether or not the genome is prokaryotic.
889 :     $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
890 :     # Finally, add the genome ID.
891 :     $fields{id} = $self->{genome};
892 :     # Create the genome record.
893 :     $sap->InsertObject('Genome', %fields);
894 :     }
895 :    
896 :     =head3 ReadAssignments
897 :    
898 :     my $assignHash = $loaderObject->ReadAssignments();
899 :    
900 :     Return a hash mapping each feature ID to its functional assignment. This method
901 :     essentially reads the B<assigned_functions> file into memory.
902 :    
903 :     =cut
904 :    
905 :     sub ReadAssignments {
906 :     # Get the parameters.
907 :     my ($self) = @_;
908 :     # Create the return hash.
909 :     my $retVal = {};
910 :     # Loop through the assigned-functions file, storing results in the hash. Later
911 :     # results will override earlier ones.
912 :     my $ih = Open(undef, "<$self->{directory}/assigned_functions");
913 :     while (! eof $ih) {
914 :     my ($fid, $function) = Tracer::GetLine($ih);
915 :     $retVal->{$fid} = $function;
916 :     }
917 :     # Return the hash built.
918 :     return $retVal;
919 :     }
920 :    
921 :    
922 :     =head3 FixContig
923 :    
924 :     $loaderObject->FixContig(\$contigID);
925 :    
926 :     Insure that the specified contig ID contains the genome ID.
927 :    
928 :     =cut
929 :    
930 :     sub FixContig {
931 :     my ($self, $contigID) = @_;
932 :     # Compute the new contig ID. We need to insure it has a genome ID in front.
933 :     unless ($$contigID =~ /^\d+\.\d+\:/) {
934 :     $$contigID = "$self->{genome}:$$contigID";
935 :     }
936 :     }
937 :    
938 :     =head3 ConnectLocation
939 :    
940 :     $loaderObject->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
941 :    
942 :     Connect the specified feature to the specified contig location.
943 :    
944 :     =over 4
945 :    
946 :     =item fid
947 :    
948 :     ID of the relevant feature.
949 :    
950 :     =item contig
951 :    
952 :     ID of the contig containing this segment of the feature (normalized).
953 :    
954 :     =item segment
955 :    
956 :     Ordinal number of this segment.
957 :    
958 :     =item left
959 :    
960 :     Location of the segment's leftmost base pair.
961 :    
962 :     =item dir
963 :    
964 :     Direction (strand) of the segment (C<+> or C<->).
965 :    
966 :     =item len
967 :    
968 :     Length of the segment (which must be no greater than the configured maximum).
969 :    
970 :     =back
971 :    
972 :     =cut
973 :    
974 :     sub ConnectLocation {
975 :     # Get the parameters.
976 :     my ($self, $fid, $contig, $segment, $left, $dir, $len) = @_;
977 :     # Get the sapling database.
978 :     my $sap = $self->{sap};
979 :     # Create the relationship.
980 :     $sap->InsertObject('IsLocatedIn', from_link => $fid, to_link => $contig,
981 :     begin => $left, dir => $dir, len => $len,
982 :     ordinal => $segment);
983 :     # Record it in the statistics.
984 :     $self->{stats}->Add(segment => 1);
985 :     }
986 :    
987 :     =head2 Internal Utility Methods
988 :    
989 :     =head3 DeleteRelatedRecords
990 :    
991 :     DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);
992 :    
993 :     Delete all the records in the named entity and relationship relating to the
994 :     specified genome and roll up the statistics in the specified statistics object.
995 :    
996 :     =over 4
997 :    
998 :     =item sap
999 :    
1000 :     L<Sapling> object for accessing the database.
1001 :    
1002 :     =item genome
1003 :    
1004 :     ID of the relevant genome.
1005 :    
1006 :     =item stats
1007 :    
1008 :     L<Stats> object for tracking the delete activity.
1009 :    
1010 :     =item relName
1011 :    
1012 :     Name of a relationship from the B<Genome> table.
1013 :    
1014 :     =item entityName
1015 :    
1016 :     Name of the entity on the other side of the relationship.
1017 :    
1018 :     =back
1019 :    
1020 :     =cut
1021 :    
1022 :     sub DeleteRelatedRecords {
1023 :     # Get the parameters.
1024 :     my ($sap, $genome, $stats, $relName, $entityName) = @_;
1025 :     # Get all the relationship records.
1026 :     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],
1027 :     "to-link");
1028 :     # Loop through the relationship records, deleting them and the target entity
1029 :     # records.
1030 :     for my $target (@targets) {
1031 :     # Delete the relationship instance.
1032 :     $sap->DeleteRow($relName, $genome, $target);
1033 :     $stats->Add($relName => 1);
1034 :     # Delete the entity instance.
1035 :     my $subStats = $sap->Delete($entityName, $target);
1036 :     # Roll up the statistics.
1037 :     $stats->Accumulate($subStats);
1038 :     }
1039 :     }
1040 :    
1041 :     =head3 ExtractFields
1042 :    
1043 :     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);
1044 :    
1045 :     Extract from the incoming hash the field names and values from the specified table.
1046 :    
1047 :     =over 4
1048 :    
1049 :     =item tableName
1050 :    
1051 :     Name of the table whose field names and values are desired.
1052 :    
1053 :     =item dataHash
1054 :    
1055 :     Reference to a hash mapping fully-qualified ERDB field names to values.
1056 :    
1057 :     =item RETURN
1058 :    
1059 :     Returns a hash containing only the fields from the specified table and their values.
1060 :    
1061 :     =back
1062 :    
1063 :     =cut
1064 :    
1065 :     sub ExtractFields {
1066 :     # Get the parameters.
1067 :     my ($tableName, $dataHash) = @_;
1068 :     # Declare the return variable.
1069 :     my %retVal;
1070 :     # Extract the desired fields.
1071 :     for my $field (keys %$dataHash) {
1072 :     # Is this a field for the specified table?
1073 :     if ($field =~ /^$tableName\(([^)]+)/) {
1074 :     # Yes, put it in the output hash.
1075 :     $retVal{$1} = $dataHash->{$field};
1076 :     }
1077 :     }
1078 :     # Return the computed hash.
1079 :     return %retVal;
1080 :     }
1081 :    
1082 : parrello 1.2 =head3 CreateIdentifier
1083 :    
1084 :     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
1085 :    
1086 :     Link an identifier to a feature. The identifier is presented in prefixed form and is of the
1087 :     specified type and the specified confidence level.
1088 :    
1089 :     =over 4
1090 :    
1091 :     =item alias
1092 :    
1093 :     Identifier to connect to the feature.
1094 :    
1095 :     =item conf
1096 :    
1097 :     Confidence level (C<A> curated, C<B> normal, C<C> protein only).
1098 :    
1099 :     =item aliasType
1100 :    
1101 :     Type of alias (e.g. C<NCBI>, C<LocusTag>).
1102 :    
1103 :     =item fid
1104 :    
1105 :     ID of the relevant feature.
1106 :    
1107 :     =back
1108 :    
1109 :     =cut
1110 :    
1111 :     sub CreateIdentifier {
1112 :     # Get the parameters.
1113 :     my ($self, $alias, $conf, $aliasType, $fid) = @_;
1114 :     # Get the Sapling object.
1115 :     my $sap = $self->{sap};
1116 :     # Compute the identifier's natural form.
1117 :     my $natural = $alias;
1118 :     if ($natural =~ /[:|](.+)/) {
1119 :     $natural = $1;
1120 :     }
1121 :     # Insure the identifier exists in the database.
1122 :     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1123 :     # Connect the identifier to the feature.
1124 :     $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);
1125 :     }
1126 :    
1127 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3