[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Dec 14 19:48:38 2010 UTC revision 1.18, Wed May 8 20:28:44 2013 UTC
# Line 25  Line 25 
25      use SeedUtils;      use SeedUtils;
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28        use AliasAnalysis;
29        use BaseSaplingLoader;
30        use MD5Computer;
31        use base qw(SaplingDataLoader);
32    
33  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
34    
# Line 65  Line 69 
69      # Create the loader object.      # Create the loader object.
70      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
71      # Load the contigs.      # Load the contigs.
72        Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
73      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
74      # Load the features.      # Load the features.
75        Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
76      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
77        # Check for annotation history. If we have it, load the history records into the
78        # database.
79        if (-f "$directory/annotations") {
80            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
81            $loaderObject->LoadAnnotations("$directory/annotations");
82        }
83      # Load the subsystem bindings.      # Load the subsystem bindings.
84        Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
85      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
86      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
87        Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
88      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
89      # Return the statistics.      # Return the statistics.
90      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 106  Line 120 
120      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
121      # Create the statistics object.      # Create the statistics object.
122      my $stats = Stats->new();      my $stats = Stats->new();
123      # Delete the DNA.      # Delete the DNA sequences.
124      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
125        for my $seq (@seqs) {
126            my $delStats = $sap->Delete(DNASequence => $seq);
127            $stats->Accumulate($delStats);
128        }
129      # Delete the contigs.      # Delete the contigs.
130      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
131      # Delete the features.      # Delete the features.
132      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
133      # Delete the molecular machines.      # Delete the molecular machines.
134      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
135      # Delete the genome itself.      # Delete the genome itself.
136      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
137      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
138      $stats->Accumulate($subStats);      $stats->Accumulate($subStats);
139        Trace("Statistics for delete of $genome:\n" . $stats->Show()) if T(SaplingDataLoader => 3);
140      # Return the statistics object.      # Return the statistics object.
141      return $stats;      return $stats;
142  }  }
143    
144    
145    =head3 Process
146    
147        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
148    
149    Load genome data from the specified directory. If the genome data already
150    exists in the database, it will be deleted first.
151    
152    =over 4
153    
154    =item sap
155    
156    L</Sapling> object for accessing the database.
157    
158    =item genome
159    
160    ID of the genome whose  data is being loaded.
161    
162    =item directory
163    
164    Name of the directory containing the genome data files. If omitted, the
165    genome will be deleted from the database.
166    
167    =item RETURN
168    
169    Returns a statistics object describing the activity during the reload.
170    
171    =back
172    
173    =cut
174    
175    sub Process {
176        # Get the parameters.
177        my ($sap, $genome, $directory) = @_;
178        # Clear the existing data for the specified genome.
179        my $stats = ClearGenome($sap, $genome);
180        # Load the new expression data from the specified directory (if one is
181        # specified).
182        if ($directory) {
183            my $newStats = Load($sap, $genome, $directory);
184            # Merge the statistics.
185            $stats->Accumulate($newStats);
186        }
187        # Return the result.
188        return $stats;
189    }
190    
191    
192  =head2 Loader Object Methods  =head2 Loader Object Methods
193    
194  =head3 new  =head3 new
# Line 179  Line 246 
246      # Get the parameters.      # Get the parameters.
247      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory) = @_;
248      # Create the object.      # Create the object.
249      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
250          sap => $sap,      # Add our specialized data.
251          genome => $genome,      $retVal->{genome} = $genome;
252          directory => $directory,      $retVal->{directory} = $directory;
253          stats => Stats->new(qw(contigs dna pegs rnas)),      # Leave the assignment hash undefined until we populate it.
254          supportRecords => {}      $retVal->{assignHash} = undef;
255      };      # Create an MD5 Computer for this genome.
256      # Bless and return it.      $retVal->{md5} = MD5Computer->new();
257      bless $retVal, $class;      # Return the result.
258      return $retVal;      return $retVal;
259  }  }
260    
# Line 196  Line 263 
263      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
264    
265  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
266  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
267  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
268    C<gc_content> statistic.
269    
270  =cut  =cut
271    
# Line 211  Line 279 
279      # Get the genome ID.      # Get the genome ID.
280      my $genome = $self->{genome};      my $genome = $self->{genome};
281      # These variables will contain the current contig ID, the current contig length,      # These variables will contain the current contig ID, the current contig length,
282      # the ordinal of the current chunk, the accumulated DNA sequence, and its length.      # the accumulated DNA sequence for the current chunk, and its length.
283      my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);      my ($contigID, $contigLen, $chunk, $chunkLen) = (undef, 0, '', 0);
284        # This variable contains a list of the chunks, for use in computing the MD5.
285        my @chunks;
286      # Loop through the contig file.      # Loop through the contig file.
287      while (! eof $ih) {      while (! eof $ih) {
288          # Get the current record.          # Get the current record.
# Line 223  Line 293 
293              my $newContigID = $1;              my $newContigID = $1;
294              # Is there a current contig?              # Is there a current contig?
295              if (defined $contigID) {              if (defined $contigID) {
296                  # Yes. Output the current chunk.                  # Yes. Output the contig.
297                  $self->OutputChunk($contigID, $ordinal, $chunk);                  $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
                 # Output the contig.  
                 $self->OutputContig($contigID, $contigLen);  
298              }              }
299              # Compute the new contig ID. We need to insure it has a genome ID in front.              # Compute the new contig ID. We need to insure it has a genome ID in front.
300              $self->FixContig(\$newContigID);              $self->FixContig(\$newContigID);
301              # Initialize the new contig.              # Initialize the new contig.
302              $contigID = $newContigID;              $contigID = $newContigID;
303              $contigLen = 0;              $contigLen = 0;
             $ordinal = 0;  
304              $chunk = '';              $chunk = '';
305              $chunkLen = 0;              $chunkLen = 0;
306                @chunks = ();
307          } else {          } else {
308              # Here we have more DNA in the current contig. Are we at the end of              # Here we have more DNA in the current contig. Are we at the end of
309              # the current chunk?              # the current chunk?
# Line 249  Line 317 
317                  # Yes. Create the actual chunk.                  # Yes. Create the actual chunk.
318                  $chunk .= substr($line, 0, $segmentLength - $chunkLen);                  $chunk .= substr($line, 0, $segmentLength - $chunkLen);
319                  # Write it out.                  # Write it out.
320                  $self->OutputChunk($contigID, $ordinal, $chunk);                  $self->OutputChunk($contigID, scalar @chunks, $chunk);
321                  # Set up the new chunk.                  # Set up the new chunk.
322                  $chunk = substr($line, $segmentLength - $chunkLen);                  $chunk = substr($line, $segmentLength - $chunkLen);
                 $ordinal++;  
323                  $chunkLen = length($chunk);                  $chunkLen = length($chunk);
324                    push @chunks, $chunk;
325              }              }
326              # Update the contig length.              # Update the contig length.
327              $contigLen += $lineLen;              $contigLen += $lineLen;
# Line 261  Line 329 
329      }      }
330      # Is there a current contig?      # Is there a current contig?
331      if (defined $contigID) {      if (defined $contigID) {
332          # Yes. Output the residual chunk.          # Yes. Output the contig itself.
333          $self->OutputChunk($contigID, $ordinal, $chunk);          $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
         # Output the contig itself.  
         $self->OutputContig($contigID, $contigLen);  
334      }      }
335  }  }
336    
# Line 300  Line 366 
366      # Compute the chunk ID.      # Compute the chunk ID.
367      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
368      # Connect this sequence to the contig.      # Connect this sequence to the contig.
369      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
370      # Create the DNA sequence.      # Create the DNA sequence.
371      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
372      # Record the chunk.      # Record the chunk.
373      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
374        # Update the GC count.
375        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
376  }  }
377    
378  =head3 OutputContig  =head3 OutputContig
379    
380      $loaderObject->OutputContig($contigID, $contigLen);      $loaderObject->OutputContig($contigID, $contigLen, $chunk, \@chunks);
381    
382  Write out the current contig.  Write out the current contig.
383    
# Line 323  Line 391 
391    
392  Length of the contig in base pairs.  Length of the contig in base pairs.
393    
394    =item chunk
395    
396    Last DNA chunk of the contig.
397    
398    =item chunks
399    
400    Reference to a list of the DNA chunks up to (but not including) the last one.
401    
402  =back  =back
403    
404  =cut  =cut
405    
406  sub OutputContig {  sub OutputContig {
407      # Get the parameters.      # Get the parameters.
408      my ($self, $contigID, $contigLen) = @_;      my ($self, $contigID, $contigLen, $chunk, $chunks) = @_;
409      # Get the sapling object.      # Get the sapling object.
410      my $sap = $self->{sap};      my $sap = $self->{sap};
411        # Get the MD5 computer.
412        my $md5C = $self->{md5};
413        # Output the last chunk.
414        $self->OutputChunk($contigID, scalar @$chunks, $chunk);
415      # Connect the contig to the genome.      # Connect the contig to the genome.
416      $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);      $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
417        # Compute the MD5.
418        push @$chunks, $chunk;
419        my $contigMD5 = $md5C->ProcessContig($contigID, $chunks);
420      # Output the contig record.      # Output the contig record.
421      $sap->InsertObject('Contig', id => $contigID, length => $contigLen);      $sap->InsertObject('Contig', id => $contigID, length => $contigLen,
422                md5_identifier => $contigMD5);
423      # Record the contig.      # Record the contig.
424      $self->{stats}->Add(contigs => 1);      $self->{stats}->Add(contigs => 1);
425      $self->{stats}->Add(dna => $contigLen);      $self->{stats}->Add(dna => $contigLen);
# Line 355  Line 439 
439  sub LoadFeatures {  sub LoadFeatures {
440      # Get the parameters.      # Get the parameters.
441      my ($self) = @_;      my ($self) = @_;
442        # Read in the functional assignments.
443        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
444        my $assignHash = $self->ReadAssignments();
445        $self->{assignHash} = $assignHash;
446      # Get the directory of feature types.      # Get the directory of feature types.
447      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
448      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
449        # Check for protein sequences. If we have some, load them into a hash.
450        my $protHash = {};
451        if (-f "$featureDir/peg/fasta") {
452            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
453            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
454        }
455      # Create the feature records for the types found.      # Create the feature records for the types found.
456      for my $type (@types) {      for my $type (@types) {
457          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
458          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
459              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
460              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
461                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
462                if (-f $tranFile) {
463                    $evHash = $self->LoadEvidenceCodes($tranFile);
464          }          }
465                # Now load the feature data.
466                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
467      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
468      }      }
469  }  }
470    
471    =head3 LoadEvidenceCodes
472    
473        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
474    
475    Load the evidence codes from the specified attribute transaction log file into a
476    hash. The log file is in tab-delimited format. The first column contains the
477    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
478    the third an attribute name (we'll ignore everything but C<evidence_code>), and
479    the fourth the attribute value.
480    
481    =over 4
482    
483    =item attributeFile
484    
485    Name of the attribute transaction log file.
486    
487    =item RETURN
488    
489    Returns a reference to a hash mapping each feature ID to a comma-delimited list
490    of its evidence codes.
491    
492    =back
493    
494    =cut
495    
496    sub LoadEvidenceCodes {
497        # Get the parameters.
498        my ($self, $attributeFile) = @_;
499        # Get the Sapling database.
500        my $sap = $self->{sap};
501        # Get the statistics object.
502        my $stats = $self->{stats};
503        # Get the assignment hash: we use this to filter the feature IDs.
504        my $assignHash = $self->{assignHash};
505        # Open the attribute log file for input.
506        my $ih = Open(undef, "<$attributeFile");
507        # This two-dimensional hash will hold the evidence codes for each feature.
508        my %retVal;
509        # Loop through the attribute log file.
510        while (! eof $ih) {
511            # Get the current attribute record.
512            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
513            $stats->Add(attributeLine => 1);
514            # Insure we have all the pieces we need.
515            if (! $command || ! $fid || $key ne 'evidence_code') {
516                $stats->Add(attributeLineSkipped => 1);
517            } elsif (! $assignHash->{$fid}) {
518                # Here the attribute is for a deleted feature.
519                $stats->Add(attributeFidSkipped => 1);
520            } else {
521                # Get the sub-hash for this feature.
522                if (! exists $retVal{$fid}) {
523                    $retVal{$fid} = {};
524                }
525                my $featureSubHash = $retVal{$fid};
526                # Process according to the command.
527                if ($command eq 'ADD') {
528                    # Here we are adding a new evidence code.
529                    $featureSubHash->{$value} = 1;
530                    $stats->Add(attributeAdd => 1);
531                } elsif ($command eq 'DELETE') {
532                    # Here we are deleting an evidence code.
533                    delete $featureSubHash->{$value};
534                    $stats->Add(attributeDelete => 1);
535                } else {
536                    # here we have an unrecognized command.
537                    $stats->Add(attributeCommandSkip => 1);
538                }
539            }
540        }
541        # Loop through the hash, converting each sub-hash to a comma-delimited list of
542        # evidence codes.
543        for my $fid (keys %retVal) {
544            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
545        }
546        # Return the evidence hash.
547        return \%retVal;
548    }
549    
550    
551  =head3 LoadFeatureData  =head3 LoadFeatureData
552    
553      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
554    
555  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
556  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 389  Line 565 
565    
566  Type of feature to load.  Type of feature to load.
567    
568    =item protHash
569    
570    Reference to a hash mapping each feature ID for a protein-encoding gene to
571    its protein sequence.
572    
573    =item evHash
574    
575    Reference to a hash mapping each feature ID to a comma-delimited list of
576    its evidence codes (if any).
577    
578  =back  =back
579    
580  =cut  =cut
581    
582  sub LoadFeatureData {  sub LoadFeatureData {
583      # Get the parameters.      # Get the parameters.
584      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
585      # Get the sapling database.      # Get the sapling database.
586      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
587      # Get the statistics object.      # Get the statistics object.
588      my $stats = $self->{stats};      my $stats = $self->{stats};
589      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
590      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
591        my $assignHash = $self->{assignHash};
592      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
593      # time, it overwrites the original.      # time, it overwrites the original.
594      my %fids;      my %fidHash;
595        # This hash tracks the deleted features. We don't want to update these.
596        my %deleted_features;
597      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
598      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
599      if (-f $fileName) {      if (-f $fileName) {
600          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
601            # of deleted features and remove them from the assignment hash. This insures
602            # that they are not used by subsequent methods.
603            my $deleteFile = "$featureDir/$type/deleted.features";
604            if (-f $deleteFile) {
605                my $dh = Open(undef, "<$deleteFile");
606                while (! eof $dh) {
607                    my ($deletedFid) = Tracer::GetLine($dh);
608                    if (exists $assignHash->{$deletedFid}) {
609                        delete $assignHash->{$deletedFid};
610                        $stats->Add(deletedFid => 1);
611                        $deleted_features{$deletedFid} = 1;
612                    }
613                }
614            }
615            # Open the main file for input.
616            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
617          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
618          while (! eof $ih) {          while (! eof $ih) {
619              # Read this feature's information.              # Read this feature's information.
620              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
621              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
622              if (exists $fids{$fid}) {              if (!$deleted_features{$fid}) {
623                    # If the feature already exists, delete it. (This should be extremely rare.)
624                    if ($fidHash{$fid}) {
625                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
626                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
627              } else {              } else {
628                  # Otherwise connect it to the genome.                      # It doesn't exist, so record it in the statistics.
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
             }  
             # Now we must parse the locations. This will contain a list of the location  
             # data 4-tuples (contig, start, dir, len).  
             my @locData;  
             # This will contain the total sequence length.  
             my $length = 0;  
             # Loop through the locations.  
             for my $loc (split /\s*,\s*/, $locations) {  
                 # Get this location's components.  
                 my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);  
                 # Normalize the contig.  
                 $self->FixContig(\$contig);  
                 # Compute the direction.  
                 if ($start <= $stop) {  
                     # Here we have the forward strand.  
                     my $len = $stop + 1 - $start;  
                     push @locData, [$contig, $start, '+', $len];  
                     $length += $len;  
                 } else {  
                     # Here we have the reverse strand.  
                     my $len = $start + 1 - $stop;  
                     push @locData, [$contig, $stop, '-', $len];  
                     $length += $len;  
                 }  
             }  
             # Now we can create the feature record.  
             $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                function => $assignHash->{$fid}, locked => 0,  
                                sequence_length => $length);  
             $fids{$fid} = 1;  
629              $stats->Add($type => 1);              $stats->Add($type => 1);
             # The next step is to connect the feature record to its locations. This  
             # involves dividing the location into segments. The following variable  
             # will count the total number of segments.  
             my $segment = 0;  
             # Loop through the locations.  
             for my $loc (@locData) {  
                 # Get the current location's information.  
                 my ($contig, $left, $dir, $len) = @$loc;  
                 # Peel off any segments.  
                 while ($len > $maxLength) {  
                     # Process according to the direction.  
                     if ($dir eq '+') {  
                         # Forward strand: peel from the beginning.  
                         $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                $maxLength);  
                         $left += $maxLength;  
                     } else {  
                         # Reverse strand: peel from the end.  
                         $self->ConnectLocation($fid, $contig, $segment,  
                                               $left + $len - $maxLength, $dir,  
                                               $maxLength);  
                     }  
                     # Denote we've used up a segment.  
                     $len -= $maxLength;  
                     $segment++;  
630                  }                  }
631                  # Output the last segment.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
632                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);                  # assignment is found there.
633                    if ($type eq 'rna') {
634                        if (! $assignHash->{$fid}) {
635                            $assignHash->{$fid} = $aliases[0];
636                        }
637                        @aliases = ();
638                    }
639                    # Add the feature to the database.
640                    my $function = $assignHash->{$fid} || "";
641                    $self->AddFeature($fid, $function, $locations, \@aliases,
642                                      $protHash->{$fid}, $evHash->{$fid});
643                    # Denote we've added this feature, so that if a duplicate occurs we're ready.
644                    $fidHash{$fid} = 1;
645              }              }
646          }          }
647      }      }
648  }  }
649    
650    
651  =head3 LoadProteinData  =head3 LoadProteinData
652    
653      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
654    
655  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
656  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
657    
658  =over 4  =over 4
659    
# Line 500  Line 661 
661    
662  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
663    
664    =item RETURN
665    
666    Returns a hash mapping feature IDs to protein sequences.
667    
668  =back  =back
669    
670  =cut  =cut
# Line 509  Line 674 
674      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
675      # Open the FASTA file for input.      # Open the FASTA file for input.
676      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
677        # Create the return hash.
678        my $retVal = {};
679      # We'll track the current protein in here.      # We'll track the current protein in here.
680      my $fid;      my $fid;
681      my $sequence = "";      my $sequence = "";
# Line 522  Line 689 
689              my $newFid = $1;              my $newFid = $1;
690              # Do we have an existing protein?              # Do we have an existing protein?
691              if (defined $fid) {              if (defined $fid) {
692                  # Yes. Write it out.                  # Yes. Store it in the hash.
693                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
694              }              }
695              # Initialize for the next protein.              # Initialize for the next protein.
696              $fid = $newFid;              $fid = $newFid;
# Line 535  Line 702 
702      }      }
703      # Do we have a residual protein.      # Do we have a residual protein.
704      if (defined $fid) {      if (defined $fid) {
705          # Yes. Write it out.          # Yes. Store it in the hash.
706          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
707        }
708        # Return the hash.
709        return $retVal;
710    }
711    
712    
713    =head3 LoadAnnotations
714    
715        $loaderObject->LoadAnnotations($fileName);
716    
717    Read in the annotation history information and use it to create annotation records.
718    
719    =over 4
720    
721    =item fileName
722    
723    Name of the annotation history file. This file is formatted with four fields per
724    record. Each field is on a separate line, with a double slash (C<//>) used as the
725    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
726    (formatted as an integer), (2) the user name, and (3) the annotation text.
727    
728    =back
729    
730    =cut
731    
732    sub LoadAnnotations {
733        # Get the parameters.
734        my ($self, $fileName) = @_;
735        # Get the assignment Hash. We use this to filter out deleted features.
736        my $assignHash = $self->{assignHash};
737        # Get the Sapling database.
738        my $sap = $self->{sap};
739        # Get the statistics object.
740        my $stats = $self->{stats};
741        # Open the input file.
742        my $ih = Tracer::Open(undef, "<$fileName");
743        # Loop through the input.
744        while (! eof $ih) {
745            # Read in the peg, timestamp, and user ID.
746            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
747            # Only proceed if the feature is not deleted.
748            if ($assignHash->{$fid}) {
749                # Add the annotation to this feature.
750                $self->MakeAnnotation($fid, $text, $user, $timestamp);
751            }
752      }      }
753  }  }
754    
755    
756  =head3 WriteProtein  =head3 WriteProtein
757    
758      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 565  Line 778 
778      # Get the parameters.      # Get the parameters.
779      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
780      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
781      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
782      # Insure the protein exists.      # Insure the protein exists.
783      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
784      # Connect the feature to it.      # Connect the feature to it.
785      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
786  }  }
787    
788  =head3 InsureEntity  =head3 LoadSubsystems
789    
790      my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);      $loaderObject->LoadSubsystems($subsysList);
791    
792  Insure that the specified record exists in the database. If no record is found of the  Load the subsystem data into the database. This requires looking through the
793  specified type with the specified ID, one will be created with the indicated fields.  bindings and using them to connect to the subsystems that already exist in the
794    database. If the subsystem does not exist, its bindings will not be loaded.
795    
796  =over 4  =over 4
797    
798  =item $entityType  =item subsysList
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
799    
800  Hash mapping field names to values for all the fields in the desired entity record except  Reference to a list of subsystem IDs. If specified, only the subsystems in the
801  for the ID.  list will be processed. This is useful when a particular set of subsystems
802    has been replaced.
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
803    
804  =back  =back
805    
806  =cut  =cut
807    
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
 =head3 LoadSubsystems  
   
     $loaderObject->LoadSubsystems();  
   
 Load the subsystem data into the database. This includes all of the subsystems,  
 their categories, roles, and the bindings that connect them to this genome's features.  
   
 =cut  
   
808  sub LoadSubsystems {  sub LoadSubsystems {
809      # Get the parameters.      # Get the parameters.
810      my ($self) = @_;      my ($self, $subsysList) = @_;
811      # Get the sapling object.      # Get the sapling object.
812      my $sap = $self->{sap};      my $sap = $self->{sap};
813      # Get the statistics object.      # Get the statistics object.
814      my $stats = $self->{stats};      my $stats = $self->{stats};
815      # Get the genome ID.      # Get the genome ID.
816      my $genome = $self->{genome};      my $genome = $self->{genome};
817      # Connect to the Sapling server so we can get the subsystem variant and role information.      # Compute the subsystem and binding file names.
818      my $sapO = SAPserver->new();      my $subFileName = "$self->{directory}/Subsystems/subsystems";
819        my $bindFileName = "$self->{directory}/Subsystems/bindings";
820        # Only proceed if both exist.
821        if (! -f $subFileName || ! -f $bindFileName) {
822            Trace("Missing subsystem data for $genome.") if T(1);
823            $stats->Add(noSubsystems => 1);
824        } else {
825      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
826      my %machines;      my %machines;
827      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
828      my %machineRoles;      my %machineRoles;
829      # The first step is to create the subsystems themselves and connect them to the          # This hash will contain the list of subsystems found in the database.
830      # genome. A list is given in the subsystems file of the Subsystems directory.          my %subsystems;
831      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");          # We loop through the subsystems, looking for the ones already in the
832      # Create the field hash for the subsystem query.          # database. The list is given in the subsystems file of the Subsystems
833      my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)          # directory.
834                      Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)          my $ih = Open(undef, "<$subFileName");
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
835      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
836      while (! eof $ih) {      while (! eof $ih) {
837          # Get this subsystem.          # Get this subsystem.
838          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
839                Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
840          # Normalize the subsystem name.          # Normalize the subsystem name.
841          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
842                # Insure the subsystem is in the database and is one we're interested in.
843                if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
844                        $sap->Exists(Subsystem => $subsystem)) {
845                    # We have the subsystem in the database. We need to compute the machine
846                    # role IDs and create the molecular machine. First, we need to remember
847                    # the subsystem.
848                    $subsystems{$subsystem} = 1;
849          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
850          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
851          # Insure the subsystem is in the database.                  my $rolePrefix = "$subsystemMD5:$genome";
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
852              # Loop through the roles.              # Loop through the roles.
853              my $roleList = $subHash->{$subsystem};                  my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
854              for my $roleTuple (@$roleList) {                          [$subsystem], "to-link abbreviation");
855                    for my $roleTuple (@roleList) {
856                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
857                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                      my $machineRoleID = $rolePrefix . '::' . $abbr;
858                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
859              }              }
860          }                  # Next we need the variant code and key.
861          # Now we need to connect the variant to the subsystem. First, we compute the real                  my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
862          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
863          # Get the variant from the sapling server.                  # Now we create the molecular machine connecting this genome to the
864          my $variantH = $sapO->get(-objects => "Variant",                  # subsystem variant.
                                   -filter => {"Variant(id)" => ["=", $variantKey]},  
                                   -fields => {"Variant(code)" => "code",  
                                               "Variant(comment)" => "comment",  
                                               "Variant(type)" => "type",  
                                               "Variant(role-rule)" => "role-rule"},  
                                   -firstOnly => 1);  
         # Insure we have it in the database.  
         $self->InsureEntity('Variant', $variantKey, %$variantH);  
         $stats->Add(subsystems => 1);  
         # Connect it to the subsystem.  
         $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
865          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
866          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
867          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
# Line 747  Line 869 
869          # Remember the machine ID.          # Remember the machine ID.
870          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
871      }      }
872      # Now we go through the bindings file. This file connects the subsystem roles to          }
873      # the molecular machines.          # Now we go through the bindings file. This file connects the subsystem
874      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");          # roles to the molecular machines.
875            $ih = Open(undef, "<$bindFileName");
876      # Loop through the bindings.      # Loop through the bindings.
877      while (! eof $ih) {      while (! eof $ih) {
878          # Get the binding data.          # Get the binding data.
879          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
880          # Normalize the subsystem name.          # Normalize the subsystem name.
881          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
882                # Insure the subsystem is in the database.
883                if ($subsystems{$subsystem}) {
884          # Compute the machine role.          # Compute the machine role.
885          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
886          # Insure it exists.          # Insure it exists.
# Line 771  Line 896 
896          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
897      }      }
898  }  }
899        }
900    }
901    
902  =head3 CreateGenome  =head3 CreateGenome
903    
# Line 793  Line 920 
920      my $dir = $self->{directory};      my $dir = $self->{directory};
921      # Get the sapling database.      # Get the sapling database.
922      my $sap = $self->{sap};      my $sap = $self->{sap};
923        # Get the MD5 computer.
924        my $md5C = $self->{md5};
925      # We'll put the genome attributes in here.      # We'll put the genome attributes in here.
926      my %fields;      my %fields;
927      # Check for a basic statistics file.      # Check for a basic statistics file.
# Line 820  Line 949 
949          $fields{'scientific-name'} = $line;          $fields{'scientific-name'} = $line;
950          # Get the taxonomy and extract the domain from it.          # Get the taxonomy and extract the domain from it.
951          $ih = Open(undef, "<$dir/TAXONOMY");          $ih = Open(undef, "<$dir/TAXONOMY");
952          ($fields{domain}) = split /;/, <$ih>, 2;          ($fields{domain}) = split m/;/, <$ih>, 2;
953      }      }
954      # Get the counts from the statistics object.      # Get the counts from the statistics object.
955      my $stats = $self->{stats};      my $stats = $self->{stats};
# Line 828  Line 957 
957      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
958      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
959      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
960        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
961      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
962      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
963      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 838  Line 968 
968      }      }
969      # Use the domain to determine whether or not the genome is prokaryotic.      # Use the domain to determine whether or not the genome is prokaryotic.
970      $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;      $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
971        # Compute the genome MD5.
972        $fields{md5_identifier} = $md5C->CloseGenome();
973      # Finally, add the genome ID.      # Finally, add the genome ID.
974      $fields{id} = $self->{genome};      $fields{id} = $self->{genome};
975      # Create the genome record.      # Create the genome record.
# Line 937  Line 1069 
1069    
1070  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1071    
1072  =head3 DeleteRelatedRecords  =head3 ReadAnnotation
1073    
1074      DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1075    
1076  Delete all the records in the named entity and relationship relating to the  Read the next record from an annotation file. The next record must exist (that is, an
1077  specified genome and roll up the statistics in the specified statistics object.  end-of-file check should have been performed before calling this method).
1078    
1079  =over 4  =over 4
1080    
1081  =item sap  =item ih
   
 L<Sapling> object for accessing the database.  
   
 =item genome  
   
 ID of the relevant genome.  
   
 =item stats  
   
 L<Stats> object for tracking the delete activity.  
   
 =item relName  
   
 Name of a relationship from the B<Genome> table.  
   
 =item entityName  
1082    
1083  Name of the entity on the other side of the relationship.  Open file handle for the annotation file.
   
 =back  
   
 =cut  
   
 sub DeleteRelatedRecords {  
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
   
 =head3 ExtractFields  
   
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
   
 =over 4  
   
 =item tableName  
   
 Name of the table whose field names and values are desired.  
   
 =item dataHash  
   
 Reference to a hash mapping fully-qualified ERDB field names to values.  
1084    
1085  =item RETURN  =item RETURN
1086    
1087  Returns a hash containing only the fields from the specified table and their values.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1088    timestamp, (2) the user ID, and (3) the annotation text.
1089    
1090  =back  =back
1091    
1092  =cut  =cut
1093    
1094  sub ExtractFields {  sub ReadAnnotation {
1095      # Get the parameters.      # Get the parameter.
1096      my ($tableName, $dataHash) = @_;      my ($ih) = @_;
1097      # Declare the return variable.      # Read the three fixed fields.
1098      my %retVal;      my $fid = <$ih>; chomp $fid;
1099      # Extract the desired fields.      my $timestamp = <$ih>; chomp $timestamp;
1100      for my $field (keys %$dataHash) {      my $user = <$ih>; chomp $user;
1101          # Is this a field for the specified table?      # Loop through the lines of the text field.
1102          if ($field =~ /^$tableName\(([^)]+)/) {      my $text = "";
1103              # Yes, put it in the output hash.      my $line = <$ih>;
1104              $retVal{$1} = $dataHash->{$field};      while (defined($line) && $line ne "//\n") {
1105          }          $text .= $line;
1106      }          $line = <$ih>;
1107      # Return the computed hash.      }
1108      return %retVal;      # Remove the trailing new-line from the text.
1109        chomp $text;
1110        # Return the fields.
1111        return ($fid, $timestamp, $user, $text);
1112  }  }
1113    
1114  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3