[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Sun Feb 13 13:02:30 2011 UTC revision 1.20, Fri Jul 10 14:22:27 2015 UTC
# Line 26  Line 26 
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28      use AliasAnalysis;      use AliasAnalysis;
29        use BaseSaplingLoader;
30        use MD5Computer;
31      use base qw(SaplingDataLoader);      use base qw(SaplingDataLoader);
32    
33  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
# Line 67  Line 69 
69      # Create the loader object.      # Create the loader object.
70      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
71      # Load the contigs.      # Load the contigs.
72      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
73      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
74      # Load the features.      # Load the features.
75      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
76      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
77        # Check for annotation history. If we have it, load the history records into the
78        # database.
79        if (-f "$directory/annotations") {
80            Trace("Processing annotations.") if T(SaplingDataLoader => 3);
81            $loaderObject->LoadAnnotations("$directory/annotations");
82        }
83      # Load the subsystem bindings.      # Load the subsystem bindings.
84      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
85      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
86      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
87      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
88      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
89      # Return the statistics.      # Return the statistics.
90      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 112  Line 120 
120      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
121      # Create the statistics object.      # Create the statistics object.
122      my $stats = Stats->new();      my $stats = Stats->new();
123      # Delete the DNA.      # Delete the DNA sequences.
124      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
125        for my $seq (@seqs) {
126            my $delStats = $sap->Delete(DNASequence => $seq);
127            $stats->Accumulate($delStats);
128        }
129      # Delete the contigs.      # Delete the contigs.
130      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
131      # Delete the features.      # Delete the features.
# Line 124  Line 136 
136      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
137      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
138      $stats->Accumulate($subStats);      $stats->Accumulate($subStats);
139        Trace("Statistics for delete of $genome:\n" . $stats->Show()) if T(SaplingDataLoader => 3);
140      # Return the statistics object.      # Return the statistics object.
141      return $stats;      return $stats;
142  }  }
143    
144    
145    =head3 Process
146    
147        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
148    
149    Load genome data from the specified directory. If the genome data already
150    exists in the database, it will be deleted first.
151    
152    =over 4
153    
154    =item sap
155    
156    L</Sapling> object for accessing the database.
157    
158    =item genome
159    
160    ID of the genome whose  data is being loaded.
161    
162    =item directory
163    
164    Name of the directory containing the genome data files. If omitted, the
165    genome will be deleted from the database.
166    
167    =item RETURN
168    
169    Returns a statistics object describing the activity during the reload.
170    
171    =back
172    
173    =cut
174    
175    sub Process {
176        # Get the parameters.
177        my ($sap, $genome, $directory) = @_;
178        # Clear the existing data for the specified genome.
179        my $stats = ClearGenome($sap, $genome);
180        # Load the new genome data from the specified directory (if one is
181        # specified).
182        if ($directory) {
183            my $newStats = Load($sap, $genome, $directory);
184            # Merge the statistics.
185            $stats->Accumulate($newStats);
186        }
187        # Return the result.
188        return $stats;
189    }
190    
191    
192  =head2 Loader Object Methods  =head2 Loader Object Methods
193    
194  =head3 new  =head3 new
# Line 189  Line 250 
250      # Add our specialized data.      # Add our specialized data.
251      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
252      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
253        # Leave the assignment hash undefined until we populate it.
254        $retVal->{assignHash} = undef;
255        # Create an MD5 Computer for this genome.
256        $retVal->{md5} = MD5Computer->new();
257      # Return the result.      # Return the result.
258      return $retVal;      return $retVal;
259  }  }
# Line 198  Line 263 
263      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
264    
265  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
266  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
267  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
268    C<gc_content> statistic.
269    
270  =cut  =cut
271    
# Line 213  Line 279 
279      # Get the genome ID.      # Get the genome ID.
280      my $genome = $self->{genome};      my $genome = $self->{genome};
281      # These variables will contain the current contig ID, the current contig length,      # These variables will contain the current contig ID, the current contig length,
282      # the ordinal of the current chunk, the accumulated DNA sequence, and its length.      # the accumulated DNA sequence for the current chunk, and its length.
283      my ($contigID, $contigLen, $ordinal, $chunk, $chunkLen) = (undef, 0, 0, '', 0);      my ($contigID, $contigLen, $chunk, $chunkLen) = (undef, 0, '', 0);
284        # This variable contains a list of the chunks, for use in computing the MD5.
285        my @chunks;
286      # Loop through the contig file.      # Loop through the contig file.
287      while (! eof $ih) {      while (! eof $ih) {
288          # Get the current record.          # Get the current record.
# Line 225  Line 293 
293              my $newContigID = $1;              my $newContigID = $1;
294              # Is there a current contig?              # Is there a current contig?
295              if (defined $contigID) {              if (defined $contigID) {
296                  # Yes. Output the current chunk.                  # Yes. Output the contig.
297                  $self->OutputChunk($contigID, $ordinal, $chunk);                  $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
                 # Output the contig.  
                 $self->OutputContig($contigID, $contigLen);  
298              }              }
299              # Compute the new contig ID. We need to insure it has a genome ID in front.              # Compute the new contig ID. We need to insure it has a genome ID in front.
300              $self->FixContig(\$newContigID);              $self->FixContig(\$newContigID);
301              # Initialize the new contig.              # Initialize the new contig.
302              $contigID = $newContigID;              $contigID = $newContigID;
303              $contigLen = 0;              $contigLen = 0;
             $ordinal = 0;  
304              $chunk = '';              $chunk = '';
305              $chunkLen = 0;              $chunkLen = 0;
306                @chunks = ();
307          } else {          } else {
308              # Here we have more DNA in the current contig. Are we at the end of              # Here we have more DNA in the current contig. Are we at the end of
309              # the current chunk?              # the current chunk?
# Line 251  Line 317 
317                  # Yes. Create the actual chunk.                  # Yes. Create the actual chunk.
318                  $chunk .= substr($line, 0, $segmentLength - $chunkLen);                  $chunk .= substr($line, 0, $segmentLength - $chunkLen);
319                  # Write it out.                  # Write it out.
320                  $self->OutputChunk($contigID, $ordinal, $chunk);                  $self->OutputChunk($contigID, scalar @chunks, $chunk);
321                  # Set up the new chunk.                  # Set up the new chunk.
322                  $chunk = substr($line, $segmentLength - $chunkLen);                  $chunk = substr($line, $segmentLength - $chunkLen);
                 $ordinal++;  
323                  $chunkLen = length($chunk);                  $chunkLen = length($chunk);
324                    push @chunks, $chunk;
325              }              }
326              # Update the contig length.              # Update the contig length.
327              $contigLen += $lineLen;              $contigLen += $lineLen;
# Line 263  Line 329 
329      }      }
330      # Is there a current contig?      # Is there a current contig?
331      if (defined $contigID) {      if (defined $contigID) {
332          # Yes. Output the residual chunk.          # Yes. Output the contig itself.
333          $self->OutputChunk($contigID, $ordinal, $chunk);          $self->OutputContig($contigID, $contigLen, $chunk, \@chunks);
         # Output the contig itself.  
         $self->OutputContig($contigID, $contigLen);  
334      }      }
335  }  }
336    
# Line 307  Line 371 
371      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
372      # Record the chunk.      # Record the chunk.
373      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
374        # Update the GC count.
375        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
376  }  }
377    
378  =head3 OutputContig  =head3 OutputContig
379    
380      $loaderObject->OutputContig($contigID, $contigLen);      $loaderObject->OutputContig($contigID, $contigLen, $chunk, \@chunks);
381    
382  Write out the current contig.  Write out the current contig.
383    
# Line 325  Line 391 
391    
392  Length of the contig in base pairs.  Length of the contig in base pairs.
393    
394    =item chunk
395    
396    Last DNA chunk of the contig.
397    
398    =item chunks
399    
400    Reference to a list of the DNA chunks up to (but not including) the last one.
401    
402  =back  =back
403    
404  =cut  =cut
405    
406  sub OutputContig {  sub OutputContig {
407      # Get the parameters.      # Get the parameters.
408      my ($self, $contigID, $contigLen) = @_;      my ($self, $contigID, $contigLen, $chunk, $chunks) = @_;
409      # Get the sapling object.      # Get the sapling object.
410      my $sap = $self->{sap};      my $sap = $self->{sap};
411        # Get the MD5 computer.
412        my $md5C = $self->{md5};
413        # Output the last chunk.
414        $self->OutputChunk($contigID, scalar @$chunks, $chunk);
415      # Connect the contig to the genome.      # Connect the contig to the genome.
416      $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);      $sap->InsertObject('IsMadeUpOf', from_link => $self->{genome}, to_link => $contigID);
417        # Compute the MD5.
418        push @$chunks, $chunk;
419        my $contigMD5 = $md5C->ProcessContig($contigID, $chunks);
420      # Output the contig record.      # Output the contig record.
421      $sap->InsertObject('Contig', id => $contigID, length => $contigLen);      $sap->InsertObject('Contig', id => $contigID, length => $contigLen,
422                md5_identifier => $contigMD5);
423      # Record the contig.      # Record the contig.
424      $self->{stats}->Add(contigs => 1);      $self->{stats}->Add(contigs => 1);
425      $self->{stats}->Add(dna => $contigLen);      $self->{stats}->Add(dna => $contigLen);
# Line 357  Line 439 
439  sub LoadFeatures {  sub LoadFeatures {
440      # Get the parameters.      # Get the parameters.
441      my ($self) = @_;      my ($self) = @_;
442        # Read in the functional assignments.
443        Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
444        my $assignHash = $self->ReadAssignments();
445        $self->{assignHash} = $assignHash;
446      # Get the directory of feature types.      # Get the directory of feature types.
447      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
448      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
449        # Check for protein sequences. If we have some, load them into a hash.
450        my $protHash = {};
451        if (-f "$featureDir/peg/fasta") {
452            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
453            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
454        }
455      # Create the feature records for the types found.      # Create the feature records for the types found.
456      for my $type (@types) {      for my $type (@types) {
457          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
458          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
459              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
460              $self->LoadFeatureData($featureDir, $type);              my $evHash = {};
461                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
462                if (-f $tranFile) {
463                    $evHash = $self->LoadEvidenceCodes($tranFile);
464          }          }
465                # Now load the feature data.
466                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
467      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         $self->LoadProteinData("$featureDir/peg/fasta");  
468      }      }
469  }  }
470    
471    =head3 LoadEvidenceCodes
472    
473        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
474    
475    Load the evidence codes from the specified attribute transaction log file into a
476    hash. The log file is in tab-delimited format. The first column contains the
477    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
478    the third an attribute name (we'll ignore everything but C<evidence_code>), and
479    the fourth the attribute value.
480    
481    =over 4
482    
483    =item attributeFile
484    
485    Name of the attribute transaction log file.
486    
487    =item RETURN
488    
489    Returns a reference to a hash mapping each feature ID to a comma-delimited list
490    of its evidence codes.
491    
492    =back
493    
494    =cut
495    
496    sub LoadEvidenceCodes {
497        # Get the parameters.
498        my ($self, $attributeFile) = @_;
499        # Get the Sapling database.
500        my $sap = $self->{sap};
501        # Get the statistics object.
502        my $stats = $self->{stats};
503        # Get the assignment hash: we use this to filter the feature IDs.
504        my $assignHash = $self->{assignHash};
505        # Open the attribute log file for input.
506        my $ih = Open(undef, "<$attributeFile");
507        # This two-dimensional hash will hold the evidence codes for each feature.
508        my %retVal;
509        # Loop through the attribute log file.
510        while (! eof $ih) {
511            # Get the current attribute record.
512            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
513            $stats->Add(attributeLine => 1);
514            # Insure we have all the pieces we need.
515            if (! $command || ! $fid || $key ne 'evidence_code') {
516                $stats->Add(attributeLineSkipped => 1);
517            } elsif (! $assignHash->{$fid}) {
518                # Here the attribute is for a deleted feature.
519                $stats->Add(attributeFidSkipped => 1);
520            } else {
521                # Get the sub-hash for this feature.
522                if (! exists $retVal{$fid}) {
523                    $retVal{$fid} = {};
524                }
525                my $featureSubHash = $retVal{$fid};
526                # Process according to the command.
527                if ($command eq 'ADD') {
528                    # Here we are adding a new evidence code.
529                    $featureSubHash->{$value} = 1;
530                    $stats->Add(attributeAdd => 1);
531                } elsif ($command eq 'DELETE') {
532                    # Here we are deleting an evidence code.
533                    delete $featureSubHash->{$value};
534                    $stats->Add(attributeDelete => 1);
535                } else {
536                    # here we have an unrecognized command.
537                    $stats->Add(attributeCommandSkip => 1);
538                }
539            }
540        }
541        # Loop through the hash, converting each sub-hash to a comma-delimited list of
542        # evidence codes.
543        for my $fid (keys %retVal) {
544            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
545        }
546        # Return the evidence hash.
547        return \%retVal;
548    }
549    
550    
551  =head3 LoadFeatureData  =head3 LoadFeatureData
552    
553      $loaderObject->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
554    
555  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
556  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 391  Line 565 
565    
566  Type of feature to load.  Type of feature to load.
567    
568    =item protHash
569    
570    Reference to a hash mapping each feature ID for a protein-encoding gene to
571    its protein sequence.
572    
573    =item evHash
574    
575    Reference to a hash mapping each feature ID to a comma-delimited list of
576    its evidence codes (if any).
577    
578  =back  =back
579    
580  =cut  =cut
581    
582  sub LoadFeatureData {  sub LoadFeatureData {
583      # Get the parameters.      # Get the parameters.
584      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
585      # Get the sapling database.      # Get the sapling database.
586      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
587      # Get the statistics object.      # Get the statistics object.
588      my $stats = $self->{stats};      my $stats = $self->{stats};
589      # Read in the functional assignments.      # Get the assignment hash. This tells us our functional assignments. This method is
590      my $assignHash = $self->ReadAssignments();      # also where we remove the deleted features from it.
591        my $assignHash = $self->{assignHash};
592      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
593      # time, it overwrites the original.      # time, it overwrites the original.
594      my %fids;      my %fidHash;
595        # This hash tracks the deleted features. We don't want to update these.
596        my %deleted_features;
597      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
598      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
599      if (-f $fileName) {      if (-f $fileName) {
600          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
601          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
602          my %deletedFids;          # that they are not used by subsequent methods.
603          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
604          if (-f $deleteFile) {          if (-f $deleteFile) {
605              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
606                while (! eof $dh) {
607                    my ($deletedFid) = Tracer::GetLine($dh);
608                    if (exists $assignHash->{$deletedFid}) {
609                        delete $assignHash->{$deletedFid};
610                        $stats->Add(deletedFid => 1);
611                        $deleted_features{$deletedFid} = 1;
612                    }
613                }
614          }          }
615          # Open the main file for input.          # Open the main file for input.
616            Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
617          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
618          while (! eof $ih) {          while (! eof $ih) {
619              # Read this feature's information.              # Read this feature's information.
620              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
621              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
622              if (! exists $deletedFids{$fid}) {              if (!$deleted_features{$fid}) {
623                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
624                  if (exists $fids{$fid}) {                  if ($fidHash{$fid}) {
625                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
626                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
                 }  
                 # Otherwise connect this feature to the genome.  
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
                 # Now we must parse the locations. This will contain a list of the location  
                 # data 4-tuples (contig, start, dir, len).  
                 my @locData;  
                 # This will contain the total sequence length.  
                 my $length = 0;  
                 # Loop through the locations.  
                 for my $loc (split /\s*,\s*/, $locations) {  
                     # Get this location's components.  
                     my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);  
                     # Normalize the contig.  
                     $self->FixContig(\$contig);  
                     # Compute the direction.  
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
627                      } else {                      } else {
628                          # Here we have the reverse strand.                      # It doesn't exist, so record it in the statistics.
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fids{$fid} = 1;  
629                  $stats->Add($type => 1);                  $stats->Add($type => 1);
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
630                          }                          }
631                    # If this is RNA, the alias list is always empty. Sometimes, the functional
632                    # assignment is found there.
633                    if ($type eq 'rna') {
634                        if (! $assignHash->{$fid}) {
635                            $assignHash->{$fid} = $aliases[0];
636                        }
637                        @aliases = ();
638                    }
639                    # Add the feature to the database.
640                    my $function = $assignHash->{$fid} || "";
641                    $self->AddFeature($fid, $function, $locations, \@aliases,
642                                      $protHash->{$fid}, $evHash->{$fid});
643                    # Denote we've added this feature, so that if a duplicate occurs we're ready.
644                    $fidHash{$fid} = 1;
645                      }                      }
646                  }                  }
647              }              }
648          }          }
649      }  
     # Now loop through the features, connecting them to their roles. Note that deleted  
     # features will not be in the assignment hash.  
     for my $fid (keys %$assignHash) {  
         # Get the roles and the error count.  
         my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});  
         # Accumulate the errors in the stats object.  
         $stats->Add(roleErrors => $errors);  
         # Is this a suspicious function?  
         if (! defined $roles) {  
             # Yes, so track it.  
             $stats->Add(badFunction => 1);  
         } else {  
             # No, connect the roles.  
             for my $role (@$roles) {  
                 # Insure this role exists.  
                 my $hypo = hypo($role);  
                 $self->InsureEntity(Role => $role, hypothetical => $hypo);  
                 # Connect it to the feature.  
                 $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);  
             }  
         }  
     }  
 }  
650    
651  =head3 LoadProteinData  =head3 LoadProteinData
652    
653      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
654    
655  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
656  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
657    
658  =over 4  =over 4
659    
# Line 578  Line 661 
661    
662  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
663    
664    =item RETURN
665    
666    Returns a hash mapping feature IDs to protein sequences.
667    
668  =back  =back
669    
670  =cut  =cut
# Line 587  Line 674 
674      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
675      # Open the FASTA file for input.      # Open the FASTA file for input.
676      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
677        # Create the return hash.
678        my $retVal = {};
679      # We'll track the current protein in here.      # We'll track the current protein in here.
680      my $fid;      my $fid;
681      my $sequence = "";      my $sequence = "";
# Line 600  Line 689 
689              my $newFid = $1;              my $newFid = $1;
690              # Do we have an existing protein?              # Do we have an existing protein?
691              if (defined $fid) {              if (defined $fid) {
692                  # Yes. Write it out.                  # Yes. Store it in the hash.
693                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
694              }              }
695              # Initialize for the next protein.              # Initialize for the next protein.
696              $fid = $newFid;              $fid = $newFid;
# Line 613  Line 702 
702      }      }
703      # Do we have a residual protein.      # Do we have a residual protein.
704      if (defined $fid) {      if (defined $fid) {
705          # Yes. Write it out.          # Yes. Store it in the hash.
706          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
707      }      }
708        # Return the hash.
709        return $retVal;
710  }  }
711    
712    
713    =head3 LoadAnnotations
714    
715        $loaderObject->LoadAnnotations($fileName);
716    
717    Read in the annotation history information and use it to create annotation records.
718    
719    =over 4
720    
721    =item fileName
722    
723    Name of the annotation history file. This file is formatted with four fields per
724    record. Each field is on a separate line, with a double slash (C<//>) used as the
725    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
726    (formatted as an integer), (2) the user name, and (3) the annotation text.
727    
728    =back
729    
730    =cut
731    
732    sub LoadAnnotations {
733        # Get the parameters.
734        my ($self, $fileName) = @_;
735        # Get the assignment Hash. We use this to filter out deleted features.
736        my $assignHash = $self->{assignHash};
737        # Get the Sapling database.
738        my $sap = $self->{sap};
739        # Get the statistics object.
740        my $stats = $self->{stats};
741        # Open the input file.
742        my $ih = Tracer::Open(undef, "<$fileName");
743        # Loop through the input.
744        while (! eof $ih) {
745            # Read in the peg, timestamp, and user ID.
746            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
747            # Only proceed if the feature is not deleted.
748            if ($assignHash->{$fid}) {
749                # Add the annotation to this feature.
750                $self->MakeAnnotation($fid, $text, $user, $timestamp);
751            }
752        }
753    }
754    
755    
756  =head3 WriteProtein  =head3 WriteProtein
757    
758      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 643  Line 778 
778      # Get the parameters.      # Get the parameters.
779      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
780      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
781      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
782      # Insure the protein exists.      # Insure the protein exists.
783      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
784      # Connect the feature to it.      # Connect the feature to it.
# Line 652  Line 787 
787    
788  =head3 LoadSubsystems  =head3 LoadSubsystems
789    
790      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems($subsysList);
791    
792  Load the subsystem data into the database. This includes all of the subsystems,  Load the subsystem data into the database. This requires looking through the
793  their categories, roles, and the bindings that connect them to this genome's features.  bindings and using them to connect to the subsystems that already exist in the
794    database. If the subsystem does not exist, its bindings will not be loaded.
795    
796    =over 4
797    
798    =item subsysList
799    
800    Reference to a list of subsystem IDs. If specified, only the subsystems in the
801    list will be processed. This is useful when a particular set of subsystems
802    has been replaced.
803    
804    =back
805    
806  =cut  =cut
807    
808  sub LoadSubsystems {  sub LoadSubsystems {
809      # Get the parameters.      # Get the parameters.
810      my ($self) = @_;      my ($self, $subsysList) = @_;
811      # Get the sapling object.      # Get the sapling object.
812      my $sap = $self->{sap};      my $sap = $self->{sap};
813      # Get the statistics object.      # Get the statistics object.
814      my $stats = $self->{stats};      my $stats = $self->{stats};
815      # Get the genome ID.      # Get the genome ID.
816      my $genome = $self->{genome};      my $genome = $self->{genome};
817      # Connect to the Sapling server so we can get the subsystem variant and role information.      # Compute the subsystem and binding file names.
818      my $sapO = SAPserver->new();      my $subFileName = "$self->{directory}/Subsystems/subsystems";
819        my $bindFileName = "$self->{directory}/Subsystems/bindings";
820        # Get a hash of the molecular machines already connected to this genome.
821        my %machinesOld = map { $_ => 1 } $sap->GetFlat('Uses', 'Uses(from-link) = ?',
822                [$genome], 'to-link');
823        # Only proceed if both exist.
824        if (! -f $subFileName || ! -f $bindFileName) {
825            Trace("Missing subsystem data for $genome.") if T(1);
826            $stats->Add(noSubsystems => 1);
827        } else {
828      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
829      my %machines;      my %machines;
830      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
831      my %machineRoles;      my %machineRoles;
832      # The first step is to create the subsystems themselves and connect them to the          # This hash will contain the list of subsystems found in the database.
833      # genome. A list is given in the subsystems file of the Subsystems directory.          my %subsystems;
834      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");          # We loop through the subsystems, looking for the ones already in the
835      # Create the field hash for the subsystem query.          # database. The list is given in the subsystems file of the Subsystems
836      my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)          # directory.
837                      Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)          my $ih = Open(undef, "<$subFileName");
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
838      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
839      while (! eof $ih) {      while (! eof $ih) {
840          # Get this subsystem.          # Get this subsystem.
841          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
842                Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
843          # Normalize the subsystem name.          # Normalize the subsystem name.
844          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
845                # Insure the subsystem is in the database and is one we're interested in.
846                if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
847                        $sap->Exists(Subsystem => $subsystem)) {
848                    # We have the subsystem in the database. We need to compute the machine
849                    # role IDs and create the molecular machine. First, we need to remember
850                    # the subsystem.
851                    $subsystems{$subsystem} = 1;
852          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
853          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
854          # Insure the subsystem is in the database.                  my $rolePrefix = "$subsystemMD5:$genome";
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
855              # Loop through the roles.              # Loop through the roles.
856              my $roleList = $subHash->{$subsystem};                  my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
857              for my $roleTuple (@$roleList) {                          [$subsystem], "to-link abbreviation");
858                    for my $roleTuple (@roleList) {
859                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
860                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                      my $machineRoleID = $rolePrefix . '::' . $abbr;
861                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
862              }              }
863          }                  # Next we need the variant code and key.
864          # Now we need to connect the variant to the subsystem. First, we compute the real                  my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
865          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
866          # Get the variant from the sapling server.                  # Now we create the molecular machine connecting this genome to the
867          my $variantH = $sapO->get(-objects => "Variant",                  # subsystem variant.
                                   -filter => {"Variant(id)" => ["=", $variantKey]},  
                                   -fields => {"Variant(code)" => "code",  
                                               "Variant(comment)" => "comment",  
                                               "Variant(type)" => "type",  
                                               "Variant(role-rule)" => "role-rule"},  
                                   -firstOnly => 1);  
         # Insure we have it in the database.  
         $self->InsureEntity('Variant', $variantKey, %$variantH);  
         $stats->Add(subsystems => 1);  
         # Connect it to the subsystem.  
         $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
868          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
869                    # Does it already exist?
870                    if ($machinesOld{$machineID}) {
871                        # Yes. Output a warning.
872                        Trace("Machine $machineID already found for $subsystem and genome $genome.") if T(1);
873                        $stats->Add(duplicateMachine => 1);
874                    } else {
875          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
876          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
877          $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);          $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID);
878                    }
879          # Remember the machine ID.          # Remember the machine ID.
880          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
881      }      }
882      # Now we go through the bindings file. This file connects the subsystem roles to          }
883      # the molecular machines.          # Now we go through the bindings file. This file connects the subsystem
884      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");          # roles to the molecular machines.
885            $ih = Open(undef, "<$bindFileName");
886      # Loop through the bindings.      # Loop through the bindings.
887      while (! eof $ih) {      while (! eof $ih) {
888          # Get the binding data.          # Get the binding data.
889          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
890          # Normalize the subsystem name.          # Normalize the subsystem name.
891          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
892                # Insure the subsystem is in the database.
893                if ($subsystems{$subsystem}) {
894          # Compute the machine role.          # Compute the machine role.
895          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
896          # Insure it exists.          # Insure it exists.
# Line 788  Line 906 
906          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
907      }      }
908  }  }
909        }
910    }
911    
912  =head3 CreateGenome  =head3 CreateGenome
913    
# Line 810  Line 930 
930      my $dir = $self->{directory};      my $dir = $self->{directory};
931      # Get the sapling database.      # Get the sapling database.
932      my $sap = $self->{sap};      my $sap = $self->{sap};
933        # Get the MD5 computer.
934        my $md5C = $self->{md5};
935      # We'll put the genome attributes in here.      # We'll put the genome attributes in here.
936      my %fields;      my %fields;
937      # Check for a basic statistics file.      # Check for a basic statistics file.
# Line 837  Line 959 
959          $fields{'scientific-name'} = $line;          $fields{'scientific-name'} = $line;
960          # Get the taxonomy and extract the domain from it.          # Get the taxonomy and extract the domain from it.
961          $ih = Open(undef, "<$dir/TAXONOMY");          $ih = Open(undef, "<$dir/TAXONOMY");
962          ($fields{domain}) = split /;/, <$ih>, 2;          ($fields{domain}) = split m/;/, <$ih>, 2;
963      }      }
964      # Get the counts from the statistics object.      # Get the counts from the statistics object.
965      my $stats = $self->{stats};      my $stats = $self->{stats};
# Line 845  Line 967 
967      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
968      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
969      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
970        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
971      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
972      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
973      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 855  Line 978 
978      }      }
979      # Use the domain to determine whether or not the genome is prokaryotic.      # Use the domain to determine whether or not the genome is prokaryotic.
980      $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;      $fields{prokaryotic} = PROK_FLAG->{$fields{domain}} || 0;
981        # Compute the genome MD5.
982        $fields{md5_identifier} = $md5C->CloseGenome();
983      # Finally, add the genome ID.      # Finally, add the genome ID.
984      $fields{id} = $self->{genome};      $fields{id} = $self->{genome};
985      # Create the genome record.      # Create the genome record.
# Line 954  Line 1079 
1079    
1080  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1081    
1082  =head3 CreateIdentifier  =head3 ReadAnnotation
1083    
1084      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1085    
1086  Link an identifier to a feature. The identifier is presented in prefixed form and is of the  Read the next record from an annotation file. The next record must exist (that is, an
1087  specified type and the specified confidence level.  end-of-file check should have been performed before calling this method).
1088    
1089  =over 4  =over 4
1090    
1091  =item alias  =item ih
1092    
1093  Identifier to connect to the feature.  Open file handle for the annotation file.
1094    
1095  =item conf  =item RETURN
   
 Confidence level (C<A> curated, C<B> normal, C<C> protein only).  
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
1096    
1097  ID of the relevant feature.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1098    timestamp, (2) the user ID, and (3) the annotation text.
1099    
1100  =back  =back
1101    
1102  =cut  =cut
1103    
1104  sub CreateIdentifier {  sub ReadAnnotation {
1105      # Get the parameters.      # Get the parameter.
1106      my ($self, $alias, $conf, $aliasType, $fid) = @_;      my ($ih) = @_;
1107      # Get the Sapling object.      # Read the three fixed fields.
1108      my $sap = $self->{sap};      my $fid = <$ih>; chomp $fid;
1109      # Compute the identifier's natural form.      my $timestamp = <$ih>; chomp $timestamp;
1110      my $natural = $alias;      my $user = <$ih>; chomp $user;
1111      if ($natural =~ /[:|](.+)/) {      # Loop through the lines of the text field.
1112          $natural = $1;      my $text = "";
1113      }      my $line = <$ih>;
1114      # Insure the identifier exists in the database.      while (defined($line) && $line ne "//\n") {
1115      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);          $text .= $line;
1116      # Connect the identifier to the feature.          $line = <$ih>;
1117      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);      }
1118        # Remove the trailing new-line from the text.
1119        chomp $text;
1120        # Return the fields.
1121        return ($fid, $timestamp, $user, $text);
1122  }  }
1123    
1124  1;  1;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.20

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3