[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Fri Apr 1 20:48:01 2011 UTC revision 1.17, Tue Mar 26 02:10:46 2013 UTC
# Line 67  Line 67 
67      # Create the loader object.      # Create the loader object.
68      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69      # Load the contigs.      # Load the contigs.
70      Trace("Loading contigs for $genome.") if T(2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
71      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
72      # Load the features.      # Load the features.
73      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(SaplingDataLoader => 2);
74      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
75      # Check for annotation history. If we have it, load the history records into the      # Check for annotation history. If we have it, load the history records into the
76      # database.      # database.
77      if (-f "$directory/annotations") {      if (-f "$directory/annotations") {
78          Trace("Processing annotations.") if T(3);          Trace("Processing annotations.") if T(SaplingDataLoader => 3);
79          $loaderObject->LoadAnnotations("$directory/annotations");          $loaderObject->LoadAnnotations("$directory/annotations");
80      }      }
81      # Load the subsystem bindings.      # Load the subsystem bindings.
82      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(SaplingDataLoader => 2);
83      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
84      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
85      Trace("Creating root for $genome.") if T(2);      Trace("Creating root for $genome.") if T(SaplingDataLoader => 2);
86      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
87      # Return the statistics.      # Return the statistics.
88      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 118  Line 118 
118      my ($sap, $genome) = @_;      my ($sap, $genome) = @_;
119      # Create the statistics object.      # Create the statistics object.
120      my $stats = Stats->new();      my $stats = Stats->new();
121      # Delete the DNA.      # Delete the DNA sequences.
122      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      my @seqs = $sap->GetFlat('DNASequence', 'DNASequence(id) LIKE ?', ["$genome:%"], 'id');
123        for my $seq (@seqs) {
124            my $delStats = $sap->Delete(DNASequence => $seq);
125            $stats->Accumulate($delStats);
126        }
127      # Delete the contigs.      # Delete the contigs.
128      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
129      # Delete the features.      # Delete the features.
# Line 154  Line 158 
158    
159  =item directory  =item directory
160    
161  Name of the directory containing the genome data files.  Name of the directory containing the genome data files. If omitted, the
162    genome will be deleted from the database.
163    
164  =item RETURN  =item RETURN
165    
# Line 169  Line 174 
174      my ($sap, $genome, $directory) = @_;      my ($sap, $genome, $directory) = @_;
175      # Clear the existing data for the specified genome.      # Clear the existing data for the specified genome.
176      my $stats = ClearGenome($sap, $genome);      my $stats = ClearGenome($sap, $genome);
177      # Load the new expression data from the specified directory.      # Load the new expression data from the specified directory (if one is
178        # specified).
179        if ($directory) {
180      my $newStats = Load($sap, $genome, $directory);      my $newStats = Load($sap, $genome, $directory);
181      # Merge the statistics.      # Merge the statistics.
182      $stats->Accumulate($newStats);      $stats->Accumulate($newStats);
183        }
184      # Return the result.      # Return the result.
185      return $stats;      return $stats;
186  }  }
# Line 227  Line 235 
235    
236  L<Stats> object for tracking statistical information about the load.  L<Stats> object for tracking statistical information about the load.
237    
 =item timestamps  
   
 A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,  
 and used to prevent duplicate timestamps.  
   
238  =back  =back
239    
240  =cut  =cut
# Line 244  Line 247 
247      # Add our specialized data.      # Add our specialized data.
248      $retVal->{genome} = $genome;      $retVal->{genome} = $genome;
249      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
250      $retVal->{timestamps} = {};      # Leave the assignment hash undefined until we populate it.
251        $retVal->{assignHash} = undef;
252      # Return the result.      # Return the result.
253      return $retVal;      return $retVal;
254  }  }
# Line 254  Line 258 
258      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
259    
260  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
261  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
262  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
263    C<gc_content> statistic.
264    
265  =cut  =cut
266    
# Line 363  Line 368 
368      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
369      # Record the chunk.      # Record the chunk.
370      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
371        # Update the GC count.
372        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
373  }  }
374    
375  =head3 OutputContig  =head3 OutputContig
# Line 414  Line 421 
421      # Get the parameters.      # Get the parameters.
422      my ($self) = @_;      my ($self) = @_;
423      # Read in the functional assignments.      # Read in the functional assignments.
424      Trace("Reading functional assignments.") if T(3);      Trace("Reading functional assignments.") if T(SaplingDataLoader => 3);
425      my $assignHash = $self->ReadAssignments();      my $assignHash = $self->ReadAssignments();
426        $self->{assignHash} = $assignHash;
427      # Get the directory of feature types.      # Get the directory of feature types.
428      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
429      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
430        # Check for protein sequences. If we have some, load them into a hash.
431        my $protHash = {};
432        if (-f "$featureDir/peg/fasta") {
433            Trace("Processing protein sequences.") if T(SaplingDataLoader => 3);
434            $protHash = $self->LoadProteinData("$featureDir/peg/fasta");
435        }
436      # Create the feature records for the types found.      # Create the feature records for the types found.
437      for my $type (@types) {      for my $type (@types) {
438          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
439          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
440              # Yes, load the feature data.              # Yes. Read in the evidence codes (if any).
441              $self->LoadFeatureData($featureDir, $type, $assignHash);              my $evHash = {};
442                my $tranFile = "$featureDir/$type/Attributes/transaction_log";
443                if (-f $tranFile) {
444                    $evHash = $self->LoadEvidenceCodes($tranFile);
445          }          }
446                # Now load the feature data.
447                $self->LoadFeatureData($featureDir, $type, $protHash, $evHash);
448            }
449        }
450    }
451    
452    =head3 LoadEvidenceCodes
453    
454        my $evHash = $loaderObject->LoadEvidenceCodes($attributeFile);
455    
456    Load the evidence codes from the specified attribute transaction log file into a
457    hash. The log file is in tab-delimited format. The first column contains the
458    transaction code (either C<ADD> or C<DELETE>), the second column a feature ID,
459    the third an attribute name (we'll ignore everything but C<evidence_code>), and
460    the fourth the attribute value.
461    
462    =over 4
463    
464    =item attributeFile
465    
466    Name of the attribute transaction log file.
467    
468    =item RETURN
469    
470    Returns a reference to a hash mapping each feature ID to a comma-delimited list
471    of its evidence codes.
472    
473    =back
474    
475    =cut
476    
477    sub LoadEvidenceCodes {
478        # Get the parameters.
479        my ($self, $attributeFile) = @_;
480        # Get the Sapling database.
481        my $sap = $self->{sap};
482        # Get the statistics object.
483        my $stats = $self->{stats};
484        # Get the assignment hash: we use this to filter the feature IDs.
485        my $assignHash = $self->{assignHash};
486        # Open the attribute log file for input.
487        my $ih = Open(undef, "<$attributeFile");
488        # This two-dimensional hash will hold the evidence codes for each feature.
489        my %retVal;
490        # Loop through the attribute log file.
491        while (! eof $ih) {
492            # Get the current attribute record.
493            my ($command, $fid, $key, $value) = Tracer::GetLine($ih);
494            $stats->Add(attributeLine => 1);
495            # Insure we have all the pieces we need.
496            if (! $command || ! $fid || $key ne 'evidence_code') {
497                $stats->Add(attributeLineSkipped => 1);
498            } elsif (! $assignHash->{$fid}) {
499                # Here the attribute is for a deleted feature.
500                $stats->Add(attributeFidSkipped => 1);
501            } else {
502                # Get the sub-hash for this feature.
503                if (! exists $retVal{$fid}) {
504                    $retVal{$fid} = {};
505                }
506                my $featureSubHash = $retVal{$fid};
507                # Process according to the command.
508                if ($command eq 'ADD') {
509                    # Here we are adding a new evidence code.
510                    $featureSubHash->{$value} = 1;
511                    $stats->Add(attributeAdd => 1);
512                } elsif ($command eq 'DELETE') {
513                    # Here we are deleting an evidence code.
514                    delete $featureSubHash->{$value};
515                    $stats->Add(attributeDelete => 1);
516                } else {
517                    # here we have an unrecognized command.
518                    $stats->Add(attributeCommandSkip => 1);
519      }      }
     # Check for protein sequences. If we have some, load them into the database.  
     if (-f "$featureDir/peg/fasta") {  
         Trace("Processing protein sequences.") if T(3);  
         $self->LoadProteinData("$featureDir/peg/fasta");  
520      }      }
     # Now loop through the features, connecting them to their roles. Note that deleted  
     # features will not be in the assignment hash.  
     Trace("Connecting features to roles.") if T(3);  
     for my $fid (keys %$assignHash) {  
         $self->ConnectFunctionRoles($fid, $assignHash->{$fid});  
521      }      }
522        # Loop through the hash, converting each sub-hash to a comma-delimited list of
523        # evidence codes.
524        for my $fid (keys %retVal) {
525            $retVal{$fid} = join(",", sort keys %{$retVal{$fid}});
526  }  }
527        # Return the evidence hash.
528        return \%retVal;
529    }
530    
531    
532  =head3 LoadFeatureData  =head3 LoadFeatureData
533    
534      $loaderObject->LoadFeatureData($featureDir, $type, $assignHash);      $loaderObject->LoadFeatureData($featureDir, $type, $protHash, $evHash);
535    
536  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
537  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 457  Line 546 
546    
547  Type of feature to load.  Type of feature to load.
548    
549  =item assignHash  =item protHash
550    
551    Reference to a hash mapping each feature ID for a protein-encoding gene to
552    its protein sequence.
553    
554  Reference to a hash mapping each feature ID to its functional assignment.  =item evHash
555    
556    Reference to a hash mapping each feature ID to a comma-delimited list of
557    its evidence codes (if any).
558    
559  =back  =back
560    
# Line 467  Line 562 
562    
563  sub LoadFeatureData {  sub LoadFeatureData {
564      # Get the parameters.      # Get the parameters.
565      my ($self, $featureDir, $type, $assignHash) = @_;      my ($self, $featureDir, $type, $protHash, $evHash) = @_;
566      # Get the sapling database.      # Get the sapling database.
567      my $sap = $self->{sap};      my $sap = $self->{sap};
     # Get the maximum location  segment length. We'll need this later.  
     my $maxLength = $sap->TuningParameter('maxLocationLength');  
568      # Get the statistics object.      # Get the statistics object.
569      my $stats = $self->{stats};      my $stats = $self->{stats};
570        # Get the assignment hash. This tells us our functional assignments. This method is
571        # also where we remove the deleted features from it.
572        my $assignHash = $self->{assignHash};
573      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
574      # time, it overwrites the original.      # time, it overwrites the original.
575      my $fidHash = $self->{timestamps};      my %fidHash;
576      # Finally, we need the timestamp hash. The initial feature population      # This hash tracks the deleted features. We don't want to update these.
577        my %deleted_features;
578      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
579      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
580      if (-f $fileName) {      if (-f $fileName) {
581          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
582          # of deleted features.          # of deleted features and remove them from the assignment hash. This insures
583          my %deletedFids;          # that they are not used by subsequent methods.
584          my $deleteFile = "$featureDir/$type/deleted.features";          my $deleteFile = "$featureDir/$type/deleted.features";
585          if (-f $deleteFile) {          if (-f $deleteFile) {
586              %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);              my $dh = Open(undef, "<$deleteFile");
587                while (! eof $dh) {
588                    my ($deletedFid) = Tracer::GetLine($dh);
589                    if (exists $assignHash->{$deletedFid}) {
590                        delete $assignHash->{$deletedFid};
591                        $stats->Add(deletedFid => 1);
592                        $deleted_features{$deletedFid} = 1;
593                    }
594                }
595          }          }
596          # Open the main file for input.          # Open the main file for input.
597          Trace("Reading features from $fileName.") if T(3);          Trace("Reading features from $fileName.") if T(SaplingDataLoader => 3);
598          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
599          while (! eof $ih) {          while (! eof $ih) {
600              # Read this feature's information.              # Read this feature's information.
601              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
602              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
603              if (! exists $deletedFids{$fid}) {              if (!$deleted_features{$fid}) {
604                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
605                  if (exists $fidHash->{$fid}) {                  if ($fidHash{$fid}) {
606                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
607                      $stats->Add(duplicateFid => 1);                      $stats->Add(duplicateFid => 1);
608                  }                  }
609                  # Otherwise connect this feature to the genome.                  # If this is RNA, the alias list is always empty. Sometimes, the functional
610                  $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);                  # assignment is found there.
611                  # Now we must parse the locations. This will contain a list of the location                  if ($type eq 'rna') {
612                  # data 4-tuples (contig, start, dir, len).                      if (! $assignHash->{$fid}) {
613                  my @locData;                          $assignHash->{$fid} = $aliases[0];
614                  # This will contain the total sequence length.                      }
615                  my $length = 0;                      @aliases = ();
616                  # Loop through the locations.                  }
617                  for my $loc (split /\s*,\s*/, $locations) {                  # Add the feature to the database.
618                      # Get this location's components.                  my $function = $assignHash->{$fid} || "";
619                      my ($contig, $start, $stop) = ($loc =~ /(.+)_(\d+)_(\d+)$/);                  $self->AddFeature($fid, $function, $locations, \@aliases,
620                      # Normalize the contig.                                    $protHash->{$fid}, $evHash->{$fid});
621                      $self->FixContig(\$contig);                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
622                      # Compute the direction.                  $fidHash{$fid} = 1;
                     if ($start <= $stop) {  
                         # Here we have the forward strand.  
                         my $len = $stop + 1 - $start;  
                         push @locData, [$contig, $start, '+', $len];  
                         $length += $len;  
                     } else {  
                         # Here we have the reverse strand.  
                         my $len = $start + 1 - $stop;  
                         push @locData, [$contig, $stop, '-', $len];  
                         $length += $len;  
                     }  
                 }  
                 # Now we can create the feature record.  
                 $sap->InsertObject('Feature', id => $fid, feature_type => $type,  
                                    function => $assignHash->{$fid}, locked => 0,  
                                    sequence_length => $length);  
                 $fidHash->{$fid} = {};  
                 $stats->Add($type => 1);  
                 # The next step is to connect the feature record to its locations. This  
                 # involves dividing the location into segments. The following variable  
                 # will count the total number of segments.  
                 my $segment = 0;  
                 # Loop through the locations.  
                 for my $loc (@locData) {  
                     # Get the current location's information.  
                     my ($contig, $left, $dir, $len) = @$loc;  
                     # Peel off any segments.  
                     while ($len > $maxLength) {  
                         # Process according to the direction.  
                         if ($dir eq '+') {  
                             # Forward strand: peel from the beginning.  
                             $self->ConnectLocation($fid, $contig, $segment, $left, $dir,  
                                                    $maxLength);  
                             $left += $maxLength;  
                         } else {  
                             # Reverse strand: peel from the end.  
                             $self->ConnectLocation($fid, $contig, $segment,  
                                                   $left + $len - $maxLength, $dir,  
                                                   $maxLength);  
                         }  
                         # Denote we've used up a segment.  
                         $len -= $maxLength;  
                         $segment++;  
                     }  
                     # Output the last segment.  
                     $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);  
                 }  
                 # Now we process the aliases and create the identifiers. We don't do this  
                 # for RNA, because the RNA function is stored in the aliases.  
                 if ($type ne 'rna') {  
                     for my $alias (@aliases) {  
                         my $normalized;  
                         # Determine the type.  
                         my $aliasType = AliasAnalysis::TypeOf($alias);  
                         $stats->Add(aliasAll => 1);  
                         # Is this a recognized type?  
                         if ($aliasType) {  
                             $stats->Add(aliasNormal => 1);  
                             # Yes. Write it normally.  
                             $self->CreateIdentifier($alias, B => $aliasType, $fid);  
                         } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {  
                             # No, but this is a specially-marked locus tag.  
                             $normalized = $1;  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {  
                             # No, but this is a natural locus tag.  
                             $stats->Add(aliasLocus => 1);  
                             $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);  
                         } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {  
                             # No, but this is a natural gene name.  
                             $stats->Add(aliasGene => 1);  
                             $self->CreateIdentifier($normalized, B => 'GENE', $fid);  
                         } elsif ($alias =~ /^\d+$/) {  
                             # Here it's a naked number, which means it's a GI number  
                             # of some sort.  
                             $stats->Add(aliasGI => 1);  
                             $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);  
                         } elsif ($alias =~ /^protein_id\|(.+)/) {  
                             # Here we have a REFSEQ protein ID. Right now we don't have a way to  
                             # handle that, because we don't know the feature's protein ID here.  
                             $stats->Add(aliasProtein => 1);  
                         } elsif ($alias =~ /[:|]/) {  
                             # Here it's an alias of an unknown type, so we skip it.  
                             $stats->Add(aliasUnknown => 1);  
                         } else {  
                             # Here it's a miscellaneous type.  
                             $stats->Add(aliasMisc => 1);  
                             $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);  
                         }  
                     }  
                 }  
623              }              }
624          }          }
625      }      }
626  }  }
627    
628    
629  =head3 LoadProteinData  =head3 LoadProteinData
630    
631      $self->LoadProteinData($fileName);      my $protHash = $self->LoadProteinData($fileName);
632    
633  Load the protein sequences from the named FASTA file. The sequences will be stored  Load the protein sequences from the named FASTA file. The sequences will be stored
634  in the B<ProteinSequence> table and linked to the FIG feature IDs, but the feature  in a hash by FIG feature ID.
 records themselves will not be created (it is presumed they are already there).  
635    
636  =over 4  =over 4
637    
# Line 626  Line 639 
639    
640  Name of the FASTA file containing the protein sequences for this genome.  Name of the FASTA file containing the protein sequences for this genome.
641    
642    =item RETURN
643    
644    Returns a hash mapping feature IDs to protein sequences.
645    
646  =back  =back
647    
648  =cut  =cut
# Line 635  Line 652 
652      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
653      # Open the FASTA file for input.      # Open the FASTA file for input.
654      my $ih = Open(undef, "<$fileName");      my $ih = Open(undef, "<$fileName");
655        # Create the return hash.
656        my $retVal = {};
657      # We'll track the current protein in here.      # We'll track the current protein in here.
658      my $fid;      my $fid;
659      my $sequence = "";      my $sequence = "";
# Line 648  Line 667 
667              my $newFid = $1;              my $newFid = $1;
668              # Do we have an existing protein?              # Do we have an existing protein?
669              if (defined $fid) {              if (defined $fid) {
670                  # Yes. Write it out.                  # Yes. Store it in the hash.
671                  $self->WriteProtein($fid, $sequence);                  $retVal->{$fid} = $sequence;
672              }              }
673              # Initialize for the next protein.              # Initialize for the next protein.
674              $fid = $newFid;              $fid = $newFid;
# Line 661  Line 680 
680      }      }
681      # Do we have a residual protein.      # Do we have a residual protein.
682      if (defined $fid) {      if (defined $fid) {
683          # Yes. Write it out.          # Yes. Store it in the hash.
684          $self->WriteProtein($fid, $sequence);          $retVal->{$fid} = $sequence;
685      }      }
686        # Return the hash.
687        return $retVal;
688  }  }
689    
690    
# Line 689  Line 710 
710  sub LoadAnnotations {  sub LoadAnnotations {
711      # Get the parameters.      # Get the parameters.
712      my ($self, $fileName) = @_;      my ($self, $fileName) = @_;
713      # Get the timestamp hash.      # Get the assignment Hash. We use this to filter out deleted features.
714      my $timeHash = $self->{timestamps};      my $assignHash = $self->{assignHash};
715      # Get the Sapling database.      # Get the Sapling database.
716      my $sap = $self->{sap};      my $sap = $self->{sap};
717      # Get the statistics object.      # Get the statistics object.
# Line 701  Line 722 
722      while (! eof $ih) {      while (! eof $ih) {
723          # Read in the peg, timestamp, and user ID.          # Read in the peg, timestamp, and user ID.
724          my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);          my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
725          # Only proceed if the feature exists.          # Only proceed if the feature is not deleted.
726          if (! exists $timeHash->{$fid}) {          if ($assignHash->{$fid}) {
727              $stats->Add(skippedAnnotation => 1);              # Add the annotation to this feature.
728          } else {              $self->MakeAnnotation($fid, $text, $user, $timestamp);
             # Change assignments by the master user to FIG assignments.  
             $text =~ s/Set master function/Set FIG function/s;  
             # Insure the time stamp is valid.  
             if ($timestamp =~ /^\d+$/) {  
                 # Here it's a number. We need to insure the one we use to form  
                 # the key is unique.  
                 my $keyStamp = $timestamp;  
                 while ($timeHash->{$fid}{$keyStamp}) {  
                     $keyStamp++;  
                     $stats->Add(skippedStamp => 1);  
                 }  
                 # Form the annotation ID.  
                 my $annotationID = SaplingDataLoader::ComputeAnnotationID($fid, $keyStamp);  
                 $timeHash->{$fid}{$keyStamp} = 1;  
                 # Generate the annotation.  
                 $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);  
                 $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,  
                             comment => $text, annotator => $user);  
             } else {  
                 # Here we have an invalid time stamp.  
                 Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);  
             }  
729          }          }
730      }      }
731  }  }
# Line 766  Line 765 
765    
766  =head3 LoadSubsystems  =head3 LoadSubsystems
767    
768      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems($subsysList);
769    
770    Load the subsystem data into the database. This requires looking through the
771    bindings and using them to connect to the subsystems that already exist in the
772    database. If the subsystem does not exist, its bindings will not be loaded.
773    
774    =over 4
775    
776    =item subsysList
777    
778  Load the subsystem data into the database. This includes all of the subsystems,  Reference to a list of subsystem IDs. If specified, only the subsystems in the
779  their categories, roles, and the bindings that connect them to this genome's features.  list will be processed. This is useful when a particular set of subsystems
780    has been replaced.
781    
782    =back
783    
784  =cut  =cut
785    
786  sub LoadSubsystems {  sub LoadSubsystems {
787      # Get the parameters.      # Get the parameters.
788      my ($self) = @_;      my ($self, $subsysList) = @_;
789      # Get the sapling object.      # Get the sapling object.
790      my $sap = $self->{sap};      my $sap = $self->{sap};
791      # Get the statistics object.      # Get the statistics object.
792      my $stats = $self->{stats};      my $stats = $self->{stats};
793      # Get the genome ID.      # Get the genome ID.
794      my $genome = $self->{genome};      my $genome = $self->{genome};
     # Connect to the Sapling server so we can get the subsystem variant and role information.  
     my $sapO = SAPserver->new();  
795      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
796      my %machines;      my %machines;
797      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
798      my %machineRoles;      my %machineRoles;
799      # The first step is to create the subsystems themselves and connect them to the      # This hash will contain the list of subsystems found in the database.
800      # genome. A list is given in the subsystems file of the Subsystems directory.      my %subsystems;
801        # We loop through the subsystems, looking for the ones already in the
802        # database. The list is given in the subsystems file of the Subsystems
803        # directory.
804      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
     # Create the field hash for the subsystem query.  
     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)  
                     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)  
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
805      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
806      while (! eof $ih) {      while (! eof $ih) {
807          # Get this subsystem.          # Get this subsystem.
808          my ($subsystem, $variant) = Tracer::GetLine($ih);          my ($subsystem, $variant) = Tracer::GetLine($ih);
809            Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
810          # Normalize the subsystem name.          # Normalize the subsystem name.
811          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
812            # Insure the subsystem is in the database and is one we're interested in.
813            if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
814                    $sap->Exists(Subsystem => $subsystem)) {
815                # We have the subsystem in the database. We need to compute the machine
816                # role IDs and create the molecular machine. First, we need to remember
817                # the subsystem.
818                $subsystems{$subsystem} = 1;
819          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
820          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
         # Insure the subsystem is in the database.  
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
821              # Loop through the roles.              # Loop through the roles.
822              my $roleList = $subHash->{$subsystem};              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
823              for my $roleTuple (@$roleList) {                      [$subsystem], "to-link abbreviation");
824                for my $roleTuple (@roleList) {
825                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
826                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
827                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
828              }              }
829          }              # Next we need the variant code and key.
830          # Now we need to connect the variant to the subsystem. First, we compute the real              my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
831          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
832          # Get the variant from the sapling server.              # Now we create the molecular machine connecting this genome to the
833          my $variantH = $sapO->get(-objects => "Variant",              # subsystem variant.
                                   -filter => {"Variant(id)" => ["=", $variantKey]},  
                                   -fields => {"Variant(code)" => "code",  
                                               "Variant(comment)" => "comment",  
                                               "Variant(type)" => "type",  
                                               "Variant(role-rule)" => "role-rule"},  
                                   -firstOnly => 1);  
         # Insure we have it in the database.  
         $self->InsureEntity('Variant', $variantKey, %$variantH);  
         $stats->Add(subsystems => 1);  
         # Connect it to the subsystem.  
         $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
834          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
835          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
836          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
# Line 878  Line 838 
838          # Remember the machine ID.          # Remember the machine ID.
839          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
840      }      }
841      # Now we go through the bindings file. This file connects the subsystem roles to      }
842      # the molecular machines.      # Now we go through the bindings file. This file connects the subsystem
843        # roles to the molecular machines.
844      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
845      # Loop through the bindings.      # Loop through the bindings.
846      while (! eof $ih) {      while (! eof $ih) {
# Line 887  Line 848 
848          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
849          # Normalize the subsystem name.          # Normalize the subsystem name.
850          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
851            # Insure the subsystem is in the database.
852            if ($subsystems{$subsystem}) {
853          # Compute the machine role.          # Compute the machine role.
854          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
855          # Insure it exists.          # Insure it exists.
# Line 902  Line 865 
865          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
866      }      }
867  }  }
868    }
869    
870  =head3 CreateGenome  =head3 CreateGenome
871    
# Line 959  Line 923 
923      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
924      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
925      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
926        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
927      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
928      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
929      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";
# Line 1066  Line 1031 
1031      $self->{stats}->Add(segment => 1);      $self->{stats}->Add(segment => 1);
1032  }  }
1033    
 =head3 CreateIdentifier  
   
     $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);  
   
 Link an identifier to a feature. The identifier is presented in prefixed form and is of the  
 specified type and the specified confidence level.  
   
 =over 4  
   
 =item alias  
   
 Identifier to connect to the feature.  
   
 =item conf  
   
 Confidence level (C<A> curated, C<B> normal, C<C> protein only).  
   
 =item aliasType  
   
 Type of alias (e.g. C<NCBI>, C<LocusTag>).  
   
 =item fid  
   
 ID of the relevant feature.  
   
 =back  
   
 =cut  
   
 sub CreateIdentifier {  
     # Get the parameters.  
     my ($self, $alias, $conf, $aliasType, $fid) = @_;  
     # Get the Sapling object.  
     my $sap = $self->{sap};  
     # Compute the identifier's natural form.  
     my $natural = $alias;  
     if ($natural =~ /[:|](.+)/) {  
         $natural = $1;  
     }  
     # Insure the identifier exists in the database.  
     $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);  
     # Connect the identifier to the feature.  
     $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);  
 }  
   
1034  =head2 Internal Utility Methods  =head2 Internal Utility Methods
1035    
1036  =head3 ReadAnnotation  =head3 ReadAnnotation
# Line 1145  Line 1065 
1065      # Loop through the lines of the text field.      # Loop through the lines of the text field.
1066      my $text = "";      my $text = "";
1067      my $line = <$ih>;      my $line = <$ih>;
1068      while ($line ne "//\n") {      while (defined($line) && $line ne "//\n") {
1069          $text .= $line;          $text .= $line;
1070          $line = <$ih>;          $line = <$ih>;
1071      }      }

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3