[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Jan 11 15:04:03 2011 UTC revision 1.8, Fri Apr 1 20:48:01 2011 UTC
# Line 26  Line 26 
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28      use AliasAnalysis;      use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 71  Line 72 
72      # Load the features.      # Load the features.
73      Trace("Loading features for $genome.") if T(2);      Trace("Loading features for $genome.") if T(2);
74      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
75        # Check for annotation history. If we have it, load the history records into the
76        # database.
77        if (-f "$directory/annotations") {
78            Trace("Processing annotations.") if T(3);
79            $loaderObject->LoadAnnotations("$directory/annotations");
80        }
81      # Load the subsystem bindings.      # Load the subsystem bindings.
82      Trace("Loading subsystems for $genome.") if T(2);      Trace("Loading subsystems for $genome.") if T(2);
83      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 112  Line 119 
119      # Create the statistics object.      # Create the statistics object.
120      my $stats = Stats->new();      my $stats = Stats->new();
121      # Delete the DNA.      # Delete the DNA.
122      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
123      # Delete the contigs.      # Delete the contigs.
124      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
125      # Delete the features.      # Delete the features.
126      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
127      # Delete the molecular machines.      # Delete the molecular machines.
128      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
129      # Delete the genome itself.      # Delete the genome itself.
130      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
131      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 127  Line 134 
134      return $stats;      return $stats;
135  }  }
136    
137    
138    =head3 Process
139    
140        my $stats = SaplingGenomeLoader::Process($sap, $genome, $directory);
141    
142    Load genome data from the specified directory. If the genome data already
143    exists in the database, it will be deleted first.
144    
145    =over 4
146    
147    =item sap
148    
149    L</Sapling> object for accessing the database.
150    
151    =item genome
152    
153    ID of the genome whose  data is being loaded.
154    
155    =item directory
156    
157    Name of the directory containing the genome data files.
158    
159    =item RETURN
160    
161    Returns a statistics object describing the activity during the reload.
162    
163    =back
164    
165    =cut
166    
167    sub Process {
168        # Get the parameters.
169        my ($sap, $genome, $directory) = @_;
170        # Clear the existing data for the specified genome.
171        my $stats = ClearGenome($sap, $genome);
172        # Load the new expression data from the specified directory.
173        my $newStats = Load($sap, $genome, $directory);
174        # Merge the statistics.
175        $stats->Accumulate($newStats);
176        # Return the result.
177        return $stats;
178    }
179    
180    
181  =head2 Loader Object Methods  =head2 Loader Object Methods
182    
183  =head3 new  =head3 new
# Line 176  Line 227 
227    
228  L<Stats> object for tracking statistical information about the load.  L<Stats> object for tracking statistical information about the load.
229    
230    =item timestamps
231    
232    A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,
233    and used to prevent duplicate timestamps.
234    
235  =back  =back
236    
237  =cut  =cut
# Line 184  Line 240 
240      # Get the parameters.      # Get the parameters.
241      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory) = @_;
242      # Create the object.      # Create the object.
243      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
244          sap => $sap,      # Add our specialized data.
245          genome => $genome,      $retVal->{genome} = $genome;
246          directory => $directory,      $retVal->{directory} = $directory;
247          stats => Stats->new(qw(contigs dna pegs rnas)),      $retVal->{timestamps} = {};
248          supportRecords => {}      # Return the result.
     };  
     # Bless and return it.  
     bless $retVal, $class;  
249      return $retVal;      return $retVal;
250  }  }
251    
# Line 360  Line 413 
413  sub LoadFeatures {  sub LoadFeatures {
414      # Get the parameters.      # Get the parameters.
415      my ($self) = @_;      my ($self) = @_;
416        # Read in the functional assignments.
417        Trace("Reading functional assignments.") if T(3);
418        my $assignHash = $self->ReadAssignments();
419      # Get the directory of feature types.      # Get the directory of feature types.
420      my $featureDir = "$self->{directory}/Features";      my $featureDir = "$self->{directory}/Features";
421      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);      my @types = Tracer::OpenDir("$self->{directory}/Features", 1);
# Line 368  Line 424 
424          # Insure this is a genuine feature directory.          # Insure this is a genuine feature directory.
425          if (-f "$featureDir/$type/tbl") {          if (-f "$featureDir/$type/tbl") {
426              # Yes, load the feature data.              # Yes, load the feature data.
427              $self->LoadFeatureData($featureDir, $type);              $self->LoadFeatureData($featureDir, $type, $assignHash);
428          }          }
429      }      }
430      # Check for protein sequences. If we have some, load them into the database.      # Check for protein sequences. If we have some, load them into the database.
431      if (-f "$featureDir/peg/fasta") {      if (-f "$featureDir/peg/fasta") {
432            Trace("Processing protein sequences.") if T(3);
433          $self->LoadProteinData("$featureDir/peg/fasta");          $self->LoadProteinData("$featureDir/peg/fasta");
434      }      }
435        # Now loop through the features, connecting them to their roles. Note that deleted
436        # features will not be in the assignment hash.
437        Trace("Connecting features to roles.") if T(3);
438        for my $fid (keys %$assignHash) {
439            $self->ConnectFunctionRoles($fid, $assignHash->{$fid});
440        }
441  }  }
442    
443  =head3 LoadFeatureData  =head3 LoadFeatureData
444    
445      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type, $assignHash);
446    
447  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
448  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 394  Line 457 
457    
458  Type of feature to load.  Type of feature to load.
459    
460    =item assignHash
461    
462    Reference to a hash mapping each feature ID to its functional assignment.
463    
464  =back  =back
465    
466  =cut  =cut
467    
468  sub LoadFeatureData {  sub LoadFeatureData {
469      # Get the parameters.      # Get the parameters.
470      my ($self, $featureDir, $type) = @_;      my ($self, $featureDir, $type, $assignHash) = @_;
471      # Get the sapling database.      # Get the sapling database.
472      my $sap = $self->{sap};      my $sap = $self->{sap};
473      # Get the maximum location  segment length. We'll need this later.      # Get the maximum location  segment length. We'll need this later.
474      my $maxLength = $sap->TuningParameter('maxLocationLength');      my $maxLength = $sap->TuningParameter('maxLocationLength');
475      # Get the statistics object.      # Get the statistics object.
476      my $stats = $self->{stats};      my $stats = $self->{stats};
     # Read in the functional assignments.  
     my $assignHash = $self->ReadAssignments();  
477      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
478      # time, it overwrites the original.      # time, it overwrites the original.
479      my %fids;      my $fidHash = $self->{timestamps};
480        # Finally, we need the timestamp hash. The initial feature population
481      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
482      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
483      if (-f $fileName) {      if (-f $fileName) {
484          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
485            # of deleted features.
486            my %deletedFids;
487            my $deleteFile = "$featureDir/$type/deleted.features";
488            if (-f $deleteFile) {
489                %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);
490            }
491            # Open the main file for input.
492            Trace("Reading features from $fileName.") if T(3);
493          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
494          while (! eof $ih) {          while (! eof $ih) {
495              # Read this feature's information.              # Read this feature's information.
496              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
497              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
498              if (exists $fids{$fid}) {              if (! exists $deletedFids{$fid}) {
499                    # If the feature already exists, delete it. (This should be extremely rare.)
500                    if (exists $fidHash->{$fid}) {
501                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
502                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
503              }              }
# Line 455  Line 531 
531              $sap->InsertObject('Feature', id => $fid, feature_type => $type,              $sap->InsertObject('Feature', id => $fid, feature_type => $type,
532                                 function => $assignHash->{$fid}, locked => 0,                                 function => $assignHash->{$fid}, locked => 0,
533                                 sequence_length => $length);                                 sequence_length => $length);
534              $fids{$fid} = 1;                  $fidHash->{$fid} = {};
535              $stats->Add($type => 1);              $stats->Add($type => 1);
536              # The next step is to connect the feature record to its locations. This              # The next step is to connect the feature record to its locations. This
537              # involves dividing the location into segments. The following variable              # involves dividing the location into segments. The following variable
# Line 534  Line 610 
610          }          }
611      }      }
612  }  }
613    }
614    
615  =head3 LoadProteinData  =head3 LoadProteinData
616    
# Line 589  Line 666 
666      }      }
667  }  }
668    
669    
670    =head3 LoadAnnotations
671    
672        $loaderObject->LoadAnnotations($fileName);
673    
674    Read in the annotation history information and use it to create annotation records.
675    
676    =over 4
677    
678    =item fileName
679    
680    Name of the annotation history file. This file is formatted with four fields per
681    record. Each field is on a separate line, with a double slash (C<//>) used as the
682    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
683    (formatted as an integer), (2) the user name, and (3) the annotation text.
684    
685    =back
686    
687    =cut
688    
689    sub LoadAnnotations {
690        # Get the parameters.
691        my ($self, $fileName) = @_;
692        # Get the timestamp hash.
693        my $timeHash = $self->{timestamps};
694        # Get the Sapling database.
695        my $sap = $self->{sap};
696        # Get the statistics object.
697        my $stats = $self->{stats};
698        # Open the input file.
699        my $ih = Tracer::Open(undef, "<$fileName");
700        # Loop through the input.
701        while (! eof $ih) {
702            # Read in the peg, timestamp, and user ID.
703            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
704            # Only proceed if the feature exists.
705            if (! exists $timeHash->{$fid}) {
706                $stats->Add(skippedAnnotation => 1);
707            } else {
708                # Change assignments by the master user to FIG assignments.
709                $text =~ s/Set master function/Set FIG function/s;
710                # Insure the time stamp is valid.
711                if ($timestamp =~ /^\d+$/) {
712                    # Here it's a number. We need to insure the one we use to form
713                    # the key is unique.
714                    my $keyStamp = $timestamp;
715                    while ($timeHash->{$fid}{$keyStamp}) {
716                        $keyStamp++;
717                        $stats->Add(skippedStamp => 1);
718                    }
719                    # Form the annotation ID.
720                    my $annotationID = SaplingDataLoader::ComputeAnnotationID($fid, $keyStamp);
721                    $timeHash->{$fid}{$keyStamp} = 1;
722                    # Generate the annotation.
723                    $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);
724                    $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,
725                                comment => $text, annotator => $user);
726                } else {
727                    # Here we have an invalid time stamp.
728                    Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);
729                }
730            }
731        }
732    }
733    
734    
735  =head3 WriteProtein  =head3 WriteProtein
736    
737      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 614  Line 757 
757      # Get the parameters.      # Get the parameters.
758      my ($self, $fid, $sequence) = @_;      my ($self, $fid, $sequence) = @_;
759      # Compute the key of the protein sequence.      # Compute the key of the protein sequence.
760      my $protID = ERDB::DigestKey($sequence);      my $protID = $self->{sap}->ProteinID($sequence);
761      # Insure the protein exists.      # Insure the protein exists.
762      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);      $self->InsureEntity(ProteinSequence => $protID, sequence => $sequence);
763      # Connect the feature to it.      # Connect the feature to it.
764      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
765  }  }
766    
 =head3 InsureEntity  
   
     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);  
   
 Insure that the specified record exists in the database. If no record is found of the  
 specified type with the specified ID, one will be created with the indicated fields.  
   
 =over 4  
   
 =item $entityType  
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
   
 =back  
   
 =cut  
   
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
767  =head3 LoadSubsystems  =head3 LoadSubsystems
768    
769      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 737  Line 819 
819              if (@$roleList > 0) {              if (@$roleList > 0) {
820                  # Get the subsystem information from the first role and create the subsystem.                  # Get the subsystem information from the first role and create the subsystem.
821                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
822                  my %subFields = ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
823                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
824                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
825                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
826                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
827                      # Create the Includes record.                      # Create the Includes record.
828                      my %incFields = ExtractFields(Includes => $roleH);                      my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
829                      $sap->InsertObject('Includes', %incFields);                      $sap->InsertObject('Includes', %incFields);
830                      # Insure we have the role in place.                      # Insure we have the role in place.
831                      my %roleFields = ExtractFields(Role => $roleH);                      my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
832                      my $roleID = $roleFields{id};                      my $roleID = $roleFields{id};
833                      delete $roleFields{id};                      delete $roleFields{id};
834                      $self->InsureEntity('Role', $roleID, %roleFields);                      $self->InsureEntity('Role', $roleID, %roleFields);
# Line 984  Line 1066 
1066      $self->{stats}->Add(segment => 1);      $self->{stats}->Add(segment => 1);
1067  }  }
1068    
 =head2 Internal Utility Methods  
   
 =head3 DeleteRelatedRecords  
   
     DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);  
   
 Delete all the records in the named entity and relationship relating to the  
 specified genome and roll up the statistics in the specified statistics object.  
   
 =over 4  
   
 =item sap  
   
 L<Sapling> object for accessing the database.  
   
 =item genome  
   
 ID of the relevant genome.  
   
 =item stats  
   
 L<Stats> object for tracking the delete activity.  
   
 =item relName  
   
 Name of a relationship from the B<Genome> table.  
   
 =item entityName  
   
 Name of the entity on the other side of the relationship.  
   
 =back  
   
 =cut  
   
 sub DeleteRelatedRecords {  
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
   
 =head3 ExtractFields  
   
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
   
 =over 4  
   
 =item tableName  
   
 Name of the table whose field names and values are desired.  
   
 =item dataHash  
   
 Reference to a hash mapping fully-qualified ERDB field names to values.  
   
 =item RETURN  
   
 Returns a hash containing only the fields from the specified table and their values.  
   
 =back  
   
 =cut  
   
 sub ExtractFields {  
     # Get the parameters.  
     my ($tableName, $dataHash) = @_;  
     # Declare the return variable.  
     my %retVal;  
     # Extract the desired fields.  
     for my $field (keys %$dataHash) {  
         # Is this a field for the specified table?  
         if ($field =~ /^$tableName\(([^)]+)/) {  
             # Yes, put it in the output hash.  
             $retVal{$1} = $dataHash->{$field};  
         }  
     }  
     # Return the computed hash.  
     return %retVal;  
 }  
   
1069  =head3 CreateIdentifier  =head3 CreateIdentifier
1070    
1071      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
# Line 1121  Line 1108 
1108      # Insure the identifier exists in the database.      # Insure the identifier exists in the database.
1109      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1110      # Connect the identifier to the feature.      # Connect the identifier to the feature.
1111      $sap->InsertObject('Identifies', from_link => $alias, to_link => $fid, conf => $conf);      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
1112    }
1113    
1114    =head2 Internal Utility Methods
1115    
1116    =head3 ReadAnnotation
1117    
1118        my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1119    
1120    Read the next record from an annotation file. The next record must exist (that is, an
1121    end-of-file check should have been performed before calling this method).
1122    
1123    =over 4
1124    
1125    =item ih
1126    
1127    Open file handle for the annotation file.
1128    
1129    =item RETURN
1130    
1131    Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1132    timestamp, (2) the user ID, and (3) the annotation text.
1133    
1134    =back
1135    
1136    =cut
1137    
1138    sub ReadAnnotation {
1139        # Get the parameter.
1140        my ($ih) = @_;
1141        # Read the three fixed fields.
1142        my $fid = <$ih>; chomp $fid;
1143        my $timestamp = <$ih>; chomp $timestamp;
1144        my $user = <$ih>; chomp $user;
1145        # Loop through the lines of the text field.
1146        my $text = "";
1147        my $line = <$ih>;
1148        while ($line ne "//\n") {
1149            $text .= $line;
1150            $line = <$ih>;
1151        }
1152        # Remove the trailing new-line from the text.
1153        chomp $text;
1154        # Return the fields.
1155        return ($fid, $timestamp, $user, $text);
1156  }  }
1157    
1158  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3