[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Dec 14 19:48:38 2010 UTC revision 1.6, Sat Feb 26 19:05:32 2011 UTC
# Line 25  Line 25 
25      use SeedUtils;      use SeedUtils;
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28        use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 65  Line 67 
67      # Create the loader object.      # Create the loader object.
68      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69      # Load the contigs.      # Load the contigs.
70        Trace("Loading contigs for $genome.") if T(2);
71      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
72      # Load the features.      # Load the features.
73        Trace("Loading features for $genome.") if T(2);
74      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
75      # Load the subsystem bindings.      # Load the subsystem bindings.
76        Trace("Loading subsystems for $genome.") if T(2);
77      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
78      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
79        Trace("Creating root for $genome.") if T(2);
80      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
81      # Return the statistics.      # Return the statistics.
82      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 107  Line 113 
113      # Create the statistics object.      # Create the statistics object.
114      my $stats = Stats->new();      my $stats = Stats->new();
115      # Delete the DNA.      # Delete the DNA.
116      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
117      # Delete the contigs.      # Delete the contigs.
118      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
119      # Delete the features.      # Delete the features.
120      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
121      # Delete the molecular machines.      # Delete the molecular machines.
122      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
123      # Delete the genome itself.      # Delete the genome itself.
124      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
125      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 171  Line 177 
177    
178  L<Stats> object for tracking statistical information about the load.  L<Stats> object for tracking statistical information about the load.
179    
180    =item timestamps
181    
182    A hash of hashes, keyed by feature ID. The sub-hashes are keyed by annotation timestamp,
183    and used to prevent duplicate timestamps.
184    
185  =back  =back
186    
187  =cut  =cut
# Line 179  Line 190 
190      # Get the parameters.      # Get the parameters.
191      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory) = @_;
192      # Create the object.      # Create the object.
193      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
194          sap => $sap,      # Add our specialized data.
195          genome => $genome,      $retVal->{genome} = $genome;
196          directory => $directory,      $retVal->{directory} = $directory;
197          stats => Stats->new(qw(contigs dna pegs rnas)),      $retVal->{timestamps} = {};
198          supportRecords => {}      # Return the result.
     };  
     # Bless and return it.  
     bless $retVal, $class;  
199      return $retVal;      return $retVal;
200  }  }
201    
# Line 300  Line 308 
308      # Compute the chunk ID.      # Compute the chunk ID.
309      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
310      # Connect this sequence to the contig.      # Connect this sequence to the contig.
311      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
312      # Create the DNA sequence.      # Create the DNA sequence.
313      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
314      # Record the chunk.      # Record the chunk.
# Line 370  Line 378 
378      if (-f "$featureDir/peg/fasta") {      if (-f "$featureDir/peg/fasta") {
379          $self->LoadProteinData("$featureDir/peg/fasta");          $self->LoadProteinData("$featureDir/peg/fasta");
380      }      }
381        # Check for annotation history. If we have it, load the history records into the
382        # database.
383        if (-f "$featureDir/annotations") {
384            $self->LoadAnnotations("$featureDir/annotations");
385        }
386  }  }
387    
388  =head3 LoadFeatureData  =head3 LoadFeatureData
389    
390      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type);
391    
392  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
393  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 406  Line 419 
419      my $assignHash = $self->ReadAssignments();      my $assignHash = $self->ReadAssignments();
420      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
421      # time, it overwrites the original.      # time, it overwrites the original.
422      my %fids;      my $fidHash = $self->{timestamps};
423        # Finally, we need the timestamp hash. The initial feature population
424      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
425      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
426      if (-f $fileName) {      if (-f $fileName) {
427          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
428            # of deleted features.
429            my %deletedFids;
430            my $deleteFile = "$featureDir/$type/deleted.features";
431            if (-f $deleteFile) {
432                %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);
433            }
434            # Open the main file for input.
435          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
436          while (! eof $ih) {          while (! eof $ih) {
437              # Read this feature's information.              # Read this feature's information.
438              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
439              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
440              if (exists $fids{$fid}) {              if (! exists $deletedFids{$fid}) {
441                    # If the feature already exists, delete it. (This should be extremely rare.)
442                    if (exists $fidHash->{$fid}) {
443                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
444                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
             } else {  
                 # Otherwise connect it to the genome.  
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
445              }              }
446                    # Otherwise connect this feature to the genome.
447                    $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
448              # Now we must parse the locations. This will contain a list of the location              # Now we must parse the locations. This will contain a list of the location
449              # data 4-tuples (contig, start, dir, len).              # data 4-tuples (contig, start, dir, len).
450              my @locData;              my @locData;
# Line 451  Line 473 
473              $sap->InsertObject('Feature', id => $fid, feature_type => $type,              $sap->InsertObject('Feature', id => $fid, feature_type => $type,
474                                 function => $assignHash->{$fid}, locked => 0,                                 function => $assignHash->{$fid}, locked => 0,
475                                 sequence_length => $length);                                 sequence_length => $length);
476              $fids{$fid} = 1;                  $fidHash->{$fid} = {};
477              $stats->Add($type => 1);              $stats->Add($type => 1);
478              # The next step is to connect the feature record to its locations. This              # The next step is to connect the feature record to its locations. This
479              # involves dividing the location into segments. The following variable              # involves dividing the location into segments. The following variable
# Line 482  Line 504 
504                  # Output the last segment.                  # Output the last segment.
505                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
506              }              }
507                    # Now we process the aliases and create the identifiers. We don't do this
508                    # for RNA, because the RNA function is stored in the aliases.
509                    if ($type ne 'rna') {
510                        for my $alias (@aliases) {
511                            my $normalized;
512                            # Determine the type.
513                            my $aliasType = AliasAnalysis::TypeOf($alias);
514                            $stats->Add(aliasAll => 1);
515                            # Is this a recognized type?
516                            if ($aliasType) {
517                                $stats->Add(aliasNormal => 1);
518                                # Yes. Write it normally.
519                                $self->CreateIdentifier($alias, B => $aliasType, $fid);
520                            } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
521                                # No, but this is a specially-marked locus tag.
522                                $normalized = $1;
523                                $stats->Add(aliasLocus => 1);
524                                $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
525                            } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
526                                # No, but this is a natural locus tag.
527                                $stats->Add(aliasLocus => 1);
528                                $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
529                            } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
530                                # No, but this is a natural gene name.
531                                $stats->Add(aliasGene => 1);
532                                $self->CreateIdentifier($normalized, B => 'GENE', $fid);
533                            } elsif ($alias =~ /^\d+$/) {
534                                # Here it's a naked number, which means it's a GI number
535                                # of some sort.
536                                $stats->Add(aliasGI => 1);
537                                $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
538                            } elsif ($alias =~ /^protein_id\|(.+)/) {
539                                # Here we have a REFSEQ protein ID. Right now we don't have a way to
540                                # handle that, because we don't know the feature's protein ID here.
541                                $stats->Add(aliasProtein => 1);
542                            } elsif ($alias =~ /[:|]/) {
543                                # Here it's an alias of an unknown type, so we skip it.
544                                $stats->Add(aliasUnknown => 1);
545                            } else {
546                                # Here it's a miscellaneous type.
547                                $stats->Add(aliasMisc => 1);
548                                $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
549                            }
550                        }
551                    }
552                }
553            }
554        }
555        # Now loop through the features, connecting them to their roles. Note that deleted
556        # features will not be in the assignment hash.
557        for my $fid (keys %$assignHash) {
558            # Get the roles and the error count.
559            my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});
560            # Accumulate the errors in the stats object.
561            $stats->Add(roleErrors => $errors);
562            # Is this a suspicious function?
563            if (! defined $roles) {
564                # Yes, so track it.
565                $stats->Add(badFunction => 1);
566            } else {
567                # No, connect the roles.
568                for my $role (@$roles) {
569                    # Insure this role exists.
570                    my $hypo = hypo($role);
571                    $self->InsureEntity(Role => $role, hypothetical => $hypo);
572                    # Connect it to the feature.
573                    $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);
574                }
575          }          }
576      }      }
577  }  }
# Line 540  Line 630 
630      }      }
631  }  }
632    
633    
634    =head3 LoadAnnotations
635    
636        $loaderObject->LoadAnnotations($fileName);
637    
638    Read in the annotation history information and use it to create annotation records.
639    
640    =over 4
641    
642    =item fileName
643    
644    Name of the annotation history file. This file is formatted with four fields per
645    record. Each field is on a separate line, with a double slash (C<//>) used as the
646    line terminator. The fields, in order, are (0) the feature ID, (1) the timestamp
647    (formatted as an integer), (2) the user name, and (3) the annotation text.
648    
649    =back
650    
651    =cut
652    
653    sub LoadAnnotations {
654        # Get the parameters.
655        my ($self, $fileName) = @_;
656        # Get the timestamp hash.
657        my $timeHash = $self->{timestamps};
658        # Get the Sapling database.
659        my $sap = $self->{sap};
660        # Get the statistics object.
661        my $stats = $self->{stats};
662        # Open the input file.
663        my $ih = Tracer::Open(undef, "<$fileName");
664        # Loop through the input.
665        while (! eof $ih) {
666            # Read in the peg, timestamp, and user ID.
667            my ($fid, $timestamp, $user, $text) = ReadAnnotation($ih);
668            # Only proceed if the feature exists.
669            if (! exists $timeHash->{$fid}) {
670                $stats->Add(skippedAnnotation => 1);
671            } else {
672                # Change assignments by the master user to FIG assignments.
673                $text =~ s/Set master function/Set FIG function/s;
674                # Insure the time stamp is valid.
675                if ($timestamp =~ /^\d+$/) {
676                    # Here it's a number. We need to insure the one we use to form
677                    # the key is unique.
678                    my $keyStamp = $timestamp;
679                    while ($timeHash->{$fid}{$keyStamp}) {
680                        $keyStamp++;
681                        $stats->Add(skippedStamp => 1);
682                    }
683                    # Form the annotation ID.
684                    my $annotationID = "$fid:" . Tracer::Pad(9999999999 - $keyStamp, 10,
685                                                             1, "0");
686                    $timeHash->{$fid}{$keyStamp} = 1;
687                    # Generate the annotation.
688                    $sap->InsertObject('IsAnnotatedBy', from_link => $fid, to_link => $annotationID);
689                    $sap->InsertObject('Annotation', id => $annotationID, annotation_time => $timestamp,
690                                comment => $text, annotator => $user);
691                } else {
692                    # Here we have an invalid time stamp.
693                    Trace("Invalid time stamp \"$timestamp\" in annotations for $fid.") if T(1);
694                }
695            }
696        }
697    }
698    
699    
700  =head3 WriteProtein  =head3 WriteProtein
701    
702      $loaderObject->WriteProtein($fid, $sequence);      $loaderObject->WriteProtein($fid, $sequence);
# Line 572  Line 729 
729      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
730  }  }
731    
 =head3 InsureEntity  
   
     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);  
   
 Insure that the specified record exists in the database. If no record is found of the  
 specified type with the specified ID, one will be created with the indicated fields.  
   
 =over 4  
   
 =item $entityType  
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
   
 =back  
   
 =cut  
   
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
732  =head3 LoadSubsystems  =head3 LoadSubsystems
733    
734      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 688  Line 784 
784              if (@$roleList > 0) {              if (@$roleList > 0) {
785                  # Get the subsystem information from the first role and create the subsystem.                  # Get the subsystem information from the first role and create the subsystem.
786                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
787                  my %subFields = ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
788                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
789                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
790                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
791                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
792                      # Create the Includes record.                      # Create the Includes record.
793                      my %incFields = ExtractFields(Includes => $roleH);                      my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
794                      $sap->InsertObject('Includes', %incFields);                      $sap->InsertObject('Includes', %incFields);
795                      # Insure we have the role in place.                      # Insure we have the role in place.
796                      my %roleFields = ExtractFields(Role => $roleH);                      my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
797                      my $roleID = $roleFields{id};                      my $roleID = $roleFields{id};
798                      delete $roleFields{id};                      delete $roleFields{id};
799                      $self->InsureEntity('Role', $roleID, %roleFields);                      $self->InsureEntity('Role', $roleID, %roleFields);
# Line 935  Line 1031 
1031      $self->{stats}->Add(segment => 1);      $self->{stats}->Add(segment => 1);
1032  }  }
1033    
1034  =head2 Internal Utility Methods  =head3 CreateIdentifier
   
 =head3 DeleteRelatedRecords  
1035    
1036      DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
1037    
1038  Delete all the records in the named entity and relationship relating to the  Link an identifier to a feature. The identifier is presented in prefixed form and is of the
1039  specified genome and roll up the statistics in the specified statistics object.  specified type and the specified confidence level.
1040    
1041  =over 4  =over 4
1042    
1043  =item sap  =item alias
1044    
1045  L<Sapling> object for accessing the database.  Identifier to connect to the feature.
1046    
1047  =item genome  =item conf
   
 ID of the relevant genome.  
1048    
1049  =item stats  Confidence level (C<A> curated, C<B> normal, C<C> protein only).
1050    
1051  L<Stats> object for tracking the delete activity.  =item aliasType
1052    
1053  =item relName  Type of alias (e.g. C<NCBI>, C<LocusTag>).
1054    
1055  Name of a relationship from the B<Genome> table.  =item fid
   
 =item entityName  
1056    
1057  Name of the entity on the other side of the relationship.  ID of the relevant feature.
1058    
1059  =back  =back
1060    
1061  =cut  =cut
1062    
1063  sub DeleteRelatedRecords {  sub CreateIdentifier {
1064      # Get the parameters.      # Get the parameters.
1065      my ($sap, $genome, $stats, $relName, $entityName) = @_;      my ($self, $alias, $conf, $aliasType, $fid) = @_;
1066      # Get all the relationship records.      # Get the Sapling object.
1067      my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],      my $sap = $self->{sap};
1068                                    "to-link");      # Compute the identifier's natural form.
1069      # Loop through the relationship records, deleting them and the target entity      my $natural = $alias;
1070      # records.      if ($natural =~ /[:|](.+)/) {
1071      for my $target (@targets) {          $natural = $1;
1072          # Delete the relationship instance.      }
1073          $sap->DeleteRow($relName, $genome, $target);      # Insure the identifier exists in the database.
1074          $stats->Add($relName => 1);      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
1075          # Delete the entity instance.      # Connect the identifier to the feature.
1076          my $subStats = $sap->Delete($entityName, $target);      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
1077  }  }
1078    
1079  =head3 ExtractFields  =head2 Internal Utility Methods
1080    
1081      my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  =head3 ReadAnnotation
1082    
1083  Extract from the incoming hash the field names and values from the specified table.      my ($fid, $timestamp, $user, $text) = SaplingGenomeLoader::ReadAnnotation($ih);
1084    
1085  =over 4  Read the next record from an annotation file. The next record must exist (that is, an
1086    end-of-file check should have been performed before calling this method).
1087    
1088  =item tableName  =over 4
   
 Name of the table whose field names and values are desired.  
1089    
1090  =item dataHash  =item ih
1091    
1092  Reference to a hash mapping fully-qualified ERDB field names to values.  Open file handle for the annotation file.
1093    
1094  =item RETURN  =item RETURN
1095    
1096  Returns a hash containing only the fields from the specified table and their values.  Returns a list containing the four fields of the record read-- (0) the feature ID, (1) the
1097    timestamp, (2) the user ID, and (3) the annotation text.
1098    
1099  =back  =back
1100    
1101  =cut  =cut
1102    
1103  sub ExtractFields {  sub ReadAnnotation {
1104      # Get the parameters.      # Get the parameter.
1105      my ($tableName, $dataHash) = @_;      my ($ih) = @_;
1106      # Declare the return variable.      # Read the three fixed fields.
1107      my %retVal;      my $fid = <$ih>; chomp $fid;
1108      # Extract the desired fields.      my $timestamp = <$ih>; chomp $timestamp;
1109      for my $field (keys %$dataHash) {      my $user = <$ih>; chomp $user;
1110          # Is this a field for the specified table?      # Loop through the lines of the text field.
1111          if ($field =~ /^$tableName\(([^)]+)/) {      my $text = "";
1112              # Yes, put it in the output hash.      my $line = <$ih>;
1113              $retVal{$1} = $dataHash->{$field};      while ($line ne "//\n") {
1114          }          $text .= $line;
1115      }          $line = <$ih>;
1116      # Return the computed hash.      }
1117      return %retVal;      # Remove the trailing new-line from the text.
1118        chomp $text;
1119        # Return the fields.
1120        return ($fid, $timestamp, $user, $text);
1121  }  }
1122    
1123  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3