[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Dec 14 19:48:38 2010 UTC revision 1.5, Sun Feb 13 13:02:30 2011 UTC
# Line 25  Line 25 
25      use SeedUtils;      use SeedUtils;
26      use SAPserver;      use SAPserver;
27      use Sapling;      use Sapling;
28        use AliasAnalysis;
29        use base qw(SaplingDataLoader);
30    
31  =head1 Sapling Genome Loader  =head1 Sapling Genome Loader
32    
# Line 65  Line 67 
67      # Create the loader object.      # Create the loader object.
68      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69      # Load the contigs.      # Load the contigs.
70        Trace("Loading contigs for $genome.") if T(2);
71      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
72      # Load the features.      # Load the features.
73        Trace("Loading features for $genome.") if T(2);
74      $loaderObject->LoadFeatures();      $loaderObject->LoadFeatures();
75      # Load the subsystem bindings.      # Load the subsystem bindings.
76        Trace("Loading subsystems for $genome.") if T(2);
77      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
78      # Create the Genome record and taxonomy information.      # Create the Genome record and taxonomy information.
79        Trace("Creating root for $genome.") if T(2);
80      $loaderObject->CreateGenome();      $loaderObject->CreateGenome();
81      # Return the statistics.      # Return the statistics.
82      return $loaderObject->{stats};      return $loaderObject->{stats};
# Line 107  Line 113 
113      # Create the statistics object.      # Create the statistics object.
114      my $stats = Stats->new();      my $stats = Stats->new();
115      # Delete the DNA.      # Delete the DNA.
116      DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'HasSection', 'DNASequence');
117      # Delete the contigs.      # Delete the contigs.
118      DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsMadeUpOf', 'Contig');
119      # Delete the features.      # Delete the features.
120      DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'IsOwnerOf', 'Feature');
121      # Delete the molecular machines.      # Delete the molecular machines.
122      DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');      SaplingDataLoader::DeleteRelatedRecords($sap, $genome, $stats, 'Uses', 'MolecularMachine');
123      # Delete the genome itself.      # Delete the genome itself.
124      my $subStats = $sap->Delete(Genome => $genome);      my $subStats = $sap->Delete(Genome => $genome);
125      # Accumulate the statistics from the delete.      # Accumulate the statistics from the delete.
# Line 179  Line 185 
185      # Get the parameters.      # Get the parameters.
186      my ($class, $sap, $genome, $directory) = @_;      my ($class, $sap, $genome, $directory) = @_;
187      # Create the object.      # Create the object.
188      my $retVal = {      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
189          sap => $sap,      # Add our specialized data.
190          genome => $genome,      $retVal->{genome} = $genome;
191          directory => $directory,      $retVal->{directory} = $directory;
192          stats => Stats->new(qw(contigs dna pegs rnas)),      # Return the result.
         supportRecords => {}  
     };  
     # Bless and return it.  
     bless $retVal, $class;  
193      return $retVal;      return $retVal;
194  }  }
195    
# Line 300  Line 302 
302      # Compute the chunk ID.      # Compute the chunk ID.
303      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');      my $chunkID = "$contigID:" . Tracer::Pad($ordinal, 7, 1, '0');
304      # Connect this sequence to the contig.      # Connect this sequence to the contig.
305      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunk);      $sap->InsertObject('HasSection', from_link => $contigID, to_link => $chunkID);
306      # Create the DNA sequence.      # Create the DNA sequence.
307      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
308      # Record the chunk.      # Record the chunk.
# Line 374  Line 376 
376    
377  =head3 LoadFeatureData  =head3 LoadFeatureData
378    
379      $self->LoadFeatureData($featureDir, $type);      $loaderObject->LoadFeatureData($featureDir, $type);
380    
381  Load the basic data for each feature into the database. The number of features of  Load the basic data for each feature into the database. The number of features of
382  the type found will be recorded in the statistics object.  the type found will be recorded in the statistics object.
# Line 410  Line 412 
412      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
413      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
414      if (-f $fileName) {      if (-f $fileName) {
415          # We have one, so open it for input.          # We have one, so we can read through it. First, however, we need to get the list
416            # of deleted features.
417            my %deletedFids;
418            my $deleteFile = "$featureDir/$type/deleted.features";
419            if (-f $deleteFile) {
420                %deletedFids = map { $_ => 1 } Tracer::GetFile($deleteFile);
421            }
422            # Open the main file for input.
423          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
424          while (! eof $ih) {          while (! eof $ih) {
425              # Read this feature's information.              # Read this feature's information.
426              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
427              # If the feature already exists, delete it.              # Only proceed if the feature is NOT deleted.
428                if (! exists $deletedFids{$fid}) {
429                    # If the feature already exists, delete it. (This should be extremely rare.)
430              if (exists $fids{$fid}) {              if (exists $fids{$fid}) {
431                  $sap->Delete(Feature => $fid);                  $sap->Delete(Feature => $fid);
432                  $stats->Add(duplicateFid => 1);                  $stats->Add(duplicateFid => 1);
             } else {  
                 # Otherwise connect it to the genome.  
                 $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);  
433              }              }
434                    # Otherwise connect this feature to the genome.
435                    $sap->InsertObject('IsOwnerOf', from_link => $self->{genome}, to_link => $fid);
436              # Now we must parse the locations. This will contain a list of the location              # Now we must parse the locations. This will contain a list of the location
437              # data 4-tuples (contig, start, dir, len).              # data 4-tuples (contig, start, dir, len).
438              my @locData;              my @locData;
# Line 482  Line 492 
492                  # Output the last segment.                  # Output the last segment.
493                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);                  $self->ConnectLocation($fid, $contig, $segment, $left, $dir, $len);
494              }              }
495                    # Now we process the aliases and create the identifiers. We don't do this
496                    # for RNA, because the RNA function is stored in the aliases.
497                    if ($type ne 'rna') {
498                        for my $alias (@aliases) {
499                            my $normalized;
500                            # Determine the type.
501                            my $aliasType = AliasAnalysis::TypeOf($alias);
502                            $stats->Add(aliasAll => 1);
503                            # Is this a recognized type?
504                            if ($aliasType) {
505                                $stats->Add(aliasNormal => 1);
506                                # Yes. Write it normally.
507                                $self->CreateIdentifier($alias, B => $aliasType, $fid);
508                            } elsif ($alias =~ /^LocusTag:(.+)/ || $alias =~ /^(?:locus|locus_tag|LocusTag)\|(.+)/) {
509                                # No, but this is a specially-marked locus tag.
510                                $normalized = $1;
511                                $stats->Add(aliasLocus => 1);
512                                $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
513                            } elsif ($normalized = AliasAnalysis::IsNatural(LocusTag => $alias)) {
514                                # No, but this is a natural locus tag.
515                                $stats->Add(aliasLocus => 1);
516                                $self->CreateIdentifier($normalized, B => 'LocusTag', $fid);
517                            } elsif ($normalized = AliasAnalysis::IsNatural(GENE => $alias)) {
518                                # No, but this is a natural gene name.
519                                $stats->Add(aliasGene => 1);
520                                $self->CreateIdentifier($normalized, B => 'GENE', $fid);
521                            } elsif ($alias =~ /^\d+$/) {
522                                # Here it's a naked number, which means it's a GI number
523                                # of some sort.
524                                $stats->Add(aliasGI => 1);
525                                $self->CreateIdentifier("gi|$alias", B => 'NCBI', $fid);
526                            } elsif ($alias =~ /^protein_id\|(.+)/) {
527                                # Here we have a REFSEQ protein ID. Right now we don't have a way to
528                                # handle that, because we don't know the feature's protein ID here.
529                                $stats->Add(aliasProtein => 1);
530                            } elsif ($alias =~ /[:|]/) {
531                                # Here it's an alias of an unknown type, so we skip it.
532                                $stats->Add(aliasUnknown => 1);
533                            } else {
534                                # Here it's a miscellaneous type.
535                                $stats->Add(aliasMisc => 1);
536                                $self->CreateIdentifier($alias, B => 'Miscellaneous', $fid);
537                            }
538                        }
539                    }
540                }
541            }
542        }
543        # Now loop through the features, connecting them to their roles. Note that deleted
544        # features will not be in the assignment hash.
545        for my $fid (keys %$assignHash) {
546            # Get the roles and the error count.
547            my ($roles, $errors) = SeedUtils::roles_for_loading($assignHash->{$fid});
548            # Accumulate the errors in the stats object.
549            $stats->Add(roleErrors => $errors);
550            # Is this a suspicious function?
551            if (! defined $roles) {
552                # Yes, so track it.
553                $stats->Add(badFunction => 1);
554            } else {
555                # No, connect the roles.
556                for my $role (@$roles) {
557                    # Insure this role exists.
558                    my $hypo = hypo($role);
559                    $self->InsureEntity(Role => $role, hypothetical => $hypo);
560                    # Connect it to the feature.
561                    $sap->InsertObject('IsFunctionalIn', from_link => $role, to_link => $fid);
562                }
563          }          }
564      }      }
565  }  }
# Line 572  Line 650 
650      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);      $self->{sap}->InsertObject('IsProteinFor', from_link => $protID, to_link => $fid);
651  }  }
652    
 =head3 InsureEntity  
   
     my $createdFlag = $loaderObject->InsureEntity($entityType => $id, %fields);  
   
 Insure that the specified record exists in the database. If no record is found of the  
 specified type with the specified ID, one will be created with the indicated fields.  
   
 =over 4  
   
 =item $entityType  
   
 Type of entity to check.  
   
 =item id  
   
 ID of the entity instance in question.  
   
 =item fields  
   
 Hash mapping field names to values for all the fields in the desired entity record except  
 for the ID.  
   
 =item RETURN  
   
 Returns TRUE if a new object was created, FALSE if it already existed.  
   
 =back  
   
 =cut  
   
 sub InsureEntity {  
     # Get the parameters.  
     my ($self, $entityType, $id, %fields) = @_;  
     # Get the database.  
     my $sap = $self->{sap};  
     # Get the support record ID hash.  
     my $supportHash = $self->{supportRecords};  
     # Denote we haven't created a new record.  
     my $retVal = 0;  
     # Get the sub-hash for this entity type.  
     my $entityHash = $supportHash->{$entityType};  
     if (! defined $entityHash) {  
         $entityHash = {};  
         $supportHash->{$entityType} = $entityHash;  
     }  
     # Check for this instance.  
     if (! $entityHash->{$id}) {  
         # It's not found. Check the database.  
         if (! $sap->Exists($entityType => $id)) {  
             # It's not in the database either, so create it.  
             $sap->InsertObject($entityType, id => $id, %fields);  
             $self->{stats}->Add(insertSupport => 1);  
             $retVal = 1;  
         }  
         # Mark the record in the hash so we know we have it.  
         $entityHash->{$id} = 1;  
     }  
     # Return the insertion indicator.  
     return $retVal;  
 }  
   
653  =head3 LoadSubsystems  =head3 LoadSubsystems
654    
655      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems();
# Line 688  Line 705 
705              if (@$roleList > 0) {              if (@$roleList > 0) {
706                  # Get the subsystem information from the first role and create the subsystem.                  # Get the subsystem information from the first role and create the subsystem.
707                  my $roleH = $roleList->[0];                  my $roleH = $roleList->[0];
708                  my %subFields = ExtractFields(Subsystem => $roleH);                  my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);
709                  $sap->InsertObject('Subsystem', %subFields);                  $sap->InsertObject('Subsystem', %subFields);
710                  # Now loop through the roles. The Includes records are always inserted, but the                  # Now loop through the roles. The Includes records are always inserted, but the
711                  # roles are only inserted if they don't already exist.                  # roles are only inserted if they don't already exist.
712                  for $roleH (@$roleList) {                  for $roleH (@$roleList) {
713                      # Create the Includes record.                      # Create the Includes record.
714                      my %incFields = ExtractFields(Includes => $roleH);                      my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);
715                      $sap->InsertObject('Includes', %incFields);                      $sap->InsertObject('Includes', %incFields);
716                      # Insure we have the role in place.                      # Insure we have the role in place.
717                      my %roleFields = ExtractFields(Role => $roleH);                      my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);
718                      my $roleID = $roleFields{id};                      my $roleID = $roleFields{id};
719                      delete $roleFields{id};                      delete $roleFields{id};
720                      $self->InsureEntity('Role', $roleID, %roleFields);                      $self->InsureEntity('Role', $roleID, %roleFields);
# Line 937  Line 954 
954    
955  =head2 Internal Utility Methods  =head2 Internal Utility Methods
956    
957  =head3 DeleteRelatedRecords  =head3 CreateIdentifier
958    
959      DeleteRelatedRecords($sap, $genome, $stats, $relName, $entityName);      $loaderObject->CreateIdentifier($alias, $conf, $aliasType, $fid);
960    
961  Delete all the records in the named entity and relationship relating to the  Link an identifier to a feature. The identifier is presented in prefixed form and is of the
962  specified genome and roll up the statistics in the specified statistics object.  specified type and the specified confidence level.
963    
964  =over 4  =over 4
965    
966  =item sap  =item alias
   
 L<Sapling> object for accessing the database.  
   
 =item genome  
   
 ID of the relevant genome.  
967    
968  =item stats  Identifier to connect to the feature.
   
 L<Stats> object for tracking the delete activity.  
969    
970  =item relName  =item conf
971    
972  Name of a relationship from the B<Genome> table.  Confidence level (C<A> curated, C<B> normal, C<C> protein only).
973    
974  =item entityName  =item aliasType
975    
976  Name of the entity on the other side of the relationship.  Type of alias (e.g. C<NCBI>, C<LocusTag>).
977    
978  =back  =item fid
   
 =cut  
   
 sub DeleteRelatedRecords {  
     # Get the parameters.  
     my ($sap, $genome, $stats, $relName, $entityName) = @_;  
     # Get all the relationship records.  
     my (@targets) = $sap->GetFlat($relName, "$relName(from-link) = ?", [$genome],  
                                   "to-link");  
     # Loop through the relationship records, deleting them and the target entity  
     # records.  
     for my $target (@targets) {  
         # Delete the relationship instance.  
         $sap->DeleteRow($relName, $genome, $target);  
         $stats->Add($relName => 1);  
         # Delete the entity instance.  
         my $subStats = $sap->Delete($entityName, $target);  
         # Roll up the statistics.  
         $stats->Accumulate($subStats);  
     }  
 }  
   
 =head3 ExtractFields  
   
     my %fieldHash = SaplingGenomeLoader::ExtractFields($tableName, $dataHash);  
   
 Extract from the incoming hash the field names and values from the specified table.  
   
 =over 4  
   
 =item tableName  
   
 Name of the table whose field names and values are desired.  
   
 =item dataHash  
   
 Reference to a hash mapping fully-qualified ERDB field names to values.  
   
 =item RETURN  
979    
980  Returns a hash containing only the fields from the specified table and their values.  ID of the relevant feature.
981    
982  =back  =back
983    
984  =cut  =cut
985    
986  sub ExtractFields {  sub CreateIdentifier {
987      # Get the parameters.      # Get the parameters.
988      my ($tableName, $dataHash) = @_;      my ($self, $alias, $conf, $aliasType, $fid) = @_;
989      # Declare the return variable.      # Get the Sapling object.
990      my %retVal;      my $sap = $self->{sap};
991      # Extract the desired fields.      # Compute the identifier's natural form.
992      for my $field (keys %$dataHash) {      my $natural = $alias;
993          # Is this a field for the specified table?      if ($natural =~ /[:|](.+)/) {
994          if ($field =~ /^$tableName\(([^)]+)/) {          $natural = $1;
995              # Yes, put it in the output hash.      }
996              $retVal{$1} = $dataHash->{$field};      # Insure the identifier exists in the database.
997          }      $self->InsureEntity(Identifier => $alias, source => $aliasType, natural_form => $natural);
998      }      # Connect the identifier to the feature.
999      # Return the computed hash.      $sap->InsertObject('IsIdentifiedBy', to_link => $alias, from_link => $fid, conf => $conf);
     return %retVal;  
1000  }  }
1001    
1002  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3