[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.13, Fri Jul 22 19:20:11 2011 UTC revision 1.17, Tue Mar 26 02:10:46 2013 UTC
# Line 57  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
 =item assignHash  
   
 Hash of feature IDs to functional assignments. Deleted features are removed, which  
 means only features listed in this hash can be legally inserted into the database.  
   
60  =back  =back
61    
62  =cut  =cut
# Line 163  Line 158 
158    
159  =item directory  =item directory
160    
161  Name of the directory containing the genome data files.  Name of the directory containing the genome data files. If omitted, the
162    genome will be deleted from the database.
163    
164  =item RETURN  =item RETURN
165    
# Line 178  Line 174 
174      my ($sap, $genome, $directory) = @_;      my ($sap, $genome, $directory) = @_;
175      # Clear the existing data for the specified genome.      # Clear the existing data for the specified genome.
176      my $stats = ClearGenome($sap, $genome);      my $stats = ClearGenome($sap, $genome);
177      # Load the new expression data from the specified directory.      # Load the new expression data from the specified directory (if one is
178        # specified).
179        if ($directory) {
180      my $newStats = Load($sap, $genome, $directory);      my $newStats = Load($sap, $genome, $directory);
181      # Merge the statistics.      # Merge the statistics.
182      $stats->Accumulate($newStats);      $stats->Accumulate($newStats);
183        }
184      # Return the result.      # Return the result.
185      return $stats;      return $stats;
186  }  }
# Line 259  Line 258 
258      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
259    
260  Load the contig information into the database. This includes the contigs themselves and  Load the contig information into the database. This includes the contigs themselves and
261  the DNA. The number of contigs will be recorded as the C<contigs> statistic and the  the DNA. The number of contigs will be recorded as the C<contigs> statistic, the
262  number of base pairs as the C<dna> statistic.  number of base pairs as the C<dna> statistic, and the number of GC instances as the
263    C<gc_content> statistic.
264    
265  =cut  =cut
266    
# Line 368  Line 368 
368      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);      $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk);
369      # Record the chunk.      # Record the chunk.
370      $self->{stats}->Add(chunks => 1);      $self->{stats}->Add(chunks => 1);
371        # Update the GC count.
372        $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//));
373  }  }
374    
375  =head3 OutputContig  =head3 OutputContig
# Line 571  Line 573 
573      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
574      # time, it overwrites the original.      # time, it overwrites the original.
575      my %fidHash;      my %fidHash;
576      # Finally, we need the timestamp hash. The initial feature population      # This hash tracks the deleted features. We don't want to update these.
577        my %deleted_features;
578      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
579      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
580      if (-f $fileName) {      if (-f $fileName) {
# Line 586  Line 589 
589                  if (exists $assignHash->{$deletedFid}) {                  if (exists $assignHash->{$deletedFid}) {
590                      delete $assignHash->{$deletedFid};                      delete $assignHash->{$deletedFid};
591                      $stats->Add(deletedFid => 1);                      $stats->Add(deletedFid => 1);
592                        $deleted_features{$deletedFid} = 1;
593                  }                  }
594              }              }
595          }          }
# Line 596  Line 600 
600              # Read this feature's information.              # Read this feature's information.
601              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);              my ($fid, $locations, @aliases) = Tracer::GetLine($ih);
602              # Only proceed if the feature is NOT deleted.              # Only proceed if the feature is NOT deleted.
603              if (exists $assignHash->{$fid}) {              if (!$deleted_features{$fid}) {
604                  # If the feature already exists, delete it. (This should be extremely rare.)                  # If the feature already exists, delete it. (This should be extremely rare.)
605                  if ($fidHash{$fid}) {                  if ($fidHash{$fid}) {
606                      $sap->Delete(Feature => $fid);                      $sap->Delete(Feature => $fid);
# Line 611  Line 615 
615                      @aliases = ();                      @aliases = ();
616                  }                  }
617                  # Add the feature to the database.                  # Add the feature to the database.
618                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,                  my $function = $assignHash->{$fid} || "";
619                    $self->AddFeature($fid, $function, $locations, \@aliases,
620                                    $protHash->{$fid}, $evHash->{$fid});                                    $protHash->{$fid}, $evHash->{$fid});
621                  # Denote we've added this feature, so that if a duplicate occurs we're ready.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
622                  $fidHash{$fid} = 1;                  $fidHash{$fid} = 1;
# Line 760  Line 765 
765    
766  =head3 LoadSubsystems  =head3 LoadSubsystems
767    
768      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems($subsysList);
769    
770    Load the subsystem data into the database. This requires looking through the
771    bindings and using them to connect to the subsystems that already exist in the
772    database. If the subsystem does not exist, its bindings will not be loaded.
773    
774  Load the subsystem data into the database. This includes all of the subsystems,  =over 4
775  their categories, roles, and the bindings that connect them to this genome's features.  
776    =item subsysList
777    
778    Reference to a list of subsystem IDs. If specified, only the subsystems in the
779    list will be processed. This is useful when a particular set of subsystems
780    has been replaced.
781    
782    =back
783    
784  =cut  =cut
785    
786  sub LoadSubsystems {  sub LoadSubsystems {
787      # Get the parameters.      # Get the parameters.
788      my ($self) = @_;      my ($self, $subsysList) = @_;
789      # Get the sapling object.      # Get the sapling object.
790      my $sap = $self->{sap};      my $sap = $self->{sap};
791      # Get the statistics object.      # Get the statistics object.
792      my $stats = $self->{stats};      my $stats = $self->{stats};
793      # Get the genome ID.      # Get the genome ID.
794      my $genome = $self->{genome};      my $genome = $self->{genome};
     # Connect to the Sapling server so we can get the subsystem variant and role information.  
     my $sapO = SAPserver->new();  
795      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
796      my %machines;      my %machines;
797      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
798      my %machineRoles;      my %machineRoles;
799      # The first step is to create the subsystems themselves and connect them to the      # This hash will contain the list of subsystems found in the database.
800      # genome. A list is given in the subsystems file of the Subsystems directory.      my %subsystems;
801        # We loop through the subsystems, looking for the ones already in the
802        # database. The list is given in the subsystems file of the Subsystems
803        # directory.
804      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
     # Create the field hash for the subsystem query.  
     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)  
                     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)  
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
805      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
806      while (! eof $ih) {      while (! eof $ih) {
807          # Get this subsystem.          # Get this subsystem.
# Line 800  Line 809 
809          Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);          Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
810          # Normalize the subsystem name.          # Normalize the subsystem name.
811          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
812            # Insure the subsystem is in the database and is one we're interested in.
813            if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
814                    $sap->Exists(Subsystem => $subsystem)) {
815                # We have the subsystem in the database. We need to compute the machine
816                # role IDs and create the molecular machine. First, we need to remember
817                # the subsystem.
818                $subsystems{$subsystem} = 1;
819          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
820          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
         # Insure the subsystem is in the database.  
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 $stats->Add(subsystems => 1);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                     $stats->Add(subsystemRoles => 1);  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
821              # Loop through the roles.              # Loop through the roles.
822              my $roleList = $subHash->{$subsystem};              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
823              for my $roleTuple (@$roleList) {                      [$subsystem], "to-link abbreviation");
824                for my $roleTuple (@roleList) {
825                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
826                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
827                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
828              }              }
829          }              # Next we need the variant code and key.
830          # Now we need to connect the variant to the subsystem. First, we compute the real              my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
831          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
832          # Insure we have it in the database.              # Now we create the molecular machine connecting this genome to the
833          if (! $sap->Exists(Variant => $variantKey)) {              # subsystem variant.
             # Get the variant from the sapling server.  
             my $variantH = $sapO->get(-objects => "Variant",  
                                       -filter => {"Variant(id)" => ["=", $variantKey]},  
                                       -fields => {"Variant(code)" => "code",  
                                                   "Variant(comment)" => "comment",  
                                                   "Variant(type)" => "type",  
                                                   "Variant(role-rule)" => "role-rule"},  
                                       -firstOnly => 1);  
             $self->InsureEntity('Variant', $variantKey, %$variantH);  
             # Connect it to the subsystem.  
             $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
             $stats->Add(subsystemVariants => 1);  
         }  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
834          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
835          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
836          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
# Line 877  Line 838 
838          # Remember the machine ID.          # Remember the machine ID.
839          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
840      }      }
841      # Now we go through the bindings file. This file connects the subsystem roles to      }
842      # the molecular machines.      # Now we go through the bindings file. This file connects the subsystem
843        # roles to the molecular machines.
844      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
845      # Loop through the bindings.      # Loop through the bindings.
846      while (! eof $ih) {      while (! eof $ih) {
# Line 886  Line 848 
848          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
849          # Normalize the subsystem name.          # Normalize the subsystem name.
850          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
851            # Insure the subsystem is in the database.
852            if ($subsystems{$subsystem}) {
853          # Compute the machine role.          # Compute the machine role.
854          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
855          # Insure it exists.          # Insure it exists.
# Line 901  Line 865 
865          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
866      }      }
867  }  }
868    }
869    
870  =head3 CreateGenome  =head3 CreateGenome
871    
# Line 958  Line 923 
923      $fields{'dna-size'} = $stats->Ask('dna');      $fields{'dna-size'} = $stats->Ask('dna');
924      $fields{pegs} = $stats->Ask('peg');      $fields{pegs} = $stats->Ask('peg');
925      $fields{rnas} = $stats->Ask('rna');      $fields{rnas} = $stats->Ask('rna');
926        $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna');
927      # Get the genetic code. The default is 11.      # Get the genetic code. The default is 11.
928      $fields{'genetic-code'} = 11;      $fields{'genetic-code'} = 11;
929      my $geneticCodeFile = "$dir/GENETIC_CODE";      my $geneticCodeFile = "$dir/GENETIC_CODE";

Legend:
Removed from v.1.13  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3