[Bio] / Sprout / SaplingGenomeLoader.pm Repository:
ViewVC logotype

Diff of /Sprout/SaplingGenomeLoader.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.15, Mon Dec 5 22:05:15 2011 UTC revision 1.17, Tue Mar 26 02:10:46 2013 UTC
# Line 57  Line 57 
57    
58  Name of the directory containing the genome information.  Name of the directory containing the genome information.
59    
 =item disconnected  
   
 True if the application is disconnected from the network - do not  
 attempt to contact a SAP server for more data.  
   
 =item assignHash  
   
 Hash of feature IDs to functional assignments. Deleted features are removed, which  
 means only features listed in this hash can be legally inserted into the database.  
   
60  =back  =back
61    
62  =cut  =cut
63    
64  sub Load {  sub Load {
65      # Get the parameters.      # Get the parameters.
66      my ($sap, $genome, $directory, $disconnected) = @_;      my ($sap, $genome, $directory) = @_;
67      # Create the loader object.      # Create the loader object.
68      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
69      # Load the contigs.      # Load the contigs.
70      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);      Trace("Loading contigs for $genome.") if T(SaplingDataLoader => 2);
71      $loaderObject->LoadContigs();      $loaderObject->LoadContigs();
# Line 168  Line 158 
158    
159  =item directory  =item directory
160    
161  Name of the directory containing the genome data files.  Name of the directory containing the genome data files. If omitted, the
162    genome will be deleted from the database.
163    
164  =item RETURN  =item RETURN
165    
# Line 183  Line 174 
174      my ($sap, $genome, $directory) = @_;      my ($sap, $genome, $directory) = @_;
175      # Clear the existing data for the specified genome.      # Clear the existing data for the specified genome.
176      my $stats = ClearGenome($sap, $genome);      my $stats = ClearGenome($sap, $genome);
177      # Load the new expression data from the specified directory.      # Load the new expression data from the specified directory (if one is
178        # specified).
179        if ($directory) {
180      my $newStats = Load($sap, $genome, $directory);      my $newStats = Load($sap, $genome, $directory);
181      # Merge the statistics.      # Merge the statistics.
182      $stats->Accumulate($newStats);      $stats->Accumulate($newStats);
183        }
184      # Return the result.      # Return the result.
185      return $stats;      return $stats;
186  }  }
# Line 196  Line 190 
190    
191  =head3 new  =head3 new
192    
193      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory, $disconnected);      my $loaderObject = SaplingGenomeLoader->new($sap, $genome, $directory);
194    
195  Create a loader object that can be used to facilitate loading genome data from a  Create a loader object that can be used to facilitate loading genome data from a
196  directory.  directory.
# Line 215  Line 209 
209    
210  Name of the directory containing the genome information.  Name of the directory containing the genome information.
211    
 =item disconnected  
   
 Set to a true value if the application should be considered to be disconnected  
 from the network - that is, do not attempt to connect to a Sapling server  
 to load subsystem data.  
   
212  =back  =back
213    
214  The object created contains the following fields.  The object created contains the following fields.
# Line 253  Line 241 
241    
242  sub new {  sub new {
243      # Get the parameters.      # Get the parameters.
244      my ($class, $sap, $genome, $directory, $disconnected) = @_;      my ($class, $sap, $genome, $directory) = @_;
245      # Create the object.      # Create the object.
246      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));      my $retVal = SaplingDataLoader::new($class, $sap, qw(contigs dna pegs rnas));
247      # Add our specialized data.      # Add our specialized data.
# Line 261  Line 249 
249      $retVal->{directory} = $directory;      $retVal->{directory} = $directory;
250      # Leave the assignment hash undefined until we populate it.      # Leave the assignment hash undefined until we populate it.
251      $retVal->{assignHash} = undef;      $retVal->{assignHash} = undef;
     $retVal->{disconnected} = defined($disconnected) ? 1 : 0;  
252      # Return the result.      # Return the result.
253      return $retVal;      return $retVal;
254  }  }
# Line 586  Line 573 
573      # This hash will track the features we've created. If a feature is found a second      # This hash will track the features we've created. If a feature is found a second
574      # time, it overwrites the original.      # time, it overwrites the original.
575      my %fidHash;      my %fidHash;
576      # Finally, we need the timestamp hash. The initial feature population      # This hash tracks the deleted features. We don't want to update these.
577        my %deleted_features;
578      # Insure we have a tbl file for this feature type.      # Insure we have a tbl file for this feature type.
579      my $fileName = "$featureDir/$type/tbl";      my $fileName = "$featureDir/$type/tbl";
     my %deleted_features;  
580      if (-f $fileName) {      if (-f $fileName) {
581          # We have one, so we can read through it. First, however, we need to get the list          # We have one, so we can read through it. First, however, we need to get the list
582          # of deleted features and remove them from the assignment hash. This insures          # of deleted features and remove them from the assignment hash. This insures
# Line 628  Line 615 
615                      @aliases = ();                      @aliases = ();
616                  }                  }
617                  # Add the feature to the database.                  # Add the feature to the database.
618                  $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases,                  my $function = $assignHash->{$fid} || "";
619                    $self->AddFeature($fid, $function, $locations, \@aliases,
620                                    $protHash->{$fid}, $evHash->{$fid});                                    $protHash->{$fid}, $evHash->{$fid});
621                  # Denote we've added this feature, so that if a duplicate occurs we're ready.                  # Denote we've added this feature, so that if a duplicate occurs we're ready.
622                  $fidHash{$fid} = 1;                  $fidHash{$fid} = 1;
# Line 777  Line 765 
765    
766  =head3 LoadSubsystems  =head3 LoadSubsystems
767    
768      $loaderObject->LoadSubsystems();      $loaderObject->LoadSubsystems($subsysList);
769    
770    Load the subsystem data into the database. This requires looking through the
771    bindings and using them to connect to the subsystems that already exist in the
772    database. If the subsystem does not exist, its bindings will not be loaded.
773    
774  Load the subsystem data into the database. This includes all of the subsystems,  =over 4
775  their categories, roles, and the bindings that connect them to this genome's features.  
776    =item subsysList
777    
778    Reference to a list of subsystem IDs. If specified, only the subsystems in the
779    list will be processed. This is useful when a particular set of subsystems
780    has been replaced.
781    
782    =back
783    
784  =cut  =cut
785    
786  sub LoadSubsystems {  sub LoadSubsystems {
787      # Get the parameters.      # Get the parameters.
788      my ($self) = @_;      my ($self, $subsysList) = @_;
   
     #  
     # If we are running in disconnected mode, do not actually load subsystems.  
     # They rely too much on information from the external sapling.  
     #  
     if ($self->{disconnected})  
     {  
         return;  
     }  
   
789      # Get the sapling object.      # Get the sapling object.
790      my $sap = $self->{sap};      my $sap = $self->{sap};
791      # Get the statistics object.      # Get the statistics object.
792      my $stats = $self->{stats};      my $stats = $self->{stats};
793      # Get the genome ID.      # Get the genome ID.
794      my $genome = $self->{genome};      my $genome = $self->{genome};
     # Connect to the Sapling server so we can get the subsystem variant and role information.  
     my $sapO = SAPserver->new();  
795      # This hash maps subsystem IDs to molecular machine IDs.      # This hash maps subsystem IDs to molecular machine IDs.
796      my %machines;      my %machines;
797      # This hash maps subsystem/role pairs to machine role IDs.      # This hash maps subsystem/role pairs to machine role IDs.
798      my %machineRoles;      my %machineRoles;
799      # The first step is to create the subsystems themselves and connect them to the      # This hash will contain the list of subsystems found in the database.
800      # genome. A list is given in the subsystems file of the Subsystems directory.      my %subsystems;
801        # We loop through the subsystems, looking for the ones already in the
802        # database. The list is given in the subsystems file of the Subsystems
803        # directory.
804      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");      my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems");
     # Create the field hash for the subsystem query.  
     my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description)  
                     Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable)  
                     Subsystem(version)  
                     Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary)  
                     Includes(sequence)  
                     Role(id) Role(hypothetical) Role(role-index));  
     my %fields = map { $_ => $_ } @fields;  
805      # Loop through the subsystems in the file, insuring we have them in the database.      # Loop through the subsystems in the file, insuring we have them in the database.
806      while (! eof $ih) {      while (! eof $ih) {
807          # Get this subsystem.          # Get this subsystem.
# Line 827  Line 809 
809          Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);          Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3);
810          # Normalize the subsystem name.          # Normalize the subsystem name.
811          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
812            # Insure the subsystem is in the database and is one we're interested in.
813            if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) &&
814                    $sap->Exists(Subsystem => $subsystem)) {
815                # We have the subsystem in the database. We need to compute the machine
816                # role IDs and create the molecular machine. First, we need to remember
817                # the subsystem.
818                $subsystems{$subsystem} = 1;
819          # Compute this subsystem's MD5.          # Compute this subsystem's MD5.
820          my $subsystemMD5 = ERDB::DigestKey($subsystem);          my $subsystemMD5 = ERDB::DigestKey($subsystem);
         # Insure the subsystem is in the database.  
         if (! $sap->Exists(Subsystem => $subsystem)) {  
             # The subsystem isn't present, so we need to read it. We get the subsystem,  
             # its roles, and its classification information. The variant will be added  
             # later if we need it.  
             my $roleList = $sapO->get(-objects => "Subsystem Includes Role",  
                                       -filter => {"Subsystem(id)" => ["=", $subsystem] },  
                                       -fields => \%fields);  
             # Only proceed if we found some roles.  
             if (@$roleList > 0) {  
                 # Get the subsystem information from the first role and create the subsystem.  
                 my $roleH = $roleList->[0];  
                 my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH);  
                 $sap->InsertObject('Subsystem', %subFields);  
                 $stats->Add(subsystems => 1);  
                 # Now loop through the roles. The Includes records are always inserted, but the  
                 # roles are only inserted if they don't already exist.  
                 for $roleH (@$roleList) {  
                     # Create the Includes record.  
                     my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH);  
                     $sap->InsertObject('Includes', %incFields);  
                     # Insure we have the role in place.  
                     my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH);  
                     my $roleID = $roleFields{id};  
                     delete $roleFields{id};  
                     $self->InsureEntity('Role', $roleID, %roleFields);  
                     # Compute the machine-role ID.  
                     my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation});  
                     $machineRoles{$subsystem}{$roleID} = $machineRoleID;  
                     $stats->Add(subsystemRoles => 1);  
                 }  
             }  
         } else {  
             # We already have the subsystem in the database, but we still need to compute the  
             # machine role IDs. We need information from the sapling server for this.  
             my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1);  
821              # Loop through the roles.              # Loop through the roles.
822              my $roleList = $subHash->{$subsystem};              my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?',
823              for my $roleTuple (@$roleList) {                      [$subsystem], "to-link abbreviation");
824                for my $roleTuple (@roleList) {
825                  my ($roleID, $abbr) = @$roleTuple;                  my ($roleID, $abbr) = @$roleTuple;
826                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);                  my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr);
827                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;                  $machineRoles{$subsystem}{$roleID} = $machineRoleID;
828              }              }
829          }              # Next we need the variant code and key.
830          # Now we need to connect the variant to the subsystem. First, we compute the real              my $variantCode = BaseSaplingLoader::Starless($variant);
         # variant code by removing the asterisks.  
         my $variantCode = $variant;  
         $variantCode =~ s/^\*+//;  
         $variantCode =~ s/\*+$//;  
         # Compute the variant key.  
831          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");          my $variantKey = ERDB::DigestKey("$subsystem:$variantCode");
832          # Insure we have it in the database.              # Now we create the molecular machine connecting this genome to the
833          if (! $sap->Exists(Variant => $variantKey)) {              # subsystem variant.
             # Get the variant from the sapling server.  
             my $variantH = $sapO->get(-objects => "Variant",  
                                       -filter => {"Variant(id)" => ["=", $variantKey]},  
                                       -fields => {"Variant(code)" => "code",  
                                                   "Variant(comment)" => "comment",  
                                                   "Variant(type)" => "type",  
                                                   "Variant(role-rule)" => "role-rule"},  
                                       -firstOnly => 1);  
             $self->InsureEntity('Variant', $variantKey, %$variantH);  
             # Connect it to the subsystem.  
             $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey);  
             $stats->Add(subsystemVariants => 1);  
         }  
         # Now we create the molecular machine connecting this genome to the subsystem  
         # variant.  
834          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");          my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome");
835          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);          $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID);
836          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');          $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => '');
# Line 904  Line 838 
838          # Remember the machine ID.          # Remember the machine ID.
839          $machines{$subsystem} = $machineID;          $machines{$subsystem} = $machineID;
840      }      }
841      # Now we go through the bindings file. This file connects the subsystem roles to      }
842      # the molecular machines.      # Now we go through the bindings file. This file connects the subsystem
843        # roles to the molecular machines.
844      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");      $ih = Open(undef, "<$self->{directory}/Subsystems/bindings");
845      # Loop through the bindings.      # Loop through the bindings.
846      while (! eof $ih) {      while (! eof $ih) {
# Line 913  Line 848 
848          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);          my ($subsystem, $role, $fid) = Tracer::GetLine($ih);
849          # Normalize the subsystem name.          # Normalize the subsystem name.
850          $subsystem = $sap->SubsystemID($subsystem);          $subsystem = $sap->SubsystemID($subsystem);
851            # Insure the subsystem is in the database.
852            if ($subsystems{$subsystem}) {
853          # Compute the machine role.          # Compute the machine role.
854          my $machineRoleID = $machineRoles{$subsystem}{$role};          my $machineRoleID = $machineRoles{$subsystem}{$role};
855          # Insure it exists.          # Insure it exists.
# Line 928  Line 865 
865          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);          $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid);
866      }      }
867  }  }
868    }
869    
870  =head3 CreateGenome  =head3 CreateGenome
871    

Legend:
Removed from v.1.15  
changed lines
  Added in v.1.17

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3