--- SaplingGenomeLoader.pm 2011/07/22 19:20:11 1.13 +++ SaplingGenomeLoader.pm 2013/03/26 02:10:46 1.17 @@ -57,11 +57,6 @@ Name of the directory containing the genome information. -=item assignHash - -Hash of feature IDs to functional assignments. Deleted features are removed, which -means only features listed in this hash can be legally inserted into the database. - =back =cut @@ -163,7 +158,8 @@ =item directory -Name of the directory containing the genome data files. +Name of the directory containing the genome data files. If omitted, the +genome will be deleted from the database. =item RETURN @@ -178,10 +174,13 @@ my ($sap, $genome, $directory) = @_; # Clear the existing data for the specified genome. my $stats = ClearGenome($sap, $genome); - # Load the new expression data from the specified directory. - my $newStats = Load($sap, $genome, $directory); - # Merge the statistics. - $stats->Accumulate($newStats); + # Load the new expression data from the specified directory (if one is + # specified). + if ($directory) { + my $newStats = Load($sap, $genome, $directory); + # Merge the statistics. + $stats->Accumulate($newStats); + } # Return the result. return $stats; } @@ -259,8 +258,9 @@ $loaderObject->LoadContigs(); Load the contig information into the database. This includes the contigs themselves and -the DNA. The number of contigs will be recorded as the C statistic and the -number of base pairs as the C statistic. +the DNA. The number of contigs will be recorded as the C statistic, the +number of base pairs as the C statistic, and the number of GC instances as the +C statistic. =cut @@ -368,6 +368,8 @@ $sap->InsertObject('DNASequence', id => $chunkID, sequence => $chunk); # Record the chunk. $self->{stats}->Add(chunks => 1); + # Update the GC count. + $self->{stats}->Add(gc_content => ($chunk =~ tr/GCgc//)); } =head3 OutputContig @@ -571,7 +573,8 @@ # This hash will track the features we've created. If a feature is found a second # time, it overwrites the original. my %fidHash; - # Finally, we need the timestamp hash. The initial feature population + # This hash tracks the deleted features. We don't want to update these. + my %deleted_features; # Insure we have a tbl file for this feature type. my $fileName = "$featureDir/$type/tbl"; if (-f $fileName) { @@ -586,6 +589,7 @@ if (exists $assignHash->{$deletedFid}) { delete $assignHash->{$deletedFid}; $stats->Add(deletedFid => 1); + $deleted_features{$deletedFid} = 1; } } } @@ -596,7 +600,7 @@ # Read this feature's information. my ($fid, $locations, @aliases) = Tracer::GetLine($ih); # Only proceed if the feature is NOT deleted. - if (exists $assignHash->{$fid}) { + if (!$deleted_features{$fid}) { # If the feature already exists, delete it. (This should be extremely rare.) if ($fidHash{$fid}) { $sap->Delete(Feature => $fid); @@ -611,7 +615,8 @@ @aliases = (); } # Add the feature to the database. - $self->AddFeature($fid, $assignHash->{$fid}, $locations, \@aliases, + my $function = $assignHash->{$fid} || ""; + $self->AddFeature($fid, $function, $locations, \@aliases, $protHash->{$fid}, $evHash->{$fid}); # Denote we've added this feature, so that if a duplicate occurs we're ready. $fidHash{$fid} = 1; @@ -760,39 +765,43 @@ =head3 LoadSubsystems - $loaderObject->LoadSubsystems(); + $loaderObject->LoadSubsystems($subsysList); + +Load the subsystem data into the database. This requires looking through the +bindings and using them to connect to the subsystems that already exist in the +database. If the subsystem does not exist, its bindings will not be loaded. + +=over 4 + +=item subsysList + +Reference to a list of subsystem IDs. If specified, only the subsystems in the +list will be processed. This is useful when a particular set of subsystems +has been replaced. -Load the subsystem data into the database. This includes all of the subsystems, -their categories, roles, and the bindings that connect them to this genome's features. +=back =cut sub LoadSubsystems { # Get the parameters. - my ($self) = @_; + my ($self, $subsysList) = @_; # Get the sapling object. my $sap = $self->{sap}; # Get the statistics object. my $stats = $self->{stats}; # Get the genome ID. my $genome = $self->{genome}; - # Connect to the Sapling server so we can get the subsystem variant and role information. - my $sapO = SAPserver->new(); # This hash maps subsystem IDs to molecular machine IDs. my %machines; # This hash maps subsystem/role pairs to machine role IDs. my %machineRoles; - # The first step is to create the subsystems themselves and connect them to the - # genome. A list is given in the subsystems file of the Subsystems directory. + # This hash will contain the list of subsystems found in the database. + my %subsystems; + # We loop through the subsystems, looking for the ones already in the + # database. The list is given in the subsystems file of the Subsystems + # directory. my $ih = Open(undef, "<$self->{directory}/Subsystems/subsystems"); - # Create the field hash for the subsystem query. - my @fields = qw(Subsystem(id) Subsystem(cluster-based) Subsystem(curator) Subsystem(description) - Subsystem(experimental) Subsystem(notes) Subsystem(private) Subsystem(usable) - Subsystem(version) - Includes(from-link) Includes(to-link) Includes(abbreviation) Includes(auxiliary) - Includes(sequence) - Role(id) Role(hypothetical) Role(role-index)); - my %fields = map { $_ => $_ } @fields; # Loop through the subsystems in the file, insuring we have them in the database. while (! eof $ih) { # Get this subsystem. @@ -800,85 +809,38 @@ Trace("Processing subsystem $subsystem variant $variant.") if T(SaplingDataLoader => 3); # Normalize the subsystem name. $subsystem = $sap->SubsystemID($subsystem); - # Compute this subsystem's MD5. - my $subsystemMD5 = ERDB::DigestKey($subsystem); - # Insure the subsystem is in the database. - if (! $sap->Exists(Subsystem => $subsystem)) { - # The subsystem isn't present, so we need to read it. We get the subsystem, - # its roles, and its classification information. The variant will be added - # later if we need it. - my $roleList = $sapO->get(-objects => "Subsystem Includes Role", - -filter => {"Subsystem(id)" => ["=", $subsystem] }, - -fields => \%fields); - # Only proceed if we found some roles. - if (@$roleList > 0) { - # Get the subsystem information from the first role and create the subsystem. - my $roleH = $roleList->[0]; - my %subFields = SaplingDataLoader::ExtractFields(Subsystem => $roleH); - $sap->InsertObject('Subsystem', %subFields); - $stats->Add(subsystems => 1); - # Now loop through the roles. The Includes records are always inserted, but the - # roles are only inserted if they don't already exist. - for $roleH (@$roleList) { - # Create the Includes record. - my %incFields = SaplingDataLoader::ExtractFields(Includes => $roleH); - $sap->InsertObject('Includes', %incFields); - # Insure we have the role in place. - my %roleFields = SaplingDataLoader::ExtractFields(Role => $roleH); - my $roleID = $roleFields{id}; - delete $roleFields{id}; - $self->InsureEntity('Role', $roleID, %roleFields); - # Compute the machine-role ID. - my $machineRoleID = join(":", $subsystemMD5, $genome, $incFields{abbreviation}); - $machineRoles{$subsystem}{$roleID} = $machineRoleID; - $stats->Add(subsystemRoles => 1); - } - } - } else { - # We already have the subsystem in the database, but we still need to compute the - # machine role IDs. We need information from the sapling server for this. - my $subHash = $sapO->subsystem_roles(-ids => $subsystem, -aux => 1, -abbr => 1); + # Insure the subsystem is in the database and is one we're interested in. + if ((! $subsysList || (grep { $_ eq $subsystem } @$subsysList)) && + $sap->Exists(Subsystem => $subsystem)) { + # We have the subsystem in the database. We need to compute the machine + # role IDs and create the molecular machine. First, we need to remember + # the subsystem. + $subsystems{$subsystem} = 1; + # Compute this subsystem's MD5. + my $subsystemMD5 = ERDB::DigestKey($subsystem); # Loop through the roles. - my $roleList = $subHash->{$subsystem}; - for my $roleTuple (@$roleList) { + my @roleList = $sap->GetAll('Includes', 'Includes(from-link) = ?', + [$subsystem], "to-link abbreviation"); + for my $roleTuple (@roleList) { my ($roleID, $abbr) = @$roleTuple; my $machineRoleID = join(":", $subsystemMD5, $genome, $abbr); $machineRoles{$subsystem}{$roleID} = $machineRoleID; } + # Next we need the variant code and key. + my $variantCode = BaseSaplingLoader::Starless($variant); + my $variantKey = ERDB::DigestKey("$subsystem:$variantCode"); + # Now we create the molecular machine connecting this genome to the + # subsystem variant. + my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome"); + $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID); + $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => ''); + $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID); + # Remember the machine ID. + $machines{$subsystem} = $machineID; } - # Now we need to connect the variant to the subsystem. First, we compute the real - # variant code by removing the asterisks. - my $variantCode = $variant; - $variantCode =~ s/^\*+//; - $variantCode =~ s/\*+$//; - # Compute the variant key. - my $variantKey = ERDB::DigestKey("$subsystem:$variantCode"); - # Insure we have it in the database. - if (! $sap->Exists(Variant => $variantKey)) { - # Get the variant from the sapling server. - my $variantH = $sapO->get(-objects => "Variant", - -filter => {"Variant(id)" => ["=", $variantKey]}, - -fields => {"Variant(code)" => "code", - "Variant(comment)" => "comment", - "Variant(type)" => "type", - "Variant(role-rule)" => "role-rule"}, - -firstOnly => 1); - $self->InsureEntity('Variant', $variantKey, %$variantH); - # Connect it to the subsystem. - $sap->InsertObject('Describes', from_link => $subsystem, to_link => $variantKey); - $stats->Add(subsystemVariants => 1); - } - # Now we create the molecular machine connecting this genome to the subsystem - # variant. - my $machineID = ERDB::DigestKey("$subsystem:$variantCode:$genome"); - $sap->InsertObject('Uses', from_link => $genome, to_link => $machineID); - $sap->InsertObject('MolecularMachine', id => $machineID, curated => 0, region => ''); - $sap->InsertObject('IsImplementedBy', from_link => $variantKey, to_link => $machineID); - # Remember the machine ID. - $machines{$subsystem} = $machineID; } - # Now we go through the bindings file. This file connects the subsystem roles to - # the molecular machines. + # Now we go through the bindings file. This file connects the subsystem + # roles to the molecular machines. $ih = Open(undef, "<$self->{directory}/Subsystems/bindings"); # Loop through the bindings. while (! eof $ih) { @@ -886,19 +848,22 @@ my ($subsystem, $role, $fid) = Tracer::GetLine($ih); # Normalize the subsystem name. $subsystem = $sap->SubsystemID($subsystem); - # Compute the machine role. - my $machineRoleID = $machineRoles{$subsystem}{$role}; - # Insure it exists. - my $created = $self->InsureEntity(MachineRole => $machineRoleID); - if ($created) { - # We created the machine role, so connect it to the machine. - my $machineID = $machines{$subsystem}; - $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID); - # Connect it to the role, too. - $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID); + # Insure the subsystem is in the database. + if ($subsystems{$subsystem}) { + # Compute the machine role. + my $machineRoleID = $machineRoles{$subsystem}{$role}; + # Insure it exists. + my $created = $self->InsureEntity(MachineRole => $machineRoleID); + if ($created) { + # We created the machine role, so connect it to the machine. + my $machineID = $machines{$subsystem}; + $sap->InsertObject('IsMachineOf', from_link => $machineID, to_link => $machineRoleID); + # Connect it to the role, too. + $sap->InsertObject('IsRoleOf', from_link => $role, to_link => $machineRoleID); + } + # Connect the feature. + $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid); } - # Connect the feature. - $sap->InsertObject('Contains', from_link => $machineRoleID, to_link => $fid); } } @@ -958,6 +923,7 @@ $fields{'dna-size'} = $stats->Ask('dna'); $fields{pegs} = $stats->Ask('peg'); $fields{rnas} = $stats->Ask('rna'); + $fields{gc_content} = $stats->Ask('gc_content') * 100 / $stats->Ask('dna'); # Get the genetic code. The default is 11. $fields{'genetic-code'} = 11; my $geneticCodeFile = "$dir/GENETIC_CODE";