[Bio] / FigKernelScripts / TransactFeatures.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/TransactFeatures.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Aug 9 00:57:48 2005 UTC revision 1.3, Thu Aug 11 03:10:35 2005 UTC
# Line 151  Line 151 
151  use File::Path;  use File::Path;
152  use FIG;  use FIG;
153  use Stats;  use Stats;
154    use TransactionProcessor;
155    use ApplyTransactions;
156    use CountTransactions;
157    use AnnotateTransactions;
158    use FixTransactions;
159    
160  # Get the command-line options.  # Get the command-line options.
161  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, safe => 0 }, @ARGV);  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, safe => 0 }, @ARGV);
# Line 159  Line 164 
164  TSetup("$traceLevel Tracer DocUtils FIG", "TEXT");  TSetup("$traceLevel Tracer DocUtils FIG", "TEXT");
165  # Get the FIG object.  # Get the FIG object.
166  my $fig = FIG->new();  my $fig = FIG->new();
 # Get its database handle.  
 my $dbh = $fig->db_handle;  
167  # Get the command.  # Get the command.
168  my $mainCommand = lc shift @parameters;  my $mainCommand = lc shift @parameters;
169  Trace("$mainCommand command specified.") if T(2);  # Create the transaction object.
170    my $controlBlock;
171  # Create the ID table. This maps each organism/ftype pair to the currently-  if ($mainCommand eq 'count' || $mainCommand eq 'register') {
172  # available ID number. If we're counting, we leave it empty. If we're not      $controlBlock = CountTransactions->new($options, $mainCommand, @parameters);
173  # counting, we need to read it in.  } elsif ($mainCommand eq 'process') {
174  my %idHash = ();      $controlBlock = ApplyTransactions->new($options, $mainCommand, @parameters);
175  if ($mainCommand eq 'process') {  } elsif ($mainCommand eq 'fix') {
176      my $inCount = 0;      $controlBlock = FixTransactions->new($options, $mainCommand, @parameters);
177      Open(\*IDFILE, "<$parameters[1]");  } else {
178      while (my $idRecord = <IDFILE>) {      Confess("Invalid command \"$mainCommand\" specified on command line.");
         chomp $idRecord;  
         my ($orgID, $ftype, $firstNumber) = split /\t/, $idRecord;  
         $idHash{"$orgID.$ftype"} = $firstNumber;  
         $inCount++;  
     }  
     Trace("$inCount ID ranges read in from $parameters[1].") if T(2);  
179  }  }
180    # Setup the process.
181  # Create some counters we can use for statistical purposes.  $controlBlock->Setup();
 my $stats = Stats->new("genomes", "add", "change", "delete");  
182  # Verify that the organism directory exists.  # Verify that the organism directory exists.
183  if (! -d $parameters[0]) {  if (! -d $parameters[0]) {
184      Confess("Directory of genome files \"$parameters[0]\" not found.");      Confess("Directory of genome files \"$parameters[0]\" not found.");
# Line 206  Line 202 
202      } else {      } else {
203          # Loop through the organisms.          # Loop through the organisms.
204          for my $genomeID (sort keys %transFiles) {          for my $genomeID (sort keys %transFiles) {
205                # Start this organism.
206              Trace("Processing changes for $genomeID.") if T(3);              Trace("Processing changes for $genomeID.") if T(3);
             # Create a statistics object for this organism.  
             my $orgStats = Stats->new("add", "change", "delete");  
             # Create a control block for passing around our key data.  
             my $controlBlock = { stats => $orgStats, genomeID => $genomeID,  
                                  idHash => \%idHash, options => $options,  
                                  fig => $fig, command => $mainCommand };  
             # Open the organism file.  
207              my $orgFileName = $transFiles{$genomeID};              my $orgFileName = $transFiles{$genomeID};
208                $controlBlock->StartGenome($genomeID, $orgFileName);
209                # Open the organism file.
210              Open(\*TRANS, "<$orgFileName");              Open(\*TRANS, "<$orgFileName");
211                # Clear the transaction counter.
212              my $tranCount = 0;              my $tranCount = 0;
             # If we're processing rather than counting, open a file for  
             # writing out corrected transactions and optionally start a  
             # database transaction.  
             if ($mainCommand eq 'process') {  
                 Open(\*TRANSOUT, ">$orgFileName.tbl");  
                 if ($options->{safe}) {  
                     $dbh->begin_tran();  
                 }  
             }  
213              # Loop through the organism's data.              # Loop through the organism's data.
214              while (my $transaction = <TRANS>) {              while (my $transaction = <TRANS>) {
215                  # Parse the record.                  # Parse the record.
# Line 237  Line 221 
221                  # Process according to the transaction type.                  # Process according to the transaction type.
222                  my $command = lc shift @fields;                  my $command = lc shift @fields;
223                  if ($command eq 'add') {                  if ($command eq 'add') {
224                      Add($controlBlock, @fields);                      $controlBlock->Add(@fields);
225                  } elsif ($command eq 'delete') {                  } elsif ($command eq 'delete') {
226                      Delete($controlBlock, @fields);                      $controlBlock->Delete(@fields);
227                  } elsif ($command eq 'change') {                  } elsif ($command eq 'change') {
228                      Change($controlBlock, @fields);                      $controlBlock->Change(@fields);
229                  } else {                  } else {
230                      $orgStats->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");                      $controlBlock->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");
231                  }                  }
232                  $orgStats->Add($command, 1);                  $controlBlock->IncrementStat($command);
233              }              }
234                # Terminate processing for this genome.
235                my $orgStats = $controlBlock->EndGenome();
236              Trace("Statistics for $genomeID\n\n" . $orgStats->Show()) if T(3);              Trace("Statistics for $genomeID\n\n" . $orgStats->Show()) if T(3);
             # Merge the statistics for this run into the globals statistics object.  
             $stats->Accumulate($orgStats);  
             $stats->Add("genomes", 1);  
237              # Close the transaction input file.              # Close the transaction input file.
238              close TRANS;              close TRANS;
             # If we're processing, close the transaction output file  
             # and optionally end the database transaction.  
             if ($mainCommand eq 'process') {  
                 close TRANSOUT;  
                 if ($options->{safe}) {  
                     $dbh->commit_tran();  
                 }  
             }  
         }  
     }  
     Trace("Statistics for this run\n\n" . $stats->Show()) if T(1);  
     # If we're counting, we need to write out the counts file or allocate IDs  
     # from the clearinghouse.  
     if ($mainCommand ne "process") {  
         # Loop through the ID hash, printing the counts. We will also write them  
         # to a file called "counts.tbl".  
         my $countfile = "$parameters[0]/counts.tbl";  
         Open(\*COUNTFILE, ">$countfile");  
         print "\nTable of Counts\n";  
         for my $idKey (keys %idHash) {  
             $idKey =~ /^(\d+\.\d+)\.([a-z]+)$/;  
             my ($org, $ftype) = ($1, $2);  
             my $count = $idHash{$idKey};  
             print "$idKey\t$count\n";  
             print COUNTFILE "$org\t$ftype\t$count\n";  
         }  
         close COUNTFILE;  
         if ($mainCommand eq "register") {  
             # Here we are registering as well as counting. This process also produces  
             # the ID file.  
             Trace("Submitting ID file to clearing house.") if T(2);  
             system("register_features_batch <$countfile >$parameters[1]");  
             Trace("Clearing house request complete.") if T(2);  
239          }          }
240      }      }
241        # Terminate processing.
242        $controlBlock->Teardown();
243        Trace("Statistics for this run\n\n" . $controlBlock->Show()) if T(1);
244      Trace("Processing complete.") if T(1);      Trace("Processing complete.") if T(1);
245  }  }
246    
 =head2 Utility Methods  
   
 =head3 Add  
   
 C<< Add($controlBlock, $newID, $locations, $translation); >>  
   
 Add a new feature to the data store.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item newID  
   
 ID to give to the new feature.  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Add {  
     my ($controlBlock, $newID, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # add the new feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we need to add the new feature.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype,  
                                 "", $locations, $translation);  
         Trace("Feature $realID added for pseudo-ID $newID.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "add\t$realID\t$locations\t$translation\n";  
     }  
 }  
   
 =head3 Change  
   
 C<< Change($controlBlock, $fid, $newID, $locations, $aliases, $translation); >>  
   
 Replace a feature to the data store. The feature will be marked for deletion and  
 a new feature will be put in its place.  
   
 This is a much more complicated process than adding a feature. In addition to  
 the add, we have to create new aliases and transfer across the assignment and  
 the annotations.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature being changed.  
   
 =item newID  
   
 New ID to give to the feature.  
   
 =item locations  
   
 New location to give to the feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 New protein translation string for the feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Change {  
     my ($controlBlock, $fid, $newID, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # replace the feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we can go ahead and change the feature. First, we must  
         # get the old feature's assignment and annotations. Note that  
         # for the annotations we ask for the time in its raw format.  
         my @functions = $fig->function_of($fid);  
         my @annotations = $fig->feature_annotations($fid, 1);  
         # Create some counters.  
         my ($assignCount, $annotateCount) = (0, 0);  
         # Add the new version of the feature and get its ID.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations,  
                                 $aliases, $translation);  
         # Copy over the assignments.  
         for my $assignment (@functions) {  
             my ($user, $function) = @{$assignment};  
             $fig->assign_function($realID, $user, $function);  
             $assignCount++;  
         }  
         # Copy over the annotations.  
         for my $annotation (@annotations) {  
             my ($oldID, $timestamp, $user, $annotation) = @{$annotation};  
             $fig->add_annotation($realID, $user, $annotation, $timestamp);  
             $annotateCount++;  
         }  
         # Mark the old feature for deletion.  
         $fig->delete_feature($fid);  
         # Tell the user what we did.  
         $controlBlock->{stats}->Add("assignments", $assignCount);  
         $controlBlock->{stats}->Add("annotations", $annotateCount);  
         Trace("Feature $realID created from $fid. $assignCount assignments and $annotateCount annotations copied.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "change\t$fid\t$realID\t$locations\t$aliases\t$translation\n";  
     }  
 }  
   
 =head3 Delete  
   
 C<< Delete($controlBlock, $fid); >>  
   
 Delete a feature from the data store. The feature will be marked as deleted,  
 which will remove it from consideration by most FIG methods. A garbage  
 collection job will be run later to permanently delete the feature.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature to delete.  
   
 =back  
   
 =cut  
   
 sub Delete {  
     my ($controlBlock, $fid) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and count it.  
     my $ftype = FIG::ftype($fid);  
     $controlBlock->{stats}->Add($ftype, 1);  
     # If we're not counting, delete the feature.  
     if ($controlBlock->{command} eq 'process') {  
         # Mark the feature for deletion.  
         $fig->delete_feature($fid);  
         # Echo the transaction to the transaction output file.  
         print TRANSOUT "del\t$fid\n";  
     }  
 }  
   
 =head3 ParseNewID  
   
 C<< my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID); >>  
   
 Extract the feature type and ordinal number from an incoming new ID.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures needed to manage transactions.  
   
 =item newID  
   
 New ID specification taken from a transaction input record. This contains the  
 feature type followed by a period and then the ordinal number of the ID.  
   
 =item RETURN  
   
 Returna a three-element list. If successful, the list will contain the feature  
 type followed by the ordinal number and the key to use in the ID hash to find  
 the feature's true ID number. If the incoming ID is invalid, the list  
 will contain three C<undef>s.  
   
 =back  
   
 =cut  
   
 sub ParseNewID {  
     # Get the parameters.  
     my ($controlBlock, $newID) = @_;  
     my ($ftype, $ordinal, $key);  
     # Parse the ID.  
     if ($newID =~ /^([a-z]+)\.(\d+)$/) {  
         # Here we have a valid ID.  
         ($ftype, $ordinal) = ($1, $2);  
         $key = $controlBlock->{genomeID} . ".$ftype";  
         # Update the feature type count in the statistics.  
         $controlBlock->{stats}->Add($ftype, 1);  
     } else {  
         # Here we have an invalid ID.  
         $controlBlock->{stats}->AddMessage("Invalid ID $newID found in line " .  
                                            $controlBlock->{line} . " for genome " .  
                                            $controlBlock->{genomeID} . ".");  
     }  
     # Return the result.  
     return ($ftype, $ordinal, $key);  
 }  
   
 =head3 GetRealID  
   
 C<< my $realID = GetRealID($controlBlock, $ftype, $ordinal, $key); >>  
   
 Compute the real ID of a new feature. This involves interrogating the ID hash and  
 formatting a full-blown ID out of little bits of information.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ordinal  
   
 Zero-based ordinal number of this feature. The ordinal number is added to the value  
 stored in the control block's ID hash to compute the real feature number.  
   
 =item key  
   
 Key in the ID hash relevant to this feature.  
   
 =item RETURN  
   
 Returns a fully-formatted FIG ID for the new feature.  
   
 =back  
   
 =cut  
   
 sub GetRealID {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key) = @_;  
     #Declare the return value.  
     my $retVal;  
     # Get the base value for the feature ID number.  
     my $base = $controlBlock->{idHash}->{$key};  
     # If it didn't exist, we have an error.  
     if (! defined $base) {  
         Confess("No ID range found for genome ID and feature type $key.");  
     } else {  
         # Now we have enough data to format the ID.  
         my $num = $base + $ordinal;  
         $retVal = "fig|$key.$num";  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 CheckTranslation  
   
 C<< my $actualTranslation = CheckTranslation($controlBlock, $ftype, $locations, $translation); >>  
   
 If we are processing a PEG, insure we have a translation for the peg's locations.  
   
 This method checks the feature type and the incoming translation string. If the  
 translation string is empty and the feature type is C<peg>, it will generate  
 a translation string using the specified locations for the genome currently  
 being processed.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ftype  
   
 Feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Comma-delimited list of location strings for the feature in question.  
   
 =item translation (optional)  
   
 If specified, will be returned to the caller as the result.  
   
 =item RETURN  
   
 Returns the protein translation string for the specified locations, or C<undef>  
 if no translation is warranted.  
   
 =back  
   
 =cut  
   
 sub CheckTranslation {  
     # Get the parameters.  
     my ($controlBlock, $ftype, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Declare the return variable.  
     my $retVal;  
     if ($ftype eq 'peg') {  
         # Here we have a protein encoding gene. Check to see if we already have  
         # a translation.  
         if (defined $translation) {  
             # Pass it back unmodified.  
             $retVal = $translation;  
         } else {  
             # Here we need to compute the translation.  
             my $dna = $fig->dna_seq($controlBlock->{genomeID}, $locations);  
             $retVal = FIG::translate($dna);  
         }  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 AddFeature  
   
 C<< my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations, $translation); >>  
   
 Add the specified feature to the FIG data store. This involves generating the new feature's  
 ID, creating the translation (if needed), adding the feature to the data store, and  
 queueing a request to update the similarities. The generated ID will be returned to the  
 caller.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item ordinal  
   
 Zero-based ordinal number of the proposed feature in the ID space. This is added to the  
 base ID number to get the real ID number.  
   
 =item key  
   
 Key to use for getting the base ID number from the ID hash.  
   
 =item ftype  
   
 Proposed feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub AddFeature {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key, $ftype, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # We want to add a new feature using the information provided. First, we  
     # generate its ID.  
     my $retVal = GetRealID($controlBlock, $ordinal, $key);  
     # Next, we insure that we have a translation.  
     my $actualTranslation = CheckTranslation($controlBlock, $ftype,  
                                              $locations, $translation);  
     # Now we add it to FIG.  
     $fig->add_feature($controlBlock->{genomeID}, $ftype, $locations, "",  
                       $actualTranslation, $retVal);  
     # Tell FIG to recompute the similarities.  
     $fig->enqueue_similarities([$retVal]);  
     # Return the ID we generated.  
     return $retVal;  
 }  
247    
248  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3