[Bio] / FigKernelScripts / TransactFeatures.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/TransactFeatures.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Aug 9 00:57:48 2005 UTC revision 1.6, Thu Aug 11 05:16:46 2005 UTC
# Line 32  Line 32 
32  a copy of each transaction file in which the pseudo-IDs have been replaced by  a copy of each transaction file in which the pseudo-IDs have been replaced by
33  real IDs.  real IDs.
34    
35    =item annotate
36    
37    Annotate the features created by the transactions so as to indicate how they were
38    derived.
39    
40    =item fix
41    
42    Fix the locations of new features and verify the translations of new and changed
43    features.
44    
45  =back  =back
46    
47  =head2 The Transaction File  =head2 The Transaction File
# Line 140  Line 150 
150  Wrap each organism's processing in a database transaction. This makes the process  Wrap each organism's processing in a database transaction. This makes the process
151  slightly more restartable than it would be otherwise.  slightly more restartable than it would be otherwise.
152    
153    =item noAlias
154    
155    Assume that the transaction files do not contain aliases. This means that in CHANGE
156    records the translation will immediately follow the location.
157    
158  =cut  =cut
159    
160  use strict;  use strict;
# Line 151  Line 166 
166  use File::Path;  use File::Path;
167  use FIG;  use FIG;
168  use Stats;  use Stats;
169    use TransactionProcessor;
170    use ApplyTransactions;
171    use CountTransactions;
172    use AnnotateTransactions;
173    use FixTransactions;
174    
175  # Get the command-line options.  # Get the command-line options.
176  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, safe => 0 }, @ARGV);  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, safe => 0, noAlias => 0 }, @ARGV);
177  # Set up tracing.  # Set up tracing.
178  my $traceLevel = $options->{trace};  my $traceLevel = $options->{trace};
179  TSetup("$traceLevel Tracer DocUtils FIG", "TEXT");  TSetup("$traceLevel Tracer DocUtils FIG", "TEXT");
180  # Get the FIG object.  # Get the FIG object.
181  my $fig = FIG->new();  my $fig = FIG->new();
 # Get its database handle.  
 my $dbh = $fig->db_handle;  
182  # Get the command.  # Get the command.
183  my $mainCommand = lc shift @parameters;  my $mainCommand = lc shift @parameters;
184  Trace("$mainCommand command specified.") if T(2);  # Create the transaction object.
185    my $controlBlock;
186  # Create the ID table. This maps each organism/ftype pair to the currently-  if ($mainCommand eq 'count' || $mainCommand eq 'register') {
187  # available ID number. If we're counting, we leave it empty. If we're not      $controlBlock = CountTransactions->new($options, $mainCommand, @parameters);
188  # counting, we need to read it in.  } elsif ($mainCommand eq 'process') {
189  my %idHash = ();      $controlBlock = ApplyTransactions->new($options, $mainCommand, @parameters);
190  if ($mainCommand eq 'process') {  } elsif ($mainCommand eq 'annotate') {
191      my $inCount = 0;      $controlBlock = AnnotateTransactions->new($options, $mainCommand, @parameters);
192      Open(\*IDFILE, "<$parameters[1]");  } elsif ($mainCommand eq 'fix') {
193      while (my $idRecord = <IDFILE>) {      $controlBlock = FixTransactions->new($options, $mainCommand, @parameters);
194          chomp $idRecord;  } else {
195          my ($orgID, $ftype, $firstNumber) = split /\t/, $idRecord;      Confess("Invalid command \"$mainCommand\" specified on command line.");
         $idHash{"$orgID.$ftype"} = $firstNumber;  
         $inCount++;  
     }  
     Trace("$inCount ID ranges read in from $parameters[1].") if T(2);  
196  }  }
197    # Setup the process.
198  # Create some counters we can use for statistical purposes.  $controlBlock->Setup();
 my $stats = Stats->new("genomes", "add", "change", "delete");  
199  # Verify that the organism directory exists.  # Verify that the organism directory exists.
200  if (! -d $parameters[0]) {  if (! -d $parameters[0]) {
201      Confess("Directory of genome files \"$parameters[0]\" not found.");      Confess("Directory of genome files \"$parameters[0]\" not found.");
# Line 206  Line 219 
219      } else {      } else {
220          # Loop through the organisms.          # Loop through the organisms.
221          for my $genomeID (sort keys %transFiles) {          for my $genomeID (sort keys %transFiles) {
222                # Start this organism.
223              Trace("Processing changes for $genomeID.") if T(3);              Trace("Processing changes for $genomeID.") if T(3);
             # Create a statistics object for this organism.  
             my $orgStats = Stats->new("add", "change", "delete");  
             # Create a control block for passing around our key data.  
             my $controlBlock = { stats => $orgStats, genomeID => $genomeID,  
                                  idHash => \%idHash, options => $options,  
                                  fig => $fig, command => $mainCommand };  
             # Open the organism file.  
224              my $orgFileName = $transFiles{$genomeID};              my $orgFileName = $transFiles{$genomeID};
225                $controlBlock->StartGenome($genomeID, $orgFileName);
226                # Open the organism file.
227              Open(\*TRANS, "<$orgFileName");              Open(\*TRANS, "<$orgFileName");
228                # Clear the transaction counter.
229              my $tranCount = 0;              my $tranCount = 0;
             # If we're processing rather than counting, open a file for  
             # writing out corrected transactions and optionally start a  
             # database transaction.  
             if ($mainCommand eq 'process') {  
                 Open(\*TRANSOUT, ">$orgFileName.tbl");  
                 if ($options->{safe}) {  
                     $dbh->begin_tran();  
                 }  
             }  
230              # Loop through the organism's data.              # Loop through the organism's data.
231              while (my $transaction = <TRANS>) {              while (my $transaction = <TRANS>) {
232                  # Parse the record.                  # Parse the record.
# Line 237  Line 238 
238                  # Process according to the transaction type.                  # Process according to the transaction type.
239                  my $command = lc shift @fields;                  my $command = lc shift @fields;
240                  if ($command eq 'add') {                  if ($command eq 'add') {
241                      Add($controlBlock, @fields);                      $controlBlock->Add(@fields);
242                  } elsif ($command eq 'delete') {                  } elsif ($command eq 'delete') {
243                      Delete($controlBlock, @fields);                      $controlBlock->Delete(@fields);
244                  } elsif ($command eq 'change') {                  } elsif ($command eq 'change') {
245                      Change($controlBlock, @fields);                      # Here we have a special case. If "noalias" is in effect, we need
246                        # to splice an empty field in before the translation.
247                        if ($controlBlock->Option("noAlias")) {
248                            splice @fields, 3, 0, "";
249                        }
250                        $controlBlock->Change(@fields);
251                  } else {                  } else {
252                      $orgStats->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");                      $controlBlock->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");
253                  }                  }
254                  $orgStats->Add($command, 1);                  $controlBlock->IncrementStat($command);
255              }              }
256              Trace("Statistics for $genomeID\n\n" . $orgStats->Show()) if T(3);              # Terminate processing for this genome.
257              # Merge the statistics for this run into the globals statistics object.              my $orgStats = $controlBlock->EndGenome();
258              $stats->Accumulate($orgStats);              Trace("Statistics for $genomeID\n\n" . $orgStats->Show() . "\n") if T(3);
             $stats->Add("genomes", 1);  
259              # Close the transaction input file.              # Close the transaction input file.
260              close TRANS;              close TRANS;
             # If we're processing, close the transaction output file  
             # and optionally end the database transaction.  
             if ($mainCommand eq 'process') {  
                 close TRANSOUT;  
                 if ($options->{safe}) {  
                     $dbh->commit_tran();  
                 }  
             }  
         }  
     }  
     Trace("Statistics for this run\n\n" . $stats->Show()) if T(1);  
     # If we're counting, we need to write out the counts file or allocate IDs  
     # from the clearinghouse.  
     if ($mainCommand ne "process") {  
         # Loop through the ID hash, printing the counts. We will also write them  
         # to a file called "counts.tbl".  
         my $countfile = "$parameters[0]/counts.tbl";  
         Open(\*COUNTFILE, ">$countfile");  
         print "\nTable of Counts\n";  
         for my $idKey (keys %idHash) {  
             $idKey =~ /^(\d+\.\d+)\.([a-z]+)$/;  
             my ($org, $ftype) = ($1, $2);  
             my $count = $idHash{$idKey};  
             print "$idKey\t$count\n";  
             print COUNTFILE "$org\t$ftype\t$count\n";  
         }  
         close COUNTFILE;  
         if ($mainCommand eq "register") {  
             # Here we are registering as well as counting. This process also produces  
             # the ID file.  
             Trace("Submitting ID file to clearing house.") if T(2);  
             system("register_features_batch <$countfile >$parameters[1]");  
             Trace("Clearing house request complete.") if T(2);  
261          }          }
262      }      }
263        # Terminate processing.
264        $controlBlock->Teardown();
265        Trace("Statistics for this run\n\n" . $controlBlock->Show() . "\n") if T(1);
266      Trace("Processing complete.") if T(1);      Trace("Processing complete.") if T(1);
267  }  }
268    
 =head2 Utility Methods  
   
 =head3 Add  
   
 C<< Add($controlBlock, $newID, $locations, $translation); >>  
   
 Add a new feature to the data store.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item newID  
   
 ID to give to the new feature.  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Add {  
     my ($controlBlock, $newID, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # add the new feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we need to add the new feature.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype,  
                                 "", $locations, $translation);  
         Trace("Feature $realID added for pseudo-ID $newID.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "add\t$realID\t$locations\t$translation\n";  
     }  
 }  
   
 =head3 Change  
   
 C<< Change($controlBlock, $fid, $newID, $locations, $aliases, $translation); >>  
   
 Replace a feature to the data store. The feature will be marked for deletion and  
 a new feature will be put in its place.  
   
 This is a much more complicated process than adding a feature. In addition to  
 the add, we have to create new aliases and transfer across the assignment and  
 the annotations.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature being changed.  
   
 =item newID  
   
 New ID to give to the feature.  
   
 =item locations  
   
 New location to give to the feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 New protein translation string for the feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Change {  
     my ($controlBlock, $fid, $newID, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # replace the feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we can go ahead and change the feature. First, we must  
         # get the old feature's assignment and annotations. Note that  
         # for the annotations we ask for the time in its raw format.  
         my @functions = $fig->function_of($fid);  
         my @annotations = $fig->feature_annotations($fid, 1);  
         # Create some counters.  
         my ($assignCount, $annotateCount) = (0, 0);  
         # Add the new version of the feature and get its ID.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations,  
                                 $aliases, $translation);  
         # Copy over the assignments.  
         for my $assignment (@functions) {  
             my ($user, $function) = @{$assignment};  
             $fig->assign_function($realID, $user, $function);  
             $assignCount++;  
         }  
         # Copy over the annotations.  
         for my $annotation (@annotations) {  
             my ($oldID, $timestamp, $user, $annotation) = @{$annotation};  
             $fig->add_annotation($realID, $user, $annotation, $timestamp);  
             $annotateCount++;  
         }  
         # Mark the old feature for deletion.  
         $fig->delete_feature($fid);  
         # Tell the user what we did.  
         $controlBlock->{stats}->Add("assignments", $assignCount);  
         $controlBlock->{stats}->Add("annotations", $annotateCount);  
         Trace("Feature $realID created from $fid. $assignCount assignments and $annotateCount annotations copied.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "change\t$fid\t$realID\t$locations\t$aliases\t$translation\n";  
     }  
 }  
   
 =head3 Delete  
   
 C<< Delete($controlBlock, $fid); >>  
   
 Delete a feature from the data store. The feature will be marked as deleted,  
 which will remove it from consideration by most FIG methods. A garbage  
 collection job will be run later to permanently delete the feature.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature to delete.  
   
 =back  
   
 =cut  
   
 sub Delete {  
     my ($controlBlock, $fid) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and count it.  
     my $ftype = FIG::ftype($fid);  
     $controlBlock->{stats}->Add($ftype, 1);  
     # If we're not counting, delete the feature.  
     if ($controlBlock->{command} eq 'process') {  
         # Mark the feature for deletion.  
         $fig->delete_feature($fid);  
         # Echo the transaction to the transaction output file.  
         print TRANSOUT "del\t$fid\n";  
     }  
 }  
   
 =head3 ParseNewID  
   
 C<< my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID); >>  
   
 Extract the feature type and ordinal number from an incoming new ID.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures needed to manage transactions.  
   
 =item newID  
   
 New ID specification taken from a transaction input record. This contains the  
 feature type followed by a period and then the ordinal number of the ID.  
   
 =item RETURN  
   
 Returna a three-element list. If successful, the list will contain the feature  
 type followed by the ordinal number and the key to use in the ID hash to find  
 the feature's true ID number. If the incoming ID is invalid, the list  
 will contain three C<undef>s.  
   
 =back  
   
 =cut  
   
 sub ParseNewID {  
     # Get the parameters.  
     my ($controlBlock, $newID) = @_;  
     my ($ftype, $ordinal, $key);  
     # Parse the ID.  
     if ($newID =~ /^([a-z]+)\.(\d+)$/) {  
         # Here we have a valid ID.  
         ($ftype, $ordinal) = ($1, $2);  
         $key = $controlBlock->{genomeID} . ".$ftype";  
         # Update the feature type count in the statistics.  
         $controlBlock->{stats}->Add($ftype, 1);  
     } else {  
         # Here we have an invalid ID.  
         $controlBlock->{stats}->AddMessage("Invalid ID $newID found in line " .  
                                            $controlBlock->{line} . " for genome " .  
                                            $controlBlock->{genomeID} . ".");  
     }  
     # Return the result.  
     return ($ftype, $ordinal, $key);  
 }  
   
 =head3 GetRealID  
   
 C<< my $realID = GetRealID($controlBlock, $ftype, $ordinal, $key); >>  
   
 Compute the real ID of a new feature. This involves interrogating the ID hash and  
 formatting a full-blown ID out of little bits of information.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ordinal  
   
 Zero-based ordinal number of this feature. The ordinal number is added to the value  
 stored in the control block's ID hash to compute the real feature number.  
   
 =item key  
   
 Key in the ID hash relevant to this feature.  
   
 =item RETURN  
   
 Returns a fully-formatted FIG ID for the new feature.  
   
 =back  
   
 =cut  
   
 sub GetRealID {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key) = @_;  
     #Declare the return value.  
     my $retVal;  
     # Get the base value for the feature ID number.  
     my $base = $controlBlock->{idHash}->{$key};  
     # If it didn't exist, we have an error.  
     if (! defined $base) {  
         Confess("No ID range found for genome ID and feature type $key.");  
     } else {  
         # Now we have enough data to format the ID.  
         my $num = $base + $ordinal;  
         $retVal = "fig|$key.$num";  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 CheckTranslation  
   
 C<< my $actualTranslation = CheckTranslation($controlBlock, $ftype, $locations, $translation); >>  
   
 If we are processing a PEG, insure we have a translation for the peg's locations.  
   
 This method checks the feature type and the incoming translation string. If the  
 translation string is empty and the feature type is C<peg>, it will generate  
 a translation string using the specified locations for the genome currently  
 being processed.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ftype  
   
 Feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Comma-delimited list of location strings for the feature in question.  
   
 =item translation (optional)  
   
 If specified, will be returned to the caller as the result.  
   
 =item RETURN  
   
 Returns the protein translation string for the specified locations, or C<undef>  
 if no translation is warranted.  
   
 =back  
   
 =cut  
   
 sub CheckTranslation {  
     # Get the parameters.  
     my ($controlBlock, $ftype, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Declare the return variable.  
     my $retVal;  
     if ($ftype eq 'peg') {  
         # Here we have a protein encoding gene. Check to see if we already have  
         # a translation.  
         if (defined $translation) {  
             # Pass it back unmodified.  
             $retVal = $translation;  
         } else {  
             # Here we need to compute the translation.  
             my $dna = $fig->dna_seq($controlBlock->{genomeID}, $locations);  
             $retVal = FIG::translate($dna);  
         }  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 AddFeature  
   
 C<< my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations, $translation); >>  
   
 Add the specified feature to the FIG data store. This involves generating the new feature's  
 ID, creating the translation (if needed), adding the feature to the data store, and  
 queueing a request to update the similarities. The generated ID will be returned to the  
 caller.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item ordinal  
   
 Zero-based ordinal number of the proposed feature in the ID space. This is added to the  
 base ID number to get the real ID number.  
   
 =item key  
   
 Key to use for getting the base ID number from the ID hash.  
   
 =item ftype  
   
 Proposed feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub AddFeature {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key, $ftype, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # We want to add a new feature using the information provided. First, we  
     # generate its ID.  
     my $retVal = GetRealID($controlBlock, $ordinal, $key);  
     # Next, we insure that we have a translation.  
     my $actualTranslation = CheckTranslation($controlBlock, $ftype,  
                                              $locations, $translation);  
     # Now we add it to FIG.  
     $fig->add_feature($controlBlock->{genomeID}, $ftype, $locations, "",  
                       $actualTranslation, $retVal);  
     # Tell FIG to recompute the similarities.  
     $fig->enqueue_similarities([$retVal]);  
     # Return the ID we generated.  
     return $retVal;  
 }  
269    
270  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3