[Bio] / FigKernelScripts / TransactFeatures.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/TransactFeatures.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Aug 9 00:57:48 2005 UTC revision 1.9, Mon Aug 15 20:29:10 2005 UTC
# Line 32  Line 32 
32  a copy of each transaction file in which the pseudo-IDs have been replaced by  a copy of each transaction file in which the pseudo-IDs have been replaced by
33  real IDs.  real IDs.
34    
35    =item annotate
36    
37    Annotate the features created by the transactions so as to indicate how they were
38    derived.
39    
40    =item check
41    
42    Verify that the locations and translations of the new and changed features are
43    correct.
44    
45    =item fix
46    
47    Fix the locations and translations of the new and changed features.
48    
49    =item aliasMove
50    
51    Move the aliases from the old features to the ones that replaced them.
52    
53    =item attribute
54    
55    Move the attributes from the old features to the ones that replaced them.
56    
57  =back  =back
58    
59  =head2 The Transaction File  =head2 The Transaction File
# Line 140  Line 162 
162  Wrap each organism's processing in a database transaction. This makes the process  Wrap each organism's processing in a database transaction. This makes the process
163  slightly more restartable than it would be otherwise.  slightly more restartable than it would be otherwise.
164    
165    =item noAlias
166    
167    Assume that the transaction files do not contain aliases. This means that in CHANGE
168    records the translation will immediately follow the location.
169    
170    =item sql
171    
172    Trace SQL commands.
173    
174    =back
175    
176  =cut  =cut
177    
178  use strict;  use strict;
# Line 151  Line 184 
184  use File::Path;  use File::Path;
185  use FIG;  use FIG;
186  use Stats;  use Stats;
187    use TransactionProcessor;
188    use ApplyTransactions;
189    use CountTransactions;
190    use AnnotateTransactions;
191    use AttributeTransactions;
192    use FixTransactions;
193    use MoveAliases;
194    
195  # Get the command-line options.  # Get the command-line options.
196  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, safe => 0 }, @ARGV);  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, sql => 0, safe => 0, noAlias => 0 },
197                                                      @ARGV);
198    # Get the command.
199    my $mainCommand = lc shift @parameters;
200  # Set up tracing.  # Set up tracing.
201  my $traceLevel = $options->{trace};  my $traceLevel = $options->{trace};
202  TSetup("$traceLevel Tracer DocUtils FIG", "TEXT");  my $tracing = "$traceLevel Tracer DocUtils FIG";
203    if ($options->{sql}) {
204        $tracing .= " SQL";
205    }
206    TSetup($tracing, "TEXT");
207  # Get the FIG object.  # Get the FIG object.
208  my $fig = FIG->new();  my $fig = FIG->new();
209  # Get its database handle.  # Create the transaction object.
210  my $dbh = $fig->db_handle;  my $controlBlock;
211  # Get the command.  if ($mainCommand eq 'count' || $mainCommand eq 'register') {
212  my $mainCommand = lc shift @parameters;      $controlBlock = CountTransactions->new($options, $mainCommand, @parameters);
213  Trace("$mainCommand command specified.") if T(2);  } elsif ($mainCommand eq 'process') {
214        $controlBlock = ApplyTransactions->new($options, $mainCommand, @parameters);
215  # Create the ID table. This maps each organism/ftype pair to the currently-  } elsif ($mainCommand eq 'annotate') {
216  # available ID number. If we're counting, we leave it empty. If we're not      $controlBlock = AnnotateTransactions->new($options, $mainCommand, @parameters);
217  # counting, we need to read it in.  } elsif ($mainCommand eq 'fix' || $mainCommand eq 'check') {
218  my %idHash = ();      $controlBlock = FixTransactions->new($options, $mainCommand, @parameters);
219  if ($mainCommand eq 'process') {  } elsif ($mainCommand eq 'aliasmove') {
220      my $inCount = 0;      $controlBlock = MoveAliases->new($options, $mainCommand, @parameters);
221      Open(\*IDFILE, "<$parameters[1]");  } elsif ($mainCommand eq 'attribute') {
222      while (my $idRecord = <IDFILE>) {      $controlBlock = AttributeTransactions->new($options, $mainCommand, @parameters);
223          chomp $idRecord;  } else {
224          my ($orgID, $ftype, $firstNumber) = split /\t/, $idRecord;      Confess("Invalid command \"$mainCommand\" specified on command line.");
         $idHash{"$orgID.$ftype"} = $firstNumber;  
         $inCount++;  
     }  
     Trace("$inCount ID ranges read in from $parameters[1].") if T(2);  
225  }  }
226    # Setup the process.
227  # Create some counters we can use for statistical purposes.  $controlBlock->Setup();
 my $stats = Stats->new("genomes", "add", "change", "delete");  
228  # Verify that the organism directory exists.  # Verify that the organism directory exists.
229  if (! -d $parameters[0]) {  if (! -d $parameters[0]) {
230      Confess("Directory of genome files \"$parameters[0]\" not found.");      Confess("Directory of genome files \"$parameters[0]\" not found.");
# Line 206  Line 248 
248      } else {      } else {
249          # Loop through the organisms.          # Loop through the organisms.
250          for my $genomeID (sort keys %transFiles) {          for my $genomeID (sort keys %transFiles) {
251                # Start this organism.
252              Trace("Processing changes for $genomeID.") if T(3);              Trace("Processing changes for $genomeID.") if T(3);
             # Create a statistics object for this organism.  
             my $orgStats = Stats->new("add", "change", "delete");  
             # Create a control block for passing around our key data.  
             my $controlBlock = { stats => $orgStats, genomeID => $genomeID,  
                                  idHash => \%idHash, options => $options,  
                                  fig => $fig, command => $mainCommand };  
             # Open the organism file.  
253              my $orgFileName = $transFiles{$genomeID};              my $orgFileName = $transFiles{$genomeID};
254                $controlBlock->StartGenome($genomeID, $orgFileName);
255                # Open the organism file.
256              Open(\*TRANS, "<$orgFileName");              Open(\*TRANS, "<$orgFileName");
257                # Clear the transaction counter.
258              my $tranCount = 0;              my $tranCount = 0;
             # If we're processing rather than counting, open a file for  
             # writing out corrected transactions and optionally start a  
             # database transaction.  
             if ($mainCommand eq 'process') {  
                 Open(\*TRANSOUT, ">$orgFileName.tbl");  
                 if ($options->{safe}) {  
                     $dbh->begin_tran();  
                 }  
             }  
259              # Loop through the organism's data.              # Loop through the organism's data.
260              while (my $transaction = <TRANS>) {              while (my $transaction = <TRANS>) {
261                  # Parse the record.                  # Parse the record.
# Line 237  Line 267 
267                  # Process according to the transaction type.                  # Process according to the transaction type.
268                  my $command = lc shift @fields;                  my $command = lc shift @fields;
269                  if ($command eq 'add') {                  if ($command eq 'add') {
270                      Add($controlBlock, @fields);                      $controlBlock->Add(@fields);
271                  } elsif ($command eq 'delete') {                  } elsif ($command eq 'delete') {
272                      Delete($controlBlock, @fields);                      $controlBlock->Delete(@fields);
273                  } elsif ($command eq 'change') {                  } elsif ($command eq 'change') {
274                      Change($controlBlock, @fields);                      # Here we have a special case. If "noalias" is in effect, we need
275                        # to splice an empty field in before the translation.
276                        if ($controlBlock->Option("noAlias")) {
277                            splice @fields, 3, 0, "";
278                        }
279                        $controlBlock->Change(@fields);
280                  } else {                  } else {
281                      $orgStats->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");                      $controlBlock->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");
282                  }                  }
283                  $orgStats->Add($command, 1);                  $controlBlock->IncrementStat($command);
284              }              }
285              Trace("Statistics for $genomeID\n\n" . $orgStats->Show()) if T(3);              # Terminate processing for this genome.
286              # Merge the statistics for this run into the globals statistics object.              my $orgStats = $controlBlock->EndGenome();
287              $stats->Accumulate($orgStats);              Trace("Statistics for $genomeID\n\n" . $orgStats->Show() . "\n") if T(3);
             $stats->Add("genomes", 1);  
288              # Close the transaction input file.              # Close the transaction input file.
289              close TRANS;              close TRANS;
             # If we're processing, close the transaction output file  
             # and optionally end the database transaction.  
             if ($mainCommand eq 'process') {  
                 close TRANSOUT;  
                 if ($options->{safe}) {  
                     $dbh->commit_tran();  
                 }  
             }  
         }  
     }  
     Trace("Statistics for this run\n\n" . $stats->Show()) if T(1);  
     # If we're counting, we need to write out the counts file or allocate IDs  
     # from the clearinghouse.  
     if ($mainCommand ne "process") {  
         # Loop through the ID hash, printing the counts. We will also write them  
         # to a file called "counts.tbl".  
         my $countfile = "$parameters[0]/counts.tbl";  
         Open(\*COUNTFILE, ">$countfile");  
         print "\nTable of Counts\n";  
         for my $idKey (keys %idHash) {  
             $idKey =~ /^(\d+\.\d+)\.([a-z]+)$/;  
             my ($org, $ftype) = ($1, $2);  
             my $count = $idHash{$idKey};  
             print "$idKey\t$count\n";  
             print COUNTFILE "$org\t$ftype\t$count\n";  
         }  
         close COUNTFILE;  
         if ($mainCommand eq "register") {  
             # Here we are registering as well as counting. This process also produces  
             # the ID file.  
             Trace("Submitting ID file to clearing house.") if T(2);  
             system("register_features_batch <$countfile >$parameters[1]");  
             Trace("Clearing house request complete.") if T(2);  
290          }          }
291      }      }
292        # Terminate processing.
293        $controlBlock->Teardown();
294        Trace("Statistics for this run\n\n" . $controlBlock->Show() . "\n") if T(1);
295      Trace("Processing complete.") if T(1);      Trace("Processing complete.") if T(1);
296  }  }
297    
 =head2 Utility Methods  
   
 =head3 Add  
   
 C<< Add($controlBlock, $newID, $locations, $translation); >>  
   
 Add a new feature to the data store.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item newID  
   
 ID to give to the new feature.  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Add {  
     my ($controlBlock, $newID, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # add the new feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we need to add the new feature.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype,  
                                 "", $locations, $translation);  
         Trace("Feature $realID added for pseudo-ID $newID.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "add\t$realID\t$locations\t$translation\n";  
     }  
 }  
   
 =head3 Change  
   
 C<< Change($controlBlock, $fid, $newID, $locations, $aliases, $translation); >>  
   
 Replace a feature to the data store. The feature will be marked for deletion and  
 a new feature will be put in its place.  
   
 This is a much more complicated process than adding a feature. In addition to  
 the add, we have to create new aliases and transfer across the assignment and  
 the annotations.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature being changed.  
   
 =item newID  
   
 New ID to give to the feature.  
   
 =item locations  
   
 New location to give to the feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 New protein translation string for the feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Change {  
     my ($controlBlock, $fid, $newID, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # replace the feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we can go ahead and change the feature. First, we must  
         # get the old feature's assignment and annotations. Note that  
         # for the annotations we ask for the time in its raw format.  
         my @functions = $fig->function_of($fid);  
         my @annotations = $fig->feature_annotations($fid, 1);  
         # Create some counters.  
         my ($assignCount, $annotateCount) = (0, 0);  
         # Add the new version of the feature and get its ID.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations,  
                                 $aliases, $translation);  
         # Copy over the assignments.  
         for my $assignment (@functions) {  
             my ($user, $function) = @{$assignment};  
             $fig->assign_function($realID, $user, $function);  
             $assignCount++;  
         }  
         # Copy over the annotations.  
         for my $annotation (@annotations) {  
             my ($oldID, $timestamp, $user, $annotation) = @{$annotation};  
             $fig->add_annotation($realID, $user, $annotation, $timestamp);  
             $annotateCount++;  
         }  
         # Mark the old feature for deletion.  
         $fig->delete_feature($fid);  
         # Tell the user what we did.  
         $controlBlock->{stats}->Add("assignments", $assignCount);  
         $controlBlock->{stats}->Add("annotations", $annotateCount);  
         Trace("Feature $realID created from $fid. $assignCount assignments and $annotateCount annotations copied.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "change\t$fid\t$realID\t$locations\t$aliases\t$translation\n";  
     }  
 }  
   
 =head3 Delete  
   
 C<< Delete($controlBlock, $fid); >>  
   
 Delete a feature from the data store. The feature will be marked as deleted,  
 which will remove it from consideration by most FIG methods. A garbage  
 collection job will be run later to permanently delete the feature.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature to delete.  
   
 =back  
   
 =cut  
   
 sub Delete {  
     my ($controlBlock, $fid) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and count it.  
     my $ftype = FIG::ftype($fid);  
     $controlBlock->{stats}->Add($ftype, 1);  
     # If we're not counting, delete the feature.  
     if ($controlBlock->{command} eq 'process') {  
         # Mark the feature for deletion.  
         $fig->delete_feature($fid);  
         # Echo the transaction to the transaction output file.  
         print TRANSOUT "del\t$fid\n";  
     }  
 }  
   
 =head3 ParseNewID  
   
 C<< my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID); >>  
   
 Extract the feature type and ordinal number from an incoming new ID.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures needed to manage transactions.  
   
 =item newID  
   
 New ID specification taken from a transaction input record. This contains the  
 feature type followed by a period and then the ordinal number of the ID.  
   
 =item RETURN  
   
 Returna a three-element list. If successful, the list will contain the feature  
 type followed by the ordinal number and the key to use in the ID hash to find  
 the feature's true ID number. If the incoming ID is invalid, the list  
 will contain three C<undef>s.  
   
 =back  
   
 =cut  
   
 sub ParseNewID {  
     # Get the parameters.  
     my ($controlBlock, $newID) = @_;  
     my ($ftype, $ordinal, $key);  
     # Parse the ID.  
     if ($newID =~ /^([a-z]+)\.(\d+)$/) {  
         # Here we have a valid ID.  
         ($ftype, $ordinal) = ($1, $2);  
         $key = $controlBlock->{genomeID} . ".$ftype";  
         # Update the feature type count in the statistics.  
         $controlBlock->{stats}->Add($ftype, 1);  
     } else {  
         # Here we have an invalid ID.  
         $controlBlock->{stats}->AddMessage("Invalid ID $newID found in line " .  
                                            $controlBlock->{line} . " for genome " .  
                                            $controlBlock->{genomeID} . ".");  
     }  
     # Return the result.  
     return ($ftype, $ordinal, $key);  
 }  
   
 =head3 GetRealID  
   
 C<< my $realID = GetRealID($controlBlock, $ftype, $ordinal, $key); >>  
   
 Compute the real ID of a new feature. This involves interrogating the ID hash and  
 formatting a full-blown ID out of little bits of information.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ordinal  
   
 Zero-based ordinal number of this feature. The ordinal number is added to the value  
 stored in the control block's ID hash to compute the real feature number.  
   
 =item key  
   
 Key in the ID hash relevant to this feature.  
   
 =item RETURN  
   
 Returns a fully-formatted FIG ID for the new feature.  
   
 =back  
   
 =cut  
   
 sub GetRealID {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key) = @_;  
     #Declare the return value.  
     my $retVal;  
     # Get the base value for the feature ID number.  
     my $base = $controlBlock->{idHash}->{$key};  
     # If it didn't exist, we have an error.  
     if (! defined $base) {  
         Confess("No ID range found for genome ID and feature type $key.");  
     } else {  
         # Now we have enough data to format the ID.  
         my $num = $base + $ordinal;  
         $retVal = "fig|$key.$num";  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 CheckTranslation  
   
 C<< my $actualTranslation = CheckTranslation($controlBlock, $ftype, $locations, $translation); >>  
   
 If we are processing a PEG, insure we have a translation for the peg's locations.  
   
 This method checks the feature type and the incoming translation string. If the  
 translation string is empty and the feature type is C<peg>, it will generate  
 a translation string using the specified locations for the genome currently  
 being processed.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ftype  
   
 Feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Comma-delimited list of location strings for the feature in question.  
   
 =item translation (optional)  
   
 If specified, will be returned to the caller as the result.  
   
 =item RETURN  
   
 Returns the protein translation string for the specified locations, or C<undef>  
 if no translation is warranted.  
   
 =back  
   
 =cut  
   
 sub CheckTranslation {  
     # Get the parameters.  
     my ($controlBlock, $ftype, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Declare the return variable.  
     my $retVal;  
     if ($ftype eq 'peg') {  
         # Here we have a protein encoding gene. Check to see if we already have  
         # a translation.  
         if (defined $translation) {  
             # Pass it back unmodified.  
             $retVal = $translation;  
         } else {  
             # Here we need to compute the translation.  
             my $dna = $fig->dna_seq($controlBlock->{genomeID}, $locations);  
             $retVal = FIG::translate($dna);  
         }  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 AddFeature  
   
 C<< my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations, $translation); >>  
   
 Add the specified feature to the FIG data store. This involves generating the new feature's  
 ID, creating the translation (if needed), adding the feature to the data store, and  
 queueing a request to update the similarities. The generated ID will be returned to the  
 caller.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item ordinal  
   
 Zero-based ordinal number of the proposed feature in the ID space. This is added to the  
 base ID number to get the real ID number.  
   
 =item key  
   
 Key to use for getting the base ID number from the ID hash.  
   
 =item ftype  
   
 Proposed feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub AddFeature {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key, $ftype, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # We want to add a new feature using the information provided. First, we  
     # generate its ID.  
     my $retVal = GetRealID($controlBlock, $ordinal, $key);  
     # Next, we insure that we have a translation.  
     my $actualTranslation = CheckTranslation($controlBlock, $ftype,  
                                              $locations, $translation);  
     # Now we add it to FIG.  
     $fig->add_feature($controlBlock->{genomeID}, $ftype, $locations, "",  
                       $actualTranslation, $retVal);  
     # Tell FIG to recompute the similarities.  
     $fig->enqueue_similarities([$retVal]);  
     # Return the ID we generated.  
     return $retVal;  
 }  
298    
299  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3