[Bio] / FigKernelScripts / TransactFeatures.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/TransactFeatures.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Tue Aug 9 00:57:48 2005 UTC revision 1.14, Thu Sep 1 09:39:31 2005 UTC
# Line 7  Line 7 
7  a command. The second specifies a directory full of transaction files. The third  a command. The second specifies a directory full of transaction files. The third
8  specifies a file that tells us which feature IDs are available for each organism.  specifies a file that tells us which feature IDs are available for each organism.
9    
10  C<TransactFeatures> I<[options]> I<command> I<transactionDirectory> I<idFile>  C<TransactFeatures> [I<options>] I<command> I<transactionDirectory> I<idFile>
11    
12  The supported commands are  The supported commands are
13    
# Line 28  Line 28 
28    
29  =item process  =item process
30    
31  Process the transactions and update the FIG data store. This will also create  Process the transactions and update the FIG data store. This will also update
32  a copy of each transaction file in which the pseudo-IDs have been replaced by  the NR file and queue features for similarity generation.
33  real IDs.  
34    =item fudge
35    
36    Convert transactions that have already been applied to new transactions that can
37    be used to test the transaction processor.
38    
39  =back  =back
40    
# Line 140  Line 144 
144  Wrap each organism's processing in a database transaction. This makes the process  Wrap each organism's processing in a database transaction. This makes the process
145  slightly more restartable than it would be otherwise.  slightly more restartable than it would be otherwise.
146    
147    =item noAlias
148    
149    Assume that the transaction files do not contain aliases. This means that in CHANGE
150    records the translation will immediately follow the location.
151    
152    =item sql
153    
154    Trace SQL commands.
155    
156    =item start
157    
158    ID of the first genome to process. This allows restarting a transaction run that failed
159    in the middle. The default is to run all transaction files.
160    
161    =back
162    
163  =cut  =cut
164    
165  use strict;  use strict;
# Line 151  Line 171 
171  use File::Path;  use File::Path;
172  use FIG;  use FIG;
173  use Stats;  use Stats;
174    use TransactionProcessor;
175    use ApplyTransactions;
176    use CountTransactions;
177    use FudgeTransactions;
178    
179  # Get the command-line options.  # Get the command-line options.
180  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, safe => 0 }, @ARGV);  my ($options, @parameters) = Tracer::ParseCommand({ trace => 3, sql => 0, safe => 0, noAlias => 0,
181                                                        start => ' '},
182                                                      @ARGV);
183    # Get the command.
184    my $mainCommand = lc shift @parameters;
185  # Set up tracing.  # Set up tracing.
186  my $traceLevel = $options->{trace};  my $traceLevel = $options->{trace};
187  TSetup("$traceLevel Tracer DocUtils FIG", "TEXT");  my $tracing = "$traceLevel Tracer DocUtils FIG";
188    if ($options->{sql}) {
189        $tracing .= " SQL";
190    }
191    TSetup($tracing, "TEXT");
192  # Get the FIG object.  # Get the FIG object.
193  my $fig = FIG->new();  my $fig = FIG->new();
194  # Get its database handle.  # Create the transaction object.
195  my $dbh = $fig->db_handle;  my $controlBlock;
196  # Get the command.  if ($mainCommand eq 'count' || $mainCommand eq 'register') {
197  my $mainCommand = lc shift @parameters;      $controlBlock = CountTransactions->new($options, $mainCommand, @parameters);
198  Trace("$mainCommand command specified.") if T(2);  } elsif ($mainCommand eq 'process') {
199        $controlBlock = ApplyTransactions->new($options, $mainCommand, @parameters);
200  # Create the ID table. This maps each organism/ftype pair to the currently-  } elsif ($mainCommand eq 'fudge') {
201  # available ID number. If we're counting, we leave it empty. If we're not      $controlBlock = FudgeTransactions->new($options, $mainCommand, @parameters);
202  # counting, we need to read it in.  } else {
203  my %idHash = ();      Confess("Invalid command \"$mainCommand\" specified on command line.");
 if ($mainCommand eq 'process') {  
     my $inCount = 0;  
     Open(\*IDFILE, "<$parameters[1]");  
     while (my $idRecord = <IDFILE>) {  
         chomp $idRecord;  
         my ($orgID, $ftype, $firstNumber) = split /\t/, $idRecord;  
         $idHash{"$orgID.$ftype"} = $firstNumber;  
         $inCount++;  
     }  
     Trace("$inCount ID ranges read in from $parameters[1].") if T(2);  
204  }  }
205    # Setup the process.
206  # Create some counters we can use for statistical purposes.  $controlBlock->Setup();
 my $stats = Stats->new("genomes", "add", "change", "delete");  
207  # Verify that the organism directory exists.  # Verify that the organism directory exists.
208  if (! -d $parameters[0]) {  if (! -d $parameters[0]) {
209      Confess("Directory of genome files \"$parameters[0]\" not found.");      Confess("Directory of genome files \"$parameters[0]\" not found.");
# Line 192  Line 213 
213      my $orgsFound = 0;      my $orgsFound = 0;
214      my %transFiles = ();      my %transFiles = ();
215      my @transDirectory = OpenDir($parameters[0], 1);      my @transDirectory = OpenDir($parameters[0], 1);
216        # Pull out the "start" option value. This will be a space if all genomes should
217        # be processed, in which case it will always compare less than the genome ID.
218        my $startGenome = $options->{start};
219      # The next step is to create a hash of organism IDs to file names. This      # The next step is to create a hash of organism IDs to file names. This
220      # saves us some painful parsing later.      # saves us some painful parsing later.
221      for my $transFileName (@transDirectory) {      for my $transFileName (@transDirectory) {
222            # Parse the file name. This will only match if it's a real transaction file.
223          if ($transFileName =~ /^tbl_diff_(\d+\.\d+)$/) {          if ($transFileName =~ /^tbl_diff_(\d+\.\d+)$/) {
224                # Get the genome ID;
225                my $genomeID = $1;
226                # If we're skipping, only include this genome ID if it's equal to
227                # or greater than the start value.
228                if ($genomeID ge $startGenome) {
229              $transFiles{$1} = "$parameters[0]/$transFileName";              $transFiles{$1} = "$parameters[0]/$transFileName";
230              $orgsFound++;              $orgsFound++;
231          }          }
232      }      }
233        }
234      Trace("$orgsFound genome transaction files found in directory $parameters[0].") if T(2);      Trace("$orgsFound genome transaction files found in directory $parameters[0].") if T(2);
235      if (! $orgsFound) {      if (! $orgsFound) {
236          Confess("No \"tbl_diff\" files found in directory $parameters[1].");          Confess("No \"tbl_diff\" files found in directory $parameters[1].");
237      } else {      } else {
238          # Loop through the organisms.          # Loop through the organisms.
239          for my $genomeID (sort keys %transFiles) {          for my $genomeID (sort keys %transFiles) {
240                # Start this organism.
241              Trace("Processing changes for $genomeID.") if T(3);              Trace("Processing changes for $genomeID.") if T(3);
             # Create a statistics object for this organism.  
             my $orgStats = Stats->new("add", "change", "delete");  
             # Create a control block for passing around our key data.  
             my $controlBlock = { stats => $orgStats, genomeID => $genomeID,  
                                  idHash => \%idHash, options => $options,  
                                  fig => $fig, command => $mainCommand };  
             # Open the organism file.  
242              my $orgFileName = $transFiles{$genomeID};              my $orgFileName = $transFiles{$genomeID};
243                $controlBlock->StartGenome($genomeID, $orgFileName);
244                # Open the organism file.
245              Open(\*TRANS, "<$orgFileName");              Open(\*TRANS, "<$orgFileName");
246                # Clear the transaction counter.
247              my $tranCount = 0;              my $tranCount = 0;
             # If we're processing rather than counting, open a file for  
             # writing out corrected transactions and optionally start a  
             # database transaction.  
             if ($mainCommand eq 'process') {  
                 Open(\*TRANSOUT, ">$orgFileName.tbl");  
                 if ($options->{safe}) {  
                     $dbh->begin_tran();  
                 }  
             }  
248              # Loop through the organism's data.              # Loop through the organism's data.
249              while (my $transaction = <TRANS>) {              while (my $transaction = <TRANS>) {
250                  # Parse the record.                  # Parse the record.
# Line 237  Line 256 
256                  # Process according to the transaction type.                  # Process according to the transaction type.
257                  my $command = lc shift @fields;                  my $command = lc shift @fields;
258                  if ($command eq 'add') {                  if ($command eq 'add') {
259                      Add($controlBlock, @fields);                      $controlBlock->Add(@fields);
260                  } elsif ($command eq 'delete') {                  } elsif ($command eq 'delete') {
261                      Delete($controlBlock, @fields);                      $controlBlock->Delete(@fields);
262                  } elsif ($command eq 'change') {                  } elsif ($command eq 'change') {
263                      Change($controlBlock, @fields);                      # Here we have a special case. If "noalias" is in effect, we need
264                        # to splice an empty field in before the translation.
265                        if ($controlBlock->Option("noAlias")) {
266                            splice @fields, 3, 0, "";
267                        }
268                        $controlBlock->Change(@fields);
269                  } else {                  } else {
270                      $orgStats->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");                      $controlBlock->AddMessage("Invalid command $command in line $tranCount for genome $genomeID");
271                  }                  }
272                  $orgStats->Add($command, 1);                  $controlBlock->IncrementStat($command);
273              }              }
             Trace("Statistics for $genomeID\n\n" . $orgStats->Show()) if T(3);  
             # Merge the statistics for this run into the globals statistics object.  
             $stats->Accumulate($orgStats);  
             $stats->Add("genomes", 1);  
274              # Close the transaction input file.              # Close the transaction input file.
275              close TRANS;              close TRANS;
276              # If we're processing, close the transaction output file              # Terminate processing for this genome.
277              # and optionally end the database transaction.              my $orgStats = $controlBlock->EndGenome();
278              if ($mainCommand eq 'process') {              Trace("Statistics for $genomeID\n\n" . $orgStats->Show() . "\n") if T(3);
                 close TRANSOUT;  
                 if ($options->{safe}) {  
                     $dbh->commit_tran();  
                 }  
             }  
         }  
     }  
     Trace("Statistics for this run\n\n" . $stats->Show()) if T(1);  
     # If we're counting, we need to write out the counts file or allocate IDs  
     # from the clearinghouse.  
     if ($mainCommand ne "process") {  
         # Loop through the ID hash, printing the counts. We will also write them  
         # to a file called "counts.tbl".  
         my $countfile = "$parameters[0]/counts.tbl";  
         Open(\*COUNTFILE, ">$countfile");  
         print "\nTable of Counts\n";  
         for my $idKey (keys %idHash) {  
             $idKey =~ /^(\d+\.\d+)\.([a-z]+)$/;  
             my ($org, $ftype) = ($1, $2);  
             my $count = $idHash{$idKey};  
             print "$idKey\t$count\n";  
             print COUNTFILE "$org\t$ftype\t$count\n";  
         }  
         close COUNTFILE;  
         if ($mainCommand eq "register") {  
             # Here we are registering as well as counting. This process also produces  
             # the ID file.  
             Trace("Submitting ID file to clearing house.") if T(2);  
             system("register_features_batch <$countfile >$parameters[1]");  
             Trace("Clearing house request complete.") if T(2);  
279          }          }
280      }      }
281        # Terminate processing.
282        $controlBlock->Teardown();
283        Trace("Statistics for this run\n\n" . $controlBlock->Show() . "\n") if T(1);
284      Trace("Processing complete.") if T(1);      Trace("Processing complete.") if T(1);
285  }  }
286    
 =head2 Utility Methods  
   
 =head3 Add  
   
 C<< Add($controlBlock, $newID, $locations, $translation); >>  
   
 Add a new feature to the data store.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item newID  
   
 ID to give to the new feature.  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Add {  
     my ($controlBlock, $newID, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # add the new feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we need to add the new feature.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype,  
                                 "", $locations, $translation);  
         Trace("Feature $realID added for pseudo-ID $newID.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "add\t$realID\t$locations\t$translation\n";  
     }  
 }  
   
 =head3 Change  
   
 C<< Change($controlBlock, $fid, $newID, $locations, $aliases, $translation); >>  
   
 Replace a feature to the data store. The feature will be marked for deletion and  
 a new feature will be put in its place.  
   
 This is a much more complicated process than adding a feature. In addition to  
 the add, we have to create new aliases and transfer across the assignment and  
 the annotations.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature being changed.  
   
 =item newID  
   
 New ID to give to the feature.  
   
 =item locations  
   
 New location to give to the feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 New protein translation string for the feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub Change {  
     my ($controlBlock, $fid, $newID, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and ordinal number from the new ID.  
     my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID);  
     # If we're counting, we need to count the ID. Otherwise, we need to  
     # replace the feature.  
     if ($controlBlock->{command} ne 'process') {  
         $controlBlock->{idHash}->{$key}++;  
     } else {  
         # Here we can go ahead and change the feature. First, we must  
         # get the old feature's assignment and annotations. Note that  
         # for the annotations we ask for the time in its raw format.  
         my @functions = $fig->function_of($fid);  
         my @annotations = $fig->feature_annotations($fid, 1);  
         # Create some counters.  
         my ($assignCount, $annotateCount) = (0, 0);  
         # Add the new version of the feature and get its ID.  
         my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations,  
                                 $aliases, $translation);  
         # Copy over the assignments.  
         for my $assignment (@functions) {  
             my ($user, $function) = @{$assignment};  
             $fig->assign_function($realID, $user, $function);  
             $assignCount++;  
         }  
         # Copy over the annotations.  
         for my $annotation (@annotations) {  
             my ($oldID, $timestamp, $user, $annotation) = @{$annotation};  
             $fig->add_annotation($realID, $user, $annotation, $timestamp);  
             $annotateCount++;  
         }  
         # Mark the old feature for deletion.  
         $fig->delete_feature($fid);  
         # Tell the user what we did.  
         $controlBlock->{stats}->Add("assignments", $assignCount);  
         $controlBlock->{stats}->Add("annotations", $annotateCount);  
         Trace("Feature $realID created from $fid. $assignCount assignments and $annotateCount annotations copied.") if T(4);  
         # Write a corrected transaction to the transaction output file.  
         print TRANSOUT "change\t$fid\t$realID\t$locations\t$aliases\t$translation\n";  
     }  
 }  
   
 =head3 Delete  
   
 C<< Delete($controlBlock, $fid); >>  
   
 Delete a feature from the data store. The feature will be marked as deleted,  
 which will remove it from consideration by most FIG methods. A garbage  
 collection job will be run later to permanently delete the feature.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item fid  
   
 ID of the feature to delete.  
   
 =back  
   
 =cut  
   
 sub Delete {  
     my ($controlBlock, $fid) = @_;  
     my $fig = $controlBlock->{fig};  
     # Extract the feature type and count it.  
     my $ftype = FIG::ftype($fid);  
     $controlBlock->{stats}->Add($ftype, 1);  
     # If we're not counting, delete the feature.  
     if ($controlBlock->{command} eq 'process') {  
         # Mark the feature for deletion.  
         $fig->delete_feature($fid);  
         # Echo the transaction to the transaction output file.  
         print TRANSOUT "del\t$fid\n";  
     }  
 }  
   
 =head3 ParseNewID  
   
 C<< my ($ftype, $ordinal, $key) = ParseNewID($controlBlock, $newID); >>  
   
 Extract the feature type and ordinal number from an incoming new ID.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures needed to manage transactions.  
   
 =item newID  
   
 New ID specification taken from a transaction input record. This contains the  
 feature type followed by a period and then the ordinal number of the ID.  
   
 =item RETURN  
   
 Returna a three-element list. If successful, the list will contain the feature  
 type followed by the ordinal number and the key to use in the ID hash to find  
 the feature's true ID number. If the incoming ID is invalid, the list  
 will contain three C<undef>s.  
   
 =back  
   
 =cut  
   
 sub ParseNewID {  
     # Get the parameters.  
     my ($controlBlock, $newID) = @_;  
     my ($ftype, $ordinal, $key);  
     # Parse the ID.  
     if ($newID =~ /^([a-z]+)\.(\d+)$/) {  
         # Here we have a valid ID.  
         ($ftype, $ordinal) = ($1, $2);  
         $key = $controlBlock->{genomeID} . ".$ftype";  
         # Update the feature type count in the statistics.  
         $controlBlock->{stats}->Add($ftype, 1);  
     } else {  
         # Here we have an invalid ID.  
         $controlBlock->{stats}->AddMessage("Invalid ID $newID found in line " .  
                                            $controlBlock->{line} . " for genome " .  
                                            $controlBlock->{genomeID} . ".");  
     }  
     # Return the result.  
     return ($ftype, $ordinal, $key);  
 }  
   
 =head3 GetRealID  
   
 C<< my $realID = GetRealID($controlBlock, $ftype, $ordinal, $key); >>  
   
 Compute the real ID of a new feature. This involves interrogating the ID hash and  
 formatting a full-blown ID out of little bits of information.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ordinal  
   
 Zero-based ordinal number of this feature. The ordinal number is added to the value  
 stored in the control block's ID hash to compute the real feature number.  
   
 =item key  
   
 Key in the ID hash relevant to this feature.  
   
 =item RETURN  
   
 Returns a fully-formatted FIG ID for the new feature.  
   
 =back  
   
 =cut  
   
 sub GetRealID {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key) = @_;  
     #Declare the return value.  
     my $retVal;  
     # Get the base value for the feature ID number.  
     my $base = $controlBlock->{idHash}->{$key};  
     # If it didn't exist, we have an error.  
     if (! defined $base) {  
         Confess("No ID range found for genome ID and feature type $key.");  
     } else {  
         # Now we have enough data to format the ID.  
         my $num = $base + $ordinal;  
         $retVal = "fig|$key.$num";  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 CheckTranslation  
   
 C<< my $actualTranslation = CheckTranslation($controlBlock, $ftype, $locations, $translation); >>  
   
 If we are processing a PEG, insure we have a translation for the peg's locations.  
   
 This method checks the feature type and the incoming translation string. If the  
 translation string is empty and the feature type is C<peg>, it will generate  
 a translation string using the specified locations for the genome currently  
 being processed.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing data used to manage the transaction process.  
   
 =item ftype  
   
 Feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Comma-delimited list of location strings for the feature in question.  
   
 =item translation (optional)  
   
 If specified, will be returned to the caller as the result.  
   
 =item RETURN  
   
 Returns the protein translation string for the specified locations, or C<undef>  
 if no translation is warranted.  
   
 =back  
   
 =cut  
   
 sub CheckTranslation {  
     # Get the parameters.  
     my ($controlBlock, $ftype, $locations, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # Declare the return variable.  
     my $retVal;  
     if ($ftype eq 'peg') {  
         # Here we have a protein encoding gene. Check to see if we already have  
         # a translation.  
         if (defined $translation) {  
             # Pass it back unmodified.  
             $retVal = $translation;  
         } else {  
             # Here we need to compute the translation.  
             my $dna = $fig->dna_seq($controlBlock->{genomeID}, $locations);  
             $retVal = FIG::translate($dna);  
         }  
     }  
     # Return the result.  
     return $retVal;  
 }  
   
 =head3 AddFeature  
   
 C<< my $realID = AddFeature($controlBlock, $ordinal, $key, $ftype, $locations, $translation); >>  
   
 Add the specified feature to the FIG data store. This involves generating the new feature's  
 ID, creating the translation (if needed), adding the feature to the data store, and  
 queueing a request to update the similarities. The generated ID will be returned to the  
 caller.  
   
 =over 4  
   
 =item controlBlock  
   
 Reference to a hash containing the data structures required to manage feature  
 transactions.  
   
 =item ordinal  
   
 Zero-based ordinal number of the proposed feature in the ID space. This is added to the  
 base ID number to get the real ID number.  
   
 =item key  
   
 Key to use for getting the base ID number from the ID hash.  
   
 =item ftype  
   
 Proposed feature type (C<peg>, C<rna>, etc.)  
   
 =item locations  
   
 Location of the new feature, in the form of a comma-separated list of location  
 strings in SEED format.  
   
 =item aliases (optional)  
   
 A new list of alias names for the feature.  
   
 =item translation (optional)  
   
 Protein translation string for the new feature. If this field is omitted and  
 the feature is a peg, the translation will be generated by normal means.  
   
 =back  
   
 =cut  
   
 sub AddFeature {  
     # Get the parameters.  
     my ($controlBlock, $ordinal, $key, $ftype, $locations, $aliases, $translation) = @_;  
     my $fig = $controlBlock->{fig};  
     # We want to add a new feature using the information provided. First, we  
     # generate its ID.  
     my $retVal = GetRealID($controlBlock, $ordinal, $key);  
     # Next, we insure that we have a translation.  
     my $actualTranslation = CheckTranslation($controlBlock, $ftype,  
                                              $locations, $translation);  
     # Now we add it to FIG.  
     $fig->add_feature($controlBlock->{genomeID}, $ftype, $locations, "",  
                       $actualTranslation, $retVal);  
     # Tell FIG to recompute the similarities.  
     $fig->enqueue_similarities([$retVal]);  
     # Return the ID we generated.  
     return $retVal;  
 }  
287    
288  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.14

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3