[Bio] / Sprout / EvCodeRefresh.pl Repository:
ViewVC logotype

Diff of /Sprout/EvCodeRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Wed Jul 2 06:20:30 2008 UTC revision 1.3, Fri Jul 11 01:05:46 2008 UTC
# Line 21  Line 21 
21  use Tracer;  use Tracer;
22  use CustomAttributes;  use CustomAttributes;
23  use Stats;  use Stats;
24    use FIG;
25    
26  =head1 EvCodeRefresh Script  =head1 EvCodeRefresh Script
27    
28      EvCodeRefresh [options] <filename>      EvCodeRefresh [options] <filename>
29    
30  Refresh evidence codes from a sequential file  Recompute evidence codes.
31    
32  =head2 Introduction  =head2 Introduction
33    
34  This script loads evidence codes from a tab-delimited file into the  This script generates and reloads evidence codes.
 B<IsEvidencedBy> table. The incoming file should contain two columns of data-- a  
 feature ID followed by an evidence code. For compatibility, there may be an  
 intervening column that is ignored.  
   
 =head2 Positional Parameters  
   
 =over 4  
   
 =item filename  
   
 Name of the file from which the evidence codes are to be loaded. The file must  
 be a tab-delimited file containing two or three columns. The first column must  
 contain a feature ID, and the last column an evidence code to be applied to that  
 feature. The evidence code will automatically be split into a class (e.g.  
 C<ilit>, C<ff>) and a modifier.  
   
 =back  
35    
36  =head2 Command-Line Options  =head2 Command-Line Options
37    
# Line 58  Line 42 
42  Specifies the tracing level. The higher the tracing level, the more messages  Specifies the tracing level. The higher the tracing level, the more messages
43  will appear in the trace log. Use E to specify emergency tracing.  will appear in the trace log. Use E to specify emergency tracing.
44    
45  =item append  =item load
46    
47  Normally, the existing evidence codes are erased before the data from the file  Specifies a file containing evidence codes. The file should contain either 2 or 3
48  is loaded. If C<append> is specified, then the erase is suppressed, and the  columns. Each record should contain a feature ID in the first column and a
49  existing codes are kept.  corresponding evidence code in the last column. If this option is specified,
50    the evidence codes are loaded from the file. Otherwise, the evidence codes are
51    recomputed into a new temporary file.
52    
53  =item classes  =item classes
54    
# Line 105  Line 91 
91  # Get the command-line options and parameters.  # Get the command-line options and parameters.
92  my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],  my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],
93                                             {                                             {
94                                                  load => ["", "file containing evidence codes; if none specified, evidence codes will be computed"],
95                                                trace => ["2", "tracing level"],                                                trace => ["2", "tracing level"],
96                                                append => ["", "if specified, the evidence codes will be appended to existing codes"],                                                classes => ["", "evidence class file name"],
                                               classes => ["", "evidence class file name (optional)"],  
97                                                phone => ["", "phone number (international format) to call when load finishes"]                                                phone => ["", "phone number (international format) to call when load finishes"]
98                                             },                                             },
99                                             "<filename>",                                             "",
100                                             @ARGV);                                             @ARGV);
101  # Set a variable to contain return type information.  # Set a variable to contain return type information.
102  my $rtype;  my $rtype;
103  # Insure we catch errors.  # Insure we catch errors.
104  eval {  eval {
105      # Get the attributes database.      # Create the FIG object.
106        my $fig = FIG->new();
107        # We'll count our progress in here.
108        my $stats = Stats->new();
109        # Create a temporary directory for our calculations.
110        my $dest=$FIG_Config::temp."/evidence_codes.".time;
111        mkdir ($dest, 0775);
112        # The file name for the evidence codes will be put in here.
113        my $fileName;
114        if ($options->{load}) {
115            # Here we're loading from a precomputed file.
116            $fileName = $options->{load};
117            Trace("Evidence codes will be loaded from precomputed file $fileName") if T(2);
118        } else {
119            # Here we need to create the evidence codes.
120            $fileName = GetEvCodes($fig, $dest, $stats);
121            # Check for literature data.
122            if (-s "$FIG_Config::data/Dlits/dlits") {
123                # We have some, so roll it in.
124                Trace("Loading dlits.") if T(3);
125                system "$FIG_Config::bin/load_dlits";
126                Trace("Generating ilits.") if T(3);
127                system "$FIG_Config::bin/generate_ilits > $FIG_Config::data/Dlits/ilits";
128                Trace("Generating dlit codes.") if T(3);
129                system "$FIG_Config::bin/generate_dlit_ev_codes > $FIG_Config::data/Dlits/dlit.ev.codes";
130                Trace("Generating ilit codes.") if T(3);
131                system "$FIG_Config::bin/generate_ilit_ev_codes < $FIG_Config::data/Dlits/ilits > $FIG_Config::data/Dlits/ilit.ev.codes";
132                Trace("Sorting and writing literature codes.") if T(3);
133                system "cat $FIG_Config::data/Dlits/[di]lit.ev.codes | sort -u >> $fileName";
134            }
135            Trace("Evidence codes generated to file $fileName.") if T(2);
136        }
137        # Now we're ready to load. Get the attributes database.
138      my $attr = CustomAttributes->new();      my $attr = CustomAttributes->new();
139      # Check for a class file.      # Check for a class file.
140      if ($options->{classes}) {      if ($options->{classes}) {
# Line 125  Line 143 
143          $attr->LoadTable($options->{classes}, 'EvidenceClass', truncate => 1);          $attr->LoadTable($options->{classes}, 'EvidenceClass', truncate => 1);
144          Trace("Evidence classes loaded.") if T(2);          Trace("Evidence classes loaded.") if T(2);
145      }      }
146      # Verify that the input file exists.      # Now we convert the evidence code file into a load file for the IsEvidencedBy
147      my $fileName = $parameters[0];      # table. First, we open it.
     if (! $fileName) {  
         Trace("No input file name specified.") if T(2);  
     } else {  
         # Now we convert the input file into a load file. First, we open it.  
148          my $ih = Open(undef, "<$fileName");          my $ih = Open(undef, "<$fileName");
         # We'll count the number of codes in here.  
         my $stats = Stats->new();  
149          # Create the load file. We sort it to speed up the load.          # Create the load file. We sort it to speed up the load.
150          my $loadFileName = "$FIG_Config::temp/IsEvidencedBy$$.dtx";          my $loadFileName = "$FIG_Config::temp/IsEvidencedBy$$.dtx";
151          my $oh = Open(undef, "| sort >$loadFileName");          my $oh = Open(undef, "| sort >$loadFileName");
# Line 189  Line 201 
201                  $stats->Add(bad_class => 1);                  $stats->Add(bad_class => 1);
202              }              }
203          }          }
         # We are almost ready to reload the evidence codes. Set up the append option.  
         # It's passed in as the truncate option flag on the load.  
         my $truncate = ($options->{append} ? 0 : 1);  
204          # Now we load.          # Now we load.
205          Trace("Loading evidence codes.") if T(2);          Trace("Loading evidence codes.") if T(2);
206          $attr->LoadTable($loadFileName, 'IsEvidencedBy', truncate => $truncate,      $attr->LoadTable($loadFileName, 'IsEvidencedBy', truncate => 1,
207                           mode => 'concurrent');                           mode => 'concurrent');
208          # Tell the user we're done, and show the statistics.          # Tell the user we're done, and show the statistics.
209          Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);          Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);
     }  
210  };  };
211  if ($@) {  if ($@) {
212      Trace("Script failed with error: $@") if T(0);      Trace("Script failed with error: $@") if T(0);
# Line 216  Line 224 
224      }      }
225  }  }
226    
227    =head3 GetEvCodes
228    
229        my $fileName = GetEvCodes($fig, $dest, $stats);
230    
231    Create a tab-delimited file of evidence codes. The output file will be in
232    the specified destination directory, and it will contain three columns.
233    The first column will contain the name of a feature, and the last will
234    contain an associated evidence code. The second column is not used. It is
235    put in solely for compatibility with external scripts.
236    
237    =over 4
238    
239    =item fig
240    
241    A FIG-like object used to access the data store.
242    
243    =item dest
244    
245    Directory to contain the working files.
246    
247    =item stats
248    
249    A statistics object that can be used to track activity.
250    
251    =item RETURN
252    
253    Returns the name of a file in the destination directory. The file will contain the
254    new evidence codes.
255    
256    =back
257    
258    =cut
259    
260    sub GetEvCodes {
261        # Get the parameters.
262        my ($fig, $dest, $stats) = @_;
263        # The subsystem list goes in here.
264        my @all_subsystems = sort grep { $fig->usable_subsystem($_) } $fig->all_subsystems;
265        # This will contain all the pegs in subsystems.
266        my %insub;
267        # Create the output file.
268        my $retVal = "$dest/evidenceCodes.tbl";
269        my $oh = Open(undef, ">$retVal");
270        # Loop through the subsystems.
271        for my $sub (@all_subsystems) {
272            Trace("Processing $sub (" . $stats->Progress(subsystems => scalar @all_subsystems) . "%).") if T(3);
273            my $subsystem = new Subsystem($sub,$fig,0);
274            if (! $subsystem) {
275                Trace("Error loading subsystem $sub.") if T(1);
276                $stats->AddMessage("Could not load subsystem $sub.");
277                $stats->Add(badSubsystem => 1);
278            } else {
279                $stats->Add(subsystem => 1);
280                # Get ALL of the coupling data for the pegs in the subsystem.
281                my @cdata = $fig->coupled_to_batch($subsystem->get_all_pegs());
282                # returns triples [$peg1, $peg2, $score]. create %cdata that maps
283                # peg1 => [peg2]
284                my %cdata;
285                map { push(@{$cdata{$_->[0]}}, $_->[1]) } @cdata;
286                # Get the genomes and roles.
287                my @genomes = $subsystem->get_genomes();
288                my @roles = $subsystem->get_roles;
289                # Loop through the genomes.
290                for my $genome (@genomes) {
291                    Trace("Processing $genome for $sub.") if T(4);
292                    Trace($stats->Ask('subsystemRows') . " total spreadsheet rows processed.") if $stats->Check(subsystemRows => 500) && T(3);
293                    # Build hashes for the different types of evidence.
294                    my (%pegs_in_row, %isu, %icw, %isd);
295                    # Get the variant code. This tells us what roles to expect.
296                    my $vcode = $subsystem->get_variant_code($subsystem->get_genome_index($genome));
297                    # Only proceed if it's a real, known variant.
298                    if (($vcode ne "0") && ($vcode ne "-1")) {
299                        # Loop through the roles.
300                        for my $role (@roles) {
301                            # Get the PEGs for this genome and role (one cell in the spreadsheet).
302                            my @pegs = $subsystem->get_pegs_from_cell($genome,$role);
303                            # Mark each of the pegs as being in a subsystem.
304                            foreach my $peg (@pegs) {
305                                $insub{$peg} = 1;
306                            }
307                            # Accumulate the number of pegs.
308                            $stats->Add(pegRoles => scalar @pegs);
309                            # The evidence we'll deduce depends on the number of pegs in this cell.
310                            if (@pegs == 1) {
311                                # If there's only one, save this peg as possessing InSubsystemUnique (isu) evidence.
312                                $isu{$pegs[0]} = 1;
313                            } elsif (@pegs > 1) {
314                                # If there's more than one, then the evidence code is InSubsystemDuplicate (idu),
315                                # modified by the number of pegs in the cell.
316                                foreach my $peg (@pegs) {
317                                    $isd{$peg} = @pegs;
318                                }
319                            }
320                            # For each peg, note that it occurs in a this row.
321                            for my $peg (@pegs) {
322                                $pegs_in_row{$peg} = 1;
323                            }
324                        }
325                    }
326                    # Loop through all the pegs in the current row. Use a batch request for this.
327                    for my $peg (keys(%pegs_in_row)) {
328                        # Get a list of all the pegs in the current row that are coupled to the current peg.
329                        my $clist = $cdata{$peg};
330                        if ($clist) {
331                        my @coup = grep { $pegs_in_row{$_} } @$clist;
332                            my $n = @coup;
333                            if ($n > 0) {
334                                # Denote that we have IsCoupledWith(icw) evidence for this peg, modified by
335                                # the number of couplings.
336                                $icw{$peg} = $n;
337                            }
338                        }
339                    }
340                    # Convert the subsystem name to a subsystem ID (spaces to underscores).
341                    $sub =~ s/\s+/_/g;
342                    # Loop through all the pegs in the current row.
343                    for my $peg (keys(%pegs_in_row)) {
344                        # If this peg has ISU evidence, send it to the output.
345                        if ($isu{$peg} ) {
346                            Tracer::PutLine($oh, [$peg, '', "isu;$sub"]);
347                        }
348                        # If this peg has ICW evidence, send it to the output.
349                        if (my $n = $icw{$peg} ) {
350                            Tracer::PutLine($oh, [$peg, '', "icw($n);$sub"]);
351                        } else {
352                            # If there's no ICW evidence, ty IDU evidence.
353                            if ($n = $isd{$peg} ) {
354                                Tracer::PutLine($oh, [$peg, '', "idu($n);$sub"]);
355                            }
356                        }
357                    }
358                }
359            }
360        }
361    
362        # I have chosen to implement the
363        #
364        #     ff    - in FIGfam
365        #     cwn   - clustered with non-hypothetical
366        #     cwh   - clustered, but only with hypotheticals
367        #
368        # in a simple, fast manner.  Perhaps, I should be going through access routines,
369        # but I am not
370    
371        # ff
372        # Get the families file and sort out duplicates.
373        Trace("Processing fig-families.") if T(2);
374        my $ffdata = &FIG::get_figfams_data();
375        if ((-s "$ffdata/families.2c") &&
376            Open(\*FF,"cut -f2 $ffdata/families.2c | sort -u |")) {
377            $stats->Add(familyFile => 1);
378            # Read the file.
379            while (defined($_ = <FF>)) {
380                # If there is a PEG on this line, denote the PEG has fig-family evidence.
381                if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/) {
382                    Tracer::PutLine($oh, [$1, '', 'ff']);
383                }
384            }
385            close(FF);
386        }
387        # cwn, cwh
388        # Get the coupling scores file.
389        Trace("Processing couplings.") if T(2);
390        if (open(FC,"<$FIG_Config::data/CouplingData/scores")) {
391            my $fc = <FC>;
392            # Loop until we hit end-of-file or a line beginning with a space.
393            while ($fc && ($fc =~ /^(\S+)/)) {
394                $stats->Add(couplingRows => 1);
395                my $curr = $1;
396                my @set  = ();
397                # We expect now to read a set of lines with the current peg in the
398                # first column, followed by the coupled peg and a score.
399                while ($fc && ($fc =~ /^(\S+)\t(\S+)\t(\d+)/) && ($curr eq $1)) {
400                    # If the score is high enough, add the coupled peg to the current set.
401                    if ($3 >= 5) {
402                        push(@set,$2);
403                    }
404                    $fc = <FC>;
405                }
406                # Now we're at the beginning of the next peg's couplings, but we need to
407                # look at the current peg's data. Only proceed if we found couplings
408                # and the PEG is not in a subsystem.
409                if (@set > 0 && ! $insub{$curr}) {
410                    # Find out if all of the coupled pegs are hypothetical.
411                    my $i;
412                    for ($i=0; ($i < @set) && &FIG::hypo(scalar $fig->function_of($set[$i])); $i++) {}
413                    # If we found a non-hypothetical PEG, we are CWN, else we're CWH.
414                    if ($i < @set) {
415                        Tracer::PutLine($oh, [$curr, '', 'cwn']);
416                    } else {
417                        Tracer::PutLine($oh, [$curr, '', 'cwh']);
418                    }
419                }
420            }
421            close(FC);
422        }
423        # Close the output file.
424        close $oh;
425        # Return its name.
426        return $retVal;
427    }
428    
429  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3