[Bio] / Sprout / EvCodeRefresh.pl Repository:
ViewVC logotype

Annotation of /Sprout/EvCodeRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use CustomAttributes;
23 :     use Stats;
24 : parrello 1.3 use FIG;
25 : parrello 1.1
26 :     =head1 EvCodeRefresh Script
27 :    
28 :     EvCodeRefresh [options] <filename>
29 :    
30 : parrello 1.3 Recompute evidence codes.
31 : parrello 1.1
32 :     =head2 Introduction
33 :    
34 : parrello 1.3 This script generates and reloads evidence codes.
35 : parrello 1.1
36 :     =head2 Command-Line Options
37 :    
38 :     =over 4
39 :    
40 :     =item trace
41 :    
42 :     Specifies the tracing level. The higher the tracing level, the more messages
43 :     will appear in the trace log. Use E to specify emergency tracing.
44 :    
45 : parrello 1.3 =item load
46 : parrello 1.1
47 : parrello 1.3 Specifies a file containing evidence codes. The file should contain either 2 or 3
48 :     columns. Each record should contain a feature ID in the first column and a
49 :     corresponding evidence code in the last column. If this option is specified,
50 :     the evidence codes are loaded from the file. Otherwise, the evidence codes are
51 :     recomputed into a new temporary file.
52 : parrello 1.1
53 :     =item classes
54 :    
55 :     If this option is specified, it should be the name of a tab-delimited file
56 :     containing the evidence classes. The evidence class table will be deleted and
57 :     reloaded from the file, which should be a valid ERDB load file for the
58 :     B<EvidenceClass> table.
59 :    
60 :     =item user
61 :    
62 :     Name suffix to be used for log files. If omitted, the PID is used.
63 :    
64 :     =item sql
65 :    
66 :     If specified, turns on tracing of SQL activity.
67 :    
68 :     =item background
69 :    
70 :     Save the standard and error output to files. The files will be created
71 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
72 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
73 :     B<user> option above.
74 :    
75 :     =item help
76 :    
77 :     Display this command's parameters and options.
78 :    
79 :     =item warn
80 :    
81 :     Create an event in the RSS feed when an error occurs.
82 :    
83 :     =item phone
84 :    
85 :     Phone number to message when the script is complete.
86 :    
87 :     =back
88 :    
89 :     =cut
90 :    
91 :     # Get the command-line options and parameters.
92 :     my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],
93 :     {
94 : parrello 1.3 load => ["", "file containing evidence codes; if none specified, evidence codes will be computed"],
95 : parrello 1.1 trace => ["2", "tracing level"],
96 : parrello 1.3 classes => ["", "evidence class file name"],
97 : parrello 1.1 phone => ["", "phone number (international format) to call when load finishes"]
98 :     },
99 : parrello 1.3 "",
100 : parrello 1.1 @ARGV);
101 :     # Set a variable to contain return type information.
102 :     my $rtype;
103 :     # Insure we catch errors.
104 :     eval {
105 : parrello 1.3 # Create the FIG object.
106 :     my $fig = FIG->new();
107 :     # We'll count our progress in here.
108 :     my $stats = Stats->new();
109 :     # Create a temporary directory for our calculations.
110 :     my $dest=$FIG_Config::temp."/evidence_codes.".time;
111 :     mkdir ($dest, 0775);
112 :     # The file name for the evidence codes will be put in here.
113 :     my $fileName;
114 :     if ($options->{load}) {
115 :     # Here we're loading from a precomputed file.
116 :     $fileName = $options->{load};
117 :     Trace("Evidence codes will be loaded from precomputed file $fileName") if T(2);
118 :     } else {
119 :     # Here we need to create the evidence codes.
120 :     $fileName = GetEvCodes($fig, $dest, $stats);
121 :     # Check for literature data.
122 :     if (-s "$FIG_Config::data/Dlits/dlits") {
123 :     # We have some, so roll it in.
124 :     Trace("Loading dlits.") if T(3);
125 :     system "$FIG_Config::bin/load_dlits";
126 :     Trace("Generating ilits.") if T(3);
127 :     system "$FIG_Config::bin/generate_ilits > $FIG_Config::data/Dlits/ilits";
128 :     Trace("Generating dlit codes.") if T(3);
129 :     system "$FIG_Config::bin/generate_dlit_ev_codes > $FIG_Config::data/Dlits/dlit.ev.codes";
130 :     Trace("Generating ilit codes.") if T(3);
131 :     system "$FIG_Config::bin/generate_ilit_ev_codes < $FIG_Config::data/Dlits/ilits > $FIG_Config::data/Dlits/ilit.ev.codes";
132 :     Trace("Sorting and writing literature codes.") if T(3);
133 :     system "cat $FIG_Config::data/Dlits/[di]lit.ev.codes | sort -u >> $fileName";
134 :     }
135 :     Trace("Evidence codes generated to file $fileName.") if T(2);
136 :     }
137 :     # Now we're ready to load. Get the attributes database.
138 : parrello 1.1 my $attr = CustomAttributes->new();
139 :     # Check for a class file.
140 :     if ($options->{classes}) {
141 :     # We have one. Load it into the evidence class table.
142 :     Trace("Loading evidence classes from $options->{classes}.") if T(2);
143 :     $attr->LoadTable($options->{classes}, 'EvidenceClass', truncate => 1);
144 :     Trace("Evidence classes loaded.") if T(2);
145 :     }
146 : parrello 1.3 # Now we convert the evidence code file into a load file for the IsEvidencedBy
147 :     # table. First, we open it.
148 :     my $ih = Open(undef, "<$fileName");
149 :     # Create the load file. We sort it to speed up the load.
150 :     my $loadFileName = "$FIG_Config::temp/IsEvidencedBy$$.dtx";
151 :     my $oh = Open(undef, "| sort >$loadFileName");
152 :     # Finally, we use this hash to track all the evidence classes.
153 :     my %classes = ();
154 :     Trace("Reading evidence codes.") if T(3);
155 :     # Loop through the input file, writing load records.
156 :     while (! eof $ih) {
157 :     # Read the input record.
158 :     my @fields = Tracer::GetLine($ih);
159 :     my $line = $.;
160 :     Trace("$line input lines processed.") if T(3) && ($. % 10000 == 0);
161 :     # Insure it's valid.
162 :     my $last = $#fields;
163 :     if ($last >= 3 || $last < 1) {
164 :     Trace("Record $line in input file has incorrect number of columns.") if T(3);
165 :     $stats->Add(errors => 1);
166 :     } else {
167 :     # Get the feature ID and the code.
168 :     my ($fid, $code) = @fields[0, $last];
169 :     # Validate the feature ID and the evidence code.
170 :     if (! ($fid =~ /^fig\|\d+/)) {
171 :     Trace("Record $line in input file has invalid feature ID \"$fid\".") if T(3);
172 :     $stats->Add(errors => 1);
173 :     } elsif (! ($code =~ /^([a-z]+)(.*)/)) {
174 :     Trace("Record $line in input file has invalid evidence code \"$code\".") if T(3);
175 : parrello 1.1 $stats->Add(errors => 1);
176 :     } else {
177 : parrello 1.3 # We have a valid input row. Produce the output line. Note that as a
178 :     # result of the pattern match that validated the evidence code, $1
179 :     # contains the class and $2 the modifier.
180 :     my ($class, $modifier) = ($1, $2);
181 :     Tracer::PutLine($oh, [$fid, $class, $modifier]);
182 :     # Count this as an output row and as a member of the specified class.
183 :     $stats->Add(rows => 1);
184 :     $stats->Add($class => 1);
185 :     $classes{$class} = 1;
186 : parrello 1.1 }
187 :     }
188 : parrello 1.3 }
189 :     Trace("Evidence codes reformatted.") if T(2);
190 :     # Close the files.
191 :     close $oh;
192 :     close $ih;
193 :     Trace("Evidence codes will be loaded from $loadFileName.") if T(2);
194 :     # Now we need to verify the incoming evidence codes against the known
195 :     # evidence classes. We issue a message for every non-existent class. It's
196 :     # not a serious error, but it's something the user should know.
197 :     Trace("Verifying evidence classes.") if T(2);
198 :     for my $class (keys %classes) {
199 :     if (! $attr->Exists(EvidenceClass => $class)) {
200 :     Trace("Evidence class \"$class\" not found in database!") if T(2);
201 :     $stats->Add(bad_class => 1);
202 : parrello 1.1 }
203 :     }
204 : parrello 1.3 # Now we load.
205 :     Trace("Loading evidence codes.") if T(2);
206 :     $attr->LoadTable($loadFileName, 'IsEvidencedBy', truncate => 1,
207 :     mode => 'concurrent');
208 :     # Tell the user we're done, and show the statistics.
209 :     Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);
210 : parrello 1.1 };
211 :     if ($@) {
212 :     Trace("Script failed with error: $@") if T(0);
213 :     $rtype = "error";
214 :     } else {
215 :     Trace("Script complete.") if T(2);
216 :     $rtype = "no error";
217 :     }
218 :     if ($options->{phone}) {
219 :     my $msgID = Tracer::SendSMS($options->{phone}, "EvCodeRefresh terminated with $rtype.");
220 :     if ($msgID) {
221 :     Trace("Phone message sent with ID $msgID.") if T(2);
222 :     } else {
223 :     Trace("Phone message not sent.") if T(2);
224 :     }
225 :     }
226 :    
227 : parrello 1.3 =head3 GetEvCodes
228 :    
229 :     my $fileName = GetEvCodes($fig, $dest, $stats);
230 :    
231 :     Create a tab-delimited file of evidence codes. The output file will be in
232 :     the specified destination directory, and it will contain three columns.
233 :     The first column will contain the name of a feature, and the last will
234 :     contain an associated evidence code. The second column is not used. It is
235 :     put in solely for compatibility with external scripts.
236 :    
237 :     =over 4
238 :    
239 :     =item fig
240 :    
241 :     A FIG-like object used to access the data store.
242 :    
243 :     =item dest
244 :    
245 :     Directory to contain the working files.
246 :    
247 :     =item stats
248 :    
249 :     A statistics object that can be used to track activity.
250 :    
251 :     =item RETURN
252 :    
253 :     Returns the name of a file in the destination directory. The file will contain the
254 :     new evidence codes.
255 :    
256 :     =back
257 :    
258 :     =cut
259 :    
260 :     sub GetEvCodes {
261 :     # Get the parameters.
262 :     my ($fig, $dest, $stats) = @_;
263 :     # The subsystem list goes in here.
264 :     my @all_subsystems = sort grep { $fig->usable_subsystem($_) } $fig->all_subsystems;
265 :     # This will contain all the pegs in subsystems.
266 :     my %insub;
267 :     # Create the output file.
268 :     my $retVal = "$dest/evidenceCodes.tbl";
269 :     my $oh = Open(undef, ">$retVal");
270 :     # Loop through the subsystems.
271 :     for my $sub (@all_subsystems) {
272 :     Trace("Processing $sub (" . $stats->Progress(subsystems => scalar @all_subsystems) . "%).") if T(3);
273 :     my $subsystem = new Subsystem($sub,$fig,0);
274 :     if (! $subsystem) {
275 :     Trace("Error loading subsystem $sub.") if T(1);
276 :     $stats->AddMessage("Could not load subsystem $sub.");
277 :     $stats->Add(badSubsystem => 1);
278 :     } else {
279 :     $stats->Add(subsystem => 1);
280 :     # Get ALL of the coupling data for the pegs in the subsystem.
281 :     my @cdata = $fig->coupled_to_batch($subsystem->get_all_pegs());
282 :     # returns triples [$peg1, $peg2, $score]. create %cdata that maps
283 :     # peg1 => [peg2]
284 :     my %cdata;
285 :     map { push(@{$cdata{$_->[0]}}, $_->[1]) } @cdata;
286 :     # Get the genomes and roles.
287 :     my @genomes = $subsystem->get_genomes();
288 :     my @roles = $subsystem->get_roles;
289 :     # Loop through the genomes.
290 :     for my $genome (@genomes) {
291 :     Trace("Processing $genome for $sub.") if T(4);
292 :     Trace($stats->Ask('subsystemRows') . " total spreadsheet rows processed.") if $stats->Check(subsystemRows => 500) && T(3);
293 :     # Build hashes for the different types of evidence.
294 :     my (%pegs_in_row, %isu, %icw, %isd);
295 :     # Get the variant code. This tells us what roles to expect.
296 :     my $vcode = $subsystem->get_variant_code($subsystem->get_genome_index($genome));
297 :     # Only proceed if it's a real, known variant.
298 :     if (($vcode ne "0") && ($vcode ne "-1")) {
299 :     # Loop through the roles.
300 :     for my $role (@roles) {
301 :     # Get the PEGs for this genome and role (one cell in the spreadsheet).
302 :     my @pegs = $subsystem->get_pegs_from_cell($genome,$role);
303 :     # Mark each of the pegs as being in a subsystem.
304 :     foreach my $peg (@pegs) {
305 :     $insub{$peg} = 1;
306 :     }
307 :     # Accumulate the number of pegs.
308 :     $stats->Add(pegRoles => scalar @pegs);
309 :     # The evidence we'll deduce depends on the number of pegs in this cell.
310 :     if (@pegs == 1) {
311 :     # If there's only one, save this peg as possessing InSubsystemUnique (isu) evidence.
312 :     $isu{$pegs[0]} = 1;
313 :     } elsif (@pegs > 1) {
314 :     # If there's more than one, then the evidence code is InSubsystemDuplicate (idu),
315 :     # modified by the number of pegs in the cell.
316 :     foreach my $peg (@pegs) {
317 :     $isd{$peg} = @pegs;
318 :     }
319 :     }
320 :     # For each peg, note that it occurs in a this row.
321 :     for my $peg (@pegs) {
322 :     $pegs_in_row{$peg} = 1;
323 :     }
324 :     }
325 :     }
326 :     # Loop through all the pegs in the current row. Use a batch request for this.
327 :     for my $peg (keys(%pegs_in_row)) {
328 :     # Get a list of all the pegs in the current row that are coupled to the current peg.
329 :     my $clist = $cdata{$peg};
330 :     if ($clist) {
331 :     my @coup = grep { $pegs_in_row{$_} } @$clist;
332 :     my $n = @coup;
333 :     if ($n > 0) {
334 :     # Denote that we have IsCoupledWith(icw) evidence for this peg, modified by
335 :     # the number of couplings.
336 :     $icw{$peg} = $n;
337 :     }
338 :     }
339 :     }
340 :     # Convert the subsystem name to a subsystem ID (spaces to underscores).
341 :     $sub =~ s/\s+/_/g;
342 :     # Loop through all the pegs in the current row.
343 :     for my $peg (keys(%pegs_in_row)) {
344 :     # If this peg has ISU evidence, send it to the output.
345 :     if ($isu{$peg} ) {
346 :     Tracer::PutLine($oh, [$peg, '', "isu;$sub"]);
347 :     }
348 :     # If this peg has ICW evidence, send it to the output.
349 :     if (my $n = $icw{$peg} ) {
350 :     Tracer::PutLine($oh, [$peg, '', "icw($n);$sub"]);
351 :     } else {
352 :     # If there's no ICW evidence, ty IDU evidence.
353 :     if ($n = $isd{$peg} ) {
354 :     Tracer::PutLine($oh, [$peg, '', "idu($n);$sub"]);
355 :     }
356 :     }
357 :     }
358 :     }
359 :     }
360 :     }
361 :    
362 :     # I have chosen to implement the
363 :     #
364 :     # ff - in FIGfam
365 :     # cwn - clustered with non-hypothetical
366 :     # cwh - clustered, but only with hypotheticals
367 :     #
368 :     # in a simple, fast manner. Perhaps, I should be going through access routines,
369 :     # but I am not
370 :    
371 :     # ff
372 :     # Get the families file and sort out duplicates.
373 :     Trace("Processing fig-families.") if T(2);
374 :     my $ffdata = &FIG::get_figfams_data();
375 :     if ((-s "$ffdata/families.2c") &&
376 :     Open(\*FF,"cut -f2 $ffdata/families.2c | sort -u |")) {
377 :     $stats->Add(familyFile => 1);
378 :     # Read the file.
379 :     while (defined($_ = <FF>)) {
380 :     # If there is a PEG on this line, denote the PEG has fig-family evidence.
381 :     if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/) {
382 :     Tracer::PutLine($oh, [$1, '', 'ff']);
383 :     }
384 :     }
385 :     close(FF);
386 :     }
387 :     # cwn, cwh
388 :     # Get the coupling scores file.
389 :     Trace("Processing couplings.") if T(2);
390 :     if (open(FC,"<$FIG_Config::data/CouplingData/scores")) {
391 :     my $fc = <FC>;
392 :     # Loop until we hit end-of-file or a line beginning with a space.
393 :     while ($fc && ($fc =~ /^(\S+)/)) {
394 :     $stats->Add(couplingRows => 1);
395 :     my $curr = $1;
396 :     my @set = ();
397 :     # We expect now to read a set of lines with the current peg in the
398 :     # first column, followed by the coupled peg and a score.
399 :     while ($fc && ($fc =~ /^(\S+)\t(\S+)\t(\d+)/) && ($curr eq $1)) {
400 :     # If the score is high enough, add the coupled peg to the current set.
401 :     if ($3 >= 5) {
402 :     push(@set,$2);
403 :     }
404 :     $fc = <FC>;
405 :     }
406 :     # Now we're at the beginning of the next peg's couplings, but we need to
407 :     # look at the current peg's data. Only proceed if we found couplings
408 :     # and the PEG is not in a subsystem.
409 :     if (@set > 0 && ! $insub{$curr}) {
410 :     # Find out if all of the coupled pegs are hypothetical.
411 :     my $i;
412 :     for ($i=0; ($i < @set) && &FIG::hypo(scalar $fig->function_of($set[$i])); $i++) {}
413 :     # If we found a non-hypothetical PEG, we are CWN, else we're CWH.
414 :     if ($i < @set) {
415 :     Tracer::PutLine($oh, [$curr, '', 'cwn']);
416 :     } else {
417 :     Tracer::PutLine($oh, [$curr, '', 'cwh']);
418 :     }
419 :     }
420 :     }
421 :     close(FC);
422 :     }
423 :     # Close the output file.
424 :     close $oh;
425 :     # Return its name.
426 :     return $retVal;
427 :     }
428 :    
429 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3