[Bio] / Sprout / EvCodeRefresh.pl Repository:
ViewVC logotype

Annotation of /Sprout/EvCodeRefresh.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     #
4 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
5 :     # for Interpretations of Genomes. All Rights Reserved.
6 :     #
7 :     # This file is part of the SEED Toolkit.
8 :     #
9 :     # The SEED Toolkit is free software. You can redistribute
10 :     # it and/or modify it under the terms of the SEED Toolkit
11 :     # Public License.
12 :     #
13 :     # You should have received a copy of the SEED Toolkit Public License
14 :     # along with this program; if not write to the University of Chicago
15 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
16 :     # Genomes at veronika@thefig.info or download a copy from
17 :     # http://www.theseed.org/LICENSE.TXT.
18 :     #
19 :    
20 :     use strict;
21 :     use Tracer;
22 :     use CustomAttributes;
23 :     use Stats;
24 : parrello 1.3 use FIG;
25 : parrello 1.1
26 :     =head1 EvCodeRefresh Script
27 :    
28 :     EvCodeRefresh [options] <filename>
29 :    
30 : parrello 1.3 Recompute evidence codes.
31 : parrello 1.1
32 :     =head2 Introduction
33 :    
34 : parrello 1.3 This script generates and reloads evidence codes.
35 : parrello 1.1
36 :     =head2 Command-Line Options
37 :    
38 :     =over 4
39 :    
40 :     =item trace
41 :    
42 :     Specifies the tracing level. The higher the tracing level, the more messages
43 :     will appear in the trace log. Use E to specify emergency tracing.
44 :    
45 : parrello 1.3 =item load
46 : parrello 1.1
47 : parrello 1.4 Specifies a file containing evidence codes. The file should contain 3
48 :     columns. Each record should contain a feature ID in the first column, the
49 :     literal C<evidence_code> in the middle column, and a corresponding evidence code
50 :     in the last column. If this option is specified, the evidence codes are loaded
51 :     from the file. Otherwise, the evidence codes are recomputed into a new temporary file.
52 : parrello 1.1
53 :     =item user
54 :    
55 :     Name suffix to be used for log files. If omitted, the PID is used.
56 :    
57 :     =item sql
58 :    
59 :     If specified, turns on tracing of SQL activity.
60 :    
61 :     =item background
62 :    
63 :     Save the standard and error output to files. The files will be created
64 :     in the FIG temporary directory and will be named C<err>I<User>C<.log> and
65 :     C<out>I<User>C<.log>, respectively, where I<User> is the value of the
66 :     B<user> option above.
67 :    
68 :     =item help
69 :    
70 :     Display this command's parameters and options.
71 :    
72 :     =item warn
73 :    
74 :     Create an event in the RSS feed when an error occurs.
75 :    
76 :     =item phone
77 :    
78 :     Phone number to message when the script is complete.
79 :    
80 :     =back
81 :    
82 :     =cut
83 :    
84 :     # Get the command-line options and parameters.
85 :     my ($options, @parameters) = StandardSetup([qw(ERDB CustomAttributes) ],
86 :     {
87 : parrello 1.3 load => ["", "file containing evidence codes; if none specified, evidence codes will be computed"],
88 : parrello 1.1 trace => ["2", "tracing level"],
89 :     phone => ["", "phone number (international format) to call when load finishes"]
90 :     },
91 : parrello 1.3 "",
92 : parrello 1.1 @ARGV);
93 :     # Set a variable to contain return type information.
94 :     my $rtype;
95 :     # Insure we catch errors.
96 :     eval {
97 : parrello 1.3 # Create the FIG object.
98 :     my $fig = FIG->new();
99 :     # We'll count our progress in here.
100 :     my $stats = Stats->new();
101 :     # Create a temporary directory for our calculations.
102 :     my $dest=$FIG_Config::temp."/evidence_codes.".time;
103 :     mkdir ($dest, 0775);
104 :     # The file name for the evidence codes will be put in here.
105 :     my $fileName;
106 :     if ($options->{load}) {
107 :     # Here we're loading from a precomputed file.
108 :     $fileName = $options->{load};
109 :     Trace("Evidence codes will be loaded from precomputed file $fileName") if T(2);
110 :     } else {
111 :     # Here we need to create the evidence codes.
112 :     $fileName = GetEvCodes($fig, $dest, $stats);
113 :     # Check for literature data.
114 :     if (-s "$FIG_Config::data/Dlits/dlits") {
115 :     # We have some, so roll it in.
116 :     Trace("Loading dlits.") if T(3);
117 :     system "$FIG_Config::bin/load_dlits";
118 :     Trace("Generating ilits.") if T(3);
119 :     system "$FIG_Config::bin/generate_ilits > $FIG_Config::data/Dlits/ilits";
120 :     Trace("Generating dlit codes.") if T(3);
121 :     system "$FIG_Config::bin/generate_dlit_ev_codes > $FIG_Config::data/Dlits/dlit.ev.codes";
122 :     Trace("Generating ilit codes.") if T(3);
123 :     system "$FIG_Config::bin/generate_ilit_ev_codes < $FIG_Config::data/Dlits/ilits > $FIG_Config::data/Dlits/ilit.ev.codes";
124 :     Trace("Sorting and writing literature codes.") if T(3);
125 :     system "cat $FIG_Config::data/Dlits/[di]lit.ev.codes | sort -u >> $fileName";
126 :     }
127 :     Trace("Evidence codes generated to file $fileName.") if T(2);
128 :     }
129 :     # Now we're ready to load. Get the attributes database.
130 : parrello 1.1 my $attr = CustomAttributes->new();
131 : parrello 1.4 # Load it.
132 : parrello 1.3 Trace("Loading evidence codes.") if T(2);
133 : parrello 1.4 $attr->LoadAttributesFrom($fileName, mode => 'concurrent');
134 : parrello 1.3 # Tell the user we're done, and show the statistics.
135 :     Trace("Evidence codes loaded.\n" . $stats->Show()) if T(2);
136 : parrello 1.1 };
137 :     if ($@) {
138 :     Trace("Script failed with error: $@") if T(0);
139 :     $rtype = "error";
140 :     } else {
141 :     Trace("Script complete.") if T(2);
142 :     $rtype = "no error";
143 :     }
144 :     if ($options->{phone}) {
145 :     my $msgID = Tracer::SendSMS($options->{phone}, "EvCodeRefresh terminated with $rtype.");
146 :     if ($msgID) {
147 :     Trace("Phone message sent with ID $msgID.") if T(2);
148 :     } else {
149 :     Trace("Phone message not sent.") if T(2);
150 :     }
151 :     }
152 :    
153 : parrello 1.3 =head3 GetEvCodes
154 :    
155 :     my $fileName = GetEvCodes($fig, $dest, $stats);
156 :    
157 :     Create a tab-delimited file of evidence codes. The output file will be in
158 :     the specified destination directory, and it will contain three columns.
159 :     The first column will contain the name of a feature, and the last will
160 :     contain an associated evidence code. The second column is not used. It is
161 :     put in solely for compatibility with external scripts.
162 :    
163 :     =over 4
164 :    
165 :     =item fig
166 :    
167 :     A FIG-like object used to access the data store.
168 :    
169 :     =item dest
170 :    
171 :     Directory to contain the working files.
172 :    
173 :     =item stats
174 :    
175 :     A statistics object that can be used to track activity.
176 :    
177 :     =item RETURN
178 :    
179 :     Returns the name of a file in the destination directory. The file will contain the
180 :     new evidence codes.
181 :    
182 :     =back
183 :    
184 :     =cut
185 :    
186 :     sub GetEvCodes {
187 :     # Get the parameters.
188 :     my ($fig, $dest, $stats) = @_;
189 :     # The subsystem list goes in here.
190 :     my @all_subsystems = sort grep { $fig->usable_subsystem($_) } $fig->all_subsystems;
191 :     # This will contain all the pegs in subsystems.
192 :     my %insub;
193 :     # Create the output file.
194 :     my $retVal = "$dest/evidenceCodes.tbl";
195 :     my $oh = Open(undef, ">$retVal");
196 :     # Loop through the subsystems.
197 :     for my $sub (@all_subsystems) {
198 :     Trace("Processing $sub (" . $stats->Progress(subsystems => scalar @all_subsystems) . "%).") if T(3);
199 :     my $subsystem = new Subsystem($sub,$fig,0);
200 :     if (! $subsystem) {
201 :     Trace("Error loading subsystem $sub.") if T(1);
202 :     $stats->AddMessage("Could not load subsystem $sub.");
203 :     $stats->Add(badSubsystem => 1);
204 :     } else {
205 :     $stats->Add(subsystem => 1);
206 :     # Get ALL of the coupling data for the pegs in the subsystem.
207 :     my @cdata = $fig->coupled_to_batch($subsystem->get_all_pegs());
208 :     # returns triples [$peg1, $peg2, $score]. create %cdata that maps
209 :     # peg1 => [peg2]
210 :     my %cdata;
211 :     map { push(@{$cdata{$_->[0]}}, $_->[1]) } @cdata;
212 :     # Get the genomes and roles.
213 :     my @genomes = $subsystem->get_genomes();
214 :     my @roles = $subsystem->get_roles;
215 :     # Loop through the genomes.
216 :     for my $genome (@genomes) {
217 :     Trace("Processing $genome for $sub.") if T(4);
218 :     Trace($stats->Ask('subsystemRows') . " total spreadsheet rows processed.") if $stats->Check(subsystemRows => 500) && T(3);
219 :     # Build hashes for the different types of evidence.
220 :     my (%pegs_in_row, %isu, %icw, %isd);
221 :     # Get the variant code. This tells us what roles to expect.
222 :     my $vcode = $subsystem->get_variant_code($subsystem->get_genome_index($genome));
223 :     # Only proceed if it's a real, known variant.
224 :     if (($vcode ne "0") && ($vcode ne "-1")) {
225 :     # Loop through the roles.
226 :     for my $role (@roles) {
227 :     # Get the PEGs for this genome and role (one cell in the spreadsheet).
228 :     my @pegs = $subsystem->get_pegs_from_cell($genome,$role);
229 :     # Mark each of the pegs as being in a subsystem.
230 :     foreach my $peg (@pegs) {
231 :     $insub{$peg} = 1;
232 :     }
233 :     # Accumulate the number of pegs.
234 :     $stats->Add(pegRoles => scalar @pegs);
235 :     # The evidence we'll deduce depends on the number of pegs in this cell.
236 :     if (@pegs == 1) {
237 :     # If there's only one, save this peg as possessing InSubsystemUnique (isu) evidence.
238 :     $isu{$pegs[0]} = 1;
239 :     } elsif (@pegs > 1) {
240 :     # If there's more than one, then the evidence code is InSubsystemDuplicate (idu),
241 :     # modified by the number of pegs in the cell.
242 :     foreach my $peg (@pegs) {
243 :     $isd{$peg} = @pegs;
244 :     }
245 :     }
246 :     # For each peg, note that it occurs in a this row.
247 :     for my $peg (@pegs) {
248 :     $pegs_in_row{$peg} = 1;
249 :     }
250 :     }
251 :     }
252 :     # Loop through all the pegs in the current row. Use a batch request for this.
253 :     for my $peg (keys(%pegs_in_row)) {
254 :     # Get a list of all the pegs in the current row that are coupled to the current peg.
255 :     my $clist = $cdata{$peg};
256 :     if ($clist) {
257 :     my @coup = grep { $pegs_in_row{$_} } @$clist;
258 :     my $n = @coup;
259 :     if ($n > 0) {
260 :     # Denote that we have IsCoupledWith(icw) evidence for this peg, modified by
261 :     # the number of couplings.
262 :     $icw{$peg} = $n;
263 :     }
264 :     }
265 :     }
266 :     # Convert the subsystem name to a subsystem ID (spaces to underscores).
267 :     $sub =~ s/\s+/_/g;
268 :     # Loop through all the pegs in the current row.
269 :     for my $peg (keys(%pegs_in_row)) {
270 :     # If this peg has ISU evidence, send it to the output.
271 :     if ($isu{$peg} ) {
272 : parrello 1.4 Tracer::PutLine($oh, [$peg, 'evidence_code', "isu;$sub"]);
273 : parrello 1.3 }
274 :     # If this peg has ICW evidence, send it to the output.
275 :     if (my $n = $icw{$peg} ) {
276 : parrello 1.4 Tracer::PutLine($oh, [$peg, 'evidence_code', "icw($n);$sub"]);
277 : parrello 1.3 } else {
278 :     # If there's no ICW evidence, ty IDU evidence.
279 :     if ($n = $isd{$peg} ) {
280 : parrello 1.4 Tracer::PutLine($oh, [$peg, 'evidence_code', "idu($n);$sub"]);
281 : parrello 1.3 }
282 :     }
283 :     }
284 :     }
285 :     }
286 :     }
287 :    
288 :     # I have chosen to implement the
289 :     #
290 :     # ff - in FIGfam
291 :     # cwn - clustered with non-hypothetical
292 :     # cwh - clustered, but only with hypotheticals
293 :     #
294 :     # in a simple, fast manner. Perhaps, I should be going through access routines,
295 :     # but I am not
296 :    
297 :     # ff
298 :     # Get the families file and sort out duplicates.
299 :     Trace("Processing fig-families.") if T(2);
300 :     my $ffdata = &FIG::get_figfams_data();
301 :     if ((-s "$ffdata/families.2c") &&
302 :     Open(\*FF,"cut -f2 $ffdata/families.2c | sort -u |")) {
303 :     $stats->Add(familyFile => 1);
304 :     # Read the file.
305 :     while (defined($_ = <FF>)) {
306 :     # If there is a PEG on this line, denote the PEG has fig-family evidence.
307 :     if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/) {
308 : parrello 1.4 Tracer::PutLine($oh, [$1, 'evidence_code', 'ff']);
309 : parrello 1.3 }
310 :     }
311 :     close(FF);
312 :     }
313 :     # cwn, cwh
314 :     # Get the coupling scores file.
315 :     Trace("Processing couplings.") if T(2);
316 :     if (open(FC,"<$FIG_Config::data/CouplingData/scores")) {
317 :     my $fc = <FC>;
318 :     # Loop until we hit end-of-file or a line beginning with a space.
319 :     while ($fc && ($fc =~ /^(\S+)/)) {
320 :     $stats->Add(couplingRows => 1);
321 :     my $curr = $1;
322 :     my @set = ();
323 :     # We expect now to read a set of lines with the current peg in the
324 :     # first column, followed by the coupled peg and a score.
325 :     while ($fc && ($fc =~ /^(\S+)\t(\S+)\t(\d+)/) && ($curr eq $1)) {
326 :     # If the score is high enough, add the coupled peg to the current set.
327 :     if ($3 >= 5) {
328 :     push(@set,$2);
329 :     }
330 :     $fc = <FC>;
331 :     }
332 :     # Now we're at the beginning of the next peg's couplings, but we need to
333 :     # look at the current peg's data. Only proceed if we found couplings
334 :     # and the PEG is not in a subsystem.
335 :     if (@set > 0 && ! $insub{$curr}) {
336 :     # Find out if all of the coupled pegs are hypothetical.
337 :     my $i;
338 :     for ($i=0; ($i < @set) && &FIG::hypo(scalar $fig->function_of($set[$i])); $i++) {}
339 :     # If we found a non-hypothetical PEG, we are CWN, else we're CWH.
340 :     if ($i < @set) {
341 : parrello 1.4 Tracer::PutLine($oh, [$curr, 'evidence_code', 'cwn']);
342 : parrello 1.3 } else {
343 : parrello 1.4 Tracer::PutLine($oh, [$curr, 'evidence_code', 'cwh']);
344 : parrello 1.3 }
345 :     }
346 :     }
347 :     close(FC);
348 :     }
349 :     # Close the output file.
350 :     close $oh;
351 :     # Return its name.
352 :     return $retVal;
353 :     }
354 :    
355 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3