[Bio] / FortyEight / imp_salvage.pl Repository:
ViewVC logotype

Annotation of /FortyEight/imp_salvage.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : olson 1.1 #
2 :     # Salvage annotations for the organisms that are replacements of SEED genomes.
3 :     #
4 :     # This is true if the file REPLACES exists and containst the genome ID of an
5 :     # existing SEED genome.
6 :     #
7 :     # Note that for this to work the SEED that this script is invoked from has to
8 :     # have a current copy of the canonical SEED organism data.
9 :     #
10 :     # Salvage does the following:
11 :     #
12 :     # Determine the old->new mapping using make_peg_maps_from_fasta.
13 :     #
14 :     # Determine the set of pegs for which we will salvage assignments by finding
15 :     # the set of mapped pegs from the old organism that are in subsystems. We will
16 :     # carry across only those assignments; the others will be taken from the RAST
17 :     # pipeline. We will construct annotation file entries for the other mapped PEGs
18 :     # that have the old annotation data to keep the history, but they will be capped
19 :     # with the annotation for the RAST assignment.
20 :     #
21 :    
22 :    
23 :     use strict;
24 :     use FIG;
25 :     use Data::Dumper;
26 :     use FIG_Config;
27 :     use File::Copy;
28 :     use File::Basename;
29 :     use ImportJob;
30 :     use GenomeMeta;
31 :     use JobStage;
32 : olson 1.2 use POSIX;
33 : olson 1.1
34 :     @ARGV == 1 or die "Usage: $0 job-dir\n";
35 :    
36 :     my $fig = new FIG();
37 :    
38 :     my $max_hits = 300;
39 :    
40 :     my $jobdir = shift;
41 :    
42 :     -d $jobdir or die "$0: job dir $jobdir does not exist\n";
43 :    
44 :     my $stage = new JobStage('ImportJob', 'salvage', $jobdir);
45 :    
46 :     $stage or die "$0: Could not create job object";
47 :    
48 :     my $job = $stage->job;
49 :     my $job_id = $job->id;
50 :    
51 :     $stage->log("Running on " . $stage->hostname);
52 :    
53 :     $stage->set_status("running");
54 :     $stage->set_running("yes");
55 :    
56 :     $stage->set_qualified_metadata("host", $stage->hostname);
57 :    
58 :     #
59 :     #
60 :     # Begin
61 :     #
62 :     #
63 :    
64 :     open(JOBS, "<$jobdir/rast.jobs") or $stage->fatal("Cannot open $jobdir/rast.jobs: $!");
65 :    
66 :     while (my $rjdir = <JOBS>)
67 :     {
68 :     chomp $rjdir;
69 :    
70 :     my $rj = new Job48($rjdir);
71 : olson 1.2 my $rj_id = $rj->id;
72 :     my $orgdir = $rj->orgdir();
73 :     my $repfile = "$orgdir/REPLACES";
74 : olson 1.1
75 :     my $repl = &FIG::file_head($repfile);
76 : olson 1.2
77 :     my $salvage_msg;
78 :    
79 : olson 1.1 if ($repl)
80 :     {
81 :     chomp $repl;
82 : olson 1.2 my $n = do_salvage($rj, $repl);
83 :    
84 :     $salvage_msg = "$n function assignments salvaged from $repl " . $fig->genus_species($repl);
85 :     }
86 :     else
87 :     {
88 :     #
89 :     # We are not salvaging, but we need to do a little cleanup to make the two cases the same.
90 :     #
91 :     # Create imp_assigned_functions from the set of *_functions files we have, and copy
92 :     # the annotations over to imp_annotations.
93 :     #
94 :    
95 :     my $imp_af = $stage->open_file(">$orgdir/imp_assigned_functions");
96 :     for my $f (qw(assigned_functions proposed_non_ff_functions proposed_functions))
97 :     {
98 :     my $path = "$orgdir/$f";
99 :    
100 :     if (open(AF, "<$path"))
101 :     {
102 :     while (<AF>)
103 :     {
104 :     print $imp_af $_;
105 :     }
106 :     close(AF);
107 :     }
108 :     }
109 :     close($imp_af);
110 :    
111 :     if (-f "$orgdir/annotations")
112 :     {
113 :     copy("$orgdir/annotations", "$orgdir/imp_annotations") or
114 :     $stage->fatal("Cannot copy $orgdir/annotations to $orgdir/imp_annotations: $!");
115 :     }
116 : olson 1.1 }
117 : olson 1.2
118 :     #
119 :     # While we're here, we're going to also mark this genome directory as a RAST job.
120 :     #
121 :    
122 :     my $fh = $stage->open_file(">$orgdir/RAST");
123 :     my $submit_time = ctime($rj->meta->get_metadata("upload.timestamp"));
124 :     my $dtime = (stat("$rjdir/DONE"))[9];
125 :     my $finish_time = ctime($dtime);
126 :     my $import_time = ctime(time);
127 :     print $fh "Genome processed by RAST at $FIG_Config::fig\n";
128 :     print $fh "$salvage_msg\n" if $salvage_msg;
129 :     print $fh "RAST job number $rj_id from $rjdir\n";
130 :     print $fh "Upload at: $submit_time";
131 :     print $fh "Completion at: $finish_time";
132 :     print $fh "Import processing at: $import_time";
133 :     close($fh);
134 :    
135 : olson 1.1 }
136 :    
137 :     close(JOBS);
138 :    
139 :     sub do_salvage
140 :     {
141 :     my($job, $old_genome) = @_;
142 :    
143 :     print "Do replacement on " . $job->genome_name() . " from $old_genome\n";
144 :    
145 :     my $orgdir = $job->orgdir();
146 :    
147 :     #
148 : olson 1.2 # Compute mappings.
149 : olson 1.1 #
150 :    
151 : olson 1.2 my $n_salvaged = 0;
152 : olson 1.1
153 :     my $old_orgdir = "$FIG_Config::organisms/$old_genome";
154 :    
155 :     -d $old_orgdir or $stage->fatal("Old organism dir $old_orgdir does not exist");
156 :    
157 :     my $maps = "$orgdir/peg_maps";
158 :     #
159 :     # XXX - should rerun this every time, later.
160 :     if (! -f $maps)
161 :     {
162 :     $stage->run_process("make_peg_maps_from_fasta",
163 :     "$FIG_Config::bin/make_peg_maps_from_fasta",
164 :     $old_orgdir,
165 :     $orgdir,
166 :     $maps);
167 :     }
168 :    
169 :     #
170 :     # Ingest the map and mark each peg with its subsystem status.
171 :     #
172 :    
173 :     open(M, "<$maps") or $stage->fatal("Cannot open peg maps $maps: $!");
174 :     my @map;
175 :     my(%old_to_new, %new_to_old);
176 :     while (<M>)
177 :     {
178 :     chomp;
179 :     my($peg_old, $peg_new) = split(/\t/);
180 :     my $ss = [$fig->peg_to_subsystems($peg_old)];
181 :     my $ent = {
182 :     old => $peg_old,
183 :     new => $peg_new,
184 :     ss => $ss,
185 :     };
186 :    
187 :     push(@map, $ent);
188 :     $old_to_new{$peg_old} = $ent;
189 :     $new_to_old{$peg_new} = $ent;
190 :     }
191 :     close(M);
192 :    
193 :     #
194 :     # Given this map, we construct the new assigned_functions and annotations files.
195 :     #
196 : olson 1.2 # We start by copying annotations to imp_annotations, to get the initial history.
197 : olson 1.1 #
198 :     # Then we scan the old organism's annotations and assigned function files,
199 :     # remembering any of the pegs that show up in the map. Any that do not,
200 :     # we write to files unmapped.annotations and unmapped.assigned_functions, again
201 :     # to retain history.
202 :     #
203 :     # Once this scan is complete, we scan the new organism's assigned functions file.
204 :     # If a peg in there is mapped, we copy the set of annotations for the old
205 :     # peg across into the new annotations file, mapping pegs as we go.
206 :     #
207 :     # If the peg was in a subsystem, we then write
208 :     # an annotation declaring the set of subsystems the peg was in, and a final
209 :     # annotation with the function assignment from the old org. The old
210 : olson 1.2 # assignment is written to imp_assigned_functions.
211 : olson 1.1 #
212 :     # If the peg is not in a subsystmem, we write a final annotation with the
213 : olson 1.2 # rast annotation, and write the rast assignment to imp_assigned_functions.
214 : olson 1.1 #
215 : olson 1.2 # If the peg was not mapped, we just write the rast function to imp_assigned_functions.
216 : olson 1.1 #
217 :     # Note that we don't actually have to scan anything - all we need to do is
218 :     # walk over the entries in the old/new map, and write out the appropriate data.
219 :     # Anything that isn't in there was already copied from the rast version of the data.
220 :     #
221 :    
222 : olson 1.2 my $new_af = $stage->open_file(">$orgdir/imp_assigned_functions");
223 :    
224 :     #
225 :     # Read the RAST assigned functions files, write a single large file with
226 :     # all the data in it, and pull the assignments into the %rast hash as well.
227 :     #
228 :    
229 :     my %rast;
230 :     my $imp_af = $stage->open_file(">$orgdir/imp_assigned_functions");
231 :     for my $f (qw(assigned_functions proposed_non_ff_functions proposed_functions))
232 :     {
233 :     my $path = "$orgdir/$f";
234 :    
235 :     if (open(AF, "<$path"))
236 :     {
237 :     while (<AF>)
238 :     {
239 :     print $imp_af $_;
240 :     chomp;
241 :     my($peg, $fn) = split(/\t/);
242 :     $rast{$peg} = $fn;
243 :     }
244 :     close(AF);
245 :     }
246 :     }
247 :     close($imp_af);
248 :    
249 :     #
250 :     # Copy annotations to imp_annotations to initialize it; leave the
251 :     # filehandle open to add more later on.
252 :     #
253 :     my $new_anno = $stage->open_file(">$orgdir/imp_annotations");
254 :     my $orig_anno = $stage->open_file("<$orgdir/annotations");
255 :     my $buf;
256 :     while (read($orig_anno, $buf, 4096))
257 :     {
258 :     print $new_anno $buf;
259 :     }
260 :     close($orig_anno);
261 : olson 1.1
262 :     my $unmapped_af = $stage->open_file(">$orgdir/unmapped.assigned_functions");
263 :     my $unmapped_anno = $stage->open_file(">$orgdir/unmapped.annotations");
264 :    
265 :     my $old_af = $stage->open_file("<$old_orgdir/assigned_functions");
266 :     my $old_anno = $stage->open_file("<$old_orgdir/annotations");
267 :    
268 :     my(%old_anno, %old_af);
269 :    
270 : olson 1.2 #
271 :     # Scan the SEED annotations and assigned functions files and ingest.
272 :     #
273 :    
274 : olson 1.1 while (<$old_af>)
275 :     {
276 :     chomp;
277 :     my($peg, $fun) = split(/\t/);
278 :     if (my $ent = $old_to_new{$peg})
279 :     {
280 :     $ent->{old_func} = $fun;
281 :     }
282 :     else
283 :     {
284 :     print $unmapped_af "$_\n";
285 :     }
286 :     }
287 :     close($old_af);
288 :    
289 :     {
290 :     local $/;
291 :     $/ = "//\n";
292 :    
293 :     while (<$old_anno>)
294 :     {
295 :     chomp;
296 :     my($peg, $time, $who, $what, $val) = split(/\n/, $_, 5);
297 :    
298 :     if (my $ent = $old_to_new{$peg})
299 :     {
300 :     $val =~ s/\n*$//;
301 :     push @{$ent->{old_anno}}, [$peg, $time, $who, $what, $val];
302 :     }
303 :     else
304 :     {
305 :     print $unmapped_anno $ent . $/;
306 :     }
307 :     }
308 :     }
309 :     close($old_anno);
310 :    
311 :     for my $new_peg (sort { &FIG::by_fig_id($a, $b) } keys %new_to_old)
312 :     {
313 :     my $ent = $new_to_old{$new_peg};
314 :    
315 :     #
316 :     # Copy old annotations
317 :     #
318 :     for my $anno (@{$ent->{old_anno}})
319 :     {
320 :     $anno->[0] = $new_peg;
321 :     print $new_anno join("\n", @$anno), "\n//\n";
322 :     }
323 :    
324 :     my $old_func = $ent->{old_func};
325 :     my $new_func = $rast{$new_peg};
326 :    
327 :     #
328 :     # Determine if we need to update.
329 :     #
330 :     my @ss_list = @{$ent->{ss}};
331 :    
332 :     if ($old_func eq '')
333 :     {
334 :     print "$ent->{old} => $new_peg No existing function\n";
335 :     print $new_anno join("\n", $new_peg, time, "salvage",
336 :     "No function found in original organism $ent->{old}"), "\n//\n";
337 :    
338 :     }
339 :     elsif ($old_func eq $new_func)
340 :     {
341 :     print "$ent->{old} => $new_peg functions are the same\n";
342 :     print $new_anno join("\n", $new_peg, time, "salvage",
343 :     "Old and new assignments are the same", $old_func), "\n//\n";
344 :    
345 :     }
346 :     else
347 :     {
348 :     if (@ss_list > 0)
349 :     {
350 :     print "$ent->{old} => $new_peg is in a ss\n";
351 :    
352 :     print $new_anno join("\n", $new_peg, time, "salvage",
353 :     "Retaining old assignment due to membership in subsystems", "@ss_list", $old_func), "\n//\n";
354 : olson 1.2 print $new_anno join("\n", $new_peg, time, "salvage", "Set master function to", $old_func), "\n//\n";
355 : olson 1.1 print $new_af "$new_peg\t$old_func\n";
356 : olson 1.2
357 :     $n_salvaged++;
358 : olson 1.1 }
359 :     else
360 :     {
361 :     print "$ent->{old} => $new_peg is not in a ss\n";
362 :    
363 :     print $new_anno join("\n", $new_peg, time, "salvage",
364 :     "Using RAST assignment due to no subsystem membership", $new_func), "\n//\n";
365 : olson 1.2 print $new_anno join("\n", $new_peg, time, "salvage", "Set master function to", $new_func), "\n//\n";
366 : olson 1.1 }
367 :     }
368 :     }
369 :     close($new_af);
370 :     close($new_anno);
371 : olson 1.2
372 :     return $n_salvaged;
373 : olson 1.1 }
374 :    
375 :    
376 :    
377 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3