[Bio] / FortyEight / imp_salvage.pl Repository:
ViewVC logotype

Annotation of /FortyEight/imp_salvage.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : olson 1.1 #
2 :     # Salvage annotations for the organisms that are replacements of SEED genomes.
3 :     #
4 :     # This is true if the file REPLACES exists and containst the genome ID of an
5 :     # existing SEED genome.
6 :     #
7 :     # Note that for this to work the SEED that this script is invoked from has to
8 :     # have a current copy of the canonical SEED organism data.
9 :     #
10 :     # Salvage does the following:
11 :     #
12 :     # Determine the old->new mapping using make_peg_maps_from_fasta.
13 :     #
14 :     # Determine the set of pegs for which we will salvage assignments by finding
15 :     # the set of mapped pegs from the old organism that are in subsystems. We will
16 :     # carry across only those assignments; the others will be taken from the RAST
17 :     # pipeline. We will construct annotation file entries for the other mapped PEGs
18 :     # that have the old annotation data to keep the history, but they will be capped
19 :     # with the annotation for the RAST assignment.
20 :     #
21 :    
22 :    
23 :     use strict;
24 :     use FIG;
25 :     use Data::Dumper;
26 :     use FIG_Config;
27 :     use File::Copy;
28 :     use File::Basename;
29 :     use ImportJob;
30 :     use GenomeMeta;
31 :     use JobStage;
32 : olson 1.2 use POSIX;
33 : olson 1.1
34 :     @ARGV == 1 or die "Usage: $0 job-dir\n";
35 :    
36 :     my $fig = new FIG();
37 :    
38 :     my $max_hits = 300;
39 :    
40 :     my $jobdir = shift;
41 :    
42 :     -d $jobdir or die "$0: job dir $jobdir does not exist\n";
43 :    
44 :     my $stage = new JobStage('ImportJob', 'salvage', $jobdir);
45 :    
46 :     $stage or die "$0: Could not create job object";
47 :    
48 :     my $job = $stage->job;
49 :     my $job_id = $job->id;
50 :    
51 :     $stage->log("Running on " . $stage->hostname);
52 :    
53 :     $stage->set_status("running");
54 :     $stage->set_running("yes");
55 :    
56 :     $stage->set_qualified_metadata("host", $stage->hostname);
57 :    
58 :     #
59 :     #
60 :     # Begin
61 :     #
62 :     #
63 :    
64 :     open(JOBS, "<$jobdir/rast.jobs") or $stage->fatal("Cannot open $jobdir/rast.jobs: $!");
65 :    
66 :     while (my $rjdir = <JOBS>)
67 :     {
68 :     chomp $rjdir;
69 :    
70 :     my $rj = new Job48($rjdir);
71 : olson 1.2 my $rj_id = $rj->id;
72 :     my $orgdir = $rj->orgdir();
73 :     my $repfile = "$orgdir/REPLACES";
74 : olson 1.1
75 :     my $repl = &FIG::file_head($repfile);
76 : olson 1.2
77 :     my $salvage_msg;
78 :    
79 : olson 1.1 if ($repl)
80 :     {
81 :     chomp $repl;
82 : olson 1.2 my $n = do_salvage($rj, $repl);
83 :    
84 :     $salvage_msg = "$n function assignments salvaged from $repl " . $fig->genus_species($repl);
85 :     }
86 :     else
87 :     {
88 :     #
89 :     # We are not salvaging, but we need to do a little cleanup to make the two cases the same.
90 :     #
91 :     # Create imp_assigned_functions from the set of *_functions files we have, and copy
92 :     # the annotations over to imp_annotations.
93 :     #
94 :    
95 :     my $imp_af = $stage->open_file(">$orgdir/imp_assigned_functions");
96 :     for my $f (qw(assigned_functions proposed_non_ff_functions proposed_functions))
97 :     {
98 :     my $path = "$orgdir/$f";
99 :    
100 :     if (open(AF, "<$path"))
101 :     {
102 :     while (<AF>)
103 :     {
104 :     print $imp_af $_;
105 :     }
106 :     close(AF);
107 :     }
108 :     }
109 :     close($imp_af);
110 :    
111 :     if (-f "$orgdir/annotations")
112 :     {
113 :     copy("$orgdir/annotations", "$orgdir/imp_annotations") or
114 :     $stage->fatal("Cannot copy $orgdir/annotations to $orgdir/imp_annotations: $!");
115 :     }
116 : olson 1.1 }
117 : olson 1.3
118 :     #
119 :     # See if this is a NMPDR organism.
120 :     #
121 :    
122 :     my $group;
123 :     if ($rj->meta->get_metadata("submit.nmpdr"))
124 :     {
125 :     #
126 :     # Determine the group.
127 :     #
128 :    
129 :     my $genome = $rj->genome_name();
130 :     $group = `$FIG_Config::bin/get_nmpdr_group $genome`;
131 :     if ($? == 0)
132 :     {
133 :     print "Marking $genome as being in NMPDR group $group\n";
134 :     my $nfh = $stage->open_file(">$orgdir/NMPDR");
135 :     print $nfh $group;
136 :     close($nfh);
137 :     }
138 :     }
139 :    
140 : olson 1.2
141 :     #
142 :     # While we're here, we're going to also mark this genome directory as a RAST job.
143 :     #
144 :    
145 :     my $fh = $stage->open_file(">$orgdir/RAST");
146 :     my $submit_time = ctime($rj->meta->get_metadata("upload.timestamp"));
147 :     my $dtime = (stat("$rjdir/DONE"))[9];
148 :     my $finish_time = ctime($dtime);
149 :     my $import_time = ctime(time);
150 :     print $fh "Genome processed by RAST at $FIG_Config::fig\n";
151 :     print $fh "$salvage_msg\n" if $salvage_msg;
152 : olson 1.3 print $fh "NMPDR Group: $group\n" if $group;
153 : olson 1.2 print $fh "RAST job number $rj_id from $rjdir\n";
154 :     print $fh "Upload at: $submit_time";
155 :     print $fh "Completion at: $finish_time";
156 :     print $fh "Import processing at: $import_time";
157 :     close($fh);
158 :    
159 : olson 1.1 }
160 :    
161 :     close(JOBS);
162 :    
163 : olson 1.3
164 : olson 1.1 sub do_salvage
165 :     {
166 :     my($job, $old_genome) = @_;
167 :    
168 :     print "Do replacement on " . $job->genome_name() . " from $old_genome\n";
169 :    
170 :     my $orgdir = $job->orgdir();
171 :    
172 :     #
173 : olson 1.2 # Compute mappings.
174 : olson 1.1 #
175 :    
176 : olson 1.2 my $n_salvaged = 0;
177 : olson 1.1
178 :     my $old_orgdir = "$FIG_Config::organisms/$old_genome";
179 :    
180 :     -d $old_orgdir or $stage->fatal("Old organism dir $old_orgdir does not exist");
181 :    
182 :     my $maps = "$orgdir/peg_maps";
183 :     #
184 :     # XXX - should rerun this every time, later.
185 :     if (! -f $maps)
186 :     {
187 :     $stage->run_process("make_peg_maps_from_fasta",
188 :     "$FIG_Config::bin/make_peg_maps_from_fasta",
189 :     $old_orgdir,
190 :     $orgdir,
191 :     $maps);
192 :     }
193 :    
194 :     #
195 :     # Ingest the map and mark each peg with its subsystem status.
196 :     #
197 :    
198 :     open(M, "<$maps") or $stage->fatal("Cannot open peg maps $maps: $!");
199 :     my @map;
200 :     my(%old_to_new, %new_to_old);
201 :     while (<M>)
202 :     {
203 :     chomp;
204 :     my($peg_old, $peg_new) = split(/\t/);
205 :     my $ss = [$fig->peg_to_subsystems($peg_old)];
206 :     my $ent = {
207 :     old => $peg_old,
208 :     new => $peg_new,
209 :     ss => $ss,
210 :     };
211 :    
212 :     push(@map, $ent);
213 :     $old_to_new{$peg_old} = $ent;
214 :     $new_to_old{$peg_new} = $ent;
215 :     }
216 :     close(M);
217 :    
218 :     #
219 :     # Given this map, we construct the new assigned_functions and annotations files.
220 :     #
221 : olson 1.2 # We start by copying annotations to imp_annotations, to get the initial history.
222 : olson 1.1 #
223 :     # Then we scan the old organism's annotations and assigned function files,
224 :     # remembering any of the pegs that show up in the map. Any that do not,
225 :     # we write to files unmapped.annotations and unmapped.assigned_functions, again
226 :     # to retain history.
227 :     #
228 :     # Once this scan is complete, we scan the new organism's assigned functions file.
229 :     # If a peg in there is mapped, we copy the set of annotations for the old
230 :     # peg across into the new annotations file, mapping pegs as we go.
231 :     #
232 :     # If the peg was in a subsystem, we then write
233 :     # an annotation declaring the set of subsystems the peg was in, and a final
234 :     # annotation with the function assignment from the old org. The old
235 : olson 1.2 # assignment is written to imp_assigned_functions.
236 : olson 1.1 #
237 :     # If the peg is not in a subsystmem, we write a final annotation with the
238 : olson 1.2 # rast annotation, and write the rast assignment to imp_assigned_functions.
239 : olson 1.1 #
240 : olson 1.2 # If the peg was not mapped, we just write the rast function to imp_assigned_functions.
241 : olson 1.1 #
242 :     # Note that we don't actually have to scan anything - all we need to do is
243 :     # walk over the entries in the old/new map, and write out the appropriate data.
244 :     # Anything that isn't in there was already copied from the rast version of the data.
245 :     #
246 :    
247 : olson 1.2 my $new_af = $stage->open_file(">$orgdir/imp_assigned_functions");
248 :    
249 :     #
250 :     # Read the RAST assigned functions files, write a single large file with
251 :     # all the data in it, and pull the assignments into the %rast hash as well.
252 :     #
253 :    
254 :     my %rast;
255 :     my $imp_af = $stage->open_file(">$orgdir/imp_assigned_functions");
256 :     for my $f (qw(assigned_functions proposed_non_ff_functions proposed_functions))
257 :     {
258 :     my $path = "$orgdir/$f";
259 :    
260 :     if (open(AF, "<$path"))
261 :     {
262 :     while (<AF>)
263 :     {
264 :     print $imp_af $_;
265 :     chomp;
266 :     my($peg, $fn) = split(/\t/);
267 :     $rast{$peg} = $fn;
268 :     }
269 :     close(AF);
270 :     }
271 :     }
272 :     close($imp_af);
273 :    
274 :     #
275 :     # Copy annotations to imp_annotations to initialize it; leave the
276 :     # filehandle open to add more later on.
277 :     #
278 :     my $new_anno = $stage->open_file(">$orgdir/imp_annotations");
279 :     my $orig_anno = $stage->open_file("<$orgdir/annotations");
280 :     my $buf;
281 :     while (read($orig_anno, $buf, 4096))
282 :     {
283 :     print $new_anno $buf;
284 :     }
285 :     close($orig_anno);
286 : olson 1.1
287 :     my $unmapped_af = $stage->open_file(">$orgdir/unmapped.assigned_functions");
288 :     my $unmapped_anno = $stage->open_file(">$orgdir/unmapped.annotations");
289 :    
290 :     my $old_af = $stage->open_file("<$old_orgdir/assigned_functions");
291 :     my $old_anno = $stage->open_file("<$old_orgdir/annotations");
292 :    
293 :     my(%old_anno, %old_af);
294 :    
295 : olson 1.2 #
296 :     # Scan the SEED annotations and assigned functions files and ingest.
297 :     #
298 :    
299 : olson 1.1 while (<$old_af>)
300 :     {
301 :     chomp;
302 :     my($peg, $fun) = split(/\t/);
303 :     if (my $ent = $old_to_new{$peg})
304 :     {
305 :     $ent->{old_func} = $fun;
306 :     }
307 :     else
308 :     {
309 :     print $unmapped_af "$_\n";
310 :     }
311 :     }
312 :     close($old_af);
313 :    
314 :     {
315 :     local $/;
316 :     $/ = "//\n";
317 :    
318 :     while (<$old_anno>)
319 :     {
320 :     chomp;
321 :     my($peg, $time, $who, $what, $val) = split(/\n/, $_, 5);
322 :    
323 :     if (my $ent = $old_to_new{$peg})
324 :     {
325 :     $val =~ s/\n*$//;
326 :     push @{$ent->{old_anno}}, [$peg, $time, $who, $what, $val];
327 :     }
328 :     else
329 :     {
330 :     print $unmapped_anno $ent . $/;
331 :     }
332 :     }
333 :     }
334 :     close($old_anno);
335 :    
336 :     for my $new_peg (sort { &FIG::by_fig_id($a, $b) } keys %new_to_old)
337 :     {
338 :     my $ent = $new_to_old{$new_peg};
339 :    
340 :     #
341 :     # Copy old annotations
342 :     #
343 :     for my $anno (@{$ent->{old_anno}})
344 :     {
345 :     $anno->[0] = $new_peg;
346 :     print $new_anno join("\n", @$anno), "\n//\n";
347 :     }
348 :    
349 :     my $old_func = $ent->{old_func};
350 :     my $new_func = $rast{$new_peg};
351 :    
352 :     #
353 :     # Determine if we need to update.
354 :     #
355 :     my @ss_list = @{$ent->{ss}};
356 :    
357 :     if ($old_func eq '')
358 :     {
359 :     print "$ent->{old} => $new_peg No existing function\n";
360 :     print $new_anno join("\n", $new_peg, time, "salvage",
361 :     "No function found in original organism $ent->{old}"), "\n//\n";
362 :    
363 :     }
364 :     elsif ($old_func eq $new_func)
365 :     {
366 :     print "$ent->{old} => $new_peg functions are the same\n";
367 :     print $new_anno join("\n", $new_peg, time, "salvage",
368 :     "Old and new assignments are the same", $old_func), "\n//\n";
369 :    
370 :     }
371 :     else
372 :     {
373 :     if (@ss_list > 0)
374 :     {
375 :     print "$ent->{old} => $new_peg is in a ss\n";
376 :    
377 :     print $new_anno join("\n", $new_peg, time, "salvage",
378 :     "Retaining old assignment due to membership in subsystems", "@ss_list", $old_func), "\n//\n";
379 : olson 1.2 print $new_anno join("\n", $new_peg, time, "salvage", "Set master function to", $old_func), "\n//\n";
380 : olson 1.1 print $new_af "$new_peg\t$old_func\n";
381 : olson 1.2
382 :     $n_salvaged++;
383 : olson 1.1 }
384 :     else
385 :     {
386 :     print "$ent->{old} => $new_peg is not in a ss\n";
387 :    
388 :     print $new_anno join("\n", $new_peg, time, "salvage",
389 :     "Using RAST assignment due to no subsystem membership", $new_func), "\n//\n";
390 : olson 1.2 print $new_anno join("\n", $new_peg, time, "salvage", "Set master function to", $new_func), "\n//\n";
391 : olson 1.1 }
392 :     }
393 :     }
394 :     close($new_af);
395 :     close($new_anno);
396 : olson 1.2
397 :     return $n_salvaged;
398 : olson 1.1 }
399 :    
400 :    
401 :    
402 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3