[Bio] / FortyEight / imp_salvage.pl Repository:
ViewVC logotype

Annotation of /FortyEight/imp_salvage.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : olson 1.1 #
2 :     # Salvage annotations for the organisms that are replacements of SEED genomes.
3 :     #
4 :     # This is true if the file REPLACES exists and containst the genome ID of an
5 :     # existing SEED genome.
6 :     #
7 :     # Note that for this to work the SEED that this script is invoked from has to
8 :     # have a current copy of the canonical SEED organism data.
9 :     #
10 :     # Salvage does the following:
11 :     #
12 :     # Determine the old->new mapping using make_peg_maps_from_fasta.
13 :     #
14 :     # Determine the set of pegs for which we will salvage assignments by finding
15 :     # the set of mapped pegs from the old organism that are in subsystems. We will
16 :     # carry across only those assignments; the others will be taken from the RAST
17 :     # pipeline. We will construct annotation file entries for the other mapped PEGs
18 :     # that have the old annotation data to keep the history, but they will be capped
19 :     # with the annotation for the RAST assignment.
20 :     #
21 :    
22 :    
23 :     use strict;
24 :     use FIG;
25 :     use Data::Dumper;
26 :     use FIG_Config;
27 :     use File::Copy;
28 :     use File::Basename;
29 :     use ImportJob;
30 :     use GenomeMeta;
31 :     use JobStage;
32 : olson 1.2 use POSIX;
33 : olson 1.1
34 :     @ARGV == 1 or die "Usage: $0 job-dir\n";
35 :    
36 :     my $fig = new FIG();
37 :    
38 :     my $max_hits = 300;
39 :    
40 :     my $jobdir = shift;
41 :    
42 :     -d $jobdir or die "$0: job dir $jobdir does not exist\n";
43 :    
44 :     my $stage = new JobStage('ImportJob', 'salvage', $jobdir);
45 :    
46 :     $stage or die "$0: Could not create job object";
47 :    
48 :     my $job = $stage->job;
49 :     my $job_id = $job->id;
50 :    
51 :     $stage->log("Running on " . $stage->hostname);
52 :    
53 :     $stage->set_status("running");
54 :     $stage->set_running("yes");
55 :    
56 :     $stage->set_qualified_metadata("host", $stage->hostname);
57 :    
58 :     #
59 :     #
60 :     # Begin
61 :     #
62 :     #
63 :    
64 :     open(JOBS, "<$jobdir/rast.jobs") or $stage->fatal("Cannot open $jobdir/rast.jobs: $!");
65 :    
66 :     while (my $rjdir = <JOBS>)
67 :     {
68 :     chomp $rjdir;
69 :    
70 :     my $rj = new Job48($rjdir);
71 : olson 1.2 my $rj_id = $rj->id;
72 :     my $orgdir = $rj->orgdir();
73 : olson 1.1
74 : olson 1.4 my $repl = $rj->meta->get_metadata("replace.seedID");
75 : olson 1.2
76 :     my $salvage_msg;
77 : olson 1.4
78 :     if ($repl ne "")
79 :     {
80 :     my $repfile = "$orgdir/REPLACES";
81 :     my $fh = $stage->open_file(">$repfile");
82 :     print $fh "$repl\n";
83 :     close($fh);
84 : olson 1.2
85 :     my $n = do_salvage($rj, $repl);
86 :    
87 :     $salvage_msg = "$n function assignments salvaged from $repl " . $fig->genus_species($repl);
88 :     }
89 :     else
90 :     {
91 :     #
92 :     # We are not salvaging, but we need to do a little cleanup to make the two cases the same.
93 :     #
94 :     # Create imp_assigned_functions from the set of *_functions files we have, and copy
95 :     # the annotations over to imp_annotations.
96 :     #
97 :    
98 :     my $imp_af = $stage->open_file(">$orgdir/imp_assigned_functions");
99 :     for my $f (qw(assigned_functions proposed_non_ff_functions proposed_functions))
100 :     {
101 :     my $path = "$orgdir/$f";
102 :    
103 :     if (open(AF, "<$path"))
104 :     {
105 :     while (<AF>)
106 :     {
107 :     print $imp_af $_;
108 :     }
109 :     close(AF);
110 :     }
111 :     }
112 :     close($imp_af);
113 :    
114 :     if (-f "$orgdir/annotations")
115 :     {
116 :     copy("$orgdir/annotations", "$orgdir/imp_annotations") or
117 :     $stage->fatal("Cannot copy $orgdir/annotations to $orgdir/imp_annotations: $!");
118 :     }
119 : olson 1.1 }
120 : olson 1.3
121 :     #
122 :     # See if this is a NMPDR organism.
123 :     #
124 :    
125 :     my $group;
126 :     if ($rj->meta->get_metadata("submit.nmpdr"))
127 :     {
128 :     #
129 :     # Determine the group.
130 :     #
131 :    
132 :     my $genome = $rj->genome_name();
133 :     $group = `$FIG_Config::bin/get_nmpdr_group $genome`;
134 :     if ($? == 0)
135 :     {
136 :     print "Marking $genome as being in NMPDR group $group\n";
137 :     my $nfh = $stage->open_file(">$orgdir/NMPDR");
138 :     print $nfh $group;
139 :     close($nfh);
140 :     }
141 :     }
142 :    
143 : olson 1.2
144 :     #
145 :     # While we're here, we're going to also mark this genome directory as a RAST job.
146 :     #
147 :    
148 :     my $fh = $stage->open_file(">$orgdir/RAST");
149 :     my $submit_time = ctime($rj->meta->get_metadata("upload.timestamp"));
150 :     my $dtime = (stat("$rjdir/DONE"))[9];
151 :     my $finish_time = ctime($dtime);
152 :     my $import_time = ctime(time);
153 :     print $fh "Genome processed by RAST at $FIG_Config::fig\n";
154 :     print $fh "$salvage_msg\n" if $salvage_msg;
155 : olson 1.3 print $fh "NMPDR Group: $group\n" if $group;
156 : olson 1.2 print $fh "RAST job number $rj_id from $rjdir\n";
157 :     print $fh "Upload at: $submit_time";
158 :     print $fh "Completion at: $finish_time";
159 :     print $fh "Import processing at: $import_time";
160 :     close($fh);
161 :    
162 : olson 1.1 }
163 :    
164 :     close(JOBS);
165 :    
166 : olson 1.3
167 : olson 1.1 sub do_salvage
168 :     {
169 :     my($job, $old_genome) = @_;
170 :    
171 :     print "Do replacement on " . $job->genome_name() . " from $old_genome\n";
172 :    
173 :     my $orgdir = $job->orgdir();
174 :    
175 :     #
176 : olson 1.2 # Compute mappings.
177 : olson 1.1 #
178 :    
179 : olson 1.2 my $n_salvaged = 0;
180 : olson 1.1
181 :     my $old_orgdir = "$FIG_Config::organisms/$old_genome";
182 :    
183 :     -d $old_orgdir or $stage->fatal("Old organism dir $old_orgdir does not exist");
184 :    
185 :     my $maps = "$orgdir/peg_maps";
186 :     #
187 :     # XXX - should rerun this every time, later.
188 :     if (! -f $maps)
189 :     {
190 :     $stage->run_process("make_peg_maps_from_fasta",
191 :     "$FIG_Config::bin/make_peg_maps_from_fasta",
192 :     $old_orgdir,
193 :     $orgdir,
194 :     $maps);
195 :     }
196 :    
197 :     #
198 :     # Ingest the map and mark each peg with its subsystem status.
199 :     #
200 :    
201 :     open(M, "<$maps") or $stage->fatal("Cannot open peg maps $maps: $!");
202 :     my @map;
203 :     my(%old_to_new, %new_to_old);
204 :     while (<M>)
205 :     {
206 :     chomp;
207 :     my($peg_old, $peg_new) = split(/\t/);
208 :     my $ss = [$fig->peg_to_subsystems($peg_old)];
209 :     my $ent = {
210 :     old => $peg_old,
211 :     new => $peg_new,
212 :     ss => $ss,
213 :     };
214 :    
215 :     push(@map, $ent);
216 :     $old_to_new{$peg_old} = $ent;
217 :     $new_to_old{$peg_new} = $ent;
218 :     }
219 :     close(M);
220 :    
221 :     #
222 :     # Given this map, we construct the new assigned_functions and annotations files.
223 :     #
224 : olson 1.2 # We start by copying annotations to imp_annotations, to get the initial history.
225 : olson 1.1 #
226 :     # Then we scan the old organism's annotations and assigned function files,
227 :     # remembering any of the pegs that show up in the map. Any that do not,
228 :     # we write to files unmapped.annotations and unmapped.assigned_functions, again
229 :     # to retain history.
230 :     #
231 :     # Once this scan is complete, we scan the new organism's assigned functions file.
232 :     # If a peg in there is mapped, we copy the set of annotations for the old
233 :     # peg across into the new annotations file, mapping pegs as we go.
234 :     #
235 :     # If the peg was in a subsystem, we then write
236 :     # an annotation declaring the set of subsystems the peg was in, and a final
237 :     # annotation with the function assignment from the old org. The old
238 : olson 1.2 # assignment is written to imp_assigned_functions.
239 : olson 1.1 #
240 :     # If the peg is not in a subsystmem, we write a final annotation with the
241 : olson 1.2 # rast annotation, and write the rast assignment to imp_assigned_functions.
242 : olson 1.1 #
243 : olson 1.2 # If the peg was not mapped, we just write the rast function to imp_assigned_functions.
244 : olson 1.1 #
245 :     # Note that we don't actually have to scan anything - all we need to do is
246 :     # walk over the entries in the old/new map, and write out the appropriate data.
247 :     # Anything that isn't in there was already copied from the rast version of the data.
248 :     #
249 :    
250 : olson 1.2 my $new_af = $stage->open_file(">$orgdir/imp_assigned_functions");
251 :    
252 :     #
253 :     # Read the RAST assigned functions files, write a single large file with
254 :     # all the data in it, and pull the assignments into the %rast hash as well.
255 :     #
256 :    
257 :     my %rast;
258 : olson 1.4
259 : olson 1.2 for my $f (qw(assigned_functions proposed_non_ff_functions proposed_functions))
260 :     {
261 :     my $path = "$orgdir/$f";
262 :    
263 :     if (open(AF, "<$path"))
264 :     {
265 :     while (<AF>)
266 :     {
267 : olson 1.4 print $new_af $_;
268 : olson 1.2 chomp;
269 :     my($peg, $fn) = split(/\t/);
270 :     $rast{$peg} = $fn;
271 :     }
272 :     close(AF);
273 :     }
274 :     }
275 :    
276 :     #
277 :     # Copy annotations to imp_annotations to initialize it; leave the
278 :     # filehandle open to add more later on.
279 :     #
280 :     my $new_anno = $stage->open_file(">$orgdir/imp_annotations");
281 :     my $orig_anno = $stage->open_file("<$orgdir/annotations");
282 :     my $buf;
283 :     while (read($orig_anno, $buf, 4096))
284 :     {
285 :     print $new_anno $buf;
286 :     }
287 :     close($orig_anno);
288 : olson 1.1
289 :     my $unmapped_af = $stage->open_file(">$orgdir/unmapped.assigned_functions");
290 :     my $unmapped_anno = $stage->open_file(">$orgdir/unmapped.annotations");
291 :    
292 :     my $old_af = $stage->open_file("<$old_orgdir/assigned_functions");
293 :     my $old_anno = $stage->open_file("<$old_orgdir/annotations");
294 :    
295 :     my(%old_anno, %old_af);
296 :    
297 : olson 1.2 #
298 :     # Scan the SEED annotations and assigned functions files and ingest.
299 :     #
300 :    
301 : olson 1.1 while (<$old_af>)
302 :     {
303 :     chomp;
304 :     my($peg, $fun) = split(/\t/);
305 :     if (my $ent = $old_to_new{$peg})
306 :     {
307 :     $ent->{old_func} = $fun;
308 :     }
309 :     else
310 :     {
311 :     print $unmapped_af "$_\n";
312 :     }
313 :     }
314 :     close($old_af);
315 :    
316 :     {
317 :     local $/;
318 :     $/ = "//\n";
319 :    
320 :     while (<$old_anno>)
321 :     {
322 :     chomp;
323 :     my($peg, $time, $who, $what, $val) = split(/\n/, $_, 5);
324 :    
325 :     if (my $ent = $old_to_new{$peg})
326 :     {
327 :     $val =~ s/\n*$//;
328 :     push @{$ent->{old_anno}}, [$peg, $time, $who, $what, $val];
329 :     }
330 :     else
331 :     {
332 :     print $unmapped_anno $ent . $/;
333 :     }
334 :     }
335 :     }
336 :     close($old_anno);
337 :    
338 :     for my $new_peg (sort { &FIG::by_fig_id($a, $b) } keys %new_to_old)
339 :     {
340 :     my $ent = $new_to_old{$new_peg};
341 :    
342 :     #
343 :     # Copy old annotations
344 :     #
345 :     for my $anno (@{$ent->{old_anno}})
346 :     {
347 :     $anno->[0] = $new_peg;
348 :     print $new_anno join("\n", @$anno), "\n//\n";
349 :     }
350 :    
351 :     my $old_func = $ent->{old_func};
352 :     my $new_func = $rast{$new_peg};
353 :    
354 :     #
355 :     # Determine if we need to update.
356 :     #
357 :     my @ss_list = @{$ent->{ss}};
358 :    
359 :     if ($old_func eq '')
360 :     {
361 : olson 1.4 # print "$ent->{old} => $new_peg No existing function\n";
362 : olson 1.1 print $new_anno join("\n", $new_peg, time, "salvage",
363 :     "No function found in original organism $ent->{old}"), "\n//\n";
364 :    
365 :     }
366 :     elsif ($old_func eq $new_func)
367 :     {
368 : olson 1.4 # print "$ent->{old} => $new_peg functions are the same\n";
369 : olson 1.1 print $new_anno join("\n", $new_peg, time, "salvage",
370 :     "Old and new assignments are the same", $old_func), "\n//\n";
371 :    
372 :     }
373 :     else
374 :     {
375 :     if (@ss_list > 0)
376 :     {
377 : olson 1.4 # print "$ent->{old} => $new_peg is in a ss\n";
378 : olson 1.1
379 :     print $new_anno join("\n", $new_peg, time, "salvage",
380 :     "Retaining old assignment due to membership in subsystems", "@ss_list", $old_func), "\n//\n";
381 : olson 1.2 print $new_anno join("\n", $new_peg, time, "salvage", "Set master function to", $old_func), "\n//\n";
382 : olson 1.1 print $new_af "$new_peg\t$old_func\n";
383 : olson 1.2
384 :     $n_salvaged++;
385 : olson 1.1 }
386 :     else
387 :     {
388 : olson 1.4 # print "$ent->{old} => $new_peg is not in a ss\n";
389 : olson 1.1
390 :     print $new_anno join("\n", $new_peg, time, "salvage",
391 :     "Using RAST assignment due to no subsystem membership", $new_func), "\n//\n";
392 : olson 1.2 print $new_anno join("\n", $new_peg, time, "salvage", "Set master function to", $new_func), "\n//\n";
393 : olson 1.1 }
394 :     }
395 :     }
396 :     close($new_af);
397 :     close($new_anno);
398 : olson 1.2
399 :     return $n_salvaged;
400 : olson 1.1 }
401 :    
402 :    
403 :    
404 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3