[Bio] / DeJonghStuff / map_model_to_kegg.pl Repository:
ViewVC logotype

Annotation of /DeJonghStuff/map_model_to_kegg.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : dejongh 1.1 # -*- perl -*-
2 :    
3 :     ###########################################
4 :     use strict;
5 :    
6 :     use FIG;
7 :     my $fig = new FIG;
8 :    
9 :     my ($genome_id) = @ARGV;
10 :    
11 :     # map from model's compound abbreviations to KEGG compound ids
12 :     # and reaction abbreviations to KEGG reaction ids
13 :    
14 :     my (%compound_abbrev_2_name, %compound_id_2_abbrev, %reaction_abbrev_2_id, %reaction_abbrev_2_name, %reaction_2_substrates, %reaction_2_products);
15 :    
16 :     open(MODEL_COMPOUNDS, "<$FIG_Config::global/Models/$genome_id/compound_name") or die("Run parse_model first");
17 :     open(MODEL_REACTIONS, "<$FIG_Config::global/Models/$genome_id/reaction") or die("Run parse_model first");
18 :     open(MODEL_REACTION_NAMES, "<$FIG_Config::global/Models/$genome_id/reaction_to_role") or die("Run parse_model first");
19 :     open(MODEL_REACTION_COMPOUNDS, "<$FIG_Config::global/Models/$genome_id/reaction_to_compound") or die("Run parse_model first");
20 :    
21 :     open(COMPOUND_OUT, ">$FIG_Config::global/Models/$genome_id/compound_to_kegg");
22 :     open(REAC_OUT, ">$FIG_Config::global/Models/$genome_id/reaction_to_kegg");
23 :    
24 :     my %reversibility;
25 :    
26 :     while (my $line = <MODEL_REACTIONS>)
27 :     {
28 :     chomp $line;
29 :     my ($rabbrev, $rev) = split "\t", $line;
30 :     $reversibility{$rabbrev} = $rev;
31 :     }
32 :    
33 :     # hard code some KEGG compound to model compound abbreviations
34 :     # Ammonia for Ammonium
35 :     $compound_id_2_abbrev{"C00014"} = "nh4";
36 :     $compound_id_2_abbrev{"C00093"} = "glyc3p";
37 :     $compound_id_2_abbrev{"C00536"} = "pppi";
38 :    
39 :     while (my $line = <MODEL_COMPOUNDS>)
40 :     {
41 :     chomp($line);
42 :     my ($cabbrev, undef, $cname) = split /\t/, $line;
43 :     $compound_abbrev_2_name{$cabbrev} = $cname;
44 :    
45 :     my @cids = $fig->ids_of_compound_like_name($cname);
46 :    
47 :     # some common compounds like nadh match on the abbreviation
48 :     if (@cids == 0)
49 :     {
50 :     @cids = $fig->ids_of_compound_like_name($cabbrev);
51 :     }
52 :    
53 :     if (@cids > 0)
54 :     {
55 :     foreach my $cid (@cids)
56 :     {
57 :     $compound_id_2_abbrev{$cid} = $cabbrev;
58 :     }
59 :     }
60 :     }
61 :    
62 :     while (my $line = <MODEL_REACTION_NAMES>)
63 :     {
64 :     chomp $line;
65 :     my ($rabbrev, $rname) = split "\t", $line;
66 :     $reaction_abbrev_2_name{$rabbrev} = $rname;
67 :     }
68 :    
69 :     while (<MODEL_REACTION_COMPOUNDS>)
70 :     {
71 :     my ($rabbrev, $cabbrev, $setn) = split "\t", $_;
72 :    
73 :     if ($setn == 1)
74 :     {
75 :     $reaction_2_substrates{$rabbrev}{$cabbrev} = 1;
76 :     }
77 :     else
78 :     {
79 :     $reaction_2_products{$rabbrev}{$cabbrev} = 1;
80 :     }
81 :     }
82 :    
83 :     my %cabbrevs_used_in_reactions;
84 :    
85 :     foreach my $rabbrev (keys %reaction_abbrev_2_name)
86 :     {
87 :     my $rname = $reaction_abbrev_2_name{$rabbrev};
88 :    
89 :     # first look up the KEGG reaction id based on the given name
90 :     my $reaction_id = &get_reaction_id($rname);
91 :    
92 :     if (defined $reaction_id)
93 :     {
94 :     my @tmplist = ($reaction_id);
95 :     $reaction_abbrev_2_id{$rabbrev} = \@tmplist;
96 :     print "$rabbrev matches $reaction_id on KEGG name ($rname)\n";
97 :    
98 :     my $kegg_rev = $fig->reversible($reaction_id);
99 :     if ($kegg_rev == $reversibility{$rabbrev})
100 :     {
101 :     print REAC_OUT "$rabbrev\t$reaction_id\t1\n";
102 :     }
103 :     else
104 :     {
105 :     print REAC_OUT "$rabbrev\t$reaction_id\t0\n";
106 :     }
107 :     }
108 :     else
109 :     {
110 :     print "$rabbrev, $rname, ($reversibility{$rabbrev}): No reaction id matched on KEGG name\n";
111 :    
112 :     # Now see if the name is associated with an EC number
113 :     my @tmp = `grep -i "$rname" $FIG_Config::data/KEGG/ECtable`;
114 :    
115 :     if (@tmp == 0)
116 :     {
117 :     #try removing anything in parentheses to get rid of comments
118 :     my $tmp_name = $rname;
119 :     $tmp_name =~ s/\s+\(.*\)$//;
120 :     # replace "b-" with "beta-"
121 :     $tmp_name =~ s/^b-/beta-/;
122 :    
123 :     if ($tmp_name ne $rname)
124 :     {
125 :     print "\t Trying $tmp_name\n";
126 :     @tmp = `grep -i "$tmp_name" $FIG_Config::data/KEGG/ECtable`;
127 :     }
128 :     }
129 :    
130 :     if (@tmp > 0)
131 :     {
132 :     my %reaction_scores;
133 :    
134 :     ec_loop: foreach my $ec_line (@tmp)
135 :     {
136 :     $ec_line =~ /\s+(\d+\.\d+\.\d+\.\d+)/;
137 :     my $ec = $1;
138 :     print "\t Found EC: $ec\n";
139 :     my @reactions = $fig->catalyzes($ec);
140 :     print "\t Found reactions: @reactions\n";
141 :    
142 :     if (@reactions > 0)
143 :     {
144 :     # try to select reaction based on compounds
145 :     foreach $reaction_id (@reactions)
146 :     {
147 :     my $kegg_rev = $fig->reversible($reaction_id);
148 :     my $better_reversed = 0;
149 :     print "\t Checking reaction $reaction_id ($kegg_rev)\n";
150 :     my %substrates = %{$reaction_2_substrates{$rabbrev}};
151 :     my %products = %{$reaction_2_products{$rabbrev}};
152 :     my %substrates_as_products = %{$reaction_2_substrates{$rabbrev}};
153 :     my %products_as_substrates = %{$reaction_2_products{$rabbrev}};
154 :     my ($score, $reverse_score);
155 :    
156 :     my @kegg_substrate_info = $fig->reaction2comp($reaction_id, 0);
157 :     my @kegg_product_info = $fig->reaction2comp($reaction_id, 1);
158 :    
159 :     my (@unmatched_kegg_substrates, @unmatched_kegg_products);
160 :    
161 :     foreach my $kegg_substrate (@kegg_substrate_info)
162 :     {
163 :     my $cid = @$kegg_substrate[0];
164 :     my @cnames = $fig->names_of_compound($cid);
165 :     print "\t\t Checking $cid (@cnames[0]) ... ";
166 :     if (my $cabbrev = $compound_id_2_abbrev{$cid})
167 :     {
168 :     print "Found $cabbrev ($compound_abbrev_2_name{$cabbrev})\n";
169 :     my $in_substrates = delete $substrates{$cabbrev};
170 :     my $in_products_as_substrates = delete $products_as_substrates{$cabbrev};
171 :    
172 :     if (! $in_substrates && ! $in_products_as_substrates)
173 :     {
174 :     print "\t\t\t No match (main = @$kegg_substrate[2])\n";
175 :     push @unmatched_kegg_substrates, $cid;
176 :     }
177 :     }
178 :     else
179 :     {
180 :     print "No compound found (main = @$kegg_substrate[2])\n";
181 :     push @unmatched_kegg_substrates, $cid;
182 :     }
183 :     }
184 :    
185 :     foreach my $kegg_product (@kegg_product_info)
186 :     {
187 :     my $cid = @$kegg_product[0];
188 :     my @cnames = $fig->names_of_compound($cid);
189 :     print "\t\t Checking $cid (@cnames[0]) ... ";
190 :     if (my $cabbrev = $compound_id_2_abbrev{$cid})
191 :     {
192 :     print "Found $cabbrev ($compound_abbrev_2_name{$cabbrev})\n";
193 :     my $in_products = delete $products{$cabbrev};
194 :     my $in_substrates_as_products = delete $substrates_as_products{$cabbrev};
195 :    
196 :     if (! $in_products && ! $in_substrates_as_products)
197 :     {
198 :     print "\t\t\t No match (main = @$kegg_product[2])\n";
199 :     push @unmatched_kegg_products, $cid;
200 :     }
201 :     }
202 :     else
203 :     {
204 :     print "No compound found (main = @$kegg_product[2])\n";
205 :     push @unmatched_kegg_products, $cid;
206 :     }
207 :     }
208 :    
209 :     # remove left over hydrogen since KEGG doesn't always include it
210 :     delete $substrates{"h"};
211 :     delete $substrates_as_products{"h"};
212 :     delete $products{"h"};
213 :     delete $products_as_substrates{"h"};
214 :    
215 :     $score = scalar(keys(%substrates)) + scalar(keys(%products)) + scalar(@unmatched_kegg_substrates)
216 :     + scalar(@unmatched_kegg_products);
217 :     $reverse_score = scalar(keys(%substrates_as_products))
218 :     + scalar(keys(%products_as_substrates)) + scalar(@unmatched_kegg_substrates)
219 :     + scalar(@unmatched_kegg_products);
220 :    
221 :     if ($reverse_score < $score)
222 :     {
223 :     print "\t\t Better score reversing reaction ($reverse_score < $score)\n";
224 :     $better_reversed = 1;
225 :     $score = $reverse_score;
226 :     %substrates = %substrates_as_products;
227 :     %products = %products_as_substrates;
228 :     }
229 :     else
230 :     {
231 :     print "\t\t No better score reversing reaction ($reverse_score >= $score)\n";
232 :     }
233 :    
234 :     if (%substrates == 0 && scalar(@unmatched_kegg_substrates) == 0)
235 :     {
236 :     print "\t\t Matched all the substrates\n";
237 :     }
238 :     else
239 :     {
240 :     print "\t\t Remaining substrates: ";
241 :     foreach my $sabbrev (keys %substrates)
242 :     {
243 :     print "$sabbrev ($compound_abbrev_2_name{$sabbrev}) ";
244 :     }
245 :     print "\n\t\t Number of unmatched KEGG substrates: ", scalar(@unmatched_kegg_substrates), "\n";
246 :     }
247 :    
248 :     if (%products == 0 && scalar(@unmatched_kegg_products) == 0)
249 :     {
250 :     print "\t\t Matched all the products\n";
251 :     }
252 :     else
253 :     {
254 :     print "\t\t Remaining products: ";
255 :     foreach my $pabbrev (keys %products)
256 :     {
257 :     print "$pabbrev ($compound_abbrev_2_name{$pabbrev}) ";
258 :     }
259 :     print "\n\t\t Number of unmatched KEGG products: ", scalar(@unmatched_kegg_products), "\n";
260 :     }
261 :    
262 :     print "\t\t Score: $score\n";
263 :    
264 :     # found the reaction. throw out other contenders and keep track
265 :     # of compounds that were used to make the match
266 :     if ($score == 0)
267 :     {
268 :     undef %reaction_scores;
269 :     $reaction_scores{$reaction_id} = $score;
270 :    
271 :     foreach my $kegg_substrate (@kegg_substrate_info)
272 :     {
273 :     my $cid = @$kegg_substrate[0];
274 :     if (my $cabbrev = $compound_id_2_abbrev{$cid})
275 :     {
276 :     $cabbrevs_used_in_reactions{$cabbrev}{$cid}=1;
277 :     }
278 :     }
279 :    
280 :     foreach my $kegg_product (@kegg_product_info)
281 :     {
282 :     my $cid = @$kegg_product[0];
283 :     if (my $cabbrev = $compound_id_2_abbrev{$cid})
284 :     {
285 :     $cabbrevs_used_in_reactions{$cabbrev}{$cid}=1;
286 :     }
287 :     }
288 :    
289 :     if ($better_reversed && $reversibility{$rabbrev} == 0 && $kegg_rev == 0)
290 :     {
291 :     $reversibility{$rabbrev} = -1;
292 :     print "\t\t Model and KEGG reactions have different directions\n";
293 :     }
294 :    
295 :     last ec_loop;
296 :     }
297 :     else
298 :     {
299 :     $reaction_scores{$reaction_id} = $score;
300 :     }
301 :     }
302 :     }
303 :     }
304 :    
305 :     my @reaction_list = sort { $reaction_scores{$a} <=> $reaction_scores{$b} } keys %reaction_scores;
306 :    
307 :     if (@reaction_list > 0)
308 :     {
309 :     $reaction_abbrev_2_id{$rabbrev} = \@reaction_list;
310 :     print "\t reaction list: @reaction_list\n";
311 :    
312 :     if (@reaction_list == 1)
313 :     {
314 :     my $kegg_rev = $fig->reversible($reaction_list[0]);
315 :     if ($kegg_rev == $reversibility{$rabbrev})
316 :     {
317 :     print REAC_OUT "$rabbrev\t@reaction_list\t1\n";
318 :     }
319 :     else
320 :     {
321 :     print REAC_OUT "$rabbrev\t@reaction_list\t0\n";
322 :     }
323 :     }
324 :     else
325 :     {
326 :     print REAC_OUT "$rabbrev\t@reaction_list\n";
327 :     }
328 :     }
329 :     else
330 :     {
331 :     print "\t No matching reactions\n";
332 :     print REAC_OUT "$rabbrev\n";
333 :     }
334 :     }
335 :     else
336 :     {
337 :     print "\t No EC found\n";
338 :     print REAC_OUT "$rabbrev\n";
339 :     }
340 :     }
341 :     }
342 :    
343 :     close(REAC_OUT);
344 :    
345 :     print "\n\nNumber of reactions with matches: ", scalar(keys(%reaction_abbrev_2_id)), "\n";
346 :    
347 :     foreach my $cabbrev (keys %cabbrevs_used_in_reactions)
348 :     {
349 :     my @cids = keys %{$cabbrevs_used_in_reactions{$cabbrev}};
350 :     print COMPOUND_OUT "$cabbrev\t@cids\n";
351 :     }
352 :    
353 :     sub get_reaction_id
354 :     {
355 :     my $match_name = shift;
356 :    
357 :     my ($reaction_id, $entry, $name);
358 :    
359 :     open(KEGG_REACTIONS, "<$FIG_Config::data/KEGG/reaction") or die("Couldn't open KEGG reaction file");
360 :    
361 :     $/ = "\n///\n";
362 :     while (<KEGG_REACTIONS>)
363 :     {
364 :     if ($_ =~ /ENTRY\s+(\S+)\s+Reaction\nNAME\s+(\S[^\n]+\n)((\s+(\S[^\n]+\n))*)/s)
365 :     {
366 :     $reaction_id = $1;
367 :    
368 :     if ($3)
369 :     {
370 :     $name = "$2 $3";
371 :     }
372 :     else
373 :     {
374 :     $name = $2;
375 :     }
376 :     $name =~ s/(\s+\$|\s+)/ /g;
377 :     chop $name;
378 :    
379 :     if ($name eq $match_name)
380 :     {
381 :     $/ = "\n";
382 :     return $reaction_id;
383 :     }
384 :     }
385 :     }
386 :    
387 :     $/ = "\n";
388 :     return;
389 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3