Parent Directory
|
Revision Log
Revision 1.3 - (view) (download) (as text)
1 : | dejongh | 1.1 | |
2 : | # -*- perl -*- | ||
3 : | |||
4 : | ########################################### | ||
5 : | use strict; | ||
6 : | |||
7 : | # KEY RELATIONAL TABLES: | ||
8 : | # | ||
9 : | # 1. compound(Cid,Priority,Name) % we have a prioritized set of names for a compound | ||
10 : | # 2. comp_to_CAS(Cid,CASid) % connection to chemical abstract society [optional] | ||
11 : | # 3. reaction(Rid,Reversible) % nonreversible go from substrates to products | ||
12 : | # 4. reaction_to_compound(Rid,0/1[substrate or product],Cid,Stoich,main_compound [part of major transformation]) | ||
13 : | # 5. reaction_to_role(Rid,FunctionalRole) | ||
14 : | # | ||
15 : | |||
16 : | use FIG; | ||
17 : | my $fig = new FIG; | ||
18 : | |||
19 : | my $usage = "usage: load_kegg"; | ||
20 : | |||
21 : | use Tracer; | ||
22 : | dejongh | 1.2 | #TSetup('2 *', 'WARN'); |
23 : | dejongh | 1.1 | &load_ec_and_map_data; |
24 : | &load_compounds; | ||
25 : | &load_reactions; | ||
26 : | &load_catalyzes; | ||
27 : | |||
28 : | undef $fig; | ||
29 : | |||
30 : | sub load_ec_and_map_data { | ||
31 : | |||
32 : | Open(\*TMPIN, "<$FIG_Config::data/KEGG/enzyme"); | ||
33 : | Open(\*ECMAP,">$FIG_Config::temp/ec_map.table"); | ||
34 : | |||
35 : | Trace("Reading KEGG enzymes.") if T(2); | ||
36 : | my($ec,%name,$map); | ||
37 : | $/ = "\n///\n"; | ||
38 : | while (defined($_ = <TMPIN>)) | ||
39 : | { | ||
40 : | if ($_ =~ /ENTRY\s+EC\s+(\d+\.\d+\.\d+\.\d+)/s) | ||
41 : | { | ||
42 : | $ec = $1; | ||
43 : | while ($_ =~ /PATH:\s+(map\d+)\s+(\S[^\n]+\S)/sg) | ||
44 : | { | ||
45 : | dejongh | 1.2 | print ECMAP "$ec\t$1\n"; |
46 : | dejongh | 1.1 | $name{$1} = $2; |
47 : | } | ||
48 : | } | ||
49 : | } | ||
50 : | $/ = "\n"; | ||
51 : | close(TMPIN); | ||
52 : | close(ECMAP); | ||
53 : | |||
54 : | Trace("Writing map table.") if T(2); | ||
55 : | Open(\*MAP, ">$FIG_Config::temp/map_name.table"); | ||
56 : | |||
57 : | foreach $map (keys(%name)) | ||
58 : | { | ||
59 : | print MAP "$map\t$name{$map}\n"; | ||
60 : | } | ||
61 : | close(MAP); | ||
62 : | |||
63 : | $fig->reload_table('all', "ec_map", | ||
64 : | "ec varchar(100), map varchar(100)", | ||
65 : | { index_ec_map_ec => "ec", index_ec_map_map => "map" }, | ||
66 : | "$FIG_Config::temp/ec_map.table"); | ||
67 : | unlink("$FIG_Config::temp/ec_map.table"); | ||
68 : | $fig->reload_table('all', "map_name", | ||
69 : | "map varchar(100) UNIQUE NOT NULL, mapname varchar(200), primary key ( map )", | ||
70 : | { }, "$FIG_Config::temp/map_name.table"); | ||
71 : | unlink("$FIG_Config::temp/map_name.table"); | ||
72 : | } | ||
73 : | |||
74 : | sub load_compounds { | ||
75 : | |||
76 : | Trace("Loading compounds.") if T(2); | ||
77 : | |||
78 : | Open(\*TMPIN, "<$FIG_Config::data/KEGG/compound"); | ||
79 : | Open(\*COMP, ">$FIG_Config::temp/comp_name.table"); | ||
80 : | Open(\*CAS, ">$FIG_Config::temp/comp_cas.table"); | ||
81 : | |||
82 : | my($cid,$name,$cas,$names,$tmp,$n,$entry); | ||
83 : | $/ = "\n///\n"; | ||
84 : | while (defined($entry = <TMPIN>)) | ||
85 : | { | ||
86 : | dejongh | 1.3 | if ($entry =~ /ENTRY\s+(C\d+).*\nNAME\s+(\S[^\n]*)\n((\s+(\S[^\n]*\S)\n)*)/s) |
87 : | dejongh | 1.1 | { |
88 : | $cid = $1; | ||
89 : | $names = $2; | ||
90 : | |||
91 : | if ($3) | ||
92 : | { | ||
93 : | $tmp = $3; | ||
94 : | chop $tmp; | ||
95 : | $tmp =~ s/^\s+/ /; | ||
96 : | $names = $names . $tmp; | ||
97 : | $names =~ s/\n\s+/ /g; | ||
98 : | $names =~ s/- /-/g; | ||
99 : | } | ||
100 : | |||
101 : | $n = 1; | ||
102 : | foreach $name (map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_ } split(/;/,$names)) | ||
103 : | { | ||
104 : | print COMP "$cid\t$n\t$name\n"; | ||
105 : | $n++; | ||
106 : | if (length $name > 200) { print "$cid, $name\n" } | ||
107 : | } | ||
108 : | } | ||
109 : | |||
110 : | if ($entry =~ /DBLINKS\s+CAS:\s+(\S+)/s) | ||
111 : | { | ||
112 : | print CAS "$cid\t$1\n"; | ||
113 : | } | ||
114 : | } | ||
115 : | $/ = "\n"; | ||
116 : | close(TMPIN); | ||
117 : | close(COMP); | ||
118 : | close(CAS); | ||
119 : | |||
120 : | $fig->reload_table('all', "comp_name", | ||
121 : | "cid varchar(7), pos integer, name varchar(200)", | ||
122 : | { index_comp_name_cid => "cid", | ||
123 : | index_comp_name_name => "name" }, | ||
124 : | "$FIG_Config::temp/comp_name.table"); | ||
125 : | unlink("$FIG_Config::temp/comp_name.table"); | ||
126 : | |||
127 : | $fig->reload_table('all', "comp_cas", | ||
128 : | "cid varchar(7), cas varchar(100)", | ||
129 : | { index_comp_cas_cid => "cid", | ||
130 : | index_comp_cas_cas => "cas" }, | ||
131 : | "$FIG_Config::temp/comp_cas.table"); | ||
132 : | unlink("$FIG_Config::temp/comp_cas.table"); | ||
133 : | } | ||
134 : | |||
135 : | sub load_reactions { | ||
136 : | |||
137 : | my($react,$sub,$prod,@sub,@prod,$subs,$prods,$dir); | ||
138 : | my($cid,$n,$main,%reaction,$x); | ||
139 : | |||
140 : | Trace("Loading reactions.") if T(2); | ||
141 : | Open(\*REACTION, "<$FIG_Config::data/KEGG/reaction.lst"); | ||
142 : | Open(\*RMAIN, "<$FIG_Config::data/KEGG/reaction_main.lst"); | ||
143 : | Open(\*R2C, ">$FIG_Config::temp/reaction_to_compound.table"); | ||
144 : | Open(\*REV, ">$FIG_Config::temp/rev.table"); | ||
145 : | |||
146 : | Trace("Reading reaction list file.") if T(2); | ||
147 : | while (defined($_ = <REACTION>)) | ||
148 : | { | ||
149 : | if ($_ =~ /(R\d+):\s+(\S.*\S)\s+<=>\s+(\S.*\S)/) | ||
150 : | { | ||
151 : | $react = $1; | ||
152 : | $sub = $2; | ||
153 : | $prod = $3; | ||
154 : | @sub = split(/\s+\+\s+/,$sub); | ||
155 : | @prod = split(/\s+\+\s+/,$prod); | ||
156 : | @sub = map { $_ =~ /^(([\dmn\(]\S*)\s+)?([CG]\d+)/; $2 ? [$3,$2,0] : [$3,1,0] } @sub; | ||
157 : | @prod = map { $_ =~ /^(([\dmn\(]\S*)\s+)?([CG]\d+)/; $2 ? [$3,$2,0] : [$3,1,0] } @prod; | ||
158 : | $reaction{$react} = [[@sub],[@prod]]; | ||
159 : | } | ||
160 : | else | ||
161 : | { | ||
162 : | Trace("Invalid reaction format: $_") if T(1); | ||
163 : | } | ||
164 : | } | ||
165 : | close(REACTION); | ||
166 : | |||
167 : | Trace("Reading main reaction file.") if T(2); | ||
168 : | while (defined($_ = <RMAIN>)) | ||
169 : | { | ||
170 : | if ($_ =~ /^(R\d+):\s+(\S.*\S)\s(\<?=\>?)\s(\S.*\S)/) | ||
171 : | { | ||
172 : | $react = $1; | ||
173 : | $sub = $2; | ||
174 : | $dir = $3; | ||
175 : | $prod = $4; | ||
176 : | |||
177 : | if (exists($reaction{$react})) | ||
178 : | { | ||
179 : | $subs = $reaction{$react}->[0]; | ||
180 : | $prods = $reaction{$react}->[1]; | ||
181 : | &mark_main($sub,$subs); | ||
182 : | &mark_main($prod,$prods); | ||
183 : | if (($dir eq "<=") || ($dir eq "=>")) | ||
184 : | { | ||
185 : | print REV "$react\t0\n"; | ||
186 : | } | ||
187 : | |||
188 : | if ($dir eq "<=") | ||
189 : | { | ||
190 : | $reaction{$react}->[0] = $prods; | ||
191 : | $reaction{$react}->[1] = $subs; | ||
192 : | } | ||
193 : | } | ||
194 : | } | ||
195 : | } | ||
196 : | close(RMAIN); | ||
197 : | close(REV); | ||
198 : | |||
199 : | Trace("Connecting reactions to compounds.") if T(2); | ||
200 : | foreach $react (sort keys(%reaction)) | ||
201 : | { | ||
202 : | ($subs,$prods) = @{$reaction{$react}}; | ||
203 : | foreach $x (@$subs) | ||
204 : | { | ||
205 : | ($cid,$n,$main) = @$x; | ||
206 : | print R2C "$react\t0\t$cid\t$n\t$main\n"; | ||
207 : | } | ||
208 : | |||
209 : | foreach $x (@$prods) | ||
210 : | { | ||
211 : | ($cid,$n,$main) = @$x; | ||
212 : | print R2C "$react\t1\t$cid\t$n\t$main\n"; | ||
213 : | } | ||
214 : | } | ||
215 : | close(R2C); | ||
216 : | |||
217 : | $fig->reload_table('all', "reaction_to_compound", | ||
218 : | "rid varchar(8), setn char(1), cid varchar(8), stoich char(6), main char(1)", | ||
219 : | { index_reaction_to_compound_rid => "rid", | ||
220 : | index_reaction_to_compound_cid => "cid" }, | ||
221 : | "$FIG_Config::temp/reaction_to_compound.table"); | ||
222 : | unlink("$FIG_Config::temp/reaction_to_compound.table"); | ||
223 : | |||
224 : | $fig->reload_table('all', "reversible", | ||
225 : | "rid varchar(8) UNIQUE NOT NULL, reversible char(1), primary key(rid)", | ||
226 : | { }, "$FIG_Config::temp/rev.table"); | ||
227 : | unlink("$FIG_Config::temp/rev.table"); | ||
228 : | Trace("Reactions processed.") if T(2); | ||
229 : | } | ||
230 : | |||
231 : | sub mark_main { | ||
232 : | my($main,$set) = @_; | ||
233 : | my($cid,$i); | ||
234 : | |||
235 : | foreach $cid (split(/\s+\+\s+/,$main)) | ||
236 : | { | ||
237 : | for ($i=0; ($i < @$set) && ($set->[$i]->[0] ne $cid); $i++) {} | ||
238 : | if ($i == @$set) | ||
239 : | { | ||
240 : | Confess("Cannot handle $cid in $_\n" . Dumper($set)); | ||
241 : | } | ||
242 : | else | ||
243 : | { | ||
244 : | $set->[$i]->[2] = 1; | ||
245 : | } | ||
246 : | } | ||
247 : | } | ||
248 : | |||
249 : | sub load_catalyzes { | ||
250 : | |||
251 : | my($entry); | ||
252 : | Open(\*REAC, "<$FIG_Config::data/KEGG/reaction"); | ||
253 : | Open(\*REAC2ENZ, ">$FIG_Config::temp/reaction_to_enzyme.table"); | ||
254 : | |||
255 : | Trace("Reading KEGG reaction file.") if T(2); | ||
256 : | my($rid,$ec,@ecs,$ecs); | ||
257 : | $/ = "\n///\n"; | ||
258 : | while (defined($entry = <REAC>)) | ||
259 : | { | ||
260 : | if ($entry =~ /ENTRY\s+(R\d+).*\nENZYME\s+(\S[^a-zA-Z\/]+)/s) | ||
261 : | { | ||
262 : | $rid = $1; | ||
263 : | $ecs = $2; | ||
264 : | print "$ecs\n"; | ||
265 : | foreach $ec (split(/\s+/,$ecs)) | ||
266 : | { | ||
267 : | print REAC2ENZ "$rid\t$ec\n"; | ||
268 : | } | ||
269 : | } | ||
270 : | } | ||
271 : | $/ = "\n"; | ||
272 : | close(REAC); | ||
273 : | close(REAC2ENZ); | ||
274 : | |||
275 : | $fig->reload_table('all', "reaction_to_enzyme", | ||
276 : | "rid varchar(8), role varchar(100)", | ||
277 : | { index_reaction_to_enzyme_rid => "rid", | ||
278 : | index_reaction_to_enzyme_role => "role" }, | ||
279 : | "$FIG_Config::temp/reaction_to_enzyme.table"); | ||
280 : | unlink("$FIG_Config::temp/reaction_to_enzyme.table"); | ||
281 : | Trace("Enzyme reactions loaded.") if T(2); | ||
282 : | } |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |