Parent Directory
|
Revision Log
Revision 1.23 - (view) (download) (as text)
1 : | efrank | 1.1 | # -*- perl -*- |
2 : | olson | 1.20 | # |
3 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
4 : | # for Interpretations of Genomes. All Rights Reserved. | ||
5 : | # | ||
6 : | # This file is part of the SEED Toolkit. | ||
7 : | # | ||
8 : | # The SEED Toolkit is free software. You can redistribute | ||
9 : | # it and/or modify it under the terms of the SEED Toolkit | ||
10 : | # Public License. | ||
11 : | # | ||
12 : | # You should have received a copy of the SEED Toolkit Public License | ||
13 : | # along with this program; if not write to the University of Chicago | ||
14 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
15 : | # Genomes at veronika@thefig.info or download a copy from | ||
16 : | # http://www.theseed.org/LICENSE.TXT. | ||
17 : | # | ||
18 : | |||
19 : | efrank | 1.1 | |
20 : | use strict; | ||
21 : | use FIG; | ||
22 : | my $fig = new FIG; | ||
23 : | |||
24 : | parrello | 1.18 | use Tracer; |
25 : | overbeek | 1.21 | if ($ENV{'VERBOSE'}) { TSetup("3 FIG DBKernel","TEXT") } |
26 : | efrank | 1.1 | |
27 : | parrello | 1.18 | Trace("Preparing to load features.") if T(2); |
28 : | my ($mode, @genomes) = FIG::parse_genome_args(@ARGV); | ||
29 : | efrank | 1.1 | my $temp_dir = "$FIG_Config::temp"; |
30 : | parrello | 1.18 | my $organisms_dir = "$FIG_Config::organisms"; |
31 : | efrank | 1.1 | |
32 : | my($genome,@types,$type,$id,$loc,@aliases,$aliases,$contig); | ||
33 : | |||
34 : | # usage: load_features [G1 G2 G3 ... ] | ||
35 : | |||
36 : | parrello | 1.18 | Open(\*REL, ">$temp_dir/tmpfeat$$"); |
37 : | Open(\*ALIAS, "| sort -T $temp_dir -u > $temp_dir/tmpalias$$"); | ||
38 : | overbeek | 1.21 | Open(\*DELFIDS,"| sort -u > $temp_dir/tmpdel$$"); |
39 : | Open(\*REPFIDS,"| sort -u > $temp_dir/tmprel$$"); | ||
40 : | |||
41 : | if ($mode eq 'all') { | ||
42 : | |||
43 : | overbeek | 1.22 | # Process any remaining deleted.features or replaced.features in Global |
44 : | if (open(GLOBDEL,"<$FIG_Config::global/deleted.features")) | ||
45 : | overbeek | 1.21 | { |
46 : | while (defined($_ = <GLOBDEL>) && ($_ =~ /^fig\|(\d+\.\d+)/)) | ||
47 : | { | ||
48 : | print DELFIDS "$1\t$_"; | ||
49 : | } | ||
50 : | } | ||
51 : | close(GLOBDEL); | ||
52 : | |||
53 : | overbeek | 1.22 | if (open(GLOBREP,"<$FIG_Config::global/replaced.features")) |
54 : | overbeek | 1.21 | { |
55 : | while (defined($_ = <GLOBREP>) && ($_ =~ /^fig\|(\d+\.\d+)/)) | ||
56 : | { | ||
57 : | print REPFIDS "$1\t$_"; | ||
58 : | } | ||
59 : | } | ||
60 : | close(GLOBREP); | ||
61 : | efrank | 1.1 | |
62 : | |||
63 : | overbeek | 1.10 | |
64 : | # Here we extract external aliases from the peg.synonyms table, when they can be inferred | ||
65 : | # accurately. | ||
66 : | parrello | 1.18 | Trace("Extracting external aliases from the peg.synonyms table.") if T(2); |
67 : | overbeek | 1.21 | Open(\*SYN, "<$FIG_Config::global/peg.synonyms"); |
68 : | overbeek | 1.10 | while (defined($_ = <SYN>)) |
69 : | { | ||
70 : | parrello | 1.18 | chop; |
71 : | my($x,$y) = split(/\t/,$_); | ||
72 : | my @ids = map { $_ =~ /^([^,]+),(\d+)/; [$1,$2] } ($x,split(/;/,$y)); | ||
73 : | my @fig = (); | ||
74 : | my(@nonfig) = (); | ||
75 : | foreach $_ (@ids) | ||
76 : | { | ||
77 : | if ($_->[0] =~ /^fig\|/) | ||
78 : | { | ||
79 : | push(@fig,$_); | ||
80 : | } | ||
81 : | else | ||
82 : | { | ||
83 : | push(@nonfig,$_); | ||
84 : | } | ||
85 : | } | ||
86 : | |||
87 : | foreach $x (@fig) | ||
88 : | overbeek | 1.13 | { |
89 : | parrello | 1.18 | my($peg,$peg_ln) = @$x; |
90 : | my $genome = &FIG::genome_of($peg); | ||
91 : | foreach $_ (@nonfig) | ||
92 : | { | ||
93 : | if ((@fig == 1) || ($peg_ln == $_->[1])) | ||
94 : | { | ||
95 : | print ALIAS "$peg\t$_->[0]\t$genome\n"; | ||
96 : | Trace("Alias record $peg, $_->[0] for $genome.") if T(4); | ||
97 : | } | ||
98 : | } | ||
99 : | overbeek | 1.13 | } |
100 : | overbeek | 1.10 | } |
101 : | close(SYN); | ||
102 : | efrank | 1.1 | } |
103 : | |||
104 : | foreach $genome (@genomes) | ||
105 : | { | ||
106 : | parrello | 1.18 | Trace("Processing $genome.") if T(3); |
107 : | efrank | 1.1 | opendir(FEAT,"$organisms_dir/$genome/Features") |
108 : | parrello | 1.18 | || die "could not open $genome/Features"; |
109 : | efrank | 1.1 | @types = grep { $_ =~ /^[a-zA-Z]+$/ } readdir(FEAT); |
110 : | closedir(FEAT); | ||
111 : | |||
112 : | foreach $type (@types) | ||
113 : | { | ||
114 : | overbeek | 1.22 | if ((-s "$organisms_dir/$genome/Features/$type/deleted.features") && |
115 : | open(TMP,"<$organisms_dir/$genome/Features/$type/deleted.features")) | ||
116 : | overbeek | 1.21 | { |
117 : | while (defined($_ = <TMP>) && ($_ =~ /^fig\|(\d+\.\d+)/)) | ||
118 : | { | ||
119 : | print DELFIDS "$1\t$_"; | ||
120 : | } | ||
121 : | close(TMP); | ||
122 : | } | ||
123 : | |||
124 : | overbeek | 1.22 | if ((-s "$organisms_dir/$genome/Features/$type/replaced.features") && |
125 : | open(TMP,"<$organisms_dir/$genome/Features/$type/replaced.features")) | ||
126 : | overbeek | 1.21 | { |
127 : | while (defined($_ = <TMP>) && ($_ =~ /^fig\|(\d+\.\d+)/)) | ||
128 : | { | ||
129 : | print REPFIDS "$1\t$_"; | ||
130 : | } | ||
131 : | close(TMP); | ||
132 : | } | ||
133 : | |||
134 : | overbeek | 1.19 | if ((-s "$organisms_dir/$genome/Features/$type/tbl") && |
135 : | open(TBL,"<$organisms_dir/$genome/Features/$type/tbl")) | ||
136 : | { | ||
137 : | Trace("Loading $genome/Features/$type/tbl") if T(4); | ||
138 : | my @tbl = <TBL>; | ||
139 : | close(TBL); | ||
140 : | my %seen; | ||
141 : | |||
142 : | while ($_ = pop @tbl) | ||
143 : | { | ||
144 : | chop; | ||
145 : | ($id,$loc,@aliases) = split(/\t/,$_); | ||
146 : | |||
147 : | if ($id && (! $seen{$id})) | ||
148 : | overbeek | 1.12 | { |
149 : | overbeek | 1.19 | $seen{$id} = 1; |
150 : | my($minloc,$maxloc); | ||
151 : | if ($loc) | ||
152 : | { | ||
153 : | $loc =~ s/\s+$//; | ||
154 : | parrello | 1.23 | ($contig,$minloc,$maxloc) = $fig->boundaries_of($loc); |
155 : | overbeek | 1.19 | if ($minloc && $maxloc) |
156 : | overbeek | 1.8 | { |
157 : | overbeek | 1.19 | ($minloc < $maxloc) || (($minloc,$maxloc) = ($maxloc,$minloc)); |
158 : | } | ||
159 : | } | ||
160 : | parrello | 1.18 | |
161 : | overbeek | 1.19 | if (! $contig) |
162 : | { | ||
163 : | $loc = $contig = $minloc = $maxloc = ""; | ||
164 : | } | ||
165 : | parrello | 1.18 | |
166 : | overbeek | 1.19 | if (@aliases > 0) |
167 : | { | ||
168 : | $aliases = join(",",grep(/\S/,@aliases)); | ||
169 : | my $alias; | ||
170 : | foreach $alias (@aliases) | ||
171 : | { | ||
172 : | if ($alias =~ /^([NXYZA]P_|gi\||sp\|\tr\||kegg\||uni\|)/) | ||
173 : | { | ||
174 : | |||
175 : | print ALIAS "$id\t$alias\t$genome\tOVERRIDE\n"; | ||
176 : | Trace("$id override alias $alias for $genome") if T(4); | ||
177 : | } | ||
178 : | overbeek | 1.8 | } |
179 : | overbeek | 1.19 | } |
180 : | else | ||
181 : | { | ||
182 : | $aliases = ""; | ||
183 : | } | ||
184 : | $minloc = (! $minloc) ? 0 : $minloc; | ||
185 : | $maxloc = (! $maxloc) ? 0 : $maxloc; | ||
186 : | if ((length($loc) < 5000) && (length($contig) < 96) && (length($id) < 32) && ($id =~ /(\d+)$/)) | ||
187 : | { | ||
188 : | print REL "$id\t$1\t$type\t$genome\t$loc\t$contig\t$minloc\t$maxloc\t$aliases\n"; | ||
189 : | } | ||
190 : | efrank | 1.1 | } |
191 : | overbeek | 1.19 | } |
192 : | } | ||
193 : | efrank | 1.1 | } |
194 : | } | ||
195 : | overbeek | 1.21 | close(REPFIDS); |
196 : | close(DELFIDS); | ||
197 : | efrank | 1.1 | close(REL); |
198 : | overbeek | 1.8 | close(ALIAS); |
199 : | parrello | 1.18 | Open(\*ALIASIN, "<$temp_dir/tmpalias$$"); |
200 : | Open(\*ALIASOUT, ">$temp_dir/tmpalias$$.1"); | ||
201 : | Trace("Parsing alias file.") if T(2); | ||
202 : | overbeek | 1.14 | $_ = <ALIASIN>; |
203 : | while ($_ && ($_ =~ /^(\S+)/)) | ||
204 : | { | ||
205 : | my @aliases = (); | ||
206 : | my $curr = $1; | ||
207 : | while ($_ && ($_ =~ /^(\S+)\t(\S+)(\t(\S+))?/) && ($1 eq $curr)) | ||
208 : | { | ||
209 : | parrello | 1.18 | push(@aliases,[$2,$3 ? 1 : 0]); |
210 : | $_ = <ALIASIN>; | ||
211 : | overbeek | 1.14 | } |
212 : | my $x; | ||
213 : | my $genome = &FIG::genome_of($curr); | ||
214 : | foreach $x (@aliases) | ||
215 : | { | ||
216 : | overbeek | 1.19 | if ($x->[1]) |
217 : | { | ||
218 : | print ALIASOUT "$curr\t$x->[0]\t$genome\n"; | ||
219 : | } | ||
220 : | else | ||
221 : | { | ||
222 : | my $i; | ||
223 : | for ($i=0; ($i < @aliases) && ((! $aliases[$i]->[1]) || (! &same_class($x->[0],$aliases[$i]->[0]))); $i++) {} | ||
224 : | if ($i == @aliases) | ||
225 : | { | ||
226 : | print ALIASOUT "$curr\t$x->[0]\t$genome\n"; | ||
227 : | } | ||
228 : | } | ||
229 : | overbeek | 1.14 | } |
230 : | } | ||
231 : | close(ALIASIN); | ||
232 : | close(ALIASOUT); | ||
233 : | unlink("$temp_dir/tmpalias$$"); | ||
234 : | efrank | 1.1 | |
235 : | overbeek | 1.21 | $fig->reload_table($mode, 'deleted_fids',"genome varchar(16), fid varchar(32)", |
236 : | { deleted_fids_fid_ix => 'fid', deleted_fids_genome_ix => 'genome' }, | ||
237 : | "$temp_dir/tmpdel$$",\@genomes); | ||
238 : | |||
239 : | unlink("$temp_dir/tmpdel$$"); | ||
240 : | |||
241 : | $fig->reload_table($mode, 'replaced_fids',"genome varchar(16), from_fid varchar(32), to_fid varchar(32)", | ||
242 : | { replaced_fids_from_ix => 'from_fid', | ||
243 : | replaced_fids_to_ix => 'to_fid', | ||
244 : | replaced_fids_genome_ix => 'genome' | ||
245 : | }, | ||
246 : | "$temp_dir/tmprep$$",\@genomes); | ||
247 : | |||
248 : | unlink("$temp_dir/tmprep$$"); | ||
249 : | |||
250 : | parrello | 1.18 | $fig->reload_table($mode, 'features', |
251 : | "id varchar(32), idN INTEGER, type varchar(16),genome varchar(16)," . | ||
252 : | "location TEXT," . | ||
253 : | "contig varchar(96), minloc INTEGER, maxloc INTEGER," . | ||
254 : | "aliases TEXT", | ||
255 : | { features_id_ix => "id", features_org_ix => "genome", | ||
256 : | features_type_ix => "type", features_beg_ix => "genome, contig, minloc" }, | ||
257 : | "$temp_dir/tmpfeat$$", \@genomes); | ||
258 : | unlink("$temp_dir/tmpfeat$$"); | ||
259 : | efrank | 1.1 | |
260 : | parrello | 1.18 | $fig->reload_table($mode, 'ext_alias', |
261 : | "id varchar(32), alias varchar(32), genome varchar(16)", | ||
262 : | { ext_alias_alias_ix => "alias", ext_alias_genome_ix => "genome", | ||
263 : | ext_alias_ix_id => "id" }, | ||
264 : | "$temp_dir/tmpalias$$.1", \@genomes ); | ||
265 : | overbeek | 1.8 | |
266 : | overbeek | 1.14 | unlink("$temp_dir/tmpalias$$.1"); |
267 : | parrello | 1.18 | Trace("Features loaded.") if T(2); |
268 : | overbeek | 1.14 | |
269 : | sub same_class { | ||
270 : | my($x,$y) = @_; | ||
271 : | |||
272 : | my $class1 = &classA($x); | ||
273 : | my $class2 = &classA($y); | ||
274 : | return ($class1 && ($class1 eq $class2)); | ||
275 : | } | ||
276 : | |||
277 : | sub classA { | ||
278 : | my($alias) = @_; | ||
279 : | |||
280 : | if ($alias =~ /^([^\|]+)\|/) | ||
281 : | { | ||
282 : | parrello | 1.18 | return $1; |
283 : | overbeek | 1.14 | } |
284 : | elsif ($alias =~ /^[NXYZA]P_[0-9\.]+$/) | ||
285 : | { | ||
286 : | parrello | 1.18 | return "refseq"; |
287 : | overbeek | 1.14 | } |
288 : | else | ||
289 : | { | ||
290 : | parrello | 1.18 | return ""; |
291 : | overbeek | 1.14 | } |
292 : | } |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |