Parent Directory
|
Revision Log
Revision 1.5 - (view) (download) (as text)
1 : | overbeek | 1.1 | use strict; |
2 : | use Data::Dumper; | ||
3 : | use SeedEnv; | ||
4 : | overbeek | 1.4 | use FIG; |
5 : | my $fig = new FIG; | ||
6 : | overbeek | 1.1 | |
7 : | overbeek | 1.2 | my $usage = "usage: build_data_directory DataDir K"; |
8 : | overbeek | 1.1 | my($genus,$dataD,$k); |
9 : | ( | ||
10 : | ($dataD = shift @ARGV) && | ||
11 : | ($k = shift @ARGV) && ($k =~ /^\d{1,2}$/) | ||
12 : | ) | ||
13 : | || die $usage; | ||
14 : | print STDERR "building $dataD for Kmers of size $k\n"; | ||
15 : | mkdir($dataD,0777); | ||
16 : | |||
17 : | overbeek | 1.4 | my %seen; |
18 : | my $existing = $ENV{'SAS_SERVER'}; | ||
19 : | $ENV{'SAS_SERVER'} = 'PUBSEED'; | ||
20 : | open(G,">$dataD/all.genomes") || die "could not open $dataD/all.genomes"; | ||
21 : | foreach $_ (`svr_all_genomes`) | ||
22 : | { | ||
23 : | if (($_ !~ /[pP]hage\b/) && ($_ !~ /[pP]lasmid\b/)) | ||
24 : | { | ||
25 : | if (($_ =~ /^(\S+)\s(\S+).*\s(\d+\.\d+)$/) && ($2 ne "sp.")) | ||
26 : | { | ||
27 : | my $genus = $1; | ||
28 : | my $species = $2; | ||
29 : | my $genome = $3; | ||
30 : | if ($species !~ /^sp\.?$/i) | ||
31 : | { | ||
32 : | my $md5 = $fig->genome_md5sum($genome); | ||
33 : | if ((! $seen{$md5}) && $fig->is_prokaryotic($genome)) | ||
34 : | { | ||
35 : | $seen{$md5} = 1; | ||
36 : | print G $_; | ||
37 : | } | ||
38 : | } | ||
39 : | } | ||
40 : | } | ||
41 : | } | ||
42 : | close(G); | ||
43 : | $ENV{'SAS_SERVER'} = $existing; | ||
44 : | |||
45 : | overbeek | 1.2 | &build_function_index($dataD); |
46 : | &build_otu_index($dataD); | ||
47 : | overbeek | 1.1 | &build_reduced_kmers($dataD,$k); |
48 : | &extend_otu_index($dataD); | ||
49 : | &update_kmers_with_extended_OTUs($dataD); | ||
50 : | |||
51 : | sub build_function_index { | ||
52 : | my($dataD) = @_; | ||
53 : | |||
54 : | open(ASSIGNMENTS,"cut -f2 $dataD/all.genomes | svr_all_features peg | svr_function_of |") | ||
55 : | || die "cannot access assignments"; | ||
56 : | |||
57 : | my %funcs; | ||
58 : | while (defined($_ = <ASSIGNMENTS>)) | ||
59 : | { | ||
60 : | overbeek | 1.5 | if ($_ =~ /^\S+\t(\S.*\S+)/) |
61 : | overbeek | 1.1 | { |
62 : | overbeek | 1.5 | my $stripped = &SeedUtils::strip_func($1); #### CHECK THIS |
63 : | $funcs{$stripped}++; | ||
64 : | overbeek | 1.1 | } |
65 : | } | ||
66 : | close(ASSIGNMENTS); | ||
67 : | open(FI,">$dataD/function.index") || die "could not open $dataD/function.index"; | ||
68 : | |||
69 : | my $nxt = 0; | ||
70 : | foreach my $f (sort keys(%funcs)) | ||
71 : | { | ||
72 : | if (($funcs{$f} > 5) && ((! &SeedUtils::hypo($f)) || ($f =~ /FIG/))) | ||
73 : | { | ||
74 : | print FI "$nxt\t$f\n"; | ||
75 : | $nxt++; | ||
76 : | } | ||
77 : | } | ||
78 : | close(FI); | ||
79 : | } | ||
80 : | |||
81 : | sub build_otu_index { | ||
82 : | my($dataD) = @_; | ||
83 : | |||
84 : | my %otus; | ||
85 : | foreach $_ (`cat $dataD/all.genomes`) | ||
86 : | { | ||
87 : | if ($_ =~ /(\S+\s\S+)/) | ||
88 : | { | ||
89 : | $otus{$1}++; | ||
90 : | } | ||
91 : | } | ||
92 : | |||
93 : | open(OI,">$dataD/otu.index") || die "could not open $dataD/otu.index"; | ||
94 : | my $nxt = 0; | ||
95 : | foreach my $otu (keys(%otus)) | ||
96 : | { | ||
97 : | if ($otu !~ / sp\.$/) | ||
98 : | { | ||
99 : | print OI "$nxt\t$otu\n"; | ||
100 : | $nxt++; | ||
101 : | } | ||
102 : | } | ||
103 : | close(OI); | ||
104 : | } | ||
105 : | |||
106 : | sub build_reduced_kmers { | ||
107 : | my($dataD,$k) = @_; | ||
108 : | |||
109 : | my %to_oI; | ||
110 : | foreach $_ (`cat $dataD/otu.index`) | ||
111 : | { | ||
112 : | if ($_ =~ /^(\S+)\t(\S.*\S)/) | ||
113 : | { | ||
114 : | $to_oI{$2} = $1; | ||
115 : | } | ||
116 : | } | ||
117 : | |||
118 : | my %g_to_oI; | ||
119 : | |||
120 : | foreach $_ (`cat $dataD/all.genomes`) | ||
121 : | { | ||
122 : | if ($_ =~ /^(\S[^\t]+)\t(\S.*\S)/) | ||
123 : | { | ||
124 : | my $g = $2; | ||
125 : | my $gs = $1; | ||
126 : | if (($gs =~ /^(\S+ \S+)/) && defined($to_oI{$1})) | ||
127 : | { | ||
128 : | $g_to_oI{$g} = $to_oI{$1}; | ||
129 : | } | ||
130 : | } | ||
131 : | } | ||
132 : | |||
133 : | my %to_fI; | ||
134 : | foreach $_ (`cat $dataD/function.index`) | ||
135 : | { | ||
136 : | if ($_ =~ /^(\S+)\t(\S.*\S)/) | ||
137 : | { | ||
138 : | $to_fI{$2} = $1; | ||
139 : | } | ||
140 : | } | ||
141 : | overbeek | 1.4 | # |
142 : | # we take assignments from both the pubSEED and the ASEED. ASEED functions | ||
143 : | # override PSEED functions | ||
144 : | # | ||
145 : | overbeek | 1.1 | my %peg_to_fI; |
146 : | overbeek | 1.4 | &load_peg_to_fI($dataD,\%to_fI,'PUBSEED',\%peg_to_fI); |
147 : | &load_peg_to_fI($dataD,\%to_fI,'SEED',\%peg_to_fI); | ||
148 : | overbeek | 1.1 | |
149 : | open(RAW,"| sort -T . > $dataD/sorted.kmers") || die "could not open $dataD/sorted.kmers"; | ||
150 : | foreach my $g (`cut -f2 $dataD/all.genomes`) | ||
151 : | { | ||
152 : | overbeek | 1.2 | # next if (! $g_to_oI{$g}); |
153 : | overbeek | 1.1 | foreach $_ (`echo '$g' | svr_all_features peg | svr_translations_of`) |
154 : | { | ||
155 : | if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S.*\S)$/) | ||
156 : | { | ||
157 : | chomp; | ||
158 : | my $seq = $2; | ||
159 : | my $id = $1; | ||
160 : | ($id =~ /^fig\|(\d+\.\d+)/) || die "bad peg $_"; | ||
161 : | my $g = $1; | ||
162 : | my $oI = $g_to_oI{$g}; | ||
163 : | my $fI = $peg_to_fI{$id}; | ||
164 : | for (my $i=0; ($i < (length($seq) - $k)); $i++) | ||
165 : | { | ||
166 : | my $kmer = uc substr($seq,$i,$k); | ||
167 : | if ($kmer !~ /[^ACDEFGHIKLMNPQRSTVWY]/) | ||
168 : | { | ||
169 : | print RAW join("\t",($kmer,$fI,$oI,length($seq)-$i)),"\n"; | ||
170 : | } | ||
171 : | } | ||
172 : | } | ||
173 : | } | ||
174 : | } | ||
175 : | close(RAW); | ||
176 : | |||
177 : | open(RAW,"<$dataD/sorted.kmers") || die "could not open sorted.kmers"; | ||
178 : | open(REDUCED,">$dataD/reduced.kmers") || die "could not open reduced kmers"; | ||
179 : | my $last = <RAW>; | ||
180 : | while ($last && ($last =~ /^(\S+)/)) | ||
181 : | { | ||
182 : | my $curr = $1; | ||
183 : | my @set; | ||
184 : | while ($last && ($last =~ /^(\S+)\t(\S*)\t(\S*)\t(\S*)$/) && ($1 eq $curr)) | ||
185 : | { | ||
186 : | push(@set,[$2,$3,$4]); | ||
187 : | $last = <RAW>; | ||
188 : | } | ||
189 : | &process_set($curr,\@set,\*REDUCED); | ||
190 : | } | ||
191 : | close(REDUCED); | ||
192 : | } | ||
193 : | |||
194 : | overbeek | 1.4 | sub load_peg_to_fI { |
195 : | my($dataD,$to_fI,$which_seed,$peg_to_fI) = @_; | ||
196 : | my $existing = $ENV{'SAS_SERVER'}; | ||
197 : | $ENV{'SAS_SERVER'} = $which_seed; | ||
198 : | |||
199 : | foreach $_ (`cut -f2 $dataD/all.genomes | svr_all_features peg | svr_function_of`) | ||
200 : | { | ||
201 : | if ($_ =~ /^(\S+)\t(\S.*\S)/) | ||
202 : | { | ||
203 : | my $peg = $1; | ||
204 : | overbeek | 1.5 | my $func = &SeedUtils::strip_func($2); |
205 : | overbeek | 1.4 | if ($to_fI->{$func}) |
206 : | { | ||
207 : | $peg_to_fI->{$peg} = $to_fI->{$func}; | ||
208 : | } | ||
209 : | } | ||
210 : | } | ||
211 : | $ENV{'SAS_SERVER'} = $existing; | ||
212 : | } | ||
213 : | |||
214 : | |||
215 : | overbeek | 1.1 | sub process_set { |
216 : | my($kmer,$set,$fh) = @_; | ||
217 : | |||
218 : | my %funcs; | ||
219 : | foreach my $tuple (@$set) | ||
220 : | { | ||
221 : | my($fI,$oI,$off) = @$tuple; | ||
222 : | $funcs{$fI}++; | ||
223 : | } | ||
224 : | my @tmp = sort { $funcs{$b} <=> $funcs{$a} } keys(%funcs); | ||
225 : | overbeek | 1.3 | if ($tmp[0] && ($funcs{$tmp[0]} > (0.5 * @$set))) |
226 : | overbeek | 1.1 | { |
227 : | my $best_fI = $tmp[0]; | ||
228 : | my %otus; | ||
229 : | my $otu = ''; | ||
230 : | foreach my $tuple (@$set) | ||
231 : | { | ||
232 : | my($fI,$oI,$off) = @$tuple; | ||
233 : | if (($fI == $best_fI) && $oI) | ||
234 : | { | ||
235 : | $otus{$oI}++; | ||
236 : | } | ||
237 : | } | ||
238 : | @tmp = sort { $otus{$b} <=> $otus{$a} } keys(%otus); | ||
239 : | if ($tmp[0] && ($_ = &pickable_otu(\@tmp,\%otus,scalar @$set))) | ||
240 : | { | ||
241 : | $otu = $_; | ||
242 : | } | ||
243 : | my $sum_off = 0; | ||
244 : | my $n = 0; | ||
245 : | foreach my $tuple (@$set) | ||
246 : | { | ||
247 : | my($fI,$oI,$off) = @$tuple; | ||
248 : | if ($fI == $best_fI) | ||
249 : | { | ||
250 : | $sum_off += $off; | ||
251 : | $n++; | ||
252 : | } | ||
253 : | } | ||
254 : | print $fh join("\t",($kmer,int($sum_off/$n),$best_fI,$otu)),"\n"; | ||
255 : | } | ||
256 : | } | ||
257 : | |||
258 : | sub pickable_otu { | ||
259 : | my($poss_otus,$counts,$in_set) = @_; | ||
260 : | |||
261 : | my @called; | ||
262 : | if (@$poss_otus == 1) { return $poss_otus->[0] } | ||
263 : | my $best; | ||
264 : | while (($best = shift @$poss_otus) && ($counts->{$best} >= (0.2 * $in_set))) | ||
265 : | { | ||
266 : | push(@called,$best); | ||
267 : | } | ||
268 : | if (@called > 0) | ||
269 : | { | ||
270 : | overbeek | 1.2 | return join(",",sort { $a <=> $b } @called); |
271 : | overbeek | 1.1 | } |
272 : | return ''; | ||
273 : | } | ||
274 : | |||
275 : | sub extend_otu_index { | ||
276 : | my($DataD) = @_; | ||
277 : | |||
278 : | my @otus = map { chop; [split(/\t/,$_)] } `cat $dataD/otu.index`; | ||
279 : | my $nxt = $otus[-1]->[0] + 1; | ||
280 : | open(EXT,">>$DataD/otu.index") || die "could not extend otu.index"; | ||
281 : | foreach $_ (`cut -f4 $DataD/reduced.kmers | grep ',' | sort -u`) | ||
282 : | { | ||
283 : | print EXT join("\t",($nxt++,$_)); | ||
284 : | } | ||
285 : | close(EXT); | ||
286 : | } | ||
287 : | |||
288 : | sub update_kmers_with_extended_OTUs { | ||
289 : | my($dataD) = @_; | ||
290 : | |||
291 : | my %to_oI; | ||
292 : | foreach $_ (`cat $dataD/otu.index`) | ||
293 : | { | ||
294 : | if ($_ =~ /^(\d+)\t(\S+)/) | ||
295 : | { | ||
296 : | $to_oI{$2} = $1; | ||
297 : | } | ||
298 : | } | ||
299 : | open(IN,"<$dataD/reduced.kmers") || die "bad"; | ||
300 : | open(OUT,">$dataD/final.kmers") || die "could not open final.kmers"; | ||
301 : | while (defined($_ = <IN>)) | ||
302 : | { | ||
303 : | if ($_ !~ /,/) { print OUT $_ } | ||
304 : | else | ||
305 : | { | ||
306 : | chop; | ||
307 : | my($kmer,$off,$fI,$otus) = split(/\t/,$_); | ||
308 : | my $otu = $to_oI{$otus}; | ||
309 : | print OUT join("\t",($kmer,$off,$fI,$otu)),"\n"; | ||
310 : | } | ||
311 : | } | ||
312 : | close(IN); | ||
313 : | close(OUT); | ||
314 : | } | ||
315 : | |||
316 : | |||
317 : |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |