Parent Directory
|
Revision Log
Revision 1.18 - (view) (download) (as text)
1 : | overbeek | 1.1 | |
2 : | olson | 1.10 | # This is a SAS component |
3 : | |||
4 : | overbeek | 1.1 | # |
5 : | # Copyright (c) 2003-2006 University of Chicago and Fellowship | ||
6 : | # for Interpretations of Genomes. All Rights Reserved. | ||
7 : | # | ||
8 : | # This file is part of the SEED Toolkit. | ||
9 : | # | ||
10 : | # The SEED Toolkit is free software. You can redistribute | ||
11 : | # it and/or modify it under the terms of the SEED Toolkit | ||
12 : | # Public License. | ||
13 : | # | ||
14 : | # You should have received a copy of the SEED Toolkit Public License | ||
15 : | # along with this program; if not write to the University of Chicago | ||
16 : | # at info@ci.uchicago.edu or the Fellowship for Interpretation of | ||
17 : | # Genomes at veronika@thefig.info or download a copy from | ||
18 : | # http://www.theseed.org/LICENSE.TXT. | ||
19 : | # | ||
20 : | package ProtSims; | ||
21 : | |||
22 : | use strict; | ||
23 : | olson | 1.4 | use Sim; |
24 : | overbeek | 1.1 | use Data::Dumper; |
25 : | use Carp; | ||
26 : | use gjoseqlib; | ||
27 : | olson | 1.15 | use DB_File; |
28 : | overbeek | 1.1 | |
29 : | olson | 1.13 | use SeedAware; |
30 : | |||
31 : | my $blat_cmd = SeedAware::executable_for("blat"); | ||
32 : | my $blastall_cmd = SeedAware::executable_for("blastall"); | ||
33 : | my $formatdb_cmd = SeedAware::executable_for("formatdb"); | ||
34 : | olson | 1.10 | |
35 : | olson | 1.17 | # |
36 : | # Need to call temps different names on each invocation - | ||
37 : | # if a subprocess hung on windows, we get problems with | ||
38 : | # "file in use" errors. | ||
39 : | # | ||
40 : | my $tmp_serial = 0; | ||
41 : | |||
42 : | overbeek | 1.1 | sub blastP { |
43 : | overbeek | 1.12 | my($q,$db,$min_hits,$use_blast) = @_; |
44 : | overbeek | 1.1 | |
45 : | overbeek | 1.2 | my(@q,@db); |
46 : | |||
47 : | olson | 1.13 | my $tmp_dir = SeedAware::location_of_tmp(); |
48 : | |||
49 : | olson | 1.18 | my $tmp_suffix = $$ . "." . $tmp_serial++ . "." . time; |
50 : | olson | 1.17 | |
51 : | overbeek | 1.1 | my $qF; |
52 : | if (ref $q) | ||
53 : | { | ||
54 : | olson | 1.17 | $qF = "$tmp_dir/query.$tmp_suffix"; |
55 : | overbeek | 1.1 | &gjoseqlib::print_alignment_as_fasta($qF,$q); |
56 : | } | ||
57 : | else | ||
58 : | { | ||
59 : | $qF = $q; | ||
60 : | if (-e $qF) | ||
61 : | { | ||
62 : | overbeek | 1.2 | @q = &gjoseqlib::read_fasta($qF); |
63 : | overbeek | 1.1 | $q = \@q; |
64 : | } | ||
65 : | else | ||
66 : | { | ||
67 : | die "$qF is missing"; | ||
68 : | } | ||
69 : | } | ||
70 : | |||
71 : | olson | 1.15 | my $db_lengths = {}; |
72 : | |||
73 : | overbeek | 1.1 | my $dbF; |
74 : | if (ref $db) | ||
75 : | { | ||
76 : | olson | 1.17 | $dbF = "$tmp_dir/db.$tmp_suffix"; |
77 : | overbeek | 1.1 | &gjoseqlib::print_alignment_as_fasta($dbF,$db); |
78 : | olson | 1.13 | system($formatdb_cmd, '-i', $dbF); |
79 : | olson | 1.15 | $db_lengths->{$_->[0]} = length($_->[2]) for @$db; |
80 : | overbeek | 1.1 | } |
81 : | else | ||
82 : | { | ||
83 : | $dbF = $db; | ||
84 : | if (-e $dbF) | ||
85 : | { | ||
86 : | if (! ((-e "$dbF.psq") && ((-M "$dbF.psq") < (-M $dbF)))) | ||
87 : | { | ||
88 : | olson | 1.13 | system($formatdb_cmd, '-i', $dbF);; |
89 : | overbeek | 1.1 | } |
90 : | olson | 1.15 | |
91 : | my $db_tie; | ||
92 : | if (-f "$dbF.lengths") | ||
93 : | { | ||
94 : | $db_tie = tie %$db_lengths, 'DB_File', "$dbF.lengths", O_RDONLY, 0, $DB_BTREE; | ||
95 : | $db_tie or warn "Cannot tie $dbF.lengths: $!"; | ||
96 : | } | ||
97 : | if (!defined($db_tie)) | ||
98 : | { | ||
99 : | if (open(my $dbfh, "<", $dbF)) | ||
100 : | { | ||
101 : | while (my($id, $def, $seq) = gjoseqlib::read_next_fasta_seq($dbfh)) | ||
102 : | { | ||
103 : | $db_lengths->{$id} = length($seq); | ||
104 : | } | ||
105 : | close($dbfh); | ||
106 : | } | ||
107 : | else | ||
108 : | { | ||
109 : | warn "Cannot open $dbF: $!"; | ||
110 : | } | ||
111 : | } | ||
112 : | overbeek | 1.1 | } |
113 : | else | ||
114 : | { | ||
115 : | die "$dbF is missing"; | ||
116 : | } | ||
117 : | } | ||
118 : | |||
119 : | olson | 1.17 | my $tmpF = "$tmp_dir/sim.out.$tmp_suffix"; |
120 : | olson | 1.13 | |
121 : | if ($use_blast) | ||
122 : | { | ||
123 : | open(my $fh, ">", $tmpF); | ||
124 : | close($fh); | ||
125 : | } | ||
126 : | else | ||
127 : | olson | 1.4 | { |
128 : | olson | 1.13 | my @cmd = ($blat_cmd, $dbF, $qF, "-prot", "-out=blast8", $tmpF); |
129 : | |||
130 : | olson | 1.16 | # |
131 : | # When running under FCGI, the system_with_redirect fails due to | ||
132 : | # the FCGI library redefining open. | ||
133 : | # | ||
134 : | |||
135 : | my $rc; | ||
136 : | if (defined($ENV{FCGI_ROLE})) | ||
137 : | { | ||
138 : | $rc = system(@cmd); | ||
139 : | } | ||
140 : | else | ||
141 : | { | ||
142 : | $rc = system_with_redirect(\@cmd, { stdout => '/dev/null' } ); | ||
143 : | } | ||
144 : | |||
145 : | olson | 1.13 | print STDERR "Blat returns $rc: @cmd\n"; |
146 : | if ($rc != 0) | ||
147 : | { | ||
148 : | warn "Blat run failed with rc=$rc\n"; | ||
149 : | open(my $fh, ">", $tmpF); | ||
150 : | close($fh); | ||
151 : | } | ||
152 : | olson | 1.4 | } |
153 : | overbeek | 1.1 | |
154 : | olson | 1.13 | # my $cmd = $use_blast ? "touch $tmpF" : "$blat_cmd $dbF $qF -prot -out=blast8 $tmpF > /dev/null"; |
155 : | # #print STDERR "$cmd\n"; | ||
156 : | # my $rc = system $cmd; | ||
157 : | # if ($rc != 0) | ||
158 : | # { | ||
159 : | # die "ProtSims::blastP: blat run failed with rc=$rc: $cmd\n"; | ||
160 : | # } | ||
161 : | |||
162 : | overbeek | 1.5 | my @sims1 = (); |
163 : | open(BLAT,"<$tmpF") || die "could not open $tmpF"; | ||
164 : | my $sim = <BLAT>; | ||
165 : | while ($sim && ($sim=~ /^(\S+\t\S+)/)) | ||
166 : | { | ||
167 : | my $pegs = $1; | ||
168 : | my @pieces = (); | ||
169 : | while ($sim && ($sim=~ /^((\S+\t\S+).*\S)/) && ($2 eq $pegs)) | ||
170 : | { | ||
171 : | push(@pieces,[split(/\t/,$1)]); | ||
172 : | $sim = <BLAT>; | ||
173 : | } | ||
174 : | push(@sims1,&condense(\@pieces)); | ||
175 : | } | ||
176 : | close(BLAT); | ||
177 : | overbeek | 1.1 | unlink $tmpF; |
178 : | olson | 1.4 | #print STDERR &Dumper(sims1 => \@sims1); |
179 : | overbeek | 1.1 | |
180 : | my @rerun = (); | ||
181 : | my @sims = (); | ||
182 : | my $qI = 0; | ||
183 : | my $simI = 0; | ||
184 : | while ($qI < @$q) | ||
185 : | { | ||
186 : | my $peg = $q->[$qI]->[0]; | ||
187 : | olson | 1.4 | #print STDERR "processing $peg\n"; |
188 : | overbeek | 1.1 | my $hits = 0; |
189 : | my $simI1 = $simI; | ||
190 : | while (($simI1 < @sims1) && ($sims1[$simI1]->[0] eq $peg)) | ||
191 : | { | ||
192 : | if (($simI1 == $#sims1) || ($sims1[$simI1]->[1] ne $sims1[$simI1+1]->[1])) | ||
193 : | { | ||
194 : | $hits++; | ||
195 : | } | ||
196 : | $simI1++; | ||
197 : | } | ||
198 : | olson | 1.4 | #print STDERR "hits=$hits min_hits=$min_hits\n"; |
199 : | overbeek | 1.1 | if ($hits >= $min_hits) |
200 : | { | ||
201 : | olson | 1.4 | push(@sims,@sims1[$simI..$simI1-1]); |
202 : | overbeek | 1.1 | } |
203 : | else | ||
204 : | { | ||
205 : | push(@rerun,$q->[$qI]); | ||
206 : | } | ||
207 : | $simI = $simI1; | ||
208 : | $qI++; | ||
209 : | } | ||
210 : | olson | 1.4 | #print STDERR &Dumper(rerun => \@rerun); |
211 : | overbeek | 1.1 | |
212 : | if (@rerun > 0) | ||
213 : | { | ||
214 : | olson | 1.17 | my $tmpQ = "$tmp_dir/tmpQ.$tmp_suffix"; |
215 : | overbeek | 1.1 | &gjoseqlib::print_alignment_as_fasta($tmpQ,\@rerun); |
216 : | olson | 1.16 | |
217 : | |||
218 : | # | ||
219 : | # If we're under FCGI (and thus under the servers), and the loadavg is low, do a small parallel run. | ||
220 : | # | ||
221 : | |||
222 : | my @par = (); | ||
223 : | if (defined($ENV{FCGI_ROLE}) && open(my $la, "/proc/loadavg")) | ||
224 : | { | ||
225 : | my $l = <$la>; | ||
226 : | chomp $l; | ||
227 : | my @vals = split(/\s+/, $l); | ||
228 : | my @procs = split(/\//, $vals[3]); | ||
229 : | if ($vals[0] < 4 && $procs[0] < 8) | ||
230 : | { | ||
231 : | @par = ("-a", 4); | ||
232 : | } | ||
233 : | } | ||
234 : | |||
235 : | |||
236 : | olson | 1.13 | # my $cmd = "$blastall_cmd -m 8 -i $tmpQ -d $dbF -FF -p blastp -e 1e-5"; |
237 : | olson | 1.16 | my @cmd = ($blastall_cmd, '-m', 8, '-i', $tmpQ, '-d', $dbF, '-FF', '-p', 'blastp', '-e', '1e-5', @par); |
238 : | olson | 1.4 | #print STDERR "$cmd\n"; |
239 : | olson | 1.13 | #open(BL, "$cmd|") or die "ProtSims::blastP: pipe to blast failed with $!: $cmd\n"; |
240 : | |||
241 : | olson | 1.14 | # |
242 : | # It'd be nice to do this but windows doesn't support it. | ||
243 : | # | ||
244 : | #open(BL, "-|", @cmd) or die "ProtSims::blastP: pipe to blast failed with $!: @cmd\n"; | ||
245 : | olson | 1.17 | my $out_tmp = "$tmp_dir/blast_out.$tmp_suffix"; |
246 : | olson | 1.14 | push(@cmd, "-o", $out_tmp); |
247 : | print STDERR "@cmd\n"; | ||
248 : | my $rc = system(@cmd); | ||
249 : | if ($rc != 0) | ||
250 : | { | ||
251 : | warn "Error rc=$rc running blast: @cmd\n"; | ||
252 : | } | ||
253 : | else | ||
254 : | { | ||
255 : | if (open(BL, "<", $out_tmp)) | ||
256 : | { | ||
257 : | while (<BL>) | ||
258 : | { | ||
259 : | chomp; | ||
260 : | push @sims, [ split(/\s+/, $_) ]; | ||
261 : | } | ||
262 : | #my @blastout = map { chomp; [split(/\s+/,$_)] } <BL>; | ||
263 : | # push(@sims,@blastout); | ||
264 : | close(BL); | ||
265 : | } | ||
266 : | } | ||
267 : | unlink $out_tmp; | ||
268 : | overbeek | 1.1 | unlink $tmpQ; |
269 : | } | ||
270 : | |||
271 : | overbeek | 1.3 | my %lnQ = map { $_->[0] => length($_->[2]) } @$q; |
272 : | olson | 1.15 | |
273 : | @sims = map { push(@$_, $lnQ{$_->[0]}, $db_lengths->{$_->[1]}); bless($_,'Sim') } @sims; | ||
274 : | overbeek | 1.1 | |
275 : | olson | 1.17 | if ($qF eq "$tmp_dir/query.$tmp_suffix") { unlink $qF } |
276 : | if ($dbF eq "$tmp_dir/db.$tmp_suffix") { unlink($dbF,"$dbF.psq","$dbF.pin","$dbF.phr") } | ||
277 : | overbeek | 1.9 | return sort { ($a->id1 cmp $b->id1) or ($a->psc <=> $b->psc) or ($a->id2 cmp $b->id2) } @sims; |
278 : | overbeek | 1.5 | } |
279 : | |||
280 : | sub condense { | ||
281 : | my($pieces) = @_; | ||
282 : | |||
283 : | overbeek | 1.6 | my $best_sc = $pieces->[0]->[10]; |
284 : | overbeek | 1.5 | my @sims = sort { ($a->[6] <=> $b->[6]) } @$pieces; |
285 : | overbeek | 1.7 | while (@sims > 1) |
286 : | overbeek | 1.5 | { |
287 : | overbeek | 1.7 | my $gap1 = $sims[1]->[6] - $sims[0]->[7]; |
288 : | my $gap2 = $sims[1]->[8] - $sims[0]->[9]; | ||
289 : | my $diff = abs($gap1 - $gap2); | ||
290 : | if (($gap1 <= 300) && ($gap2 <= 300) && ($diff <= (0.1 * &min($gap1,$gap2)))) | ||
291 : | { | ||
292 : | $sims[0]->[7] = $sims[1]->[7]; | ||
293 : | $sims[0]->[9] = $sims[1]->[9]; | ||
294 : | } | ||
295 : | overbeek | 1.5 | splice(@sims,1,1); |
296 : | } | ||
297 : | overbeek | 1.6 | $sims[0]->[10] = $best_sc; |
298 : | overbeek | 1.5 | return $sims[0]; |
299 : | overbeek | 1.1 | } |
300 : | |||
301 : | overbeek | 1.7 | sub min { |
302 : | my($x,$y) = @_; | ||
303 : | return ($x <= $y) ? $x : $y; | ||
304 : | } | ||
305 : | |||
306 : | overbeek | 1.1 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |