[Bio] / FigKernelPackages / AlignTree.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/AlignTree.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.37 - (view) (download) (as text)

1 : fangfang 1.1 # This is a SAS component
2 :     #
3 :     # Copyright (c) 2003-2010 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 :     package AlignTree;
20 :    
21 :     use strict;
22 : fangfang 1.13
23 :     use Carp;
24 :     use Data::Dumper;
25 :    
26 :     use SeedAware;
27 :     use SeedUtils;
28 :    
29 :     use ffxtree;
30 : fangfang 1.1 use gjoseqlib;
31 :     use gjoalignment;
32 :     use gjoalignandtree;
33 :     use gjoparseblast;
34 : fangfang 1.6 use representative_sequences;
35 : fangfang 1.1
36 :     require Exporter;
37 :    
38 :     our @ISA = qw(Exporter);
39 : fangfang 1.6 our @EXPORT = qw( align_sequences
40 :     trim_alignment
41 : fangfang 1.11 psiblast_search
42 :     make_tree );
43 : fangfang 1.6
44 :    
45 :     #-------------------------------------------------------------------------------
46 :     #
47 :     # @align = align_sequences( \@seqs, \%opts )
48 :     # \@align = align_sequences( \@seqs, \%opts )
49 :     # @align = align_sequences( \%opts ) # \@seqs = $opts{seqs}
50 :     # \@align = align_sequences( \%opts ) # \@seqs = $opts{seqs}
51 :     #
52 :     # Options:
53 :     #
54 : fangfang 1.21 # tool => program # MAFFT (d), Muscle or clustal
55 :     # clustal_ends => bool # use clustal to align ends (zero end gap penalty) (D = 0)
56 : fangfang 1.6 #
57 :     # Other options supported in align_with_clustal, align_with_muscle or align_with_mafft
58 :     #
59 :     #-------------------------------------------------------------------------------
60 :    
61 :     sub align_sequences {
62 :     my ($seqs, $opts);
63 :    
64 :     if (@_ == 1 && ref $_[0] eq 'HASH') {
65 :     $opts = $_[0];
66 :     $seqs = $opts->{seqs};
67 :     } else {
68 :     ($seqs, $opts) = @_;
69 :     }
70 :     $opts->{version} or $seqs && ref $seqs eq 'ARRAY' && @$seqs
71 :     or die "align_sequences called with invalid sequences.\n";
72 :    
73 :     my $program;
74 :    
75 : fangfang 1.21 $opts->{tool} ||= 'mafft';
76 : fangfang 1.6
77 : fangfang 1.21 $opts->{muscle} = "/home/fangfang/bin/muscle" if -x "/home/fangfang/bin/muscle";
78 :     $opts->{mafft} = "/home/fangfang/bin/mafft" if -x "/home/fangfang/bin/mafft";
79 :    
80 :     if ($opts->{tool} =~ /muscle/i) { $program = \&gjoalignment::align_with_muscle }
81 :     elsif ($opts->{tool} =~ /mafft/i) { $program = \&gjoalignment::align_with_mafft }
82 :     elsif ($opts->{tool} =~ /clustal/i) { $program = \&gjoalignment::align_with_clustal }
83 : fangfang 1.6
84 :     if ($opts->{version}) {
85 :     if ($opts->{tool} =~ /clustal/i) { # version option not supported in gjoalignment::align_with_clustal
86 :     my $clustal = SeedAware::executable_for($opts->{clustalw} || $opts->{program} || 'clustalw');
87 :     my $tmpdir = SeedAware::location_of_tmp($opts);
88 :     my $tmpF = SeedAware::new_file_name("$tmpdir/version", '');
89 :    
90 :     SeedAware::system_with_redirect($clustal, "--version", {stdout => $tmpF});
91 :    
92 :     open(F, $tmpF) or die "Could not open $tmpF";
93 :     my @info = <F>;
94 :     close(F);
95 :     unlink($tmpF);
96 :     my $version = $info[3]; # fourth line of Clustal usage info
97 :     chomp($version);
98 :     return $version;
99 :     }
100 :     return $program->($seqs, $opts);
101 :     }
102 :    
103 : fangfang 1.35 my $ali = @$seqs > 1 ? $program->($seqs, $opts) : $seqs;
104 : fangfang 1.21
105 : fangfang 1.35 if ($opts->{clustal_ends} && $opts->{tool} != ~/clustal/i && @$ali > 1) {
106 : fangfang 1.21 my $opts2 = { global_coords => 1 };
107 :     trim_ali_to_conserved_domains($ali, $opts2);
108 :     my ($b, $e) = ($opts2->{global_beg}, $opts2->{global_end});
109 :     # print STDERR "b, e, l = $b, $e, ". length($ali->[0]->[2])."\n";
110 :     if (defined $b && defined $e && $b < $e) {
111 :     my %inner;
112 :     my @padded = @$ali;
113 :     my $padstr = 'PADDINGPADDING';
114 :     for (@padded) {
115 :     $inner{$_->[0]} = substr($_->[2], $b, $e-$b+1);
116 :     substr($_->[2], $b, $e-$b+1) = $padstr;
117 :     }
118 :     $ali = gjoalignment::align_with_clustal(@padded);
119 :     for (@$ali) {
120 :     $_->[2] =~ s/$padstr/$inner{$_->[0]}/e;
121 :     }
122 :     }
123 :     }
124 :    
125 : fangfang 1.6 wantarray ? @$ali : $ali;
126 :     }
127 :    
128 :    
129 :     #-------------------------------------------------------------------------------
130 :     #
131 : fangfang 1.21 # @trimmed_ali = trim_ali_to_conserved_domains( \@ali, \%opts )
132 :     # \@trimmed_ali = trim_ali_to_conserved_domains( \@ali, \%opts )
133 :     #
134 :     # Options
135 :     #
136 :     # win_size => size # size of sliding window used in scoring domain conservation
137 :     # domain_conserv => thresh # min mean domain conservation (D = 0.3)
138 :     # residue_conserv => thresh # min residule conservation for trimmed end sites (D = 0.1)
139 :     # global_coords => bool # return coordinates of the internal region
140 :     # between the relatively conserved end domains:
141 :     # $opts->{global_beg} = first site
142 :     # $opts->{global_end} = last site
143 :     #
144 :     #-------------------------------------------------------------------------------
145 :    
146 :     sub trim_ali_to_conserved_domains {
147 :     my ($ali, $opts) = @_;
148 :    
149 :     my $winsize = $opts->{win_size} || 10;
150 :     my $domain = $opts->{domain_conserv} || 0.3;
151 :     my $residue = $opts->{residue_conserv} || 0.1;
152 :    
153 :     my $conserv = residue_conserv_scores($ali);
154 :     my $len = length($ali->[0]->[2]);
155 :    
156 :     my ($b, $e);
157 :     my ($s1, $s2);
158 :     my ($r1, $r2);
159 :     for ($b = 0; $b < $len; $b++) {
160 :     $s1 += $conserv->[$b];
161 :     $s1 -= $conserv->[$b-$winsize] if $b >= $winsize;
162 :     last if $s1 >= $winsize * $domain && $b+1 >= $winsize;
163 :     }
164 :     for ($e = $len-1; $e > $b; $e--) {
165 :     $s2 += $conserv->[$e];
166 :     $s2 -= $conserv->[$e+$winsize] if $e+$winsize < $len;
167 :     last if $s2 >= $winsize * $domain && $len-$e >= $winsize;
168 :     }
169 :     if ($opts->{global_coords}) {
170 :     $opts->{global_beg} = $b+1;
171 :     $opts->{global_end} = $e-1;
172 :     }
173 :     $b = max($b-$winsize+1, 0); $b++ while $conserv->[$b] < $residue;
174 :     $e = min($e+$winsize-1, $len-1); $e-- while $conserv->[$e] < $residue;
175 :    
176 :     my @ali2 = map { [@$_[0,1], substr($_->[2], $b, $e-$b+1)] } @$ali;
177 :    
178 :     wantarray ? @ali2 : \@ali2;
179 :     }
180 :    
181 :     #-------------------------------------------------------------------------------
182 :     #
183 :     # Calculate conservation scores for a protein alignment:
184 :     #
185 :     # @conserv = residue_conserv_scores( \@ali);
186 :     # \@conserv = residue_conserv_scores( \@ali);
187 :     #
188 :     # The function returns an array of scores that correspond to the
189 :     # degree of conservation of each column in a protein alignment.
190 :     #
191 :     # Scores are in the range [0, 1] (1 indicates a column with all idential AAs)
192 :     #
193 :     # This metric is based on ClustalX's conservation score (Thompson,
194 :     # NAR 1997): One vector per sequence is defined for an alignment
195 :     # site. The vectors are defined in the space of amino acids using a
196 :     # substitution matrix. A mean vector is then calculated for the site
197 :     # and the final score is the average euclidean distance to the mean
198 :     # vector.
199 :     #
200 :     # It is unclear how the original method converts a distance to a
201 :     # similarity score. Here we use the following simple conversion:
202 :     #
203 :     # sim = min( 0, (1 - distance/MAX_DISTANCE) )
204 :     #
205 :     # where MAX_DISTANCE is hardcoded to 10.
206 :     #
207 :     #-------------------------------------------------------------------------------
208 :    
209 :     sub residue_conserv_scores {
210 :     my ($ali) = @_;
211 :    
212 :     my @seq = map { uc $_->[2] } @$ali;
213 :     my $len = length($seq[0]);
214 :     my $nseq = scalar @seq;
215 :     my $chars = qr/^[A-Za-z]$/;
216 :    
217 :     my ($aa_list, $blosum62) = gjoalign2html::raw_blosum62();
218 :     my $n_aa = scalar @$aa_list;
219 :     my $aa_str = join('', @$aa_list);
220 :     my %aa_index = map { $_ => index($aa_str, $_) } @$aa_list;
221 :    
222 :     my @conserv;
223 :    
224 :     for (my $i = 0; $i < $len; $i++) {
225 :     my %cnt;
226 :     my $nongap;
227 :     my @col = map { substr($_, $i, 1) } @seq;
228 :     foreach (@col) { if (/$chars/) { $cnt{$_}++; $nongap++ } }
229 :     if ($nongap == 0) {
230 :     push @conserv, 0;
231 :     next;
232 :     }
233 :     my @center;
234 :     my $ind;
235 :     my @dist;
236 :     my $total;
237 :     for my $ia (0..$n_aa-1) {
238 :     for my $c (keys %cnt) {
239 :     $ind = $aa_index{$c};
240 :     $center[$ia] += $cnt{$c} * $blosum62->[$ind]->[$ia];
241 :     }
242 :     $center[$ia] /= $nongap;
243 :     for my $c (keys %cnt) {
244 :     $ind = $aa_index{$c};
245 :     $dist[$ia] += $cnt{$c} * ($blosum62->[$ind]->[$ia] - $center[$ia]) ** 2;
246 :     }
247 :     $total += $dist[$ia];
248 :     }
249 :     $total = sqrt($total/$nongap);
250 :     $total = 1 - $total/10;
251 :     $total = 0 if $total < 0;
252 :     $total *= ($nongap / $nseq);
253 :    
254 :     push @conserv, $total;
255 :     }
256 :    
257 :     wantarray ? @conserv : \@conserv;
258 :     }
259 :    
260 :     #-------------------------------------------------------------------------------
261 :     #
262 : fangfang 1.6 # @align = trim_alignment( \@align, \%opts )
263 :     # \@align = trim_alignment( \@align, \%opts )
264 :     # @align = trim_alignment( \%opts ) # input \@align = $opts{ali}
265 :     # \@align = trim_alignment( \%opts ) # input \@align = $opts{ali}
266 :     #
267 :     # Options:
268 :     #
269 : fangfang 1.8 # align_opts => HASH # options for all alignment operations in trimming. Use Clustal by default.
270 :     # fract_cov => fract # fraction of sequences to be covered in initial trimming of ends (D: 0.75)
271 :     # fract_ends => fract # minimum fraction of ends to be considered significant for uncov cutoff (D = 0.1)
272 :     # keep_def => bool # do not append trimming coordinates to description fields in seqs
273 :     # log_dir => dir # directory for log files
274 :     # log_prefix => string # prefix for log file names
275 :     # max_reps_sim => thresh # threshold used to collapse seqs into representatives (D = 0.9)
276 :     # single_round => bool # if set to false, additional rounds of psiblast are attempted to incorporate seqs with multiple hsps.
277 :     # skip_psiblast => bool # trim to median ends only
278 : fangfang 1.21 # to_domain => bool # trim to end conserved domains
279 : fangfang 1.8 # use_reps => bool # first collapse seqs into representatives
280 :     # win_size => size # size of sliding window used in calculating uncov cutoff (D = 10)
281 : fangfang 1.6 #
282 : fangfang 1.21 # domain_conserv => thresh # min mean domain conservation (D = 0.3)
283 :     # residue_conserv => thresh # min residule conservation for trimmed end sites (D = 0.1)
284 :     #
285 : fangfang 1.6 #-------------------------------------------------------------------------------
286 :    
287 :     sub trim_alignment {
288 :     my ($ali, $opts);
289 :    
290 :     if (@_ == 1 && ref $_[0] eq 'HASH') {
291 :     $opts = $_[0];
292 :     $ali = $opts->{ali} || $opts->{seqs};
293 :     } else {
294 :     ($ali, $opts) = @_;
295 :     }
296 :     $ali && ref $ali eq 'ARRAY' && @$ali
297 :     or die "trim_alignment called with invalid alignment.\n";
298 :    
299 :     my $skip_psiblast = $opts->{skip_psiblast} ? 1 : 0;
300 :     my $single_round = $opts->{single_round} ? 1 : 0;
301 : fangfang 1.21 my $to_domain = $opts->{to_domain} ? 1 : 0;
302 : fangfang 1.6 my $log_dir = $opts->{log_dir} || SeedAware::location_of_tmp();
303 :     my $keep_log = $opts->{keep_log} || $opts->{log_dir} ? 1 : 0;
304 :     my $log_prefix = $opts->{log_prefix};
305 :     my $use_reps = $opts->{use_reps} || $opts->{max_reps_sim} ? 1 : 0;
306 :     my $max_reps_sim = $opts->{max_reps_sim} || 0.90;
307 :     my $fract_cov = $opts->{fract_cov} || 0.75;
308 : fangfang 1.8 my $fract_ends = $opts->{fract_ends} || 0.10;
309 :     my $win_size = $opts->{win_size} || 10;
310 : fangfang 1.21 my $keep_def = $opts->{keep_def};
311 : fangfang 1.6 my $align_opts = $opts->{align_opts};
312 :    
313 : fangfang 1.21 my $trim0 = $to_domain ? trim_ali_to_conserved_domains($ali, $opts) :
314 :     gjoalignandtree::trim_align_to_median_ends($ali, {begin => 1, end => 1, fract_cov => $fract_cov});
315 :    
316 :     if (!$keep_def && $skip_psiblast) {
317 :     my %full = map { $_->[0] => $_->[2] } @$ali;
318 :     for (@$trim0) {
319 :     my $s0 = $full{$_->[0]}; $s0 =~ tr/A-Za-z//cd; # remove gaps
320 :     my $s = $_->[2]; $s =~ tr/A-Za-z//cd;
321 :    
322 :     my $l = length($s0);
323 :     my $b = index(lc $s0, lc $s) + 1;
324 :     my $e = $b + length($s) - 1;
325 :     $_->[1] .= " ($b-$e/$l)" if $b > 1 || $e < $l;
326 :     }
327 :     }
328 : fangfang 1.6
329 :     return wantarray ? @$trim0 : $trim0 if $skip_psiblast;
330 :    
331 :     SeedUtils::verify_dir($log_dir);
332 :     my $log_stderr = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.stderr") : "/dev/null";
333 :     my $log_report = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.report") : "/dev/null";
334 :    
335 :     my $reps = $use_reps ? representative_sequences::rep_seq_2($trim0, {max_sim => $max_reps_sim}) : [@$trim0];
336 :     my $db = gjoseqlib::pack_sequences($ali);
337 :     my $profile = align_sequences($reps, $align_opts);
338 :     my $blast = gjoalignandtree::blastpgp($db, $profile, {stderr => $log_stderr});
339 :     my ($trimmed, $report) = process_psiblast_v2($blast, $opts);
340 :    
341 :     my $report_string = join("\n", map { join "\t", @$_ } values %$report) . "\n";
342 :    
343 :     print_string($log_report, $report_string) if $keep_log;
344 :    
345 :     my $new_ali = align_sequences($trimmed, $align_opts);
346 :     return wantarray ? @$new_ali : $new_ali if $single_round;
347 :    
348 :     my @to_trim = grep { $report->{$_->[0]} =~ /multiple hsps/ } @$trim0;
349 :    
350 :     my $round = 2;
351 :     while (@to_trim >= max(2, @$ali/10)) {
352 :     my $log_stderr = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.stderr.$round") : "/dev/null";
353 :     my $log_report = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.report.$round") : "/dev/null";
354 :     my $blast = gjoalignandtree::blastpgp($db, \@to_trim, {stderr => $log_stderr});
355 :     my ($trm, $rpt) = process_psiblast_v2($blast, $opts);
356 :     my @new_hits = grep { $report->{$_->[0]} =~ /multiple hsps/ } @$trm;
357 :     @to_trim = grep { $rpt->{$_->[0]} =~ /multiple hsps/ } @$trm;
358 :    
359 :     push @$trimmed, $_ for @new_hits;
360 :     }
361 : fangfang 1.7
362 : fangfang 1.6 $new_ali = align_sequences($trimmed, $align_opts);
363 :     wantarray ? @$new_ali : $new_ali;
364 :     }
365 : fangfang 1.1
366 :    
367 : fangfang 1.8 #-------------------------------------------------------------------------------
368 :     #
369 :     # \@seq = psiblast_search( $db, $profile, \%opts )
370 :     # ( \@seq, \%report ) = psiblast_search( $db, $profile, \%opts )
371 :     # \@seq = psiblast_search( $profile, \%opts )
372 :     # ( \@seq, \%report ) = psiblast_search( $profile, \%opts )
373 :     # \@seq = psiblast_search( \%opts )
374 :     # ( \@seq, \%report ) = psiblast_search( \%opts )
375 :     #
376 :     # $profile can be $profile_file_name or \@profile_seqs
377 :     # $db can be $db_file_name or \@db_seqs, or
378 : fangfang 1.25 # 'SEED', 'PSEED' or 'PPSEED', for protein seqs in complete genomes in annotator's SEED, PSEED or PUBLIC-PSEED
379 : fangfang 1.8 #
380 :     # If supplied as file, profile is pseudoclustal
381 :     #
382 :     # Report records:
383 :     #
384 : fangfang 1.23 # [ $sid, $scr, $e_value, $slen, $status,
385 :     # $frac_id, $frac_pos,
386 : fangfang 1.8 # $q_uncov_n_term, $q_uncov_c_term,
387 :     # $s_uncov_n_term, $s_uncov_c_term ]
388 :     #
389 :     # Options:
390 :     #
391 :     # e_value => $max_e_value # maximum blastpgp E-value (D = 0.01)
392 : fangfang 1.15 # incremental => bool # expand an initial set with multiple rounds of psiblast
393 : fangfang 1.8 # max_e_val => $max_e_value # maximum blastpgp E-value (D = 0.01)
394 :     # max_q_uncov => $aa_per_end # maximum unmatched query (D = min{20, qlen/3})
395 :     # max_q_uncov_c => $aa_per_end # maximum unmatched query, c-term (D = min{20, qlen/3})
396 :     # max_q_uncov_n => $aa_per_end # maximum unmatched query, n-term (D = min{20, qlen/3})
397 :     # min_ident => $frac_ident # minimum fraction identity (D = 0.15)
398 :     # min_positive => $frac_positive # minimum fraction positive scoring (D = 0.20)
399 : fangfang 1.21 # min_frac_cov => $frac_cov # minimum fraction coverage of query and subject sequence (D = 0.20)
400 :     # min_q_cov => $frac_cov # minimum fraction coverage of query sequence (D = 0.50)
401 :     # min_s_cov => $frac_cov # minimum fraction coverage of subject sequence (D = 0.20)
402 : fangfang 1.18 # max_reps_sim => $thresh # threshold used to collapse seqs into representatives (D = 0.8)
403 : fangfang 1.8 # nresult => $max_seq # maximim matching sequences (D = 5000)
404 : fangfang 1.26 # nthread => $n_thread # number of blastpgp threads (D = 2)
405 : fangfang 1.8 # query => $q_file_name # query sequence file (D = most complete)
406 :     # query => \@q_seq_entry # query sequence (D = most complete)
407 :     # stderr => $file # blastpgp stderr (D = /dev/stderr)
408 :     #
409 : fangfang 1.18 # Options for incremental search:
410 :     #
411 :     # max_reps_sim => $thresh # threshold used to collapse seqs into representatives (D = 0.9)
412 : fangfang 1.20 # min_seqs_for_reps => $n # only use representative seqs if number of seqs exceeds this threshold (D = 100)
413 : fangfang 1.18 # stop_round => $n # for incremental search, stop after a specified number of psiblast rounds (D = unlimited)
414 :     # use_reps => bool # collapse profile seqs into representatives before submitting to psiblast
415 :     #
416 : fangfang 1.8 # If supplied, query must be identical to a profile sequence, but no gaps.
417 :     #
418 :     #-------------------------------------------------------------------------------
419 :    
420 : fangfang 1.2 sub psiblast_search {
421 : fangfang 1.8 my ($db, $profile, $opts);
422 :    
423 :     if (@_ >= 2 && ref $_[1] eq 'ARRAY') { ($db, $profile, $opts) = @_ }
424 :     elsif (@_ >= 1 && ref $_[0] eq 'ARRAY') { ($profile, $opts) = @_ }
425 :     elsif (@_ == 1 && ref $_[0] eq 'HASH') { ($opts) = @_ }
426 :    
427 : fangfang 1.20 $opts->{nresult} ||= 5000;
428 : fangfang 1.26 $opts->{nthread} ||= 2;
429 : fangfang 1.20 $opts->{stderr} ||= '/dev/null';
430 : fangfang 1.8
431 :     $profile ||= $opts->{profile};
432 :     $db ||= $opts->{db} || 'SEED';
433 :    
434 : fangfang 1.28 # my $org_dir = "/home/fangfang/FIGdisk/FIG/Data/Organisms";
435 :     # my $org_dir = ${FIG_Config::data}; # does not work on bio-big
436 :     my $org_dir = "/vol/public-pseed/FIGdisk/FIG/Data/Organisms"; # complete genomes only
437 : fangfang 1.37 my $psi_dir = $ENV{PsiblastDB} || "/home/fangfang/WB/PsiblastDB";
438 : fangfang 1.28
439 : fangfang 1.8 if (ref $db ne 'ARRAY') {
440 :     if ($db =~ /^(\d+\.\d+)$/) { $db = "$org_dir/$1/Features/peg/fasta" }
441 : fangfang 1.25 elsif (uc $db eq 'PPSEED') { $db = "$psi_dir/public-pseed.complete" }
442 : fangfang 1.22 elsif (uc $db eq 'PSEED') { $db = "$psi_dir/ppseed.NR" }
443 : fangfang 1.14 elsif (uc $db eq 'SEED') { $db = "$psi_dir/SEED.complete.fasta" }
444 : fangfang 1.8 }
445 :    
446 : fangfang 1.15 my $inc = $opts->{incremental} || $opts->{inc};
447 :    
448 :     my ($hits, $report, $history) = $inc ? incremental_psiblast_search($db, $profile, $opts)
449 :     : gjoalignandtree::extract_with_psiblast($db, $profile, $opts);
450 : fangfang 1.8
451 : fangfang 1.15 wantarray ? ($hits, $report, $history) : $hits;
452 :     }
453 :    
454 : fangfang 1.21
455 : fangfang 1.32 sub db_name_to_file {
456 :     my ($db) = @_;
457 :     my $org_dir = "/vol/public-pseed/FIGdisk/FIG/Data/Organisms"; # complete genomes only
458 :     my $psi_dir = "/home/fangfang/WB/PsiblastDB/";
459 :     if ($db =~ /^(\d+\.\d+)$/) { $db = "$org_dir/$1/Features/peg/fasta" }
460 :     elsif (uc $db eq 'PPSEED') { $db = "$psi_dir/public-pseed.complete" }
461 :     elsif (uc $db eq 'PSEED') { $db = "$psi_dir/ppseed.NR" }
462 :     elsif (uc $db eq 'SEED') { $db = "$psi_dir/SEED.complete.fasta" }
463 :     return $db;
464 :     }
465 :    
466 : fangfang 1.21 #-------------------------------------------------------------------------------
467 :     #
468 :     # Incremental psiblast search from a small set of initial sequences,
469 :     # which may be unaligned.
470 :     #
471 :     # \@seq = incremental_psiblast_search( $db, $profile, \%opts )
472 :     # ( \@seq, \%report, \@history ) = incremental_psiblast_search( $db, $profile, \%opts )
473 :     #
474 :     # Options
475 :     #
476 :     # max_query_nseq => $n # stop when the number of query sequences exceeds this threshold
477 : fangfang 1.24 # max_reps_sim => $thresh # threshold used to collapse seqs into representatives (D = 0.95)
478 :     # min_seqs_for_reps => $n # only use representative seqs if number of seqs exceeds this threshold (D = 10)
479 : fangfang 1.21 # stop_round => $n # stop after a specified number of psiblast rounds (D = unlimited)
480 :     # use_reps => bool # always collapse profile seqs into representatives before submitting to psiblast
481 :     #
482 :     # Other options only affect the final round of psiblast.
483 :     #
484 :     # Report records are documented in psiblast_search().
485 :     #
486 :     # History consists a list 4-tuples, corresponding to search status
487 :     # at each psiblast round:
488 :     #
489 :     # [ profile_length, num_starting_seqs, num_trimmed_reps, num_psiblast_hits ]
490 :     #
491 :     # The psiblast hits at the end of each round are aligned, trimmed,
492 :     # and sorted. The top hits are then selected to form the set of
493 :     # profile sequences for the next round. The algorithm tries to
494 :     # expand the set cautiously. Unless the psiblast hits share high
495 :     # identity (~75%) with the profile, the set grows by no more than a
496 :     # factor of 2 each round. If neither stop_round nor max_query_nseq
497 :     # is specified, the process runs to convergence or until the number
498 :     # of profile sequences reaches 500.
499 :     #
500 :     #-------------------------------------------------------------------------------
501 :    
502 :     sub incremental_psiblast_search {
503 :     my ($db, $profile, $opts) = @_;
504 :    
505 :     my $stop_round = $opts->{stop_round};
506 :     my $max_query_nseq = $opts->{max_query_nseq} || 500;
507 : fangfang 1.24 my $max_reps_sim = $opts->{max_reps_sim} || 0.95;
508 :     my $min_reps_seqs = $opts->{min_seqs_for_reps} || 10;
509 : fangfang 1.26 my $nthread = $opts->{nthread} || 2;
510 : fangfang 1.21 my $fast = $opts->{fast_trimming} || $opts->{fast} ? 1 : 0;
511 :     my $use_reps = $opts->{use_reps} || $opts->{max_reps_sim} ? 1 : 0;
512 :    
513 :     my $initial_set_keep = 0.8;
514 :     my $significant_score = 150; # 150 = ~ (+ 75% id + %80 pos - 25 uncov)
515 :     my $expansion_factor = 2; # query set doubles each round
516 :     my $frac_id_weight = 100;
517 :     my $frac_pos_weight = 100;
518 :     my $uncov_penalty_q = 0.2;
519 :     my $uncov_penalty_s = 0.02;
520 :    
521 :     my @history;
522 :     my @prof = @$profile;
523 :    
524 : fangfang 1.26 my $opts2 = { tool => 'mafft', auto => 1, 'clustal_ends' => !$fast }; # align
525 :     my $opts3 = { query => $opts->{query}, e_value => 0.01, max_q_uncov => 500, nresult => 5000, stderr => '/dev/null', nthread => $nthread }; # psiblast
526 :     my $opts4 = $fast || $initial_set_keep >= 1 ? { to_domain => 1, skip_psiblast => 1 } : { align_opts => $opts2 }; # trim
527 : fangfang 1.21
528 :     # in case profile seqs are unaligned
529 :     my @lens = sort { $a <=> $b } map { length $_->[2] } @$profile;
530 :     if ($lens[0] != $lens[-1]) {
531 :     my $reps = $use_reps && @prof >= $min_reps_seqs ? representative_sequences::rep_seq_2(\@prof, {max_sim => $max_reps_sim}) : [@prof];
532 :     my $ali = align_sequences($reps, $opts2);
533 :     my $trim = trim_alignment($ali, $opts4);
534 :     my $redo = (@$trim / @$ali) < $initial_set_keep;
535 :     $trim = trim_alignment($ali, { skip_psiblast => 1 }) if $redo;
536 :     @prof = @$trim;
537 :    
538 :     my $record = [ length $ali->[0]->[2], scalar @$profile, scalar @$trim ];
539 :     push @history, $record;
540 :     print STDERR join("\t", @$record). "\n";
541 :     }
542 :    
543 :     $max_query_nseq = min($max_query_nseq, $opts->{nresult}) if $opts->{nresult} > 0;
544 :     my @seqs = @$profile;
545 : fangfang 1.25 my $query;
546 : fangfang 1.21 my $i = 0;
547 :     while (@seqs < $max_query_nseq) {
548 :     my ($hits, $report) = gjoalignandtree::extract_with_psiblast($db, \@prof, $opts3);
549 :     my $record = [ length $prof[0]->[2], scalar @seqs, scalar @prof, scalar keys %$report ];
550 :     push @history, $record;
551 :     print STDERR join("\t", @$record)."\n";
552 : fangfang 1.25 last unless @$hits > 0;
553 :    
554 : fangfang 1.21 my %score;
555 :     for (keys %$report) {
556 : fangfang 1.23 my ($scr, $exp, $slen, $status, $frac_id, $frac_pos, $uncov_q1, $uncov_q2, $uncov_s1, $uncov_s2) = @{$report->{$_}}[1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
557 : fangfang 1.21 next unless $status =~ /included/i;
558 :     my $uncov_q = $uncov_q1 + $uncov_q2;
559 :     my $uncov_s = $uncov_s1 + $uncov_s2;
560 :     $score{$_} = $frac_id * $frac_id_weight + $frac_pos * $frac_pos_weight
561 :     - $uncov_q * $uncov_penalty_q - $uncov_s * $uncov_penalty_s;
562 :     }
563 :     my @keep;
564 :     for (sort { $score{$b} <=> $score{$a} } keys %score) {
565 :     last if @keep / @prof >= $expansion_factor && $score{$_} < $significant_score;
566 :     push @keep, $_;
567 :     }
568 :     my %seen;
569 :     $seen{$_} = 1 for @keep;
570 :     @seqs = grep { $seen{$_->[0]} } @$hits;
571 : fangfang 1.25
572 :     my $qid = $opts3->{query_used}->[0];
573 :     my $reps = $use_reps && @seqs >= $min_reps_seqs ? representative_sequences::rep_seq_2(\@seqs, {max_sim => $max_reps_sim, keep_id => [$qid]}) : [@seqs];
574 : fangfang 1.21 my $ali = align_sequences($reps, $opts2);
575 :     my $trim = trim_alignment($ali, $opts4);
576 :    
577 : fangfang 1.25 my ($query) = grep { $_->[0] eq $qid } @$trim;
578 :     $opts3->{query} = $query;
579 :     # print STDERR '$reps = '. Dumper($reps);
580 :     # print STDERR '$query = '. Dumper($query);
581 :     # print STDERR "hits, seqs, trim = ". scalar@$hits. "\t". scalar@seqs. "\t". scalar@$trim . "\n";
582 :    
583 : fangfang 1.21 last if @$trim <= @prof || @$trim >= $max_query_nseq || ($stop_round && ++$i >= $stop_round);
584 :    
585 :     @prof = @$trim;
586 :     }
587 :    
588 : fangfang 1.25 my ($hits, $report) = gjoalignandtree::extract_with_psiblast($db, \@prof, $opts);
589 : fangfang 1.21 my $record = [ length $prof[0]->[2], scalar @seqs, scalar @prof, scalar @$hits ];
590 :     push @history, $record;
591 :     print STDERR join("\t", @$record). "\n";
592 :    
593 :     wantarray ? ($hits, $report, \@history) : $hits;
594 :     }
595 :    
596 : fangfang 1.32
597 :     #-------------------------------------------------------------------------------
598 :     #
599 :     # Blast wrapper
600 :     #
601 :     #-------------------------------------------------------------------------------
602 :    
603 :     sub blast {
604 :     my ($db, $query, $opts);
605 :    
606 :     if (@_ >= 2 && ref $_[1] eq 'ARRAY') { ($db, $query, $opts) = @_ }
607 :     elsif (@_ >= 1 && ref $_[0] eq 'ARRAY') { ($query, $opts) = @_ }
608 :     elsif (@_ == 1 && ref $_[0] eq 'HASH') { ($opts) = @_ }
609 :    
610 :     $db ||= $opts->{db} || $opts->{seqs} || db_name_to_file($db);
611 :     $query ||= $opts->{query};
612 :     $query = [ grep { $_->[0] eq $query } @$db ]->[0] if $db && (ref $db eq 'ARRAY') && @$db > 0;
613 :    
614 :     my $tmp = SeedAware::location_of_tmp( $opts );
615 :    
616 :     my ( $dbfile, $rm_db );
617 :     if ( defined $db && ref $db )
618 :     {
619 :     ref $db eq 'ARRAY'
620 :     && @$db
621 :     || print STDERR "blast requires one or more database sequences.\n"
622 :     && return undef;
623 :     $dbfile = SeedAware::new_file_name( "$tmp/blast_db" );
624 :     gjoseqlib::print_alignment_as_fasta( $dbfile, $db );
625 :     $rm_db = 1;
626 :     }
627 :     elsif ( defined $db && -f $db )
628 :     {
629 :     $dbfile = $db;
630 :     }
631 :     else
632 :     {
633 :     die "blastpgp requires database.";
634 :     }
635 :     verify_db( $dbfile, 'P' ); # protein
636 :    
637 :    
638 :     my ( $qfile, $rm_query );
639 :     if ( defined $query && ref $query )
640 :     {
641 :     ref $query eq 'ARRAY' && @$query == 3
642 :     or print STDERR "blast invalid query sequence.\n"
643 :     and return undef;
644 :    
645 :     $qfile = SeedAware::new_file_name( "$tmp/blast_query" );
646 :     gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );
647 :     $rm_query = 1;
648 :     }
649 :     elsif ( defined $query && -f $query )
650 :     {
651 :     $qfile = $query;
652 :     ( $query ) = gjoseqlib::read_fasta( $qfile );
653 :     }
654 :     else
655 :     {
656 :     die "blast requires query.";
657 :     }
658 :    
659 :     my $e_val = $opts->{ e_value } || $opts->{ max_e_val } || $opts->{ max_e_value } || 0.01;
660 :     my $n_cpu = $opts->{ n_thread } || $opts->{ nthread } || $opts->{ n_cpu } || $opts->{ ncpu } || 2;
661 :     my $nkeep = $opts->{ n_result } || $opts->{ nresult } || 1000;
662 :     my $prog = $opts->{ program } || $opts->{ blast_program } || 'blastp';
663 :     my $stderr = $opts->{ stderr } || '/dev/null';
664 :    
665 :     my $blastall = SeedAware::executable_for( 'blastall' )
666 :     or print STDERR "Could not find executable for program 'blastall'.\n"
667 :     and return undef;
668 :    
669 :     my @cmd = ( $blastall,
670 :     '-p' => $prog,
671 :     '-d' => $dbfile,
672 :     '-i' => $qfile,
673 :     '-e' => $e_val,
674 :     '-b' => $nkeep,
675 :     '-v' => $nkeep,
676 : fangfang 1.33 '-F' => 'F'
677 : fangfang 1.32 );
678 :     push @cmd, ( '-a' => $n_cpu ) if $n_cpu;
679 :    
680 :     my $blastfh = SeedAware::read_from_pipe_with_redirect( @cmd, { stderr => $stderr } )
681 :     or print STDERR "Failed to open: '" . join( ' ', @cmd ), "'.\n"
682 :     and return undef;
683 :    
684 :     my $out = gjoparseblast::blast_hsp_list( $blastfh, 1 );
685 :     close $blastfh;
686 :    
687 :     if ( $rm_db )
688 :     {
689 :     my @files = grep { -f $_ } map { ( $_, "$_.psq", "$_.pin", "$_.phr" ) } $dbfile;
690 :     unlink @files if @files;
691 :     }
692 :     unlink $qfile if $rm_query;
693 :    
694 :     if ($opts->{m8} && $out && @$out > 0) {
695 :     my @records = map {
696 :     my ($qid, $qdef, $qlen, $sid, $sdef, $slen, $scr,
697 :     $e_val, $p_n, $p_val, $n_mat, $n_id, $n_pos, $n_gap,
698 :     $dir, $q1, $q2, $qseq, $s1, $s2, $sseq) = @$_;
699 :     join("\t", $qid, $sid, sprintf("%.2f", 100 * $n_id / $n_mat),
700 :     $n_mat, $n_mat - $n_id - $n_gap, $n_gap, $q1, $q2,
701 :     $s1, $s2, $e_val, $scr, $qlen, $slen) ."\n";
702 :     } @$out;
703 :     $out = join('', @records);
704 :     }
705 :    
706 :     return $out;
707 :     }
708 :    
709 :    
710 :    
711 : fangfang 1.15 #-------------------------------------------------------------------------------
712 :     #
713 :     # Incremental psiblast search from a small set of initial sequences,
714 :     # which may be unaligned.
715 :     #
716 :     # \@seq = incremental_psiblast_search( $db, $profile, \%opts )
717 :     # ( \@seq, \%report, \@history ) = incremental_psiblast_search( $db, $profile, \%opts )
718 :     #
719 :     # Options
720 :     #
721 : fangfang 1.18 # max_reps_sim => $thresh # threshold used to collapse seqs into representatives (D = 0.9)
722 :     # min_seqs_for_reps => $n # only use representative seqs if number of seqs exceeds this threshold (D = 100)
723 :     # stop_round => $n # stop after a specified number of psiblast rounds (D = unlimited)
724 :     # use_reps => bool # always collapse profile seqs into representatives before submitting to psiblast
725 : fangfang 1.15 #
726 :     # Other options only affect the final round of psiblast.
727 :     #
728 :     # Report records are documented in psiblast_search().
729 :     #
730 : fangfang 1.19 # History consists a list 4-tuples, corresponding to search status
731 : fangfang 1.15 # at each psiblast round:
732 :     #
733 : fangfang 1.19 # [ profile_length, num_starting_seqs, num_trimmed_reps, num_psiblast_hits ]
734 : fangfang 1.15 #
735 :     # The psiblast hits at the end of each round are aligned, trimmed,
736 :     # and sorted. The top hits are then selected to form the set of
737 :     # profile sequences for the next round. The algorithm tries to
738 :     # expand the set cautiously. Unless the psiblast hits share high
739 :     # identity (~75%) with the profile, the set grows by no more than a
740 :     # factor of 2 each round. If stop_round is not specified, the
741 :     # process runs to convergence or until the number of profile
742 :     # sequences reaches 500.
743 :     #
744 :     #-------------------------------------------------------------------------------
745 :    
746 : fangfang 1.21 sub incremental_psiblast_search_0 {
747 : fangfang 1.15 my ($db, $profile, $opts) = @_;
748 :    
749 :     my $stop_round = $opts->{stop_round};
750 : fangfang 1.18 my $max_reps_sim = $opts->{max_reps_sim} || 0.9;
751 :     my $min_reps_seqs = $opts->{min_seqs_for_reps} || 100;
752 :     my $use_reps = $opts->{use_reps} || $opts->{max_reps_sim} ? 1 : 0;
753 :    
754 : fangfang 1.15 my $max_nseq_clustal = 100;
755 :     my $initial_set_keep = 0.8;
756 :     my $max_query_nseq = 500;
757 :     my $significant_score = 150; # 150 = ~ (+ 75% id + %80 pos - 25 uncov)
758 :     my $expansion_factor = 2; # query set doubles each round
759 :     my $frac_id_weight = 100;
760 :     my $frac_pos_weight = 100;
761 :     my $uncov_penalty = 0.2;
762 :    
763 :     my @history;
764 :     my @prof = @$profile;
765 : fangfang 1.18
766 :     # in case profile seqs are unaligned
767 : fangfang 1.15 my @lens = sort { $a <=> $b } map { length $_->[2] } @$profile;
768 : fangfang 1.18 if ($lens[0] != $lens[-1]) {
769 :     my $reps = $use_reps && @prof >= $min_reps_seqs ? representative_sequences::rep_seq_2(\@prof, {max_sim => $max_reps_sim}) : [@prof];
770 : fangfang 1.20 my $opts2 = @$reps < $max_nseq_clustal ? { tool => 'clustal' } : { tool => 'mafft' };
771 : fangfang 1.18 my $ali = align_sequences($reps, $opts2);
772 : fangfang 1.15 my $trim = trim_alignment($ali, { align_opts => $opts2 });
773 :     my $redo = (@$trim / @$ali) < $initial_set_keep;
774 :     $trim = trim_alignment($ali, { skip_psiblast => 1 }) if $redo;
775 :     @prof = @$trim;
776 :    
777 : fangfang 1.18 my $record = [ length $ali->[0]->[2], scalar @$profile, scalar @$trim ];
778 :     push @history, $record;
779 :     print STDERR join("\t", @$record). "\n";
780 : fangfang 1.15 }
781 :    
782 :     my $opts3 = { e_value => 0.01, max_q_uncov => 1000, nresult => 1000, stderr => '/dev/null' };
783 :    
784 :     $max_query_nseq = min($max_query_nseq, $opts->{nresult}) if $opts->{nresult} > 0;
785 : fangfang 1.18 my @seqs = @$profile;
786 : fangfang 1.15 my $i = 0;
787 : fangfang 1.18 while (@seqs < $max_query_nseq) {
788 : fangfang 1.15 my ($hits, $report) = gjoalignandtree::extract_with_psiblast($db, \@prof, $opts3);
789 : fangfang 1.18 my $record = [ length $prof[0]->[2], scalar @seqs, scalar @prof, scalar keys %$report ];
790 :     push @history, $record;
791 :     print STDERR join("\t", @$record)."\n";
792 : fangfang 1.15 my %score;
793 :     for (keys %$report) {
794 : fangfang 1.23 my ($scr, $exp, $status, $frac_id, $frac_pos, $uncov_q1, $uncov_q2) = @{$report->{$_}}[1, 2, 4, 5, 6, 7, 8];
795 : fangfang 1.15 next unless $status =~ /included/i;
796 :     my $uncov = $uncov_q1 + $uncov_q2;
797 :     $score{$_} = $frac_id * $frac_id_weight + $frac_pos * $frac_pos_weight - $uncov * $uncov_penalty;
798 :     }
799 :     my @keep;
800 :     for (sort { $score{$b} <=> $score{$a} } keys %score) {
801 :     last if @keep / @prof >= $expansion_factor && $score{$_} < $significant_score;
802 :     push @keep, $_;
803 :     }
804 :     my %seen;
805 :     $seen{$_} = 1 for @keep;
806 : fangfang 1.18 @seqs = grep { $seen{$_->[0]} } @$hits;
807 :     my $reps = $use_reps && @seqs >= $min_reps_seqs ? representative_sequences::rep_seq_2(\@seqs, {max_sim => $max_reps_sim}) : [@seqs];
808 : fangfang 1.20 my $opts2 = @$reps < $max_nseq_clustal ? { tool => 'clustal' } : { tool => 'mafft' };
809 : fangfang 1.18 my $ali = align_sequences($reps, $opts2);
810 : fangfang 1.15 my $trim = trim_alignment($ali, { align_opts => $opts2 });
811 :    
812 :     last if @$trim <= @prof || ($stop_round && ++$i >= $stop_round);
813 : fangfang 1.20 # last if subset_ratio([map($_->[0], @$trim)], [map($_->[0], @$profile)]) < $initial_set_keep;
814 :    
815 : fangfang 1.15 @prof = @$trim;
816 :     }
817 :    
818 :     my ($hits, $report) = gjoalignandtree::extract_with_psiblast($db, \@prof, $opts);
819 : fangfang 1.18 my $record = [ length $prof[0]->[2], scalar @seqs, scalar @prof, scalar @$hits ];
820 :     push @history, $record;
821 :     print STDERR join("\t", @$record). "\n";
822 : fangfang 1.15
823 :     wantarray ? ($hits, $report, \@history) : $hits;
824 : fangfang 1.8 }
825 :    
826 : fangfang 1.15
827 : fangfang 1.8 #-------------------------------------------------------------------------------
828 : fangfang 1.20 # [ S1 \cap S2 ] / [ S2 ]
829 :     #-------------------------------------------------------------------------------
830 :     sub subset_ratio {
831 :     my ($set1, $set2) = @_;
832 :     my %seen; $seen{$_}++ for @$set2;
833 :     my $cnt; $seen{$_} && $cnt++ for @$set1;
834 :     return @$set1 ? $cnt / @$set1 : 0;
835 :     }
836 :    
837 :    
838 :     #-------------------------------------------------------------------------------
839 : fangfang 1.8 #
840 :     # Options:
841 :     #
842 :     # keep_def => bool # do not append trimming coordinates to description fields in seqs
843 :     # fract_ends => $fract_ends # minimum fraction of ends to be considered significant for uncov cutoff (D = 0.1)
844 :     # max_q_uncov => $aa_per_end # maximum unmatched query (D = min{20, qlen/3})
845 :     # max_q_uncov_c => $aa_per_end # maximum unmatched query, c-term (D = min{20, qlen/3})
846 :     # max_q_uncov_n => $aa_per_end # maximum unmatched query, n-term (D = min{20, qlen/3})
847 :     # min_ident => $frac_ident # minimum fraction identity (D = 0.15)
848 :     # min_positive => $frac_positive # minimum fraction positive scoring (D = 0.20)
849 :     # min_q_cov => $aa_covered # minimum number of query characters covered (D = max{10, qlen/5})
850 :     # win_size => size # size of sliding window used in calculating uncov cutoff (D = 10)
851 :     #
852 :     #-------------------------------------------------------------------------------
853 :    
854 :     sub process_psiblast_v2 {
855 :     my ( $blast, $opts ) = @_;
856 :    
857 :     $blast && ref $blast eq 'ARRAY' or return ();
858 :     $opts && ref $opts eq 'HASH' or $opts = {};
859 :    
860 :     my( $qid, $qdef, $qlen, $qhits ) = @{ $blast->[0] };
861 :    
862 :     my $keep_def = $opts->{ keep_def } || $opts->{ keep_seq_def } ? 1 : 0;
863 :     my $fract_ends = $opts->{ fract_ends } || $opts->{ fraction_ends } || 0.1; # fraction of sequences ending in window
864 :     my $max_q_uncov_c = $opts->{ max_q_uncov_c } || $opts->{ max_q_uncov } || min(20, $qlen/3);
865 :     my $max_q_uncov_n = $opts->{ max_q_uncov_n } || $opts->{ max_q_uncov } || min(20, $qlen/3);
866 :     my $min_q_cov = $opts->{ min_q_cov } || $opts->{ min_nongaps } || max(10, $qlen/5);
867 :     my $min_ident = $opts->{ min_ident } || 0.15;
868 :     my $min_pos = $opts->{ min_positive } || 0.20;
869 :     my $win_size = $opts->{ win_size } || $opts->{ window_size } || 10;
870 :    
871 :     my (@uncov1, @uncov2);
872 :    
873 :     foreach my $sdata ( @$qhits ) {
874 :     my( $sid, $sdef, $slen, $hsps ) = @$sdata;
875 :     if ( one_real_hsp($hsps) ) {
876 :     # [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
877 :     # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
878 :     my( $q1, $q2, $s1, $s2 ) = ( @{ $hsps->[0] } )[9, 10, 12, 13];
879 :     push @uncov1, [ $q1-1, $s1-1 ];
880 :     push @uncov2, [ $qlen-$q2, $slen-$s2 ];
881 :     }
882 :     }
883 :    
884 :     my $q1_cut = calc_uncov_cut(\@uncov1, $fract_ends, $win_size) + 1;
885 :     my $q2_cut = $qlen - calc_uncov_cut(\@uncov2, $fract_ends, $win_size);
886 :    
887 :     my (@trimmed, %report, $status);
888 :    
889 :     foreach my $sdata ( @$qhits ) {
890 :     my( $sid, $sdef, $slen, $hsps ) = @$sdata;
891 :    
892 :     if ( one_real_hsp($hsps) ) {
893 :    
894 :     my $hsp0 = $hsps->[0];
895 : fangfang 1.23 my ($scr, $exp, $nmat, $nid, $npos, $q1, $q2, $qseq, $s1, $s2, $sseq) = (@$hsp0)[ 0, 1, 4, 5, 6, 9, 10, 11, 12, 13, 14 ];
896 : fangfang 1.8
897 :     my $uncov1 = $q1 <= $q1_cut ? 0 : $q1 - $q1_cut;
898 :     my $uncov2 = $q2 >= $q2_cut ? 0 : $q2_cut - $q2;
899 :    
900 :     if ( $q1-1 > $max_q_uncov_n ) { $status = 'missing start' }
901 :     elsif ( $qlen-$q2 > $max_q_uncov_c ) { $status = 'missing end' }
902 :     elsif ( $nid / $nmat < $min_ident ) { $status = 'low identity' }
903 :     elsif ( $npos / $nmat < $min_pos ) { $status = 'low positives' }
904 :     else {
905 :     my ($t1, $t2, $s1t, $s2t);
906 :    
907 :     ($sseq, $t1) = trim_5( $q1_cut - $q1, $qseq, $sseq );
908 :     ($sseq, $t2) = trim_3( $q2 - $q2_cut, $qseq, $sseq );
909 :    
910 :     $s1t = $s1 + $t1;
911 :     $s2t = $s2 - $t2;
912 :    
913 :     $sseq =~ s/-+//g;
914 :    
915 :     if ( length $sseq < $min_q_cov ) {
916 :     $status = 'tiny coverage'
917 :     } else {
918 :     $sdef .= " ($s1t-$s2t/$slen)" if !$keep_def && ($s1t > 1 || $s2t < $slen );
919 :     push @trimmed, [ $sid, $sdef, $sseq ];
920 :     $status = 'included';
921 :     }
922 :     }
923 : fangfang 1.23
924 :     my $frac_id = sprintf("%.3f", $nid/$nmat);
925 :     my $frac_pos = sprintf("%.3f", $npos/$nmat);
926 :    
927 :     $report{ $sid } = [ $sid, $scr, $exp, $slen, $status, $nid/$nmat, $npos/$nmat, $q1-1, $qlen-$q2, $s1-1, $slen-$s2 ];
928 : fangfang 1.8
929 :     } else { $status = 'multiple hsps' }
930 :     }
931 :    
932 :     wantarray ? ( \@trimmed, \%report ) : \@trimmed;
933 :     }
934 :    
935 :     sub trim_5 {
936 :     my ( $ntrim, $qseq, $sseq ) = @_;
937 :     return $sseq if $ntrim <= 0;
938 :     my ( $to_trim ) = $qseq =~ /^(([^-]-*){$ntrim})/;
939 :     my $trimmed = substr( $sseq, length($to_trim) );
940 :     wantarray ? ( $trimmed, length($to_trim) ) : $trimmed;
941 :     }
942 :    
943 :    
944 :     sub trim_3 {
945 :     my ( $ntrim, $qseq, $sseq ) = @_;
946 :     return $sseq if $ntrim <= 0;
947 :     my ( $to_trim ) = $qseq =~ /((-*[^-]){$ntrim})$/;
948 :     my $trimmed = substr( $sseq, 0, length($sseq) - length($to_trim) );
949 :     wantarray ? ( $trimmed, length($to_trim) ) : $trimmed;
950 :     }
951 :    
952 :     #-------------------------------------------------------------------------------
953 :     #
954 :     # Allow fragmentary matches inside of the highest-scoring hsp:
955 :     #
956 :     #-------------------------------------------------------------------------------
957 :    
958 :     sub one_real_hsp {
959 :     my ( $hsps ) = @_;
960 :     return 0 if ! ( $hsps && ( ref( $hsps ) eq 'ARRAY' ) && @$hsps );
961 :     return 1 if @$hsps == 1;
962 :    
963 :     my ( $q1_0, $q2_0 ) = ( @{ $hsps->[0] } )[9, 10];
964 :     for ( my $i = 1; $i < @$hsps; $i++ )
965 :     {
966 :     my ($q1, $q2) = ( @{ $hsps->[$i] } )[9, 10];
967 :     return 0 if $q1 < $q1_0 || $q2 > $q2_0;
968 :     }
969 :    
970 :     return 1;
971 :     }
972 :    
973 :     #-------------------------------------------------------------------------------
974 :     #
975 :     # Calculate the best cutoff from an array of uncov numbers:
976 :     #
977 :     # 1. Sort the uncov numbers
978 :     # 2. Use a sliding window to count the instances of uncovs in a specified range
979 :     # 3. Look for the first significant peak of uncovs from the highest uncov
980 :     #
981 :     #-------------------------------------------------------------------------------
982 :    
983 :     sub calc_uncov_cut {
984 :     my ($uncovs, $thresh, $winsize) = @_;
985 :     return undef unless $uncovs && ref $uncovs eq 'ARRAY' && @$uncovs;
986 :    
987 :     my @uncov = sort { $a->[0] <=> $b->[0] } @$uncovs;
988 :     return $uncov[0]->[0] if $uncov[0]->[0] == $uncov[-1]->[0];
989 :    
990 :     $thresh ||= 0.10;
991 :     $winsize ||= 10;
992 :    
993 :     my $min_count = int($thresh * @uncov) + 1;
994 :     my $imax = $uncov[-1]->[0];
995 :    
996 :     my $j1 = @uncov - 1;
997 :     my $j2 = @uncov - 1;
998 :     my $cnt = 0;
999 :     my $max_cnt = 0; # maximum seqs in uncov window range
1000 :     my $i_of_max_cnt; # center of moving window
1001 :    
1002 :     for (my $i = $imax; $j2 >= 0 && $i >= 0; $i--) {
1003 :     while ($j1 >= 0 && $uncov[$j1]->[0] >= $i - $winsize) {
1004 :     $cnt++;
1005 :     $j1--;
1006 :     }
1007 :     while ($j2 >= 0 && $uncov[$j2]->[0] > $i + $winsize) {
1008 :     $cnt--;
1009 :     $j2--;
1010 :     }
1011 :     if ($cnt > $max_cnt) {
1012 :     $max_cnt = $cnt;
1013 :     $i_of_max_cnt = $i;
1014 :     }
1015 :     last if ($cnt < $max_cnt && $max_cnt >= $min_count); # just past peak
1016 :     }
1017 :     if ($max_cnt >= $min_count) {
1018 :     return ($i_of_max_cnt > $winsize) ? $i_of_max_cnt - $winsize : 0;
1019 :     }
1020 :    
1021 :     return calc_uncov_cut($uncovs, $thresh, max($winsize+1, int($winsize*1.43)));
1022 :     }
1023 :    
1024 :     sub print_string {
1025 :     my ( $fh, $close, $unused ) = gjoseqlib::output_filehandle( shift );
1026 :     ( unshift @_, $unused ) if $unused; # modeled after gjoseqlib::print_alignment_as_fasta
1027 :    
1028 :     my $str = shift;
1029 :     return unless $str;
1030 :    
1031 :     print $fh $str;
1032 :     close $fh if $close;
1033 :     }
1034 :    
1035 : fangfang 1.11
1036 :    
1037 :     #-------------------------------------------------------------------------------
1038 :     #
1039 :     # $tree = make_tree( \@ali, \%opts )
1040 :     # ( $tree, \%stats ) = make_tree( \@ali, \%opts )
1041 :     # $tree = make_tree( \%opts ) # \@ali = $opts{ali}
1042 :     # ( $tree, \%stats ) = make_tree( \%opts ) # \@ali = $opts{ali}
1043 :     #
1044 :     # Options:
1045 :     #
1046 : fangfang 1.29 # bootstrap => n # bootstrap samples (D = 0)
1047 : fangfang 1.17 # tool => program # fasttree (d), phyml, raxml
1048 : fangfang 1.11 #
1049 : fangfang 1.17 # params => parameter string for tree tool
1050 :     # (can be superseded by the following common options)
1051 : fangfang 1.11 #
1052 :     # search => topolog_search_method # NNI (d), SPR
1053 :     # model => substitution_model # nt: HKY85, JC69, K80, F81, F84, TN93
1054 :     # # aa: LG, JTT, WAG, MtREV, Dayhoff, DCMut
1055 :     # rate => rate_distribution # Gamma (d), Uniform
1056 :     # nclasses => num_subst_categories # 4 (d)
1057 :     # optimize => all (d, topology && brlen && parameters), eval (optimize model parameters only)
1058 :     # input => input tree # tree file name
1059 : fangfang 1.34 # nproc => number of processors to use for bootstrap
1060 : fangfang 1.11 #
1061 :     # Option default values depend the tool used. See details in:
1062 :     # ffxtree:: tree_with_fasttree, tree_with_phyml, tree_with_raxml
1063 :     #
1064 :     #-------------------------------------------------------------------------------
1065 :    
1066 :     sub make_tree {
1067 : fangfang 1.12 my ($ali, $opts) = ffxtree::process_input_args_w_ali(@_);
1068 : fangfang 1.11
1069 : fangfang 1.23 return unless @$ali >= 3;
1070 :    
1071 : fangfang 1.11 my $program;
1072 :    
1073 :     $opts->{tool} ||= 'fasttree';
1074 :    
1075 :     if ($opts->{tool} =~ /fasttree/i) { $program = \&ffxtree::tree_with_fasttree; $opts->{fasttree} = "/home/fangfang/bin/fasttree" }
1076 :     elsif ($opts->{tool} =~ /phyml/i) { $program = \&ffxtree::tree_with_phyml; $opts->{phyml} = "/home/fangfang/bin/phyml" }
1077 :     elsif ($opts->{tool} =~ /raxml/i) { $program = \&ffxtree::tree_with_raxml; $opts->{raxml} = "/home/fangfang/bin/raxmlHPC" }
1078 :    
1079 : fangfang 1.34 my $nb = $opts->{bootstrap};
1080 :     my $np = $opts->{nproc};
1081 : fangfang 1.36 my $in = $opts->{input};
1082 :    
1083 :     my ($tree, $stats);
1084 : fangfang 1.34
1085 : fangfang 1.36 if ($nb > 0 && $in) {
1086 :     $tree = $in;
1087 :     } else {
1088 :     ($tree, $stats) = $program->($ali, $opts);
1089 :     }
1090 :    
1091 : fangfang 1.34 if ($nb > 0) {
1092 :     my @samples = map { my $a = gjoalignment::bootstrap_sample($ali); $a } 1..$nb;
1093 :     my @trees;
1094 :    
1095 :     if ($np >= 2) {
1096 :     eval {require Proc::ParallelLoop};
1097 :     my $tmpdir = SeedAware::location_of_tmp($opts);
1098 :     my @output = map { SeedAware::new_file_name("$tmpdir/tree_pareach_$_", 'newick') } 1..$nb;
1099 :     my @tuples; push @tuples, [$samples[$_], $output[$_], $opts] for 0..$nb-1;
1100 :     Proc::ParallelLoop::pareach(\@tuples, sub { my ($a, $f, $o) = @{$_[0]}; $o->{treefile} = $f; my $t = $program->($a, $o);}, { Max_Workers => $np });
1101 :     @trees = map { gjonewicklib::read_newick_tree($_) } @output;
1102 :     unlink $_ for grep { -e $_ } @output;
1103 :     } else {
1104 :     @trees = map { my $t = $program->($_, $opts); $t } @samples;
1105 : fangfang 1.17 }
1106 : fangfang 1.34
1107 : fangfang 1.36 $tree = gjonewicklib::reroot_newick_to_midpoint_w($tree) unless $in;
1108 : fangfang 1.34 $tree = gjophylip::bootstrap_label_nodes($tree, \@trees);
1109 : fangfang 1.17 }
1110 : fangfang 1.11
1111 :     wantarray() ? ($tree, $stats) : $tree;
1112 :     }
1113 :    
1114 :    
1115 : fangfang 1.20 sub pfam_scan {
1116 :     my ($seqs, $opts) = ffxtree::process_input_args_w_ali(@_);
1117 :    
1118 :     $seqs = gjoseqlib::pack_sequences($seqs);
1119 :    
1120 : fangfang 1.27 my $ncpu = $opts->{ncpu} || 2;
1121 : fangfang 1.20 my $scan = "/home/fangfang/bin/pfam_scan.pl";
1122 :     my $pfamdir = "/home/fangfang/Pfam";
1123 :     my $tmpdir = SeedAware::location_of_tmp($opts);
1124 :     my $tmpin = SeedAware::new_file_name("$tmpdir/pfam_input_seqs", 'fa');
1125 :    
1126 :     gjoseqlib::print_alignment_as_fasta($tmpin, $seqs);
1127 :    
1128 : fangfang 1.27 my @lines = SeedAware::run_gathering_output($scan, '-fasta', $tmpin, '-dir', $pfamdir, '-cpu', $ncpu);
1129 : fangfang 1.20
1130 :     my %pfam;
1131 :     for (@lines) {
1132 :     next if /^#/ || /^\s/;
1133 :     my ($id, $fam) = @{[split(/\s+/)]}[0,6];
1134 :     if ($pfam{$id}) {
1135 :     push @{$pfam{$id}}, $fam;
1136 :     } else {
1137 :     $pfam{$id} = [$fam];
1138 :     }
1139 :     }
1140 :    
1141 :     [ map { $pfam{$_} ? [ $_, $pfam{$_} ] : () } map { $_->[0] } @$seqs ];
1142 :     }
1143 :    
1144 :     sub print_pfam {
1145 :     my ($pfam) = @_;
1146 :    
1147 :     # print STDERR '$pfam = '. Dumper($pfam);
1148 :    
1149 :     my @lines = map { $_->[0] ."\t". join(" ", @{$_->[1]} ) } @$pfam;
1150 :    
1151 :     print join("\n", @lines). "\n";
1152 :     # wantarray ? @lines : join("\n", @lines). "\n";
1153 :     }
1154 :    
1155 : fangfang 1.8 #-------------------------------------------------------------------------------
1156 :     #
1157 :     # Obsolete routines
1158 :     #
1159 :     #-------------------------------------------------------------------------------
1160 :    
1161 :     sub psiblast_search_old {
1162 : fangfang 1.2 my ($profile, $opts) = @_;
1163 :    
1164 : fangfang 1.3 my $min_ident = $opts->{ min_ident } || 0.15;
1165 :     $opts->{ e_value } ||= $opts->{ max_e_val } ||= 0.01;
1166 :    
1167 : fangfang 1.4 # print STDERR Dumper($opts);
1168 :    
1169 : fangfang 1.2 my $dir = "/home/fangfang/WB/tmpsvr";
1170 :     my $logdir = "$dir/logs";
1171 :    
1172 :     my $complete = "/home/fangfang/WB/svr_complete.fasta"; # db (fasta or gjo seqs)
1173 :    
1174 :     my %trimmedH;
1175 :     my %rejectedH;
1176 :    
1177 :     my $i = 1;
1178 :     my $suffix = -s "$dir/trim2-$i" ? "-$i" : "";
1179 :     my $fprof = "$dir/trim2$suffix";
1180 :    
1181 :    
1182 :     # print "Running psiblast, querying trim2$suffix against complete genomes...\n";
1183 :    
1184 : fangfang 1.3 my $blast = gjoalignandtree::blastpgp($complete, $profile, { max_exp => $opts->{e_value}, stderr => "$logdir/psiblast2$suffix.stderr" });
1185 : fangfang 1.2 my($qid, $qdef, $qlen, $qhits) = @$blast;
1186 :     my @trimmed;
1187 :    
1188 :     open PSI, ">$logdir/psiblast2$suffix.dump" or die "could not open psiblast2$suffix.dump\n";
1189 :     print PSI Dumper($blast);
1190 :     close PSI;
1191 :    
1192 :     my $rej = "$dir/psiblast2$suffix.rejects";
1193 :     open REJ, ">$rej" or die "Could not open $rej";
1194 :    
1195 :     my @scores;
1196 :    
1197 :     my $max_uncov = 20;
1198 :     my $max_s_uncov = min($qlen*5, 500);
1199 :     foreach my $sdata (@$qhits) {
1200 :     my($sid, $sdef, $slen, $hsps) = @$sdata;
1201 :     my $explanation;
1202 :    
1203 :     # if (@$hsps == 1)
1204 :     if ( one_real_hsp($hsps) ) {
1205 :     # [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
1206 :     # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
1207 : fangfang 1.3 my($scr, $nmat, $nid, $q1, $q2, $qseq, $s1, $s2, $sseq) = (@{$hsps->[0]})[0, 4, 5, 9, 10, 11, 12, 13, 14];
1208 : fangfang 1.2
1209 :     if (($q1-1) + ($qlen-$q2) > 2 * $max_uncov) {
1210 :     $explanation = [ $sid, "short coverage", $q1, $qlen-$q2, $s1, $slen-$s2 ];
1211 :     } elsif ($s1-1 + $slen-$s2 > $max_s_uncov) {
1212 :     $explanation = [ $sid, "long subject", $q1, $qlen-$q2, $s1, $slen-$s2 ];
1213 : fangfang 1.3 } elsif ($nid / $nmat < $min_ident) {
1214 :     $explanation = [ $sid, "low identity", $nid / $nmat ];
1215 : fangfang 1.2 } else {
1216 :     $sseq =~ s/-+//g;
1217 :     my $entry = [$sid, $sdef, $sseq];
1218 :     push(@trimmed, $entry);
1219 :     $trimmedH{$sid} = [$entry, $scr] unless $trimmedH{$sid} && $trimmedH{$sid}->[1] >= $scr;
1220 :     }
1221 :     } else {
1222 :     $explanation = [ $sid, "multiple hsps" ];
1223 :     }
1224 :     if ($explanation) {
1225 :     print REJ join("\t", @$explanation)."\n";
1226 :     $rejectedH{$sid} ||= $explanation;
1227 :     }
1228 :     }
1229 :     close REJ;
1230 :     # &gjoseqlib::print_alignment_as_fasta("$dir/trim2$suffix-complete", \@trimmed);
1231 :     # printf "trim2$suffix-complete contains %d sequences\n", &num_seqs_in_fasta("$dir/trim2$suffix-complete");
1232 :    
1233 :     return \@trimmed;
1234 :    
1235 :     }
1236 :    
1237 : fangfang 1.6 sub trim_alignment_old {
1238 : fangfang 1.1 my ($ali, $opts) = @_;
1239 :     my $rv;
1240 :    
1241 :     my $profile;
1242 :    
1243 : fangfang 1.6 # my $dir = SeedAware::location_of_tmp( $opts );
1244 : fangfang 1.1 my $dir = "/home/fangfang/WB/tmpsvr";
1245 :    
1246 :     my $logdir = "$dir/logs";
1247 :     &SeedUtils::verify_dir($logdir);
1248 :    
1249 :     my $trim1 = gjoalignandtree::trim_align_to_median_ends($ali, { begin => 1, end => 1 });
1250 :     my $reps = [@$trim1];
1251 :    
1252 :     my (@trims, %rejectH, %successH);
1253 :    
1254 :     my $i = 0;
1255 :     my $nseq = @$trim1;
1256 :    
1257 :     while (@$trim1 >= 0.1 * $nseq && ($i == 0 || @$trim1 >= 2)) {
1258 :     $i++;
1259 :     my $suffix = $i > 1 ? "-$i" : "";
1260 :    
1261 :     # print "Running psiblast, querying trim1 [$i] against reps-0.8...\n";
1262 :    
1263 :     my $blast = gjoalignandtree::blastpgp($reps, $trim1, { stderr => "$logdir/psiblast1.stderr$suffix" });
1264 :     open PSI, ">$logdir/psiblast1.dump$suffix" or die "could not open psiblast1.dump$suffix\n";
1265 :     print PSI Dumper($blast);
1266 :     close PSI;
1267 :    
1268 :     my $opts = {};
1269 :     my ( $trimmed, $reject, $scoreH ) = process_psiblast1( $blast, $opts );
1270 :    
1271 :     push @trims, [ $_->[0], $_, $scoreH->{$_->[0]}, $i ] for @$trimmed;
1272 :     $successH{$_->[0]} = 1 for @$trimmed;
1273 :     push @{$rejectH{$_->[0]}}, $_ for @$reject;
1274 :    
1275 :     my $rej = "$dir/psiblast1.rejects$suffix";
1276 :     open REJ, ">", $rej or die "Could not open $rej";
1277 :     foreach ( @$reject ) {
1278 :     print REJ join( "\t", @$_ ), "\n";
1279 :     # print "REJ\t", join( "\t", @$_ ), "\n";
1280 :     }
1281 :     close REJ;
1282 :    
1283 :     # foreach (@$trimmed) {
1284 :     # print "GOOD\t", join("\t", $_->[0], $_->[2])."\n";
1285 :     # }
1286 :     # print "\n";
1287 :    
1288 :     &gjoseqlib::print_alignment_as_fasta("$dir/trim1-reps-0.8$suffix", $trimmed);
1289 :     # printf "trim1-reps-0.8$suffix contains %d sequences\n", scalar@$trimmed;
1290 :     $profile = [@$trimmed];
1291 :    
1292 :     my $ntrim1 = @$trim1;
1293 :    
1294 :     # take only the rejected seqs marked as multiple hsps
1295 :     my %multiple = map { $_->[0] => 1 } grep { $_->[1] =~ /multiple/i } @$reject;
1296 :     @$trim1 = grep { $multiple{$_->[0]} } @$trim1;
1297 :    
1298 :     # previous (wrong): take all the rejected seqs
1299 :     # @$trim1 = grep { !$successH{$_->[0]} } @$trim1;
1300 :    
1301 :     # take all the rejected seqs except tiny ones
1302 :     # my %nontiny = map { $_->[0] => 1 } grep { $_->[1] !~ /tiny/i } @$reject;
1303 :     # @$trim1 = grep { $nontiny{$_->[0]} } @$trim1;
1304 :    
1305 :     last if $ntrim1 == @$trim1;
1306 :     }
1307 :    
1308 :     if ($i > 1) {
1309 :     run("mv $dir/trim1-reps-0.8 $dir/trim1-reps-0.8-1");
1310 :     run("mv $dir/psiblast1.rejects $dir/psiblast1.rejects-1");
1311 :    
1312 :     my %seen;
1313 :     @trims = grep { !$seen{$_->[0]}++ } sort { $b->[2] <=> $a->[2] } @trims;
1314 :    
1315 :     my @fhs;
1316 :     foreach my $j (1 .. $i) {
1317 :     open($fhs[$j], ">$dir/trim1-reps-0.8-$j-ids") or die "Could not open $dir/trim1-reps-0.8-$j-ids";
1318 :     }
1319 :     foreach my $trm (@trims) {
1320 :     my $j = $trm->[3];
1321 :     my $fh = $fhs[$j];
1322 :     print $fh join("\t", $trm->[0], $trm->[2])."\n";
1323 :     }
1324 :     close $_ for @fhs[1..$i];
1325 :    
1326 :     @trims = map { $_->[1] } @trims;
1327 :     &gjoseqlib::print_alignment_as_fasta("$dir/trim1-reps-0.8", \@trims);
1328 :     $profile = [@trims];
1329 :    
1330 :     printf "After merging, trim1-reps-0.8 contains %d sequences\n", &num_seqs_in_fasta("$dir/trim1-reps-0.8");
1331 :    
1332 :     my $rej = "$dir/psiblast1.rejects";
1333 :     open REJ, ">", $rej or die "Could not open $rej";
1334 :     for (grep { @{$rejectH{$_}} == $i } keys %rejectH) {
1335 :     print REJ map { join("\t", @$_), "\n" } @{$rejectH{$_}};
1336 :     }
1337 :     close REJ;
1338 :     }
1339 :    
1340 :     # my $profile1 = &gjoseqlib::read_fasta("$dir/trim1-reps-0.8");
1341 :     my $profile1 = $profile;
1342 : fangfang 1.2 my $align2 = &gjoalignment::align_with_mafft($profile1);
1343 : fangfang 1.1 my $trim2 = &gjoalignandtree::trim_align_to_median_ends($align2, { begin => 1, end => 1 } );
1344 :    
1345 :     return $trim2;
1346 :     }
1347 :    
1348 :    
1349 : fangfang 1.6 sub process_psiblast1 {
1350 : fangfang 1.1 my ( $blast, $opts ) = @_;
1351 :    
1352 :     $blast && ref $blast eq 'ARRAY' or return ();
1353 :     $opts && ref $opts eq 'HASH' or $opts = {};
1354 :    
1355 :     my( $qid, $qdef, $qlen, $qhits ) = @$blast;
1356 :    
1357 :     my $fract_ends = $opts->{ fract_ends } || $opts->{ fraction_ends } || 0.1; # fraction of sequences ending in window
1358 :     my $max_uncov = $opts->{ max_uncov } || $opts->{ max_uncovered } || 20 > $qlen/3 ? ($qlen/3 + 1) : 20;
1359 :     my $min_cov = $opts->{ min_cov } || $opts->{ min_nongaps } || 10 < $qlen/5 ? ($qlen/5 + 1) : 10;
1360 :     my $win_size = $opts->{ win_size } || $opts->{ window_size } || 10;
1361 :    
1362 :     # print "max_uncovered = $max_uncov\n";
1363 :    
1364 :     my @uncov1;
1365 :     my @uncov2;
1366 :     foreach my $sdata ( @$qhits )
1367 :     {
1368 :     my( $sid, $sdef, $slen, $hsps ) = @$sdata;
1369 :     # if ( @$hsps == 1 )
1370 :     if ( one_real_hsp($hsps) )
1371 :     {
1372 :     # [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
1373 :     # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
1374 :     my( $q1, $q2, $s1, $s2 ) = ( @{ $hsps->[0] } )[9, 10, 12, 13];
1375 :     push @uncov1, [ $q1-1, $s1-1 ];
1376 :     push @uncov2, [ $qlen-$q2, $slen-$s2 ];
1377 :     }
1378 :     }
1379 :    
1380 :     my $q1_cut = calc_uncov_cut(\@uncov1, $fract_ends, $win_size) + 1;
1381 :     my $q2_cut = $qlen - calc_uncov_cut(\@uncov2, $fract_ends, $win_size);
1382 :    
1383 :     # print $qlen - $q2_cut, "\n";
1384 :     # print "q1_cut = $q1_cut q2_cut = $q2_cut qlen = $qlen\n";
1385 :    
1386 :     my @trimmed;
1387 :     my @rejected;
1388 :     my %scoreH;
1389 :     foreach my $sdata ( @$qhits ) {
1390 :     my( $sid, $sdef, $slen, $hsps ) = @$sdata;
1391 :     # if ( @$hsps == 1 )
1392 :     if ( one_real_hsp($hsps) ) {
1393 :     my( $scr, $q1, $q2, $qseq, $s1, $s2, $sseq ) = ( @{ $hsps->[0] } )[ 0, 9 .. 14 ];
1394 :     my $uncov1 = $q1 <= $q1_cut ? 0 : $q1 - $q1_cut;
1395 :     my $uncov2 = $q2 >= $q2_cut ? 0 : $q2_cut - $q2;
1396 :     if ( $uncov1 <= $max_uncov && $uncov2 <= $max_uncov ) {
1397 :     $sseq = trim_5( $q1_cut - $q1, $qseq, $sseq );
1398 :     $sseq = trim_3( $q2 - $q2_cut, $qseq, $sseq );
1399 :     $sseq =~ s/-+//g;
1400 :     if ( length($sseq) >= $min_cov ) {
1401 :     push @trimmed, [ $sid, $sdef, $sseq ];
1402 :     $scoreH{$sid} = $scr;
1403 :     } else {
1404 :     push @rejected, [ $sid, 'tiny coverage', $uncov1, $uncov2, $s1-1, $slen-$s2, length($sseq) ];
1405 :     }
1406 :     } else {
1407 :     push @rejected, [ $sid, 'short coverage', $uncov1, $uncov2, $s1-1, $slen-$s2 ];
1408 :     }
1409 :     } else {
1410 :     push @rejected, [ $sid, 'multiple hsps' ];
1411 :     }
1412 :     }
1413 :     return wantarray ? ( \@trimmed, \@rejected, \%scoreH ) : \@trimmed;
1414 :     }
1415 :    
1416 :    
1417 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3