[Bio] / FigKernelScripts / svr_trim_ali.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_trim_ali.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (view) (download) (as text)

1 : fangfang 1.1 #
2 :     # This is a SAS Component
3 :     #
4 :    
5 :     #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     use strict;
23 :     use Data::Dumper;
24 :     use Carp;
25 :     use Getopt::Long;
26 :    
27 :     =head1 svr_trim_ali
28 :    
29 :     svr_trim_ali [options] < ali.fa > trim.fa
30 :    
31 :     This script takes a FASTA file of aligned sequences, trims the
32 : fangfang 1.3 alignment by running PSIBLAST against the sequences themselves, and
33 :     writes the trimmed alignment to the standard output.
34 :    
35 :     =head2 Introduction
36 :    
37 :     usage: svr_trim_ali [options] < ali.fa > trim.fa
38 :    
39 : fangfang 1.4 -a align_tool - alignment tool to use: Clustal (D), MAFFT, Muscle.
40 : fangfang 1.3 -c - append trimming coordinates to description fields in FASTA
41 :     -d log_dir - direcotry for log files
42 :     -e log_prefix - prefix for log file names
43 :     -f fract_cov - fraction of sequences to be covered in initial trimming (D = 0.75)
44 :     -g - attempt no more than a single round of psiblast
45 :     -l - run trimming locally
46 :     -m - trim to median ends only
47 :     -r - first collapse seqs into representatives
48 :     -s max_reps_sim - threshold used to collapse seqs into representatives (D = 0.9)
49 : fangfang 1.8 -cd - trim to conserved domains
50 :     -html file - show clipped ends in lower letters in an html alignment
51 : fangfang 1.3
52 :     =head2 Command-Line options
53 :    
54 :     =over 4
55 :    
56 :     =item -a align_tool
57 :    
58 :     Alignment tool to use. The default is Clustal, which seems to deal
59 :     with end gaps better. If MAFFT is chosen, automatically selects an
60 :     appropriate strategy from L-INS-i, FFT-NS-i and FFT-NS-2, according to
61 :     data size.
62 :    
63 :     =item -c
64 :    
65 :     With the -c option, the coordiates of the trimmed sequences are
66 :     appended to the comment field of the output FASTA.
67 :    
68 :     =item -d log_dir
69 :    
70 :     Directory name for trimming log files. Without the -d option, log
71 :     files are not saved.
72 :    
73 :     =item -e log_prefix
74 :    
75 :     Prefix for log file names. Random digits are appended to the file
76 :     names so that existing files will not be clobbered.
77 :    
78 :     =item -f fract_cov (D = 0.75)
79 :    
80 :     Fraction of sequences to be covered in initial trimming. Use 0.5 for
81 :     trimming to medien ends.
82 :    
83 :     =item -g
84 :    
85 :     Without the -g option, more than one rounds of psiblast search may be
86 :     attempted to incorporate seqs with multiple hsps.
87 :    
88 :     =item -l
89 :    
90 :     Run trimming and psiblast locally.
91 :    
92 :     =item -m
93 :    
94 :     Trim to median ends (or a specified coverage fraction in the -f option) only.
95 :    
96 :     =item -r
97 :    
98 :     Use represetative sequences to reduce data size and over-represented sequences.
99 :    
100 :     =item -s max_reps_sim (D = 0.9)
101 :    
102 :     The similarity threshold used to collapse seqs into representatives.
103 :    
104 : fangfang 1.5 =item -t fract_ends (D = 0.1)
105 :    
106 :     The minimum fraction of ends falling in the same window of uncovered
107 :     amino acids that are considered significant for determining the
108 :     trimming cutoff. A smaller fraction value indicates more aggressive
109 :     trimming.
110 :    
111 :     =item -w window_size (D = 10)
112 :    
113 :     The size of the initial sliding window used to count instances of
114 :     sequences whose ends have similar number of uncovered amino acids. If
115 :     no cutoff value is found, additional rounds of calculation are carried
116 :     out with increasing window sizes. The effect of starting window size
117 :     on trimming is uncertain. A narrower starting window size usually
118 :     indicates less aggressive trimming, but it may have the opposite
119 :     effect when fract_ends is very small.
120 :    
121 : fangfang 1.8 =item -cd
122 :    
123 :     Trim to conserved domains. No psiblast search is attempted.
124 :    
125 :     =item -html file
126 :    
127 :     Generate an HTML file for visualizing trimmed alignment with clipped
128 :     ends in lower case.
129 :    
130 : fangfang 1.3 =back
131 :    
132 :     =head2 Input
133 :    
134 :     The input set of aligned sequences is read from STDIN.
135 :    
136 :     =head2 Output
137 :    
138 :     The set of trimmed sequences is written to STDOUT.
139 : fangfang 1.1
140 :     =cut
141 :    
142 : fangfang 1.3 use AlignTree;
143 :     use ATserver;
144 : fangfang 1.1 use SeedUtils;
145 : fangfang 1.3
146 : fangfang 1.1 use gjoalignment;
147 :     use gjoseqlib;
148 :    
149 : fangfang 1.3 my $usage = <<"End_of_Usage";
150 : fangfang 1.1
151 : fangfang 1.3 usage: svr_trim_ali [options] < ali.fa > trim.fa
152 : fangfang 1.1
153 : fangfang 1.5 -a align_tool - alignment tool to use: Clustal (D), MAFFT, Muscle
154 :     -c - append trimming coordinates to description fields in FASTA
155 : fangfang 1.6 -d log_dir - directory for log files
156 : fangfang 1.5 -e log_prefix - prefix for log file names
157 :     -f fract_cov - fraction of sequences to be covered in initial trimming (D = 0.75)
158 :     -g - attempt no more than a single round of psiblast
159 :     -l - run trimming locally
160 :     -m - trim to median ends only
161 :     -r - first collapse seqs into representatives
162 :     -s max_reps_sim - threshold used to collapse seqs into representatives (D = 0.9)
163 :     -t fract_ends - minimum fraction of ends to be considered significant for uncov cutoff (D = 0.1)
164 :     -w win_size - size of sliding window used in calculating uncov cutoff (D = 10)
165 : fangfang 1.8 -cd - trim to conserved domains
166 :     -html file - show clipped ends in lower letters in an html alignment
167 : fangfang 1.3
168 :     End_of_Usage
169 :    
170 : fangfang 1.8 my ($help, $local, $tool, $coord, $dir, $prefix, $fc, $fe,
171 :     $single, $median, $reps, $sim, $win, $cd, $html, $short);
172 : fangfang 1.3
173 : fangfang 1.4 GetOptions("h|help" => \$help,
174 :     "l|local" => \$local,
175 :     "a|tool=s" => \$tool,
176 :     "c" => \$coord,
177 :     "d=s" => \$dir,
178 :     "e=s" => \$prefix,
179 : fangfang 1.5 "f|fc=f" => \$fc,
180 : fangfang 1.4 "g" => \$single,
181 :     "m" => \$median,
182 : fangfang 1.7 "r|rep" => \$reps,
183 :     "s|sim=f" => \$sim,
184 : fangfang 1.5 "t|fe=f" => \$fe,
185 : fangfang 1.8 "w=i" => \$win,
186 :     "cd" => \$cd,
187 :     "html=s" => \$html,
188 :     "short" => \$short);
189 : fangfang 1.3
190 :     $help and die $usage;
191 :    
192 :     my $opts;
193 :    
194 : fangfang 1.8 $opts->{keep_def} = 1 if !$coord && !$html;
195 :     $opts->{single_round} = 1 if $single;
196 :     $opts->{skip_psiblast} = 1 if $median || $cd;
197 :     $opts->{to_domain} = 1 if $cd;
198 :     $opts->{use_reps} = 1 if $reps;
199 :     $opts->{fract_cov} = $fc if $fc;
200 :     $opts->{fract_ends} = $fe if $fe;
201 :     $opts->{log_dir} = $dir if $dir;
202 :     $opts->{log_prefix} = $prefix if $prefix;
203 :     $opts->{max_reps_sim} = $sim if $sim;
204 :     $opts->{win_size} = $win if $win;
205 :     $opts->{align_opts} = { tool => $tool } if $tool;
206 : fangfang 1.4
207 :     $opts->{align_opts}->{auto} = 1 if $tool =~ /mafft/i;
208 : fangfang 1.3
209 :     $opts->{ali} = gjoseqlib::read_fasta();
210 :    
211 :     my $AT;
212 :     my $trim;
213 :    
214 :     if ($local) {
215 :     $trim = AlignTree::trim_alignment($opts);
216 :     } else {
217 :     $AT = ATserver->new();
218 :     $trim = $AT->trim_ali($opts)->{rv};
219 :     }
220 : fangfang 1.1
221 : fangfang 1.3 gjoseqlib::print_alignment_as_fasta($trim);
222 : fangfang 1.1
223 : fangfang 1.8 if ($html) {
224 : fangfang 1.10 my @ali2 = map { [@$_[0,1], uc($_->[2])] } @{$opts->{ali}};
225 :     my %desc = map { $_->[0] => $_->[1] } @ali2;
226 :     my %coord = map { $_->[0] => substr($_->[1], length($desc{$_->[0]})) } @$trim;
227 : fangfang 1.9
228 : fangfang 1.8 my ($beg, $end) = (0, length($ali2[0]->[2]));
229 :     for (@ali2) {
230 : fangfang 1.10 $_->[1] = $desc{$_->[0]}. $coord{$_->[0]};
231 : fangfang 1.8 my $i = 0;
232 :     my $j = length($_->[2]) - 1;
233 :     if ($coord{$_->[0]} =~ /.* \((\d+)-(\d+)\/(\d+)\)/) {
234 :     my ($b, $e, $l) = ($1, $2, $3);
235 :     my $nongap = 0;
236 :     while ($nongap < $b-1) {
237 :     $nongap++ if substr($_->[2], $i++, 1) =~ /[A-Z]/i;
238 :     }
239 :     $nongap = 0;
240 :     while ($nongap < $l-$e) {
241 :     $nongap++ if substr($_->[2], $j--, 1) =~ /[A-Z]/i;
242 :     }
243 :     substr($_->[2], 0, $i) = lc substr($_->[2], 0, $i) if $i > 0;
244 :     substr($_->[2], $j+1) = lc substr($_->[2], $j+1) if $j+1 < length($_->[2]);
245 :     $beg = $i if $i > $beg;
246 :     $end = $j if $j < $end;
247 :     }
248 :     }
249 :     my $show = 20;
250 :     if ($short && $end-$beg > 2*$show) {
251 :     for (@ali2) {
252 :     $_->[2] = join(' ', substr($_->[2],0, $beg),
253 :     substr($_->[2],$beg, $show),
254 :     '---OMITTED---',
255 :     substr($_->[2], $end-$show+1, $show),
256 :     substr($_->[2], $end+1));
257 :     }
258 :     }
259 :     open(HTML, ">$html") or die "Could not write to $html";
260 :     my ($seqs2, $legend ) = gjoalign2html::color_alignment_by_consensus( { align => \@ali2 } );
261 :     print HTML gjoalign2html::alignment_2_html_page($seqs2, { legend => $legend, title => "Alignment showing trimmed regions" });
262 :     close(HTML);
263 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3