[Bio] / FigKernelScripts / svr_trim_ali.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_trim_ali.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : fangfang 1.1 #
2 :     # This is a SAS Component
3 :     #
4 :    
5 :     #
6 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
7 :     # for Interpretations of Genomes. All Rights Reserved.
8 :     #
9 :     # This file is part of the SEED Toolkit.
10 :     #
11 :     # The SEED Toolkit is free software. You can redistribute
12 :     # it and/or modify it under the terms of the SEED Toolkit
13 :     # Public License.
14 :     #
15 :     # You should have received a copy of the SEED Toolkit Public License
16 :     # along with this program; if not write to the University of Chicago
17 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
18 :     # Genomes at veronika@thefig.info or download a copy from
19 :     # http://www.theseed.org/LICENSE.TXT.
20 :     #
21 :    
22 :     use strict;
23 :     use Data::Dumper;
24 :     use Carp;
25 :     use Getopt::Long;
26 :    
27 :     =head1 svr_trim_ali
28 :    
29 :     svr_trim_ali [options] < ali.fa > trim.fa
30 :    
31 :     This script takes a FASTA file of aligned sequences, trims the
32 : fangfang 1.3 alignment by running PSIBLAST against the sequences themselves, and
33 :     writes the trimmed alignment to the standard output.
34 :    
35 :     =head2 Introduction
36 :    
37 :     usage: svr_trim_ali [options] < ali.fa > trim.fa
38 :    
39 : fangfang 1.4 -a align_tool - alignment tool to use: Clustal (D), MAFFT, Muscle.
40 : fangfang 1.3 -c - append trimming coordinates to description fields in FASTA
41 :     -d log_dir - direcotry for log files
42 :     -e log_prefix - prefix for log file names
43 :     -f fract_cov - fraction of sequences to be covered in initial trimming (D = 0.75)
44 :     -g - attempt no more than a single round of psiblast
45 :     -l - run trimming locally
46 :     -m - trim to median ends only
47 :     -r - first collapse seqs into representatives
48 :     -s max_reps_sim - threshold used to collapse seqs into representatives (D = 0.9)
49 :    
50 :     =head2 Command-Line options
51 :    
52 :     =over 4
53 :    
54 :     =item -a align_tool
55 :    
56 :     Alignment tool to use. The default is Clustal, which seems to deal
57 :     with end gaps better. If MAFFT is chosen, automatically selects an
58 :     appropriate strategy from L-INS-i, FFT-NS-i and FFT-NS-2, according to
59 :     data size.
60 :    
61 :     =item -c
62 :    
63 :     With the -c option, the coordiates of the trimmed sequences are
64 :     appended to the comment field of the output FASTA.
65 :    
66 :     =item -d log_dir
67 :    
68 :     Directory name for trimming log files. Without the -d option, log
69 :     files are not saved.
70 :    
71 :     =item -e log_prefix
72 :    
73 :     Prefix for log file names. Random digits are appended to the file
74 :     names so that existing files will not be clobbered.
75 :    
76 :     =item -f fract_cov (D = 0.75)
77 :    
78 :     Fraction of sequences to be covered in initial trimming. Use 0.5 for
79 :     trimming to medien ends.
80 :    
81 :     =item -g
82 :    
83 :     Without the -g option, more than one rounds of psiblast search may be
84 :     attempted to incorporate seqs with multiple hsps.
85 :    
86 :     =item -l
87 :    
88 :     Run trimming and psiblast locally.
89 :    
90 :     =item -m
91 :    
92 :     Trim to median ends (or a specified coverage fraction in the -f option) only.
93 :    
94 :     =item -r
95 :    
96 :     Use represetative sequences to reduce data size and over-represented sequences.
97 :    
98 :     =item -s max_reps_sim (D = 0.9)
99 :    
100 :     The similarity threshold used to collapse seqs into representatives.
101 :    
102 : fangfang 1.5 =item -t fract_ends (D = 0.1)
103 :    
104 :     The minimum fraction of ends falling in the same window of uncovered
105 :     amino acids that are considered significant for determining the
106 :     trimming cutoff. A smaller fraction value indicates more aggressive
107 :     trimming.
108 :    
109 :     =item -w window_size (D = 10)
110 :    
111 :     The size of the initial sliding window used to count instances of
112 :     sequences whose ends have similar number of uncovered amino acids. If
113 :     no cutoff value is found, additional rounds of calculation are carried
114 :     out with increasing window sizes. The effect of starting window size
115 :     on trimming is uncertain. A narrower starting window size usually
116 :     indicates less aggressive trimming, but it may have the opposite
117 :     effect when fract_ends is very small.
118 :    
119 : fangfang 1.3 =back
120 :    
121 :     =head2 Input
122 :    
123 :     The input set of aligned sequences is read from STDIN.
124 :    
125 :     =head2 Output
126 :    
127 :     The set of trimmed sequences is written to STDOUT.
128 : fangfang 1.1
129 :     =cut
130 :    
131 : fangfang 1.3 use AlignTree;
132 :     use ATserver;
133 : fangfang 1.1 use SeedUtils;
134 : fangfang 1.3
135 : fangfang 1.1 use gjoalignment;
136 :     use gjoseqlib;
137 :    
138 : fangfang 1.3 my $usage = <<"End_of_Usage";
139 : fangfang 1.1
140 : fangfang 1.3 usage: svr_trim_ali [options] < ali.fa > trim.fa
141 : fangfang 1.1
142 : fangfang 1.5 -a align_tool - alignment tool to use: Clustal (D), MAFFT, Muscle
143 :     -c - append trimming coordinates to description fields in FASTA
144 :     -d log_dir - direcotry for log files
145 :     -e log_prefix - prefix for log file names
146 :     -f fract_cov - fraction of sequences to be covered in initial trimming (D = 0.75)
147 :     -g - attempt no more than a single round of psiblast
148 :     -l - run trimming locally
149 :     -m - trim to median ends only
150 :     -r - first collapse seqs into representatives
151 :     -s max_reps_sim - threshold used to collapse seqs into representatives (D = 0.9)
152 :     -t fract_ends - minimum fraction of ends to be considered significant for uncov cutoff (D = 0.1)
153 :     -w win_size - size of sliding window used in calculating uncov cutoff (D = 10)
154 : fangfang 1.3
155 :     End_of_Usage
156 :    
157 : fangfang 1.5 my ($help, $local, $tool, $coord, $dir, $prefix, $fc, $fe, $single, $median, $reps, $sim, $win);
158 : fangfang 1.3
159 : fangfang 1.4 GetOptions("h|help" => \$help,
160 :     "l|local" => \$local,
161 :     "a|tool=s" => \$tool,
162 :     "c" => \$coord,
163 :     "d=s" => \$dir,
164 :     "e=s" => \$prefix,
165 : fangfang 1.5 "f|fc=f" => \$fc,
166 : fangfang 1.4 "g" => \$single,
167 :     "m" => \$median,
168 :     "r" => \$reps,
169 : fangfang 1.5 "s=f" => \$sim,
170 :     "t|fe=f" => \$fe,
171 :     "w=i" => \$win);
172 : fangfang 1.3
173 :     $help and die $usage;
174 :    
175 :     my $opts;
176 :    
177 : fangfang 1.4 $opts->{keep_def} = 1 if !$coord;
178 :     $opts->{single_round} = 1 if $single;
179 :     $opts->{skip_psiblast} = 1 if $median;
180 :     $opts->{use_reps} = 1 if $reps;
181 : fangfang 1.5 $opts->{fract_cov} = $fc if $fc;
182 :     $opts->{fract_ends} = $fe if $fe;
183 : fangfang 1.4 $opts->{log_dir} = $dir if $dir;
184 :     $opts->{log_prefix} = $prefix if $prefix;
185 :     $opts->{max_reps_sim} = $sim if $sim;
186 : fangfang 1.5 $opts->{win_size} = $win if $win;
187 : fangfang 1.4 $opts->{align_opts} = { tool => $tool } if $tool;
188 :    
189 :     $opts->{align_opts}->{auto} = 1 if $tool =~ /mafft/i;
190 : fangfang 1.3
191 :     $opts->{ali} = gjoseqlib::read_fasta();
192 :    
193 :     my $AT;
194 :     my $trim;
195 :    
196 :     if ($local) {
197 :     $trim = AlignTree::trim_alignment($opts);
198 :     } else {
199 :     $AT = ATserver->new();
200 :     $trim = $AT->trim_ali($opts)->{rv};
201 :     }
202 : fangfang 1.1
203 : fangfang 1.3 gjoseqlib::print_alignment_as_fasta($trim);
204 : fangfang 1.1

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3