[Bio] / FigKernelScripts / svr_representative_sequences.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_representative_sequences.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : overbeek 1.1 #! /usr/bin/perl
2 :    
3 :     #
4 :     # This is a SAS Component
5 :     #
6 :    
7 :     =head1 svr_representative_sequences [ opts ] [ rep_seqs_0 ] < new_seqs > rep_seqs
8 :    
9 :     usage: representative_sequences [ opts ] [ rep_seqs_0 ] < new_seqs > rep_seqs
10 :    
11 :     -b - order input sequences by size (long to short)
12 :     -c cluster_type - behavior of clustering algorithm (0 or 1, D=1)
13 :     -d seq_clust_dir - directory for files of clustered sequencees
14 :     -f id_clust_file - file with one line per cluster, listing its ids
15 :     -g keep_gid_list - list of genome IDs to keep
16 :     -i keep_id_list - list of sequence IDs to keep
17 :     -l log_file - real-time record of clustering, one line per seq
18 :     -m measure_of_sim - measure of similarity to use:
19 :     identity_fraction (default),
20 :     positive_fraction (proteins only), or
21 :     score_per_position (0-2 bits)
22 :     -s similarity - similarity required to be clustered (D = 0.8)
23 :    
24 :     Sequences are clustered, with one representative sequence reported for
25 :     each cluster. rep_seqs_0 is an optional file of sequences to be assigned
26 :     to unique clusters, regardless of their similarities. Each new sequence
27 :     is added to the cluster with the most similar representative sequence, or,
28 :     if its similarity to any existing representative is less than 'similarity',
29 :     it becomes the representative of a new cluster. With the -d option,
30 :     each cluster of sequences is written to a distinct file in the specified
31 :     directory. With the -f option, for each cluster, a tab-separated list
32 :     of ids is written to the specified file. With the -l option, the id of
33 :     each sequence analyzed is written to the log file, followed by the id of
34 :     the sequence that represents it (when appropriate).
35 :    
36 :     cluster_type 0 is the original method, which has only the representative
37 :     for each group in the blast database. This can randomly segregate
38 :     distant members of groups, regardless of the placement of other very
39 :     similar sequences.
40 :    
41 :     cluster_type 1 adds more diverse representatives of a group in the blast
42 :     database. This is slightly more expensive, but is much less likely to
43 :     split close relatives into different groups.
44 :    
45 :     =head2 Command-Line options
46 :    
47 :     =over 4
48 :    
49 :     =item -b
50 :    
51 :     order input sequences by size (long to short)
52 :    
53 :     =item -c cluster_type
54 :    
55 :     behavior of clustering algorithm (0 or 1, D=1)
56 :    
57 :     cluster_type 0 is the original method, which has only the representative
58 :     for each group in the blast database. This can randomly segregate
59 :     distant members of groups, regardless of the placement of other very
60 :     similar sequences.
61 :    
62 :     cluster_type 1 adds more diverse representatives of a group in the blast
63 :     database. This is slightly more expensive, but is much less likely to
64 :     split close relatives into different groups.
65 :    
66 :     =item -d seq_clust_dir - directory for files of clustered sequencees
67 :    
68 :     With the -d option, each cluster of sequences is written to a
69 :     distinct file in the specified directory.
70 :    
71 :     =item -f id_clust_file - file with one line per cluster, listing its ids
72 :    
73 :     With the -f option, for each cluster, a tab-separated list
74 :     of ids is written to the specified file.
75 :    
76 :     =item -g keep_gid_list - list of genome IDs to keep (i.e., keep all ids from these genomes)
77 :    
78 :     The file specified contains lines beginning with a genome ID. Any IDs for these genomes are
79 :     always kept.
80 :    
81 :     =item -i keep_id_list - list of sequence IDs to keep
82 :    
83 :     The specified file contains lines, each of which is a list of comma-separated FIG IDs.
84 :     Sequences with these IDs are always kept.
85 :    
86 :     =item -l log_file - real-time record of clustering, one line per seq
87 :    
88 :     This is used to see the details of the clustering process. We doubt that most users
89 :     should find it necessary.
90 :    
91 :     =item -m measure_of_sim - measure of similarity to use:
92 :    
93 :     Sequences are removed if there similarity to a "kept" sequence exceeds a specified
94 :     threshold (see -similarity below)
95 :    
96 :     The possible measures of similarity that you can specify are as follows:
97 :    
98 :     identity_fraction (default),
99 :     positive_fraction (proteins only), or
100 :     score_per_position (0-2 bits)
101 :    
102 :     =item -s similarity - similarity required to be clustered (D = 0.8)
103 :    
104 :     The similarity threshhold used to determine when sequences are deleted (but
105 :     represented by a kept sequence).
106 :    
107 :     =back
108 :    
109 :     =head2 Command-Line Arguments
110 :    
111 :     You have the option of reading all of the sequences from STDIN, but you can also
112 :     specify a set of files as arguments on the command line. All of these files (plus
113 :     STDIN) are sources for the input sequences.
114 :    
115 :     =head2 Output
116 :    
117 :     The set of retained sequences is written to STDOUT. Which sequences are represented
118 :     by each retained sequence can be determined by the output indicated in -f (a file
119 :     of grouped sequences, one group per line) or in -d (a directory in which each file
120 :     represents a single group).
121 :    
122 :     =cut
123 :    
124 :    
125 :     use gjoseqlib;
126 :     use representative_sequences;
127 :     use strict;
128 :    
129 :     my $usage = <<"End_of_Usage";
130 :    
131 :     usage: representative_sequences [ opts ] [ rep_seqs_0 ] < new_seqs > rep_seqs
132 :    
133 :     -b - order input sequences by size (long to short)
134 :     -c cluster_type - behavior of clustering algorithm (0 or 1, D=1)
135 :     -d seq_clust_dir - directory for files of clustered sequencees
136 :     -f id_clust_file - file with one line per cluster, listing its ids
137 :     -g keep_gid_list - list of genome IDs to keep
138 :     -i keep_id_list - list of sequence IDs to keep
139 :     -l log_file - real-time record of clustering, one line per seq
140 :     -m measure_of_sim - measure of similarity to use:
141 :     identity_fraction (default),
142 :     positive_fraction (proteins only), or
143 :     score_per_position (0-2 bits)
144 :     -s similarity - similarity required to be clustered (D = 0.8)
145 :    
146 :     Sequences are clustered, with one representative sequence reported for
147 :     each cluster. rep_seqs_0 is an optional file of sequences to be assigned
148 :     to unique clusters, regardless of their similarities. Each new sequence
149 :     is added to the cluster with the most similar representative sequence, or,
150 :     if its similarity to any existing representative is less than 'similarity',
151 :     it becomes the representative of a new cluster. With the -d option,
152 :     each cluster of sequences is written to a distinct file in the specified
153 :     directory. With the -f option, for each cluster, a tab-separated list
154 :     of ids is written to the specified file. With the -l option, the id of
155 :     each sequence analyzed is written to the log file, followed by the id of
156 :     the sequence that represents it (when appropriate).
157 :    
158 :     cluster_type 0 is the original method, which has only the representative
159 :     for each group in the blast database. This can randomly segregate
160 :     distant members of groups, regardless of the placement of other very
161 :     similar sequences.
162 :    
163 :     cluster_type 1 adds more diverse representatives of a group in the blast
164 :     database. This is slightly more expensive, but is much less likely to
165 :     split close relatives into different groups.
166 :    
167 :     End_of_Usage
168 :    
169 :     my $by_size = undef;
170 :     my $cluster_type = 1;
171 :     my $seq_clust_dir = undef;
172 :     my $id_clust_file = undef;
173 :     my $log = undef;
174 :     my $threshold = 0.80;
175 :     my $measure = 'identity_fraction';
176 :     my $keep_id_file = undef;
177 :     my $keep_gid_file = undef;
178 :    
179 :     while ( $ARGV[0] =~ /^-/ )
180 :     {
181 :     $_ = shift @ARGV;
182 :     if ($_ =~ s/^-b//) { $by_size = 1 }
183 :     elsif ($_ =~ s/^-c//) { $cluster_type = ($_ || shift @ARGV) }
184 :     elsif ($_ =~ s/^-d//) { $seq_clust_dir = ($_ || shift @ARGV) }
185 :     elsif ($_ =~ s/^-f//) { $id_clust_file = ($_ || shift @ARGV) }
186 :     elsif ($_ =~ s/^-g//) { $keep_gid_file = ($_ || shift @ARGV) }
187 :     elsif ($_ =~ s/^-i//) { $keep_id_file = ($_ || shift @ARGV) }
188 :     elsif ($_ =~ s/^-l//) { $log = ($_ || shift @ARGV) }
189 :     elsif ($_ =~ s/^-m//) { $measure = ($_ || shift @ARGV) }
190 :     elsif ($_ =~ s/^-s//) { $threshold = ($_ || shift @ARGV) }
191 :     else { print STDERR "Bad flag: '$_'\n$usage"; exit 1 }
192 :     }
193 :    
194 :     # Is there a starting set of representative sequences?
195 :    
196 :     my $repF = undef;
197 :     my @reps = ();
198 :    
199 :     if ( @ARGV )
200 :     {
201 :     ( $repF = shift @ARGV )
202 :     && ( -f $repF )
203 :     && ( @reps = &gjoseqlib::read_fasta( $repF ) )
204 :     && ( @reps )
205 :     or print STDERR "Bad representative sequences starting file: $repF\n"
206 :     and print STDERR $usage
207 :     and exit 1;
208 :     }
209 :    
210 :     if ( $log )
211 :     {
212 :     open LOG, ">$log"
213 :     or print STDERR "Unable to open log file '$log'\n$usage"
214 :     and exit 1;
215 :     }
216 :    
217 :     my @seqs = &gjoseqlib::read_fasta( \*STDIN );
218 :     @seqs or print STDERR "Failed to read sequences from stdin\n$usage"
219 :     and exit 1;
220 :    
221 :     my %options = ( max_sim => $threshold,
222 :     sim_meas => $measure
223 :     );
224 :    
225 :     $options{ by_size } = 1 if $by_size;
226 :     $options{ logfile } = \*LOG if $log;
227 :    
228 :    
229 :     my @keep_ids;
230 :     my @keep_gids;
231 :    
232 :     if (defined $keep_gid_file)
233 :     {
234 :     open F, "<$keep_gid_file"
235 :     or die "Unable to open $keep_gid_file";
236 :     while (<F>) {
237 :     chomp;
238 :     push @keep_gids, map { /(\d+\.\d+)/ ? "fig\|$1" : () } split /[\s,]+/, $_;
239 :     }
240 :     close F;
241 :     }
242 :    
243 :     if (defined $keep_id_file)
244 :     {
245 :     open F, "<$keep_id_file"
246 :     or die "Unable to open $keep_id_file";
247 :     while (<F>) {
248 :     chomp;
249 :     push @keep_ids, split /[\s,]+/, $_;
250 :     }
251 :     close F;
252 :     }
253 :    
254 :     $options{ keep_gid } = \@keep_gids if @keep_gids;
255 :     $options{ keep_id } = \@keep_ids if @keep_ids;
256 :    
257 :     my ( $rep, $reping );
258 :    
259 :     if ( $cluster_type == 1 )
260 :     {
261 :     ( $rep, $reping ) = &representative_sequences::rep_seq( ( @reps ? \@reps : () ),
262 :     \@seqs,
263 :     \%options
264 :     );
265 :     }
266 :     else
267 :     {
268 :     ( $rep, $reping ) = &representative_sequences::rep_seq_2( ( @reps ? \@reps : () ),
269 :     \@seqs,
270 :     \%options
271 :     );
272 :     }
273 :    
274 :     close( LOG ) if $log;
275 :    
276 :     &gjoseqlib::print_alignment_as_fasta( $rep );
277 :    
278 :     if ( $id_clust_file )
279 :     {
280 :     open FILE, ">$id_clust_file"
281 :     or print STDERR "Could not open id_clust_file '$id_clust_file'\n$usage"
282 :     and exit 1;
283 :     foreach ( map { $_->[0] } @$rep )
284 :     {
285 :     print FILE join( "\t", $_, @{ $reping->{$_} } ), "\n";
286 :     }
287 :     close FILE;
288 :     }
289 :    
290 :     if ( $seq_clust_dir )
291 :     {
292 :     mkdir $seq_clust_dir if ! -d $seq_clust_dir;
293 :     -d $seq_clust_dir
294 :     or print STDERR "Could not make seq_clust_dir '$seq_clust_dir'\n$usage"
295 :     and exit 1;
296 :     chdir $seq_clust_dir;
297 :    
298 :     my %index = map { $_->[0] => $_ } @reps, @seqs;
299 :    
300 :     my $file = 'group00000';
301 :     foreach ( map { $_->[0] } @$rep )
302 :     {
303 :     my $cluster = [ map { $index{$_} } $_, @{ $reping->{$_} } ];
304 :     &gjoseqlib::print_alignment_as_fasta( ++$file, $cluster );
305 :     }
306 :     }
307 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3