[Bio] / FigKernelPackages / representative_sequences.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/representative_sequences.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.12 - (view) (download) (as text)

1 : golsen 1.1 package representative_sequences;
2 :    
3 : overbeek 1.10 # This is a SAS component
4 :     #
5 : golsen 1.1 use strict;
6 :     use gjoparseblast;
7 :    
8 : golsen 1.5 eval { use Data::Dumper }; # Not in all installations
9 :    
10 :     # These variables are used as globals below. They have been made SEED-aware,
11 :     # but not SEED-dependent. formatdb is run as a system command, and might
12 : golsen 1.1 # be encapsulated in a run() construct for the SEED. blastall is in an
13 :     # input pipe.
14 :    
15 : fangfang 1.12 my $ext_bin = $FIG_Config::ext_bin;
16 :    
17 :     my $formatdb = $ext_bin && -x "$ext_bin/formatdb" ? "$ext_bin/formatdb"
18 :     : "formatdb";
19 :     my $blastall = $ext_bin && -x "$ext_bin/blastall" ? "$ext_bin/blastall"
20 :     : "blastall";
21 : golsen 1.1
22 :     require Exporter;
23 :    
24 :     our @ISA = qw(Exporter);
25 :     our @EXPORT = qw( representative_sequences
26 :     rep_seq_2
27 : golsen 1.5 rep_seq
28 : golsen 1.1 );
29 :    
30 :     #===============================================================================
31 : golsen 1.8 # Build or add to a set of representative sequences (if you do not want an
32 : golsen 1.5 # enrichment of sequences around a focus sequence (called the reference), this
33 :     # is probably the subroutine that you want).
34 : golsen 1.1 #
35 : golsen 1.5 # \@reps = rep_seq( \@reps, \@new, \%options );
36 :     # \@reps = rep_seq( \@new, \%options );
37 : golsen 1.1 #
38 :     # or
39 :     #
40 : golsen 1.5 # ( \@reps, \%representing ) = rep_seq( \@reps, \@new, \%options );
41 :     # ( \@reps, \%representing ) = rep_seq( \@new, \%options );
42 : golsen 1.1 #
43 : golsen 1.5 # or
44 : golsen 1.2 #
45 :     # \@reps = rep_seq_2( \@reps, \@new, \%options );
46 :     # \@reps = rep_seq_2( \@new, \%options );
47 :     #
48 :     # or
49 :     #
50 :     # ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
51 :     # ( \@reps, \%representing ) = rep_seq_2( \@new, \%options );
52 : golsen 1.1 #
53 : golsen 1.5 # Construct a representative set of related sequences:
54 :     #
55 :     # \@repseqs = representative_sequences( $ref, \@seqs, $max_sim, \%options );
56 :     #
57 :     # or
58 :     #
59 :     # ( \@repseqs, \%representing, \@low_sim ) = representative_sequences( $ref,
60 :     # \@seqs, $max_sim, \%options );
61 :     #
62 : golsen 1.1 # Output:
63 :     #
64 :     # \@repseqs Reference to the list of retained (representative subset)
65 :     # of sequence entries. Sequence entries have the form
66 :     # [ $id, $def, $seq ]
67 :     #
68 :     # \%representing
69 :     # Reference to a hash in which the keys are the ids of the
70 :     # representative sequences, for which the corresponding value
71 :     # is a list of ids of other sequences that are represented by
72 :     # that representive.
73 :     #
74 :     #
75 :     # Arguments (only \@seqs is required):
76 :     #
77 :     # $ref A reference sequence as [ $id, $def, $seq ]. If present, the
78 :     # reference sequence defines a focal point for the analysis. A
79 :     # representative sequence from each lineage in its vicinity will
80 :     # be retained, even though they are more similar than max_sim to
81 :     # the reference, or to each other. The reference will always be
82 :     # included in the representative set. A limit is put on the
83 :     # similarity of lineages retained by the reference sequence with
84 :     # the max_ref_sim option (default = 0.99). The reference sequence
85 : golsen 1.2 # should not be repeated in the set of other sequences. (Only
86 :     # applies to representative_sequences; there is no equivalent for
87 :     # rep_seq_2.)
88 :     #
89 :     # \@reps In rep_seq_2, these sequences will each be placed in their own
90 :     # cluster, regardless of their similarity to one another. Each
91 :     # remaining sequence is added to the cluster to which it is
92 :     # most similar, unless it is less simililar than max_sim, in
93 :     # which case it represents a new cluster.
94 : golsen 1.1 #
95 :     # \@seqs Set of sequences to be pruned. If there is no reference
96 :     # sequence, the fist sequence in this list will be the starting
97 :     # point for the analysis and will be retained, but all sequences
98 :     # more similar than max_sim to it will be removed (in contrast to
99 :     # a reference sequence, which retains a representative of each
100 :     # lineage in its vicinity). Sequences that fail the E-value test
101 :     # relative to the reference (or the fist sequence if there is no
102 :     # reference) are dropped.
103 :     #
104 : golsen 1.2 # $max_sim (representative_sequences only; an option for rep_seq_2)
105 :     # Sequences with a higher similarity than max_sim to an existing
106 :     # representative sequence will not be included in the @reps
107 :     # output. Their ids are associated with the identifier of the
108 :     # sequence representing them in \%representing. The details of
109 :     # the behaviour are modified by other options. (default = 0.80)
110 : golsen 1.1 #
111 : golsen 1.8 # \%options Key => Value pairs that modify the behaviour:
112 :     #
113 :     # by_size (rep_seq and rep_seq_2 only)
114 :     # By default, sequences are analyzed in input order. This
115 :     # option set to true will sort from longest to shortest.
116 : golsen 1.1 #
117 :     # logfile Filehandle for a logfile of the progress. As each
118 : golsen 1.2 # sequence is analyzed, its disposition in recorded.
119 :     # In representative_sequences(), the id of each new
120 :     # representative is followed by a tab separated list of the
121 :     # ids that it represents. In rep_seq_2(), as each sequence
122 :     # is analyzed, it is recorded, followed by the id of the
123 :     # sequence representing it, if it is not the first member
124 :     # of a new cluster. Autoflush is set for the logfile.
125 : golsen 1.1 # If the value supplied is not a reference to a GLOB, then
126 :     # the log is sent to STDOUT (which is probably not what you
127 :     # want in most cases). The behavior is intended to aid in
128 :     # following prgress, and in recovery of interupted runs.
129 :     #
130 : golsen 1.2 # max_ref_sim (representative_sequences only)
131 : golsen 1.1 # Maximum similarity of any sequence to the reference. If
132 :     # max_ref_sim is less than max_sim, it is silently reset to
133 :     # max_sim. (default = 0.99, because 1.0 can be annoying)
134 :     #
135 : golsen 1.2 # max_e_val Maximum E-value for blastall. Probably moot, but will help
136 :     # with performance. (default = 0.01)
137 :     #
138 :     # max_sim Sequences with a higher similarity than max_sim to a
139 :     # retained sequence will be deleted. The details of the
140 :     # behaviour is modified by other options. (default = 0.80)
141 :     # (a parameter for representative_sequences, but an option
142 :     # for rep_seq_2).
143 : golsen 1.1 #
144 : golsen 1.5 # save_tmp Do not delete temporary files upon completion (for debug)
145 :     #
146 : golsen 1.1 # sim_meas Measure similarity for inclusion or exclusion by
147 :     # 'identity_fraction' (default), 'positive_fraction', or
148 :     # 'score_per_position'
149 :     #
150 : golsen 1.2 # save_exp (representative_sequences only)
151 :     # When there is a reference sequence, lineages more similar
152 : golsen 1.1 # than max_sim will be retained near the reference. The
153 :     # default goal is to save one member of each lineage. If
154 :     # the initial representative of the lineage is seq1, we
155 :     # pose the question, "Are there sufficiently deep divisions
156 :     # within the lineage to seq1 that it they might be viewed
157 :     # as independent? That is, might there be another sequence,
158 :     # seq2 that so different from seq1 that we might want to see
159 :     # it also?
160 :     #
161 :     # +---------------------- ref
162 :     # |
163 :     # ---+ +-------------------- seq1
164 :     # +-+
165 :     # +-------------------- seq2
166 :     #
167 :     # Without any special treatment, if the similarity of seq1
168 :     # to ref ( S(seq1,ref) ) is greater than max_sim, seq1 would
169 :     # be the sole representative of thelineage containing both
170 :     # seq1 and seq2, because the similarity of seq1 to seq2
171 :     # ( S(seq1,seq2) ) is greater than S(seq1,ref). This can
172 :     # be altered by the value of save_exp. In terms of
173 :     # similarity, seq2 will be discarded if:
174 :     #
175 :     # S(seq1,seq2) > S(seq1,ref) ** save_exp, and
176 :     # S(seq1,seq2) > S(seq2,ref) ** save_exp
177 :     #
178 :     # The default behavior described above occurs when save_exp
179 :     # is 1. If save_exp < 1, then greater similarities between
180 :     # seq1 and seq2 are allowed. Reasonable values of save_exp
181 :     # are roughly 0.7 to 1.0. (At save_exp = 0, any similarity
182 :     # would be allowed; yuck.)
183 :     #
184 : golsen 1.2 # stable (representative_sequences only; always true for rep_seq_2)
185 :     # If true (not undef, '', or 0), then the representatives
186 : golsen 1.1 # will be chosen from as early in the list as possible (this
187 : golsen 1.2 # facilitates augmentation of an existing list).
188 : golsen 1.1 #
189 :     #-------------------------------------------------------------------------------
190 :     #
191 : golsen 1.2 # Diagram of the pruning behavior of representative_sequences():
192 : golsen 1.1 #
193 :     # 0.5 0.6 0.7 0.8 0.9 1.0 Similarity
194 :     # |---------|---------|---------|---------|---------|
195 :     # .
196 :     # . + A
197 :     # . +---+
198 :     # . | + B
199 :     # . +---+
200 :     # . | +---- C
201 :     # . +----------+
202 :     # . | +-------- D
203 :     # . |
204 :     # +-----------+ +----------------- E
205 :     # | . +-+
206 :     # | . +----------------- F
207 :     # +----------------+ .
208 :     # | | . +--------------------------- G
209 :     # | +---+
210 :     # | . | +--------------------- H
211 :     # --+ . +-----+
212 :     # | . +--------------------- I
213 :     # | .
214 :     # | +------------------------------- J
215 :     # +----------------+ .
216 :     # | . +--------------------------- K
217 :     # +---+
218 :     # . +--------------------------- L
219 :     # .
220 :     # |---------|---------|---------|---------|---------|
221 :     # 0.5 0.6 0.7 0.8 0.9 1.0 Similarity
222 :     #
223 :     # In the above tree and max_sim = 0.70 and max_ref_sim = 0.99:
224 :     #
225 :     # With no reference sequence, and A first in the list, the representative
226 :     # sequences will be A, G, J and K.
227 :     #
228 :     # With A as the reference sequence and save_exp left at its default, the
229 :     # representative sequences will be A, C, D, E, G, J and K. B is excluded
230 :     # because it is more similar than max_ref_sim to A.
231 :     #
232 :     # With A as the reference sequence and save_exp = 0.8, the representative
233 :     # sequences will be A, C, D, E, F (comparably similar to A and E), G,
234 :     # H (comparably similar to A and G), J and K. The sequence L will be
235 :     # represented by K because L is much closer to K than to A.
236 :     #
237 :     # This oversimplifies the choice of representative of a cluster of related
238 :     # sequences. For example, whether G, H or I would represent the group of
239 :     # three actually depends on relative clock speed (slower is better) and
240 :     # sequence coverage (more complete is better). The actual order is by BLAST
241 :     # bit score (possibly combining independent segments).
242 :     #
243 :     # In addition, this discussion is in terms of a tree, but the calculations
244 :     # are based on a (partial) set of pairwise sequence similarities. Thus, the
245 :     # precise behavior is hard to predict, but should be similar to that described
246 :     # above.
247 :     #
248 :     #-------------------------------------------------------------------------------
249 :     #
250 :     # To construct a representative set of sequences relative to a reference
251 :     # sequence:
252 :     #
253 :     # 1. Prioritize sequences for keeping, from highest to lowest scoring
254 :     # relative to reference, as measured by blast score (bits).
255 :     # When stable is set, priority is from first to last in input file
256 :     # (a reference sequence should not be supplied).
257 :     #
258 :     # 2. Based on the similarity of each sequence to the reference and save_exp,
259 :     # compute sequence-specific values of max_sim:
260 :     #
261 :     # max_sim( seq_i ) = exp( save_exp * ln( seq_i_ref_sim ) )
262 :     #
263 :     # 3. Examine the next prioritized sequence (seq1).
264 :     #
265 :     # 4. If seq1 has been vetoed, go to 7.
266 :     #
267 :     # 5. Mark seq1 to keep.
268 :     #
269 :     # 6. Use blast to find similarities of seq1 to other sequences.
270 :     #
271 :     # 7. For each similar sequence (seq2):
272 :     #
273 :     # 7a. Skip if seq2 is marked to keep, or marked for veto
274 :     #
275 :     # 7b. Compute the maximum simiarity of seq1 and seq2 for retaining seq2:
276 :     #
277 :     # max_sim_1_2 = max( max_sim, max_sim( seq1 ), max_sim( seq2 ) )
278 :     #
279 :     # 7c. If similarity of seq1 and seq2 > max_sim, veto seq2
280 :     #
281 :     # 7d. Next seq2
282 :     #
283 :     # 8. If there are more sequences to examine, go to 3.
284 :     #
285 :     # 9. Collect the sequences marked for keeping.
286 :     #
287 :     #===============================================================================
288 :    
289 : golsen 1.5 #===============================================================================
290 :     # Build or add to a set of representative sequences. The difference of
291 :     # rep_seq_2 and rep_seq is that rep_seq can have multiple representatives
292 :     # in the blast database for a given group. This helps prevent fragmentation
293 :     # of clusters.
294 :     #
295 :     # \@reps = rep_seq( \@reps, \@new, \%options );
296 :     # \@reps = rep_seq( \@new, \%options );
297 :     #
298 :     # or
299 :     #
300 :     # ( \@reps, \%representing ) = rep_seq( \@reps, \@new, \%options );
301 :     # ( \@reps, \%representing ) = rep_seq( \@new, \%options );
302 :     #
303 :     #===============================================================================
304 :    
305 :     sub rep_seq
306 :     {
307 :     ref $_[0] eq 'ARRAY'
308 :     or print STDERR "First parameter of rep_seq must be an ARRAY reference\n"
309 :     and return undef;
310 :    
311 :     my ( $reps, $seqs, $options ) = ( [], undef, {} );
312 : golsen 1.1
313 : golsen 1.5 if ( @_ == 1 )
314 : golsen 1.1 {
315 : golsen 1.5 $seqs = shift;
316 : golsen 1.1 }
317 : golsen 1.5
318 :     elsif ( @_ == 2 )
319 : golsen 1.1 {
320 : golsen 1.5 if ( ref $_[1] eq 'ARRAY' ) { ( $reps, $seqs ) = @_ }
321 :     elsif ( ref $_[1] eq 'HASH' ) { ( $seqs, $options ) = @_ }
322 :     else
323 :     {
324 :     print STDERR "Second parameter of rep_seq must be an ARRAY or HASH reference\n";
325 :     return undef;
326 :     }
327 : golsen 1.1 }
328 :    
329 : golsen 1.5 elsif ( @_ == 3 )
330 :     {
331 :     if ( ref $_[1] ne 'ARRAY' )
332 :     {
333 :     print STDERR "Second parameter of 3 parameter rep_seq must be an ARRAY reference\n";
334 :     return undef;
335 :     }
336 :     if ( ref $_[2] ne 'HASH' )
337 :     {
338 :     print STDERR "Third parameter of 3 parameter rep_seq must be a HASH reference\n";
339 :     return undef;
340 :     }
341 : golsen 1.1
342 : golsen 1.5 ( $reps, $seqs, $options ) = @_;
343 :     }
344 :     else
345 : golsen 1.1 {
346 : golsen 1.5 print STDERR "rep_seq called with @{[scalar @_]} parameters\n";
347 :     return undef;
348 : golsen 1.1 }
349 :    
350 :     # ---------------------------------------# Default values for options
351 :    
352 : golsen 1.8 my $by_size = undef; # Analyze sequences in order provided
353 : golsen 1.5 my $max_sim = 0.80; # Retain 80% identity of less
354 :     my $logfile = undef; # Log file of sequences processed
355 :     my $max_e_val = 0.01; # Blast E-value to decrease output
356 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
357 : golsen 1.1
358 :     # Two questionable decisions:
359 :     # 1. Be painfully flexible on option names.
360 :     # 2. Silently fix bad parameter values.
361 :    
362 :     foreach ( keys %$options )
363 :     {
364 : golsen 1.5 my $value = $options->{ $_ };
365 :     if ( m/^log/i ) # logfile
366 :     {
367 :     next if ! $value;
368 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
369 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
370 :     }
371 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
372 :     {
373 :     $value += 0;
374 :     $value = 0 if $value < 0;
375 :     $value = 1 if $value > 1;
376 :     $max_sim = $value;
377 :     }
378 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
379 :     {
380 :     $value += 0;
381 :     $value = 0 if $value < 0;
382 :     $max_e_val = $value;
383 :     }
384 :     elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
385 :     {
386 :     $sim_meas = standardize_similarity_measure( $value );
387 :     }
388 :     elsif ( m/save_?te?mp/i ) # group temporary files
389 :     {
390 :     $options->{ savetmp } = 1;
391 :     }
392 : golsen 1.8 elsif ( m/size/i ) # add longest to shortest
393 :     {
394 :     $by_size = 1;
395 :     }
396 : golsen 1.5 else
397 :     {
398 :     print STDERR "WARNING: rep_seq bad option ignored: '$_' => '$value'\n";
399 :     }
400 : golsen 1.1 }
401 :    
402 : golsen 1.5 # Check sequence ids for duplicates:
403 : golsen 1.1
404 : golsen 1.5 my $reps2 = [];
405 :     my $seen = {};
406 : golsen 1.1
407 : golsen 1.5 my $id;
408 :     foreach ( @$reps )
409 :     {
410 :     $id = $_->[0];
411 :     if ( $seen->{ $id }++ )
412 :     {
413 :     print STDERR "Duplicate sequence id '$id' skipped by rep_seq\n";
414 :     }
415 :     else
416 :     {
417 :     push @$reps2, $_;
418 :     }
419 :     }
420 : golsen 1.1
421 : golsen 1.5 my $seqs2 = [];
422 :     foreach ( @$seqs )
423 :     {
424 :     $id = $_->[0];
425 :     if ( $seen->{ $id }++ )
426 :     {
427 : golsen 1.6 print STDERR "Duplicate sequence id '$id' skipped by rep_seq\n";
428 : golsen 1.5 }
429 :     else
430 :     {
431 :     push @$seqs2, $_;
432 :     }
433 :     }
434 : golsen 1.1
435 : golsen 1.8 if ( $by_size )
436 :     {
437 :     @$reps2 = sort { length( $b->[2] ) <=> length( $a->[2] ) } @$reps2;
438 :     }
439 :    
440 : golsen 1.5 # If no preexisting representatives, then take first sequence:
441 : golsen 1.1
442 : golsen 1.5 @$reps2 or ( @$reps2 = shift @$seqs2 );
443 : golsen 1.1
444 : golsen 1.5 if ( $logfile ) { foreach ( @$reps2 ) { print $logfile "$_->[0]\n" } }
445 : golsen 1.1
446 : golsen 1.5 # Search each rep sequence against itself to get max_bpp
447 : golsen 1.1
448 : golsen 1.5 my ( $tmp_dir, $save_tmp ) = &tmp_dir( $options );
449 :     $tmp_dir or print STDERR "Unable to locate temporary file directory\n"
450 :     and return;
451 :     my $db = "$tmp_dir/tmp_blast_db_$$";
452 :     my $protein = are_protein( $reps2 );
453 :     my %max_bpp;
454 :     my $entry;
455 :     foreach $entry ( @$reps2 )
456 :     {
457 :     $max_bpp{ $entry->[0] } = self_bpp( $db, $entry, $protein );
458 :     }
459 : golsen 1.1
460 : golsen 1.5 my $naln = 10; # Alignments to make
461 :     my $self = 1; # Self match will never occur
462 :     my $prog = $protein ? 'blastp' : 'blastn';
463 :     my $blast_opt = "-e $max_e_val -v $naln -b $naln -F F -a 2";
464 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
465 : golsen 1.1
466 : golsen 1.5 # List of who is represented by a sequence:
467 : golsen 1.1
468 : golsen 1.5 my %group = map { $_->[0] => [] } @$reps2;
469 : golsen 1.1
470 : golsen 1.5 # Groups can have more than one representative in the blast database:
471 : golsen 1.1
472 : golsen 1.5 my $rep4blast = [ @$reps2 ]; # initial reps
473 :     my %group_id = map { $_->[0] => $_->[0] } @$reps2; # represent self
474 : golsen 1.1
475 : golsen 1.5 # Search each sequence against the database.
476 : golsen 1.1
477 : golsen 1.5 my ( $seq, $bpp_max, $sid, $gid );
478 :     my $newdb = 1;
479 : golsen 1.1
480 : golsen 1.5 foreach $entry ( @$seqs2 )
481 :     {
482 :     # Is it time to rebuild a BLAST database?
483 : golsen 1.1
484 : golsen 1.5 if ( $newdb )
485 :     {
486 :     make_blast_db( $db, $rep4blast, $protein );
487 :     $newdb = 0;
488 :     }
489 : golsen 1.1
490 : golsen 1.5 # Do the blast analysis. Returned records are of the form:
491 :     #
492 :     # 0 1 2 3 4 5 6 7 8 9 10 11
493 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
494 :     #
495 :     # $tophit = [ $score, $blast_record, $surething ]
496 : golsen 1.1
497 : golsen 1.5 $id = $entry->[0];
498 :     $seq = $entry->[2];
499 :     my ( $tophit ) = sort { $b->[0] <=> $a->[0] }
500 :     map { in_group( $_, $max_sim, $sim_meas, $max_bpp{ $_->[3] } ) }
501 :     top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );
502 : golsen 1.1
503 : golsen 1.5 # It matches an existing representative
504 : golsen 1.1
505 : golsen 1.5 if ( $tophit )
506 :     {
507 :     $sid = $tophit->[1]->[3]; # id of the best matching sequence
508 :     $gid = $group_id{ $sid }; # look up representative for group
509 :     push @{ $group{ $gid } }, $id; # add sequence to list in group
510 :     $group_id{ $id } = $gid; # record group for id
511 :     print $logfile "$id\t$gid\n" if $logfile;
512 :    
513 :     # Add sequence to blast database if it is not a 'surething'
514 :    
515 :     if ( ! $tophit->[2] )
516 :     {
517 :     push @$rep4blast, $entry;
518 :     $max_bpp{ $id } = self_bpp( $db, $entry, $protein ) if $sim_meas =~ /^sc/;
519 :     $newdb = 1;
520 :     }
521 :     }
522 : golsen 1.1
523 : golsen 1.5 # It is a new representative
524 : golsen 1.1
525 : golsen 1.5 else
526 :     {
527 :     push @$reps2, $entry;
528 :     push @$rep4blast, $entry;
529 :     $group{ $id } = [];
530 :     $group_id{ $id } = $id; # represent self
531 :     $max_bpp{ $id } = self_bpp( $db, $entry, $protein ) if $sim_meas =~ /^sc/;
532 :     $newdb = 1;
533 :     print $logfile "$id\n" if $logfile;
534 :     }
535 : golsen 1.1 }
536 :    
537 : golsen 1.5 if ( $save_tmp ) {} # This is not used, might be in future
538 :     elsif ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
539 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
540 : golsen 1.1
541 : golsen 1.5 # Return the surviving sequence entries, and optionally the hash of
542 :     # ids represented by each survivor:
543 :    
544 :     wantarray ? ( $reps2, \%group ) : $reps2;
545 :     }
546 : golsen 1.1
547 :    
548 : golsen 1.5 #===============================================================================
549 :     # Caluculate sequence similarity according to the requested measure, and return
550 :     # empty list if lower than max_sim. Otherwise, return the hit and and
551 :     # whether the hit is really strong:
552 :     #
553 :     # [ $score, $hit, $surething ] = in_group( $hit, $max_sim, $measure, $bpp_max )
554 :     # () = in_group( $hit, $max_sim, $measure, $bpp_max )
555 :     #
556 :     # $hit is a structure with blast information:
557 :     #
558 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
559 :     #
560 :     # The surething is the similarity for which $max_sim is 4 standard deviations
561 :     # lower.
562 :     #===============================================================================
563 : golsen 1.1
564 : golsen 1.5 sub in_group
565 :     { my ( $hit, $max_sim, $measure, $bpp_max ) = @_;
566 : golsen 1.1
567 : golsen 1.5 my $n = $hit->[8]; # aligned positions
568 :     return () if ( $n <= 0 );
569 : golsen 1.1
570 : golsen 1.5 my $m; # matched positions
571 : golsen 1.1
572 : golsen 1.5 if ( $measure =~ /^sc/ ) { $m = $hit->[ 6] / ( $bpp_max || 2 ) } # score/pos
573 :     elsif ( $measure =~ /^po/ ) { $m = $hit->[10] } # positives
574 :     else { $m = $hit->[ 9] } # identities
575 : golsen 1.1
576 : golsen 1.5 return () if $m < ( $max_sim * $n );
577 : golsen 1.1
578 : golsen 1.5 my $u = ( $n > $m ) ? ( $n - $m ) : 0; # differing positions
579 :     my $stddev = sqrt( $m * $u / $n );
580 :     my $conf = 4; # standard deviations for "surething"
581 :     $max_sim = 0.01 if $max_sim < 0.01;
582 :     my $surething = ( $u + $conf * $stddev ) <= ( ( 1 - $max_sim ) * $n ) ? 1 : 0;
583 : golsen 1.1
584 : golsen 1.5 [ $m/$n, $hit, $surething ]
585 : golsen 1.1 }
586 :    
587 :    
588 :     #===============================================================================
589 :     # Build or add to a set of representative sequences.
590 :     #
591 :     # \@reps = rep_seq_2( \@reps, \@new, \%options );
592 :     # \@reps = rep_seq_2( \@new, \%options );
593 :     #
594 :     # or
595 :     #
596 :     # ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
597 :     # ( \@reps, \%representing ) = rep_seq_2( \@new, \%options );
598 :     #
599 :     #===============================================================================
600 :    
601 :     sub rep_seq_2
602 :     {
603 :     ref $_[0] eq 'ARRAY'
604 :     or print STDERR "First parameter of rep_seq_2 must be an ARRAY reference\n"
605 :     and return undef;
606 :    
607 :     my ( $reps, $seqs, $options ) = ( [], undef, {} );
608 :    
609 :     if ( @_ == 1 )
610 :     {
611 :     $seqs = shift;
612 :     }
613 :    
614 :     elsif ( @_ == 2 )
615 :     {
616 :     if ( ref $_[1] eq 'ARRAY' ) { ( $reps, $seqs ) = @_ }
617 :     elsif ( ref $_[1] eq 'HASH' ) { ( $seqs, $options ) = @_ }
618 :     else
619 :     {
620 :     print STDERR "Second parameter of rep_seq_2 must be an ARRAY or HASH reference\n";
621 :     return undef;
622 :     }
623 :     }
624 :    
625 :     elsif ( @_ == 3 )
626 :     {
627 :     if ( ref $_[1] ne 'ARRAY' )
628 :     {
629 :     print STDERR "Second parameter of 3 parameter rep_seq_2 must be an ARRAY reference\n";
630 :     return undef;
631 :     }
632 :     if ( ref $_[2] ne 'HASH' )
633 :     {
634 :     print STDERR "Third parameter of 3 parameter rep_seq_2 must be a HASH reference\n";
635 :     return undef;
636 :     }
637 :    
638 :     ( $reps, $seqs, $options ) = @_;
639 :     }
640 :     else
641 :     {
642 :     print STDERR "rep_seq_2 called with @{[scalar @_]} parameters\n";
643 :     return undef;
644 :     }
645 :    
646 :     # ---------------------------------------# Default values for options
647 :    
648 : golsen 1.8 my $by_size = undef; # Analyze sequences in order provided
649 :     my $max_sim = 0.80; # Retain 80% identity of less
650 : golsen 1.1 my $logfile = undef; # Log file of sequences processed
651 :     my $max_e_val = 0.01; # Blast E-value to decrease output
652 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
653 :    
654 :     # Two questionable decisions:
655 :     # 1. Be painfully flexible on option names.
656 :     # 2. Silently fix bad parameter values.
657 :    
658 :     foreach ( keys %$options )
659 :     {
660 : golsen 1.5 my $value = $options->{ $_ };
661 :     if ( m/^log/i ) # logfile
662 :     {
663 :     next if ! $value;
664 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
665 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
666 :     }
667 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
668 :     {
669 :     $value += 0;
670 :     $value = 0 if $value < 0;
671 :     $value = 1 if $value > 1;
672 :     $max_sim = $value;
673 :     }
674 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
675 :     {
676 :     $value += 0;
677 :     $value = 0 if $value < 0;
678 :     $max_e_val = $value;
679 :     }
680 :     elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
681 :     {
682 :     $sim_meas = standardize_similarity_measure( $value );
683 :     }
684 : golsen 1.8 elsif ( m/by_?size/i || m/size/i ) # add longest to shortest
685 :     {
686 :     $by_size = 1;
687 :     }
688 : golsen 1.5 else
689 :     {
690 :     print STDERR "WARNING: rep_seq_2 bad option ignored: '$_' => '$value'\n";
691 :     }
692 : golsen 1.1 }
693 :    
694 :     # Check sequence ids for duplicates:
695 :    
696 :     my $reps2 = [];
697 :     my $seen = {};
698 :    
699 :     my $id;
700 :     foreach ( @$reps )
701 :     {
702 :     $id = $_->[0];
703 :     if ( $seen->{ $id }++ )
704 :     {
705 :     print STDERR "Duplicate sequence id '$id' skipped by rep_seq_2\n";
706 :     }
707 :     else
708 :     {
709 :     push @$reps2, $_;
710 :     }
711 :     }
712 :    
713 :     my $seqs2 = [];
714 :     foreach ( @$seqs )
715 :     {
716 :     $id = $_->[0];
717 :     if ( $seen->{ $id }++ )
718 :     {
719 : golsen 1.6 print STDERR "Duplicate sequence id '$id' skipped by rep_seq_2\n";
720 : golsen 1.1 }
721 :     else
722 :     {
723 :     push @$seqs2, $_;
724 :     }
725 :     }
726 :    
727 : golsen 1.8 if ( $by_size )
728 :     {
729 :     @$reps2 = sort { length( $b->[2] ) <=> length( $a->[2] ) } @$reps2;
730 :     }
731 :    
732 : golsen 1.1 # If no preexisting representatives, then take first sequence:
733 :    
734 :     @$reps2 or @$reps2 = ( shift @$seqs2 );
735 :    
736 :     if ( $logfile ) { foreach ( @$reps2 ) { print $logfile "$_->[0]\n" } }
737 :    
738 :     # Search each rep sequence against itself for max_bpp
739 :    
740 : golsen 1.5 my ( $tmp_dir, $save_tmp ) = &tmp_dir( $options );
741 :     $tmp_dir or print STDERR "Unable to locate temporary file directory\n"
742 :     and return;
743 :     my $db = "$tmp_dir/tmp_blast_db_$$";
744 : golsen 1.1 my $protein = are_protein( $reps2 );
745 :     my %max_bpp;
746 :     my $entry;
747 :     foreach $entry ( @$reps2 )
748 :     {
749 :     $max_bpp{ $entry->[0] } = self_bpp( $db, $entry, $protein );
750 :     }
751 :    
752 :     my $naln = 10; # Alignments to make
753 :     my $self = 1; # Self match will never occur
754 :     my $prog = $protein ? 'blastp' : 'blastn';
755 :     my $blast_opt = "-e $max_e_val -v $naln -b $naln -F F -a 2";
756 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
757 :    
758 :     # List of who is represented by a sequence:
759 :    
760 : golsen 1.5 my %group = map { $_->[0] => [] } @$reps2;
761 : golsen 1.1
762 :     # Search each sequence against the database.
763 :    
764 :     my ( $seq, $bpp_max );
765 :     my $newdb = 1;
766 :    
767 :     foreach $entry ( @$seqs2 )
768 :     {
769 : golsen 1.5 # Is it time to rebuild a BLAST database?
770 :    
771 :     if ( $newdb )
772 :     {
773 :     make_blast_db( $db, $reps2, $protein );
774 :     $newdb = 0;
775 :     }
776 : golsen 1.1
777 :     # Do the blast analysis. Returned records are of the form:
778 :     #
779 :     # 0 1 2 3 4 5 6 7 8 9 10 11
780 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
781 :    
782 : golsen 1.5 $id = $entry->[0];
783 :     $seq = $entry->[2];
784 : golsen 1.1 my ( $tophit ) = sort { $b->[0] <=> $a->[0] }
785 :     grep { $_->[0] >= $max_sim }
786 :     map { [ seq_similarity( $_, $sim_meas, $max_bpp{ $_->[3] } ), $_ ] }
787 :     top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );
788 :    
789 :    
790 :     # It matches an existing representative
791 :    
792 : golsen 1.5 if ( $tophit )
793 :     {
794 :     # print STDERR join(", ", $tophit->[0], @{$tophit->[1]} ), "\n";
795 :     push @{ $group{ $tophit->[1]->[3] } }, $id;
796 :     print $logfile "$id\t$tophit->[1]->[3]\n" if $logfile;
797 :     }
798 :    
799 :     # It is a new representative
800 :    
801 :     else
802 :     {
803 :     push @$reps2, $entry;
804 :     $group{ $id } = [];
805 :     $max_bpp{ $id } = self_bpp( $db, $entry, $protein );
806 :     $newdb = 1;
807 :     print $logfile "$id\n" if $logfile;
808 :     }
809 : golsen 1.1 }
810 :    
811 : golsen 1.5 if ( $save_tmp ) {} # This is not used, might be in future
812 :     elsif ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
813 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
814 : golsen 1.1
815 :     # Return the surviving sequence entries, and optionally the hash of
816 :     # ids represented by each survivor:
817 :    
818 : golsen 1.5 wantarray ? ( $reps2, \%group ) : $reps2;
819 :     }
820 :    
821 :    
822 :     #===============================================================================
823 :     # Construct a representative set of related sequences:
824 :     #
825 :     # \@repseqs = representative_sequences( $ref, \@seqs, $max_sim, \%options );
826 :     #
827 :     # or
828 :     #
829 :     # ( \@repseqs, \%representing, \@low_sim ) = representative_sequences( $ref,
830 :     # \@seqs, $max_sim, \%options );
831 :     #
832 :     #===============================================================================
833 :     sub representative_sequences {
834 :     my $seqs = ( shift @_ || shift @_ ); # If $ref is undef, shift again
835 :     ref( $seqs ) eq "ARRAY"
836 :     or die "representative_sequences called with bad first argument\n";
837 :    
838 :     my ( $ref, $use_ref );
839 :     if ( ! ref( $seqs->[0] ) ) # First item was sequence entry, not list of entries
840 :     {
841 :     $ref = $seqs;
842 :     $seqs = shift @_;
843 :     ref( $seqs ) eq "ARRAY"
844 :     and ref( $seqs->[0] ) eq "ARRAY"
845 :     or die "representative_sequences called with bad sequences list\n";
846 :     $use_ref = 1;
847 :     }
848 :     else # First item was list of entries, split off first
849 :     {
850 :     ref( $seqs->[0] ) eq "ARRAY"
851 :     or die "representative_sequences called with bad sequences list\n";
852 :     $ref = shift @$seqs;
853 :     $use_ref = 0;
854 :     }
855 :    
856 :     my $max_sim = shift @_;
857 :     my $options;
858 :    
859 :     # Undocumented feature: skip max_sim (D = 0.8)
860 :    
861 :     if ( ref( $max_sim ) eq "HASH" )
862 :     {
863 :     $options = $max_sim;
864 :     $max_sim = undef;
865 :     }
866 :    
867 :     # If the above did not give us options, get them now:
868 :    
869 :     $options ||= ( shift @_ ) || {};
870 :    
871 :     # ---------------------------------------# Default values for options
872 :    
873 :     $max_sim ||= 0.80; # Retain 80% identity of less
874 :     my $logfile = undef; # Log file of sequences processed
875 :     my $max_ref_sim = 0.99; # Get rid of identical sequences
876 :     my $max_e_val = 0.01; # Blast E-value to decrease output
877 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
878 :     my $save_exp = 1.0; # Don't retain near equivalents
879 :     my $stable = 0; # Pick reps input order
880 :    
881 :     # Two questionable decisions:
882 :     # 1. Be painfully flexible on option names.
883 :     # 2. Silently fix bad parameter values.
884 :    
885 :     foreach ( keys %$options )
886 :     {
887 :     my $value = $options->{ $_ };
888 :     if ( m/^log/i ) # logfile
889 :     {
890 :     next if ! $value;
891 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
892 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
893 :     }
894 :     elsif ( m/ref/i ) # max_ref_sim
895 :     {
896 :     $value += 0;
897 :     $value = 0 if $value < 0;
898 :     $value = 1 if $value > 1;
899 :     $max_ref_sim = $value;
900 :     }
901 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
902 :     {
903 :     $value += 0;
904 :     $value = 0 if $value < 0;
905 :     $value = 1 if $value > 1;
906 :     $max_sim = $value;
907 :     }
908 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
909 :     {
910 :     $value += 0;
911 :     $value = 0 if $value < 0;
912 :     $max_e_val = $value;
913 :     }
914 :     elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
915 :     {
916 :     $sim_meas = standardize_similarity_measure( $value );
917 :     }
918 :     elsif ( m/sav/i || m/exp/i ) # save_exp(onent)
919 :     {
920 :     $value += 0;
921 :     $value = 0 if $value < 0;
922 :     $value = 1 if $value > 1;
923 :     $save_exp = $value;
924 :     }
925 :     elsif ( m/stab/i ) # stable order
926 :     {
927 :     $stable = $value ? 1 : 0;
928 :     }
929 :     else
930 :     {
931 :     print STDERR "WARNING: representative_sequences bad option ignored: '$_' => '$value'\n";
932 :     }
933 :     }
934 :    
935 :     # Silent sanity check. This should not happen, as it is almost equivalent
936 :     # to making no reference sequence.
937 :    
938 :     $max_ref_sim = $max_sim if ( $max_ref_sim < $max_sim );
939 :    
940 :     # Do the analysis
941 :    
942 :     my $ref_id = $ref->[0];
943 :     my $ref_seq = $ref->[2];
944 :    
945 :     # Build a list of the ids (without ref) and an index for the sequence entries:
946 :    
947 :     my @seq_id = map { $_->[0] } @$seqs;
948 :     my $seq_ind = { map { @{$_}[0] => $_ } ( $ref, @$seqs ) };
949 :    
950 :     # Make a lookup table of the sequence number, for use in reording
951 :     # sequences later:
952 :    
953 :     my $n = 0;
954 :     my %ord = ( map { @$_[0] => ++$n } @$seqs );
955 :    
956 :     # Build blast database (it includes the reference):
957 :    
958 :     my $protein = are_protein( $seqs );
959 :     my ( $tmp_dir, $save_tmp ) = &tmp_dir( $options );
960 :     $tmp_dir or print STDERR "Unable to locate temporary file directory\n"
961 :     and return;
962 :     my $db = "$tmp_dir/tmp_blast_db_$$";
963 :     make_blast_db( $db, [ $ref, @$seqs ], $protein );
964 :    
965 :     # Search query against new database
966 :    
967 :     my $max = 3 * @$seqs; # Alignments to keep
968 :     my $self = 1; # Keep self match (for its bit score)
969 :    
970 :     my $blast_opt = "-e $max_e_val -v $max -b $max -F F -a 2";
971 :    
972 :     # Do the blast analysis. Returned records are of the form:
973 :     #
974 :     # 0 1 2 3 4 5 6 7 8 9 10 11
975 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
976 :    
977 :     my $prog = $protein ? 'blastp' : 'blastn';
978 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
979 :     my @ref_hits = top_blast_per_subject( $prog, $db, $ref_id, $ref_seq, $self, $blast_opt );
980 :    
981 :     # First hit is always a perfect match, so we get bits per position:
982 :     # This is only used if the measure is bits per position
983 :    
984 :     my $ref_bpp = $ref_hits[0]->[6] / $ref_hits[0]->[8];
985 :    
986 :     # Remove self match (might not be first if there are identical sequences):
987 :    
988 :     my %hit = ();
989 :     @ref_hits = grep { my $sid = $_->[3]; $hit{ $sid } = 1; ( $sid ne $ref_id ) } @ref_hits;
990 :    
991 :     my %group = ();
992 :     $group{ $ref_id } = [];
993 :     my %veto = ();
994 :     my $n_to_do = @ref_hits;
995 :     my $rebuild_d_n = 40;
996 :     my $last_rebuild = 1.5 * $rebuild_d_n;
997 :     my $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
998 :    
999 :     # Sequence-specific maximum similarities:
1000 :    
1001 :     my %max_sim = map { ( $_ => $max_sim ) } @seq_id;
1002 :    
1003 :     foreach ( @ref_hits )
1004 :     {
1005 :     my $id = $_->[3];
1006 :     my $sim = seq_similarity( $_, $sim_meas, $ref_bpp );
1007 :    
1008 :     if ( $sim > ( $use_ref ? $max_ref_sim : $max_sim ) )
1009 :     {
1010 :     $veto{ $id } = 1;
1011 :     push @{ $group{ $ref_id } }, $id; # Log the sequences represented
1012 :     $n_to_do--;
1013 :     }
1014 :     else
1015 :     {
1016 :     my $max_sim_i = exp( $save_exp * log( $sim ) );
1017 :     $max_sim{ $id } = $max_sim_i if ( $max_sim_i > $max_sim );
1018 :     }
1019 :     }
1020 :    
1021 :    
1022 :     if ( $logfile )
1023 :     {
1024 :     print $logfile join( "\t", $ref_id, @{ $group{ $ref_id } } ), "\n";
1025 :     }
1026 :    
1027 :     # Search each sequence against the database.
1028 :     # If the order is to be stable, reorder hits to match input order.
1029 :    
1030 :     my ( $id1, $seq1, $max_sim_1, $id2, $max_sim_2, $bpp_max );
1031 :     my @ids_to_do = map { $_->[3] } @ref_hits;
1032 :     @ids_to_do = sort { $ord{ $a } <=> $ord{ $b } } @ids_to_do if $stable;
1033 :    
1034 :     while ( $id1 = shift @ids_to_do )
1035 :     {
1036 :     next if $veto{ $id1 };
1037 :    
1038 :     # Is it time to rebuild a smaller BLAST database? This helps
1039 :     # significantly in the overall performance.
1040 :    
1041 :     if ( $n_to_do <= $rebuild )
1042 :     {
1043 :     if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
1044 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
1045 :     make_blast_db( $db, [ map { $seq_ind->{ $_ } } # id to sequence entry
1046 :     grep { ! $veto{ $_ } } # id not vetoed
1047 :     ( $id1, @ids_to_do ) # remaining ids
1048 :     ],
1049 :     $protein
1050 :     );
1051 :     $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
1052 :     }
1053 :    
1054 :     $n_to_do--;
1055 :     $group{ $id1 } = [];
1056 :    
1057 :     $max_sim_1 = $max_sim{ $id1 };
1058 :     $bpp_max = undef;
1059 :     $seq1 = $seq_ind->{$id1}->[2];
1060 :     foreach ( top_blast_per_subject( $prog, $db, $id1, $seq1, $self, $blast_opt ) )
1061 :     {
1062 :     $bpp_max ||= $_->[6] / $_->[8];
1063 :     $id2 = $_->[3];
1064 :     next if ( $veto{ $id2 } || $group{ $id2 } );
1065 :     $max_sim_2 = $max_sim{ $id2 };
1066 :     $max_sim_2 = $max_sim_1 if ( $max_sim_1 > $max_sim_2 );
1067 :     if ( seq_similarity( $_, $sim_meas, $bpp_max ) > $max_sim_2 )
1068 :     {
1069 :     $veto{ $id2 } = 1;
1070 :     push @{ $group{ $id1 } }, $id2; # Log the sequences represented
1071 :     $n_to_do--;
1072 :     }
1073 :     }
1074 :    
1075 :     if ( $logfile )
1076 :     {
1077 :     print $logfile join( "\t", $id1, @{ $group{ $id1 } } ), "\n";
1078 :     }
1079 :     }
1080 :    
1081 :     if ( $save_tmp ) {} # This is not used, might be in future
1082 :     elsif ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
1083 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
1084 :    
1085 :     # Return the surviving sequence entries, and optionally the hash of
1086 :     # ids represented by each survivor:
1087 :    
1088 :     my $kept = [ $ref, grep { $group{ $_->[0] } } @$seqs ];
1089 :    
1090 :     wantarray ? ( $kept, \%group, [ grep { ! $hit{ $_->[0] } } @$seqs ] ) : $kept;
1091 : golsen 1.1 }
1092 :    
1093 :    
1094 :     #===============================================================================
1095 :     # Try to figure out the sequence similarity measure that is being requested:
1096 :     #
1097 :     # $type = standardize_similarity_measure( $requested_type )
1098 :     #
1099 :     #===============================================================================
1100 :    
1101 :     sub standardize_similarity_measure
1102 :     { my ( $req_meas ) = @_;
1103 :     return ( ! $req_meas ) ? 'identity_fraction'
1104 :     : ( $req_meas =~ /id/i ) ? 'identity_fraction'
1105 :     : ( $req_meas =~ /sc/i ) ? 'score_per_position'
1106 :     : ( $req_meas =~ /spp/i ) ? 'score_per_position'
1107 :     : ( $req_meas =~ /bit/i ) ? 'score_per_position'
1108 :     : ( $req_meas =~ /bpp/i ) ? 'score_per_position'
1109 :     : ( $req_meas =~ /tiv/i ) ? 'positive_fraction'
1110 :     : ( $req_meas =~ /pos_/i ) ? 'positive_fraction'
1111 :     : ( $req_meas =~ /ppp/i ) ? 'positive_fraction'
1112 :     : 'identity_fraction';
1113 :     }
1114 :    
1115 :    
1116 :     #===============================================================================
1117 :     # Caluculate sequence similarity according to the requested measure:
1118 :     #
1119 :     # $similarity = seq_similarity( $hit, $measure, $bpp_max )
1120 :     #
1121 :     # $hit is a structure with blast information:
1122 :     #
1123 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1124 :     #===============================================================================
1125 :    
1126 :     sub seq_similarity
1127 :     { my ( $hit, $measure, $bpp_max ) = @_;
1128 :     return ( @$hit < 11 ) ? undef
1129 :     : ( $measure =~ /^sc/ ) ? $hit->[ 6] / ( $hit->[8] * ( $bpp_max || 2 ) )
1130 :     : ( $measure =~ /^po/ ) ? $hit->[10] / $hit->[8]
1131 :     : $hit->[ 9] / $hit->[8]
1132 :     }
1133 :    
1134 :    
1135 :     #===============================================================================
1136 :     # Caluculate self similarity of a sequence in bits per position:
1137 :     #
1138 :     # $max_bpp = self_bpp( $db_name, $entry, $protein )
1139 :     #
1140 :     #===============================================================================
1141 :    
1142 :     sub self_bpp
1143 :     {
1144 :     my ( $db, $entry, $protein ) = @_;
1145 :    
1146 :     # Build blast database:
1147 :    
1148 :     make_blast_db( $db, [ $entry ], $protein );
1149 :    
1150 :     # Search sequence against the database
1151 :    
1152 :     my $id = $entry->[0];
1153 :     my $seq = $entry->[2];
1154 :     my $self = 1; # Self match will never occur
1155 :    
1156 :     my $prog = $protein ? 'blastp' : 'blastn';
1157 :     my $blast_opt = "-v 1 -b 1 -F F -a 2";
1158 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
1159 :    
1160 :     # Do the blast analysis. Returned records are of the form:
1161 :     #
1162 :     # 0 1 2 3 4 5 6 7 8 9 10 11
1163 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1164 :    
1165 :     my ( $hit ) = top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );
1166 :     # print STDERR join( ", ", @$hit ), "\n";
1167 :    
1168 :     # First hit is always a perfect match, so we get bits per position:
1169 :     # This is only used if the measure is bits per position
1170 : golsen 1.5
1171 : golsen 1.1 $hit->[6] / $hit->[8];
1172 :     }
1173 :    
1174 :    
1175 :     #===============================================================================
1176 :     # Make a blast databse from a set of sequence entries. The type of database
1177 :     # (protein or nucleic acid) is quessed from the sequence data.
1178 :     #
1179 :     # make_blast_db( $db_filename, \@seq_entries, $protein )
1180 :     #
1181 :     # Sequence entries have the form: [ $id, $def, $seq ]
1182 :     #===============================================================================
1183 :    
1184 :     sub make_blast_db
1185 :     { my ( $db, $seqs, $protein ) = @_;
1186 :    
1187 :     open FH, ">$db" or die "Could not create blast database file \"$db\"";
1188 :     foreach ( @$seqs )
1189 :     {
1190 : golsen 1.5 print FH ">$_->[0]", ( $_->[1] ? " $_->[1]" : () ), "\n$_->[2]\n";
1191 : golsen 1.1 }
1192 :     close FH;
1193 :    
1194 :     my $is_prot = $protein ? 'T' : 'F';
1195 :     system "$formatdb -p $is_prot -i '$db'";
1196 :     }
1197 :    
1198 :    
1199 :     #===============================================================================
1200 :     # The type of data (protein or nucleic acid) is quessed from the sequences.
1201 :     #
1202 :     # are_protein( \@seq_entries )
1203 :     #
1204 :     # Sequence entries have the form: [ $id, $def, $seq ]
1205 :     #===============================================================================
1206 :    
1207 :     sub are_protein
1208 :     { my ( $seqs ) = @_;
1209 :     my ( $nt, $aa ) = ( 0, 0 );
1210 :     foreach ( @$seqs )
1211 :     {
1212 : golsen 1.5 my $s = $_->[2];
1213 :     $nt += $s =~ tr/ACGTacgt//d;
1214 :     $aa += $s =~ tr/A-Za-z//d;
1215 : golsen 1.1 }
1216 :     ( $nt < 3 * $aa ) ? 1 : 0;
1217 :     }
1218 :    
1219 :    
1220 :     #===============================================================================
1221 :     # Blast a subject against a datbase, saving only top hit per subject
1222 :     #
1223 :     # Return:
1224 :     #
1225 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1226 :     #
1227 :     #===============================================================================
1228 :    
1229 :     # Use a bare block to compartmentalize a counter:
1230 :    
1231 :     { my $cnt = 0;
1232 :    
1233 :     sub top_blast_per_subject
1234 :     { my ( $prog, $db, $qid, $qseq, $self, $blast_opt, $sort, $no_merge ) = @_;
1235 :    
1236 :     $cnt++;
1237 : golsen 1.5 my ( $tmp_dir, $save_tmp ) = &tmp_dir( );
1238 :     $tmp_dir or print STDERR "Unable to locate temporary file directory\n"
1239 :     and return;
1240 :     my $query_file = "$tmp_dir/tmp_blast_seq_${$}_$cnt";
1241 : golsen 1.11 open QFILE, ">$query_file" or die "Could not create sequence file \"$query_file\"";
1242 :     print QFILE ">$qid\n$qseq\n";
1243 :     close QFILE;
1244 : golsen 1.1
1245 :     my $blast_cmd = "$blastall -p $prog -d '$db' -i '$query_file' $blast_opt";
1246 : golsen 1.5
1247 : golsen 1.11 open BPIPE, "$blast_cmd |" or die "Could open blast pipe\n";
1248 :     my $sims = integrate_blast_segments( \*BPIPE, $sort, $no_merge, $self );
1249 :     close BPIPE;
1250 : golsen 1.5 unlink $query_file if ! $save_tmp;
1251 : golsen 1.1
1252 :     my $pq = ""; # Previous query id
1253 :     my $ps = ""; # Previous subject id
1254 :     my $keep;
1255 :    
1256 :     grep { $keep = ( $pq ne $_->[0] ) || ( $ps ne $_->[3] );
1257 :     $pq = $_->[0];
1258 :     $ps = $_->[3];
1259 :     $keep && ( $self || ( $pq ne $ps ) );
1260 :     } @$sims;
1261 :     }
1262 :     }
1263 :    
1264 :    
1265 :     #===============================================================================
1266 :     # Read output of rationalize blast and assemble minimally overlapping segments
1267 :     # into a total score for each subject sequence. For each query, sort matches
1268 :     # into user-chosen order (D = total score):
1269 :     #
1270 :     # @sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
1271 :     # \@sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
1272 :     #
1273 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
1274 :     # and 'positive_fraction' (matched very flexibly).
1275 :     #
1276 :     # Returned sims (e_val is only for best HSP, not any composite):
1277 :     #
1278 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1279 :     #
1280 :     # There is a strategic decision to not read the blast output from memory;
1281 :     # it could be enormous. This cuts the flexibility some.
1282 :     #===============================================================================
1283 :     #
1284 :     # coverage fields:
1285 :     #
1286 :     # [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
1287 :     #
1288 :     #===============================================================================
1289 :    
1290 :     sub integrate_blast_segments
1291 :     { my ( $fh, $order, $no_merge, $self ) = @_;
1292 :     $fh ||= \*STDIN;
1293 :     ( ref( $fh ) eq "GLOB" ) || die "integrate_blast_segments called without a filehandle\n";
1294 :    
1295 :     $order = ( ! $order ) ? 'score'
1296 :     : ( $order =~ /sc/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
1297 :     : ( $order =~ /bit/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
1298 :     : ( $order =~ /spp/i ) ? 'score_per_position'
1299 :     : ( $order =~ /id/i ) ? 'identity_fraction'
1300 :     : ( $order =~ /tiv/i ) ? 'positive_fraction'
1301 :     : 'score';
1302 :    
1303 :     my $max_frac_overlap = 0.2;
1304 :    
1305 :     my ( $qid, $qdef, $qlen, $sid, $sdef, $slen );
1306 :     my ( $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap );
1307 :     my ( $ttl_scr, $ttl_mat, $ttl_id, $ttl_pos, $ttl_gap );
1308 :     my @sims = ();
1309 :     my @qsims = ();
1310 :     my $coverage = undef;
1311 :     my $record;
1312 :    
1313 :     while ( $_ = next_blast_record( $fh, $self ) )
1314 :     {
1315 : golsen 1.5 chomp;
1316 :     if ( $_->[0] eq 'Query=' )
1317 :     {
1318 :     if ( $coverage )
1319 :     {
1320 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
1321 :     $coverage = undef;
1322 :     }
1323 :     if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
1324 :     ( undef, $qid, $qdef, $qlen ) = @$_;
1325 :     $sid = undef;
1326 :     @qsims = ();
1327 :     }
1328 :     elsif ( $_->[0] eq '>' )
1329 :     {
1330 :     if ( $coverage )
1331 :     {
1332 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
1333 :     $coverage = undef;
1334 :     }
1335 :     next if ! $qid;
1336 :     ( undef, $sid, $sdef, $slen ) = @$_;
1337 :     }
1338 :     elsif ( $_->[0] eq 'HSP' && $sid )
1339 :     {
1340 :     $coverage = integrate_HSP( $coverage, $_, $max_frac_overlap, $no_merge );
1341 :     }
1342 : golsen 1.1 }
1343 :    
1344 :     if ( $coverage ) { push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ] }
1345 :    
1346 :     if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
1347 :    
1348 :     wantarray ? @sims : \@sims;
1349 :     }
1350 :    
1351 :    
1352 :     #===============================================================================
1353 :     #
1354 :     # Try to integrate non-conflicting HSPs for the same subject sequence. The
1355 :     # conflicts are only assessed from the standpoint of the query, at least for
1356 :     # now. We could track the subject sequence coverage as well (to avoid a direct
1357 :     # repeat in the query from matching the same subject twice).
1358 :     #
1359 :     # $new_coverage = integrate_HSP( $coverage, $hsp, $max_frac_overlap, $no_merge )
1360 :     #
1361 :     # 0 1 2 3 4 5 6 7
1362 :     # $coverage = [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
1363 :     #
1364 :     # $coverage should be undefined at the first call; the function intiallizes
1365 :     # all of the fields from the first HSP. scr, n_mat, n_id, n_pos, and n_gap
1366 :     # are sums over the combined HSPs. e_val is based only of the first HSP.
1367 :     #
1368 :     # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1369 :     # $hsp = [ 'HSP', scr, e_val, n_seg, e_val2, n_mat, n_id, n_pos, n_gap, dir, s1, e1, sq1, s2, e2, sq2 ]
1370 :     #
1371 :     # $max_frac_overlap Amount of the new HSP that is allowed to overlap already
1372 :     # incorporated HSPs
1373 :     #
1374 :     # $no_merge Disable the merging of multiple HSPs. The structure will
1375 :     # be filled in from the first HSP and left unchanged though
1376 :     # subsequence calls. This simplifies the program structure.
1377 :     #
1378 :     # Fitting a new HSP into covered intervals:
1379 :     #
1380 :     # 1 qlen
1381 :     # |---------------------------------------------------------------| query
1382 :     # ------------ --------------- covered
1383 :     # ------------- new match
1384 :     # l r
1385 :     #
1386 :     #===============================================================================
1387 :    
1388 :     sub integrate_HSP
1389 :     { my ( $coverage, $hsp, $max_frac_overlap, $no_merge ) = @_;
1390 :     my ( undef, $scr, $e_val, undef, undef, $n_mat, $n_id, $n_pos, $n_gap, $dir, $s1, $e1 ) = @$hsp;
1391 :    
1392 :     # Ignore frame; just use direction of match:
1393 :    
1394 :     $dir = substr( $dir, 0, 1 );
1395 :    
1396 :     # Orient by left and right ends:
1397 :    
1398 :     my ( $l, $r ) = ( $e1 > $s1 ) ? ( $s1, $e1 ) : ( $e1, $s1 );
1399 :    
1400 :     # First HSP for the subject sequence:
1401 :    
1402 :     if ( ! $coverage )
1403 :     {
1404 : golsen 1.5 return [ $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap, $dir, [ [ $s1, $e1 ] ] ];
1405 : golsen 1.1 }
1406 :    
1407 :     # Not first; must be same direction to combine (also test no_merge here):
1408 :    
1409 :     return $coverage if ( $no_merge || ( $dir ne $coverage->[6] ) );
1410 :    
1411 :     # Not first; must fall in a gap of query sequence coverage:
1412 :    
1413 :     my @intervals = @{ $coverage->[7] };
1414 :     my $max_overlap = $max_frac_overlap * ( $r - $l + 1 );
1415 :     my $prev_end = 0;
1416 :     my $next_beg = $intervals[0]->[0];
1417 :     my @used = ();
1418 :     while ( $next_beg <= $l ) # *** Sequential search could be made binary
1419 :     {
1420 : golsen 1.5 $prev_end = $intervals[0]->[1];
1421 :     push @used, scalar shift @intervals;
1422 :     $next_beg = @intervals ? $intervals[0]->[0] : 1e10;
1423 : golsen 1.1 }
1424 :    
1425 :     my $overlap = ( ( $l <= $prev_end ) ? ( $prev_end - $l + 1 ) : 0 )
1426 :     + ( ( $r >= $next_beg ) ? ( $r - $next_beg + 1 ) : 0 );
1427 :     return $coverage if ( $overlap > $max_overlap );
1428 :    
1429 :     # Okay, we have passed the overlap test. We need to integrate the
1430 :     # match into the coverage description. Yes, I know that this counts
1431 :     # the overlap region. We could pro rate it, but that is messy too:
1432 :    
1433 :     $coverage->[0] += $scr;
1434 :     $coverage->[2] += $n_mat;
1435 :     $coverage->[3] += $n_id;
1436 :     $coverage->[4] += $n_pos;
1437 :     $coverage->[5] += $n_gap;
1438 :    
1439 :     # Refigure the covered intervals, fusing intervals separated by a
1440 :     # gap of less than 10:
1441 :    
1442 :     my $min_gap = 10;
1443 :     if ( $l <= $prev_end + $min_gap )
1444 :     {
1445 : golsen 1.5 if ( @used ) { $l = $used[-1]->[0]; pop @used }
1446 :     else { $l = 1 }
1447 : golsen 1.1 }
1448 :     if ( $r >= $next_beg - $min_gap )
1449 :     {
1450 : golsen 1.5 if ( @intervals ) { $r = $intervals[0]->[1]; shift @intervals }
1451 :     else { $r = 1e10 }
1452 : golsen 1.1 }
1453 :    
1454 :     $coverage->[7] = [ @used, [ $l, $r ], @intervals ];
1455 :    
1456 :     return $coverage;
1457 :     }
1458 :    
1459 :    
1460 :     #===============================================================================
1461 :     # Sort the blast matches by the desired criterion:
1462 :     #
1463 :     # @sims = order_query_sims( $qid, $qdef, $qlen, \@qsims, $order )
1464 :     #
1465 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
1466 :     # and 'positive_fraction'
1467 :     #
1468 :     # @qsims fields:
1469 :     #
1470 :     # 0 1 2 3 4 5 6 7 8
1471 :     # [ sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1472 :     #
1473 :     #===============================================================================
1474 :    
1475 :     sub order_query_sims
1476 :     { my ( $qid, $qdef, $qlen, $qsims, $order ) = @_;
1477 :    
1478 :     my @sims;
1479 :     if ( $order eq 'score_per_position' )
1480 :     {
1481 : golsen 1.5 @sims = map { [ $_->[5] ? $_->[3]/$_->[5] : 0, $_ ] } @$qsims;
1482 : golsen 1.1 }
1483 :     elsif ( $order eq 'identity_fraction' )
1484 :     {
1485 : golsen 1.5 @sims = map { [ $_->[5] ? $_->[6]/$_->[5] : 0, $_ ] } @$qsims;
1486 : golsen 1.1 }
1487 :     elsif ( $order eq 'positive_fraction' )
1488 :     {
1489 : golsen 1.5 @sims = map { [ $_->[5] ? $_->[7]/$_->[5] : 0, $_ ] } @$qsims;
1490 : golsen 1.1 }
1491 :     else # Default is by 'score'
1492 :     {
1493 : golsen 1.5 @sims = map { [ $_->[3], $_ ] } @$qsims;
1494 : golsen 1.1 }
1495 :    
1496 :     map { [ $qid, $qdef, $qlen, @{$_->[1]} ] } sort { $b->[0] <=> $a->[0] } @sims;
1497 :     }
1498 :    
1499 :    
1500 : golsen 1.5 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1501 :     # Locate the directory for temporary files in a SEED-aware, but not SEED-
1502 :     # dependent manner:
1503 :     #
1504 :     # ( $tmp_dir, $save_tmp ) = tmp_dir( \%options )
1505 :     #
1506 :     # $save_tmp is true if the option 'savetmp' is true.
1507 :     #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
1508 :     sub tmp_dir
1509 :     {
1510 :     my $options = ref( $_[0] ) eq 'HASH' ? shift : {};
1511 :    
1512 :     my $tmp = $options->{ tmp } && -d $options->{ tmp } ? $options->{ tmp }
1513 :     : $FIG_Config::temp && -d $FIG_Config::temp ? $FIG_Config::temp
1514 :     : -d '/tmp' ? '/tmp'
1515 :     : '.';
1516 :    
1517 :     my $save_tmp = $options->{ savetmp } || 0;
1518 :    
1519 :     return ( $tmp, $save_tmp );
1520 :     }
1521 :    
1522 : overbeek 1.7 ###############################################################################
1523 : golsen 1.5
1524 : overbeek 1.7 sub n_rep_seqs {
1525 :     my(%args) = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_;
1526 :    
1527 :     my($seqs) = $args{seqs} || return undef;
1528 :     my($reps) = $args{reps} || undef;
1529 :     my($max_iden) = $args{max_iden} || 0.9; # we don't keep seqs more than 90% identical
1530 :     my($max_rep) = $args{max_rep} || 50; # maximum number of seqs in returned set
1531 :    
1532 : overbeek 1.9 if ($args{by_size}) { $seqs = [sort { length($b->[2]) <=> length($a->[2]) } @$seqs] };
1533 :    
1534 : overbeek 1.7 my($lost) = {};
1535 :     my($repseqs,$representing) = &rep_seq_2($reps ? $reps : (), $seqs, { max_sim => $max_iden });
1536 :     if ($max_rep >= @$repseqs)
1537 :     {
1538 :     return ($repseqs,$representing);
1539 :     }
1540 :     my $n_rep = $reps ? @$reps : 0;
1541 :     my $incr = $max_iden / 2;
1542 :     # print STDERR "max_iden=$max_iden, ", scalar @$repseqs,"\n";
1543 :     my $iterations_left = 7;
1544 :    
1545 :     my @seqs2;
1546 :     while ($iterations_left && ($max_rep != @$repseqs))
1547 :     {
1548 :     if ($max_rep > @$repseqs)
1549 :     {
1550 :     $max_iden += $incr;
1551 :     }
1552 :     else
1553 :     {
1554 :     @seqs2 = @$repseqs[$n_rep..(@$repseqs - 1)];
1555 :     &add_to_lost($lost,$representing);
1556 :     $max_iden -= $incr;
1557 :     }
1558 :     ($repseqs,$representing) = &rep_seq_2($reps ? $reps : (), \@seqs2, { max_sim => $max_iden });
1559 :     # print STDERR "max_iden=$max_iden, ", scalar @$repseqs,"\n";
1560 :     $iterations_left--;
1561 :     $incr = $incr / 2;
1562 :     }
1563 :    
1564 :     foreach my $id (keys(%$lost))
1565 :     {
1566 :     my $rep_by = $lost->{$id};
1567 :     while ($lost->{$rep_by})
1568 :     {
1569 :     $rep_by = $lost->{$rep_by};
1570 :     }
1571 :     push(@{$representing->{$rep_by}},$id);
1572 :     }
1573 :     return ($repseqs,$representing);
1574 :     }
1575 :    
1576 :     sub add_to_lost {
1577 :     my($lost,$representing) = @_;
1578 :    
1579 :     foreach my $id (keys(%$representing))
1580 :     {
1581 :     my $x = $representing->{$id};
1582 :     foreach my $lost_id (@$x)
1583 :     {
1584 :     $lost->{$lost_id} = $id;
1585 :     }
1586 :     }
1587 :     }
1588 :    
1589 :     ###############################################################################
1590 : golsen 1.5
1591 : golsen 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3