[Bio] / FigKernelPackages / representative_sequences.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/representative_sequences.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.22 - (view) (download) (as text)

1 : golsen 1.1 package representative_sequences;
2 :    
3 : golsen 1.18 #
4 : overbeek 1.10 # This is a SAS component
5 :     #
6 : golsen 1.1 use strict;
7 :     use gjoparseblast;
8 : golsen 1.17 use gjoseqlib;
9 :     use SeedAware;
10 : golsen 1.18 use Data::Dumper;
11 : golsen 1.1
12 :     require Exporter;
13 : golsen 1.17 our @ISA = qw( Exporter );
14 : golsen 1.1 our @EXPORT = qw( representative_sequences
15 :     rep_seq_2
16 : golsen 1.5 rep_seq
17 : golsen 1.17 n_rep_seqs
18 : golsen 1.1 );
19 :    
20 : golsen 1.17
21 : golsen 1.1 #===============================================================================
22 : golsen 1.8 # Build or add to a set of representative sequences (if you do not want an
23 : golsen 1.21 # enrichment of sequences around a focus sequence (called the reference),
24 :     # these are the subroutines that you want). rep_seq() can include diverse
25 :     # representatives of a group in the blast database, making the clustering
26 :     # more reliable (i.e., this is probably what you want).
27 : golsen 1.1 #
28 : golsen 1.5 # \@reps = rep_seq( \@reps, \@new, \%options );
29 :     # \@reps = rep_seq( \@new, \%options );
30 : golsen 1.1 #
31 :     # or
32 :     #
33 : golsen 1.5 # ( \@reps, \%representing ) = rep_seq( \@reps, \@new, \%options );
34 :     # ( \@reps, \%representing ) = rep_seq( \@new, \%options );
35 : golsen 1.1 #
36 : golsen 1.5 # or
37 : golsen 1.2 #
38 :     # \@reps = rep_seq_2( \@reps, \@new, \%options );
39 :     # \@reps = rep_seq_2( \@new, \%options );
40 :     #
41 :     # or
42 :     #
43 :     # ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
44 :     # ( \@reps, \%representing ) = rep_seq_2( \@new, \%options );
45 : golsen 1.1 #
46 : golsen 1.5 #
47 : golsen 1.21 # Construct a representative set of related sequences, with an enrichment
48 :     # in representation surrounding the initial reference sequence.
49 :     #
50 :     # \@reps = representative_sequences( $ref, \@seqs, $max_sim, \%options );
51 : golsen 1.5 #
52 :     # or
53 :     #
54 : golsen 1.21 # ( \@reps, \%representing, \@low_sim ) = representative_sequences( $ref,
55 : golsen 1.5 # \@seqs, $max_sim, \%options );
56 :     #
57 : golsen 1.21 # Outputs:
58 : golsen 1.1 #
59 : golsen 1.21 # \@reps Reference to the list of retained (representative subset)
60 : golsen 1.22 # of sequence entries. Sequence entries have the form
61 : golsen 1.1 # [ $id, $def, $seq ]
62 :     #
63 :     # \%representing
64 :     # Reference to a hash in which the keys are the ids of the
65 :     # representative sequences, for which the corresponding value
66 :     # is a list of ids of other sequences that are represented by
67 :     # that representive.
68 :     #
69 :     #
70 :     # Arguments (only \@seqs is required):
71 :     #
72 : golsen 1.22 # $ref A reference sequence as [ $id, $def, $seq ]. If present, the
73 :     # reference sequence defines a focal point for the analysis. A
74 : golsen 1.1 # representative sequence from each lineage in its vicinity will
75 :     # be retained, even though they are more similar than max_sim to
76 : golsen 1.22 # the reference, or to each other. The reference will always be
77 :     # included in the representative set. A limit is put on the
78 : golsen 1.1 # similarity of lineages retained by the reference sequence with
79 : golsen 1.22 # the max_ref_sim option (default = 0.99). The reference sequence
80 :     # should not be repeated in the set of other sequences. (Only
81 : golsen 1.2 # applies to representative_sequences; there is no equivalent for
82 :     # rep_seq_2.)
83 :     #
84 :     # \@reps In rep_seq_2, these sequences will each be placed in their own
85 : golsen 1.22 # cluster, regardless of their similarity to one another. Each
86 : golsen 1.2 # remaining sequence is added to the cluster to which it is
87 :     # most similar, unless it is less simililar than max_sim, in
88 :     # which case it represents a new cluster.
89 : golsen 1.1 #
90 : golsen 1.22 # \@seqs Set of sequences to be pruned. If there is no reference
91 : golsen 1.1 # sequence, the fist sequence in this list will be the starting
92 :     # point for the analysis and will be retained, but all sequences
93 :     # more similar than max_sim to it will be removed (in contrast to
94 :     # a reference sequence, which retains a representative of each
95 : golsen 1.22 # lineage in its vicinity). Sequences that fail the E-value test
96 : golsen 1.1 # relative to the reference (or the fist sequence if there is no
97 :     # reference) are dropped.
98 :     #
99 : golsen 1.2 # $max_sim (representative_sequences only; an option for rep_seq_2)
100 :     # Sequences with a higher similarity than max_sim to an existing
101 :     # representative sequence will not be included in the @reps
102 : golsen 1.22 # output. Their ids are associated with the identifier of the
103 :     # sequence representing them in \%representing. The details of
104 : golsen 1.2 # the behaviour are modified by other options. (default = 0.80)
105 : golsen 1.1 #
106 : golsen 1.8 # \%options Key => Value pairs that modify the behaviour:
107 :     #
108 :     # by_size (rep_seq and rep_seq_2 only)
109 : golsen 1.22 # By default, sequences are analyzed in input order. This
110 :     # option set to 1 will order from longest to shortest, or
111 :     # set to 2 will order from median outward.
112 :     #
113 :     # dup_id_ok (rep_seq and rep_seq_2 only)
114 :     # Duplicate ids in the input will be silently ignored. This
115 :     # is useful when continuing a run.
116 : golsen 1.1 #
117 : golsen 1.21 # extra_rep (rep_seq only)
118 :     # Filehandle for a file of the sequences included in the
119 :     # blast db, but are not themselves representing a group.
120 :     # In the case of rep_seq, the database can include diverse
121 :     # representatives of a cluster. These data are intended for
122 :     # use in continuations of a clustering by inclusion at the
123 :     # head of the new data.
124 :     #
125 : golsen 1.22 # logfile Filehandle for a logfile of the progress. As each
126 : golsen 1.2 # sequence is analyzed, its disposition in recorded.
127 :     # In representative_sequences(), the id of each new
128 :     # representative is followed by a tab separated list of the
129 : golsen 1.22 # ids that it represents. In rep_seq_2(), as each sequence
130 : golsen 1.2 # is analyzed, it is recorded, followed by the id of the
131 :     # sequence representing it, if it is not the first member
132 : golsen 1.22 # of a new cluster. Autoflush is set for the logfile.
133 : golsen 1.1 # If the value supplied is not a reference to a GLOB, then
134 :     # the log is sent to STDOUT (which is probably not what you
135 : golsen 1.22 # want in most cases). The behavior is intended to aid in
136 : golsen 1.1 # following prgress, and in recovery of interupted runs.
137 :     #
138 : golsen 1.2 # max_ref_sim (representative_sequences only)
139 : golsen 1.22 # Maximum similarity of any sequence to the reference. If
140 : golsen 1.1 # max_ref_sim is less than max_sim, it is silently reset to
141 : golsen 1.22 # max_sim. (default = 0.99, because 1.0 can be annoying)
142 : golsen 1.1 #
143 : golsen 1.22 # max_e_val Maximum E-value for blastall. Probably moot, but will help
144 :     # with performance. (default = 0.01)
145 : golsen 1.2 #
146 :     # max_sim Sequences with a higher similarity than max_sim to a
147 : golsen 1.22 # retained sequence will be deleted. The details of the
148 : golsen 1.2 # behaviour is modified by other options. (default = 0.80)
149 :     # (a parameter for representative_sequences, but an option
150 :     # for rep_seq_2).
151 : golsen 1.1 #
152 : golsen 1.22 # n_thread (rep_seq and rep_seq_2 only)
153 :     # Number of threads used by blastall. (default = 2)
154 :     #
155 : golsen 1.18 # n_query (rep_seq and rep_seq_2 only)
156 :     # Blast serveral sequences at a time to decrease process
157 : golsen 1.22 # creation overhead. (default = 64)
158 : golsen 1.18 #
159 :     # rep_seq_2 (rep_seq only)
160 :     # Use rep_seq_2() behavior (only on representative in the
161 :     # blast database per cluster.
162 :     #
163 : golsen 1.5 # save_tmp Do not delete temporary files upon completion (for debug)
164 :     #
165 : golsen 1.1 # sim_meas Measure similarity for inclusion or exclusion by
166 :     # 'identity_fraction' (default), 'positive_fraction', or
167 :     # 'score_per_position'
168 :     #
169 : golsen 1.2 # save_exp (representative_sequences only)
170 :     # When there is a reference sequence, lineages more similar
171 : golsen 1.22 # than max_sim will be retained near the reference. The
172 :     # default goal is to save one member of each lineage. If
173 : golsen 1.1 # the initial representative of the lineage is seq1, we
174 :     # pose the question, "Are there sufficiently deep divisions
175 :     # within the lineage to seq1 that it they might be viewed
176 :     # as independent? That is, might there be another sequence,
177 :     # seq2 that so different from seq1 that we might want to see
178 :     # it also?
179 :     #
180 :     # +---------------------- ref
181 :     # |
182 :     # ---+ +-------------------- seq1
183 :     # +-+
184 :     # +-------------------- seq2
185 :     #
186 :     # Without any special treatment, if the similarity of seq1
187 :     # to ref ( S(seq1,ref) ) is greater than max_sim, seq1 would
188 :     # be the sole representative of thelineage containing both
189 :     # seq1 and seq2, because the similarity of seq1 to seq2
190 : golsen 1.22 # ( S(seq1,seq2) ) is greater than S(seq1,ref). This can
191 :     # be altered by the value of save_exp. In terms of
192 : golsen 1.1 # similarity, seq2 will be discarded if:
193 :     #
194 :     # S(seq1,seq2) > S(seq1,ref) ** save_exp, and
195 :     # S(seq1,seq2) > S(seq2,ref) ** save_exp
196 :     #
197 :     # The default behavior described above occurs when save_exp
198 : golsen 1.22 # is 1. If save_exp < 1, then greater similarities between
199 :     # seq1 and seq2 are allowed. Reasonable values of save_exp
200 :     # are roughly 0.7 to 1.0. (At save_exp = 0, any similarity
201 : golsen 1.1 # would be allowed; yuck.)
202 :     #
203 : golsen 1.2 # stable (representative_sequences only; always true for rep_seq_2)
204 :     # If true (not undef, '', or 0), then the representatives
205 : golsen 1.1 # will be chosen from as early in the list as possible (this
206 : golsen 1.2 # facilitates augmentation of an existing list).
207 : golsen 1.1 #
208 : golsen 1.18 # tmp Location for temporary blast files.
209 :     #
210 : golsen 1.1 #-------------------------------------------------------------------------------
211 :     #
212 : golsen 1.2 # Diagram of the pruning behavior of representative_sequences():
213 : golsen 1.1 #
214 :     # 0.5 0.6 0.7 0.8 0.9 1.0 Similarity
215 :     # |---------|---------|---------|---------|---------|
216 :     # .
217 :     # . + A
218 :     # . +---+
219 :     # . | + B
220 :     # . +---+
221 :     # . | +---- C
222 :     # . +----------+
223 :     # . | +-------- D
224 :     # . |
225 :     # +-----------+ +----------------- E
226 :     # | . +-+
227 :     # | . +----------------- F
228 :     # +----------------+ .
229 :     # | | . +--------------------------- G
230 :     # | +---+
231 :     # | . | +--------------------- H
232 :     # --+ . +-----+
233 :     # | . +--------------------- I
234 :     # | .
235 :     # | +------------------------------- J
236 :     # +----------------+ .
237 :     # | . +--------------------------- K
238 :     # +---+
239 :     # . +--------------------------- L
240 :     # .
241 :     # |---------|---------|---------|---------|---------|
242 :     # 0.5 0.6 0.7 0.8 0.9 1.0 Similarity
243 :     #
244 :     # In the above tree and max_sim = 0.70 and max_ref_sim = 0.99:
245 :     #
246 :     # With no reference sequence, and A first in the list, the representative
247 :     # sequences will be A, G, J and K.
248 :     #
249 :     # With A as the reference sequence and save_exp left at its default, the
250 : golsen 1.22 # representative sequences will be A, C, D, E, G, J and K. B is excluded
251 : golsen 1.1 # because it is more similar than max_ref_sim to A.
252 :     #
253 :     # With A as the reference sequence and save_exp = 0.8, the representative
254 :     # sequences will be A, C, D, E, F (comparably similar to A and E), G,
255 : golsen 1.22 # H (comparably similar to A and G), J and K. The sequence L will be
256 : golsen 1.1 # represented by K because L is much closer to K than to A.
257 :     #
258 :     # This oversimplifies the choice of representative of a cluster of related
259 : golsen 1.22 # sequences. For example, whether G, H or I would represent the group of
260 : golsen 1.1 # three actually depends on relative clock speed (slower is better) and
261 : golsen 1.22 # sequence coverage (more complete is better). The actual order is by BLAST
262 : golsen 1.1 # bit score (possibly combining independent segments).
263 :     #
264 :     # In addition, this discussion is in terms of a tree, but the calculations
265 : golsen 1.22 # are based on a (partial) set of pairwise sequence similarities. Thus, the
266 : golsen 1.1 # precise behavior is hard to predict, but should be similar to that described
267 :     # above.
268 :     #
269 :     #-------------------------------------------------------------------------------
270 :     #
271 :     # To construct a representative set of sequences relative to a reference
272 :     # sequence:
273 :     #
274 :     # 1. Prioritize sequences for keeping, from highest to lowest scoring
275 :     # relative to reference, as measured by blast score (bits).
276 :     # When stable is set, priority is from first to last in input file
277 :     # (a reference sequence should not be supplied).
278 :     #
279 :     # 2. Based on the similarity of each sequence to the reference and save_exp,
280 :     # compute sequence-specific values of max_sim:
281 :     #
282 :     # max_sim( seq_i ) = exp( save_exp * ln( seq_i_ref_sim ) )
283 :     #
284 :     # 3. Examine the next prioritized sequence (seq1).
285 :     #
286 :     # 4. If seq1 has been vetoed, go to 7.
287 :     #
288 :     # 5. Mark seq1 to keep.
289 :     #
290 :     # 6. Use blast to find similarities of seq1 to other sequences.
291 :     #
292 :     # 7. For each similar sequence (seq2):
293 :     #
294 :     # 7a. Skip if seq2 is marked to keep, or marked for veto
295 :     #
296 :     # 7b. Compute the maximum simiarity of seq1 and seq2 for retaining seq2:
297 :     #
298 :     # max_sim_1_2 = max( max_sim, max_sim( seq1 ), max_sim( seq2 ) )
299 :     #
300 :     # 7c. If similarity of seq1 and seq2 > max_sim, veto seq2
301 :     #
302 :     # 7d. Next seq2
303 :     #
304 :     # 8. If there are more sequences to examine, go to 3.
305 :     #
306 :     # 9. Collect the sequences marked for keeping.
307 :     #
308 :     #===============================================================================
309 :    
310 : golsen 1.5 #===============================================================================
311 : golsen 1.22 # Build or add to a set of representative sequences. The difference of
312 : golsen 1.5 # rep_seq_2 and rep_seq is that rep_seq can have multiple representatives
313 : golsen 1.22 # in the blast database for a given group. This helps prevent fragmentation
314 : golsen 1.5 # of clusters.
315 :     #
316 :     # \@reps = rep_seq( \@reps, \@new, \%options );
317 :     # \@reps = rep_seq( \@new, \%options );
318 :     #
319 :     # or
320 :     #
321 :     # ( \@reps, \%representing ) = rep_seq( \@reps, \@new, \%options );
322 :     # ( \@reps, \%representing ) = rep_seq( \@new, \%options );
323 :     #
324 : golsen 1.18 # January 28, 2011:
325 :     #
326 :     # rep_seq_2() is now implimented by the option: rep_seq_2 => 1
327 :     #
328 :     # The code now allows batching of multiple blast queries to see if that
329 :     # helps cut down on process creation overhead: n_query => n (D = 64)
330 :     #
331 : golsen 1.5 #===============================================================================
332 :    
333 :     sub rep_seq
334 :     {
335 : golsen 1.17 # Are there options?
336 : golsen 1.5
337 : golsen 1.17 my $options = ( $_[-1] && ref $_[-1] eq 'HASH' ) ? pop @_ : {};
338 : golsen 1.1
339 : golsen 1.17 my ( $reps, $seqs ) = @_ < 2 ? ( [], shift ) : @_;
340 : golsen 1.5
341 : golsen 1.17 $reps && ref $reps eq 'ARRAY'
342 :     or print STDERR "Representative sequences for rep_seq() must be an ARRAY reference.\n"
343 :     and return undef;
344 :    
345 :     $seqs && ref $seqs eq 'ARRAY'
346 :     or print STDERR "Sequences for rep_seq() must be an ARRAY reference.\n"
347 :     and return undef;
348 : golsen 1.1
349 :     # ---------------------------------------# Default values for options
350 :    
351 : golsen 1.22 my $by_size = 0; # Analyze sequences in order provided
352 :     my $dup_id_ok = 0; # Warn when duplicate ids are seen
353 :     my $extra_rep = undef; # File of nonrep sequences in blastdb
354 :     my $keep_gid = [];
355 : fangfang 1.13 my $keep_id = [];
356 : golsen 1.22 my $logfile = undef; # Log file of sequences processed
357 :     my $max_e_val = 0.01; # Blast E-value to decrease output
358 :     my $max_sim = 0.80; # Retain 80% identity of less
359 :     my $n_thread = 2; # Number of threads in blastall
360 :     my $n_query = 64; # Blast sequences 64 at a time
361 :     my $rep_seq_2 = 0; # Not call to rep_seq_2;
362 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
363 : golsen 1.1
364 :     # Two questionable decisions:
365 :     # 1. Be painfully flexible on option names.
366 :     # 2. Silently fix bad parameter values.
367 :    
368 :     foreach ( keys %$options )
369 :     {
370 : golsen 1.5 my $value = $options->{ $_ };
371 : fangfang 1.13
372 : golsen 1.18 if ( m/by_?size/i ) # add longest to shortest
373 : fangfang 1.13 {
374 : golsen 1.22 $by_size = $value;
375 :     }
376 :     elsif ( m/dup_?id/i )
377 :     {
378 :     $dup_id_ok = $value;
379 : golsen 1.18 }
380 :     elsif ( m/keep_?gid/i )
381 :     {
382 :     $keep_gid = $value if $value && ref( $value ) eq 'ARRAY';
383 : fangfang 1.13 }
384 :     elsif ( m/keep_?id/i )
385 :     {
386 : golsen 1.18 $keep_id = $value if $value && ref( $value ) eq 'ARRAY';
387 : fangfang 1.13 }
388 :     elsif ( m/^log/i ) # logfile
389 : golsen 1.5 {
390 :     next if ! $value;
391 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
392 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
393 :     }
394 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
395 :     {
396 :     $value += 0;
397 :     $value = 0 if $value < 0;
398 :     $value = 1 if $value > 1;
399 :     $max_sim = $value;
400 :     }
401 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
402 :     {
403 :     $value += 0;
404 :     $value = 0 if $value < 0;
405 :     $max_e_val = $value;
406 :     }
407 : golsen 1.22 elsif ( m/cpu/i || m/proc/i || m/thread/i )
408 :     {
409 :     $n_thread = $value || 2;
410 :     }
411 : golsen 1.18 elsif ( m/n_?quer/i )
412 :     {
413 : golsen 1.21 $n_query = $value || 64;
414 :     }
415 :     elsif ( m/^extra_?rep/i ) # File of nonrep seqs in blastdb
416 :     {
417 :     next if ! ( $value && ref $value eq "GLOB" );
418 :     $extra_rep = $value;
419 :     select( ( select( $extra_rep ), $| = 1 )[0] ); # autoflush on
420 : golsen 1.18 }
421 :     elsif ( m/^rep_seq_2$/ ) # rep_seq_2 behavior
422 :     {
423 :     $rep_seq_2 = $value;
424 :     }
425 : golsen 1.5 elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
426 :     {
427 :     $sim_meas = standardize_similarity_measure( $value );
428 :     }
429 :     elsif ( m/save_?te?mp/i ) # group temporary files
430 :     {
431 :     $options->{ savetmp } = 1;
432 :     }
433 :     else
434 :     {
435 : golsen 1.18 # print STDERR "WARNING: rep_seq bad option ignored: '$_' => '$value'\n";
436 : golsen 1.5 }
437 : golsen 1.1 }
438 :    
439 : golsen 1.5 my $reps2 = [];
440 : golsen 1.22 my $seqs2 = [];
441 : golsen 1.1
442 : golsen 1.5 {
443 : golsen 1.22 # Check sequence ids for duplicates (use bare block to scope $seen):
444 :    
445 :     my $seen = {};
446 :     foreach ( @$reps )
447 : golsen 1.5 {
448 : golsen 1.22 my $id = $_->[0];
449 :     if ( $seen->{ $id }++ )
450 :     {
451 :     print STDERR "Duplicate sequence id '$id' skipped by rep_seq\n" if ! $dup_id_ok;
452 :     }
453 :     else
454 :     {
455 :     push @$reps2, $_;
456 :     }
457 : golsen 1.5 }
458 : golsen 1.1
459 : golsen 1.22 my %keep_gid_hash = map { $_ => 1 } @$keep_gid;
460 :     my %keep_id_hash = map { $_ => 1 } @$keep_id;
461 : golsen 1.18
462 : golsen 1.22 # Filter sequences to be added;
463 : golsen 1.18
464 : golsen 1.22 foreach ( @$seqs )
465 : golsen 1.5 {
466 : golsen 1.22 my $id = $_->[0];
467 :     if ( $seen->{ $id }++ )
468 :     {
469 :     print STDERR "Duplicate sequence id '$id' skipped by rep_seq\n" if ! $dup_id_ok;
470 :     }
471 :     elsif ( $keep_id_hash{ $id } || ( $id =~ /^(fig\|\d+\.\d+)\./ && $keep_gid_hash{ $1 } ) )
472 :     {
473 :     push @$reps2, $_;
474 :     }
475 :     else
476 :     {
477 :     push @$seqs2, $_;
478 :     }
479 : golsen 1.5 }
480 :     }
481 : golsen 1.1
482 : golsen 1.18 #
483 :     # Do the analysis.
484 :     #
485 : golsen 1.22 # Begin by eliminating indels from the input sequences.
486 :     # pack_sequences() only rewrites sequences that will be changed.
487 : overbeek 1.14 #
488 : golsen 1.18 $reps2 = &gjoseqlib::pack_sequences( $reps2 ) || $reps2;
489 :     $seqs2 = &gjoseqlib::pack_sequences( $seqs2 ) || $seqs2;
490 : overbeek 1.14
491 : golsen 1.22 if ( $by_size == 2 )
492 :     {
493 :     my @s = sort { length( $b->[2] ) <=> length( $a->[2] ) } @$seqs2;
494 :     @$seqs2 = ();
495 :     # index of mid point, rounded down
496 :     my $i = int( (@s - 1) / 2 );
497 :     # step direction
498 :     # even => first step pos, e.g., (1,2,0,3)
499 :     # odd => first step neg, e.g., (2,1,3,0,4)
500 :     my $s = @s % 2 ? -1 : 1;
501 :     for ( my $di = 1; $di <= @s; $di++ )
502 :     {
503 :     push @$seqs2, $s[$i];
504 :     $i += $s * $di;
505 :     $s *= -1;
506 :     }
507 :     }
508 :     elsif ( $by_size )
509 : golsen 1.8 {
510 : golsen 1.18 @$seqs2 = sort { length( $b->[2] ) <=> length( $a->[2] ) } @$seqs2;
511 : golsen 1.8 }
512 :    
513 : golsen 1.5 # If no preexisting representatives, then take first sequence:
514 : golsen 1.1
515 : golsen 1.18 ( $reps2 && @$reps2 ) or ( @$reps2 = ( shift @$seqs2 ) );
516 : golsen 1.1
517 : golsen 1.5 if ( $logfile ) { foreach ( @$reps2 ) { print $logfile "$_->[0]\n" } }
518 : golsen 1.1
519 : golsen 1.5 # Search each rep sequence against itself to get max_bpp
520 : golsen 1.1
521 : golsen 1.17 my $tmp_dir = &SeedAware::location_of_tmp( $options );
522 : golsen 1.18 $tmp_dir or print STDERR "Unable to locate temporary file directory.\n"
523 : golsen 1.5 and return;
524 : golsen 1.17
525 :     my $db = SeedAware::new_file_name( "$tmp_dir/tmp_blast_db" );
526 : golsen 1.5 my $protein = are_protein( $reps2 );
527 : golsen 1.18
528 :     my %max_bpp; # Used in evaluating bit per position score
529 :     if ( $sim_meas =~ /^sc/ )
530 : golsen 1.5 {
531 : golsen 1.18 foreach my $entry ( @$reps2 )
532 :     {
533 :     $max_bpp{ $entry->[0] } = self_bpp( $db, $entry, $protein, $options );
534 :     }
535 : golsen 1.5 }
536 : golsen 1.1
537 : golsen 1.18 my $naln = $n_query + 9; # Alignments to make
538 :     my $self = 0; # Self match is never wanted
539 : golsen 1.5 my $prog = $protein ? 'blastp' : 'blastn';
540 : golsen 1.18 my $blast_opt = [ -e => $max_e_val,
541 :     -v => $naln,
542 :     -b => $naln,
543 :     -F => 'F',
544 : golsen 1.22 -a => $n_thread
545 : golsen 1.17 ];
546 : golsen 1.21 push @$blast_opt, ( -r => 1, -q => -1 ) if ! $protein;
547 : golsen 1.1
548 : golsen 1.18 # List of whom is represented by a sequence:
549 : golsen 1.1
550 : golsen 1.5 my %group = map { $_->[0] => [] } @$reps2;
551 : golsen 1.1
552 : golsen 1.5 # Groups can have more than one representative in the blast database:
553 : golsen 1.1
554 : golsen 1.21 my @new4blast = @$reps2; # initial reps
555 :     my %group_id = map { $_->[0] => $_->[0] } @$reps2; # represent self
556 : golsen 1.1
557 : golsen 1.21 # When we add multiple sequences to blast db at a time, we need to
558 : golsen 1.18 # know which are really in there as reps of groups.
559 :    
560 :     my %match_ok = map { $_->[0] => 1 } @$reps2; # hash of blast reps
561 :    
562 : golsen 1.5 # Search each sequence against the database.
563 : golsen 1.1
564 : golsen 1.18 my ( $bpp_max, $sid, $gid );
565 : golsen 1.21 my $offset = 0; # Where to start writing in blastdb
566 : golsen 1.1
567 : golsen 1.18 while ( @$seqs2 )
568 : golsen 1.5 {
569 : golsen 1.18 $n_query = @$seqs2 if @$seqs2 < $n_query; # Number to blast
570 :     my @queries = splice @$seqs2, 0, $n_query;
571 :    
572 : golsen 1.21 # Is it time to rebuild a BLAST database? This is now done by appending
573 :     # two groups of sequences: permanent additions and temporary additions.
574 : golsen 1.1
575 : golsen 1.21 if ( @new4blast || $n_query > 1 )
576 : golsen 1.5 {
577 : golsen 1.18 my $last = pop @queries;
578 : golsen 1.21 ( undef, $offset ) = update_blast_db( $db, \@new4blast, \@queries, $protein, $offset );
579 : golsen 1.18 push @queries, $last;
580 : golsen 1.21 @new4blast = ();
581 : golsen 1.5 }
582 : golsen 1.1
583 : golsen 1.22 # Do the blast analysis. Returned records are of the form:
584 : golsen 1.5 #
585 :     # 0 1 2 3 4 5 6 7 8 9 10 11
586 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
587 :     #
588 :     # $tophit = [ $score, $blast_record, $surething ]
589 : golsen 1.1
590 : golsen 1.18 my @results = top_blast_per_subject_2( $prog, $db, \@queries, $self, $blast_opt, $options );
591 : golsen 1.1
592 : golsen 1.18 foreach my $result ( @results )
593 :     {
594 :     my ( $qid, $hits ) = @$result;
595 : golsen 1.1
596 : golsen 1.18 my ( $tophit ) = sort { $b->[0] <=> $a->[0] }
597 :     map { in_group( $_, $max_sim, $sim_meas, $max_bpp{ $_->[3] } ) }
598 :     grep { $match_ok{ $_->[3] } }
599 :     @$hits;
600 :    
601 :     my $entry = shift @queries;
602 :    
603 :     # It matches an existing representative
604 :    
605 :     if ( $tophit )
606 :     {
607 :     $sid = $tophit->[1]->[3]; # id of the best matching sequence
608 :     $gid = $group_id{ $sid }; # look up representative for group
609 :     push @{ $group{ $gid } }, $qid; # add sequence to list in group
610 :     $group_id{ $qid } = $gid; # record group for id
611 :     print $logfile "$qid\t$gid\n" if $logfile;
612 :    
613 :     # Add sequence to blast database if it is not a 'surething'
614 :    
615 :     if ( ! $tophit->[2] && ! $rep_seq_2 )
616 :     {
617 : golsen 1.21 push @new4blast, $entry;
618 :     gjoseqlib::write_fasta( $extra_rep, $entry ) if $extra_rep; # db archive
619 : golsen 1.18 $match_ok{ $qid } = 1;
620 :     $max_bpp{ $qid } = self_bpp( $db, $entry, $protein, $options ) if $sim_meas =~ /^sc/;
621 :     }
622 :     }
623 : golsen 1.5
624 : golsen 1.18 # It is a new representative
625 : golsen 1.5
626 : golsen 1.18 else
627 : golsen 1.5 {
628 : golsen 1.21 push @$reps2, $entry;
629 :     push @new4blast, $entry;
630 : golsen 1.18 $match_ok{ $qid } = 1;
631 :     $group{ $qid } = [];
632 :     $group_id{ $qid } = $qid; # represent self
633 :     $max_bpp{ $qid } = self_bpp( $db, $entry, $protein, $options ) if $sim_meas =~ /^sc/;
634 :     print $logfile "$qid\n" if $logfile;
635 : golsen 1.5 }
636 :     }
637 : golsen 1.1 }
638 :    
639 : golsen 1.17 if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
640 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
641 : golsen 1.1
642 : golsen 1.5 # Return the surviving sequence entries, and optionally the hash of
643 :     # ids represented by each survivor:
644 :    
645 :     wantarray ? ( $reps2, \%group ) : $reps2;
646 :     }
647 : golsen 1.1
648 :    
649 : golsen 1.5 #===============================================================================
650 :     # Caluculate sequence similarity according to the requested measure, and return
651 : golsen 1.22 # empty list if lower than max_sim. Otherwise, return the hit and whether
652 : golsen 1.21 # the hit is really strong:
653 : golsen 1.5 #
654 :     # [ $score, $hit, $surething ] = in_group( $hit, $max_sim, $measure, $bpp_max )
655 :     # () = in_group( $hit, $max_sim, $measure, $bpp_max )
656 :     #
657 :     # $hit is a structure with blast information:
658 :     #
659 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
660 :     #
661 :     # The surething is the similarity for which $max_sim is 4 standard deviations
662 :     # lower.
663 :     #===============================================================================
664 : golsen 1.1
665 : golsen 1.5 sub in_group
666 : golsen 1.18 {
667 :     my ( $hit, $max_sim, $measure, $bpp_max ) = @_;
668 : golsen 1.1
669 : golsen 1.5 my $n = $hit->[8]; # aligned positions
670 :     return () if ( $n <= 0 );
671 : golsen 1.1
672 : golsen 1.5 my $m; # matched positions
673 : golsen 1.1
674 : golsen 1.5 if ( $measure =~ /^sc/ ) { $m = $hit->[ 6] / ( $bpp_max || 2 ) } # score/pos
675 :     elsif ( $measure =~ /^po/ ) { $m = $hit->[10] } # positives
676 :     else { $m = $hit->[ 9] } # identities
677 : golsen 1.1
678 : golsen 1.5 return () if $m < ( $max_sim * $n );
679 : golsen 1.1
680 : golsen 1.5 my $u = ( $n > $m ) ? ( $n - $m ) : 0; # differing positions
681 :     my $stddev = sqrt( $m * $u / $n );
682 :     my $conf = 4; # standard deviations for "surething"
683 :     $max_sim = 0.01 if $max_sim < 0.01;
684 :     my $surething = ( $u + $conf * $stddev ) <= ( ( 1 - $max_sim ) * $n ) ? 1 : 0;
685 : golsen 1.1
686 : golsen 1.5 [ $m/$n, $hit, $surething ]
687 : golsen 1.1 }
688 :    
689 :    
690 :     #===============================================================================
691 :     # Build or add to a set of representative sequences.
692 :     #
693 :     # \@reps = rep_seq_2( \@reps, \@new, \%options );
694 :     # \@reps = rep_seq_2( \@new, \%options );
695 :     #
696 :     # or
697 :     #
698 :     # ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
699 :     # ( \@reps, \%representing ) = rep_seq_2( \@new, \%options );
700 :     #
701 : golsen 1.18 # Make the behavior of just one representative per group an option of
702 :     # rep_seq(), unifying the codes.
703 : golsen 1.1 #===============================================================================
704 :    
705 :     sub rep_seq_2
706 :     {
707 : golsen 1.17 # Are there options?
708 : golsen 1.1
709 : golsen 1.17 my $options = ( $_[-1] && ref $_[-1] eq 'HASH' ) ? pop @_ : {};
710 : golsen 1.18 $options->{ rep_seq_2 } = 1;
711 : golsen 1.1
712 : golsen 1.18 rep_seq( @_, $options );
713 : golsen 1.5 }
714 :    
715 :    
716 :     #===============================================================================
717 :     # Construct a representative set of related sequences:
718 :     #
719 :     # \@repseqs = representative_sequences( $ref, \@seqs, $max_sim, \%options );
720 :     #
721 :     # or
722 :     #
723 :     # ( \@repseqs, \%representing, \@low_sim ) = representative_sequences( $ref,
724 :     # \@seqs, $max_sim, \%options );
725 :     #
726 :     #===============================================================================
727 : golsen 1.21 sub representative_sequences
728 :     {
729 : golsen 1.5 my $seqs = ( shift @_ || shift @_ ); # If $ref is undef, shift again
730 :     ref( $seqs ) eq "ARRAY"
731 :     or die "representative_sequences called with bad first argument\n";
732 :    
733 :     my ( $ref, $use_ref );
734 :     if ( ! ref( $seqs->[0] ) ) # First item was sequence entry, not list of entries
735 :     {
736 :     $ref = $seqs;
737 :     $seqs = shift @_;
738 :     ref( $seqs ) eq "ARRAY"
739 :     and ref( $seqs->[0] ) eq "ARRAY"
740 :     or die "representative_sequences called with bad sequences list\n";
741 :     $use_ref = 1;
742 :     }
743 :     else # First item was list of entries, split off first
744 :     {
745 :     ref( $seqs->[0] ) eq "ARRAY"
746 :     or die "representative_sequences called with bad sequences list\n";
747 :     $ref = shift @$seqs;
748 :     $use_ref = 0;
749 :     }
750 :    
751 :     my $max_sim = shift @_;
752 :     my $options;
753 :    
754 :     # Undocumented feature: skip max_sim (D = 0.8)
755 :    
756 :     if ( ref( $max_sim ) eq "HASH" )
757 :     {
758 :     $options = $max_sim;
759 :     $max_sim = undef;
760 :     }
761 :    
762 :     # If the above did not give us options, get them now:
763 :    
764 :     $options ||= ( shift @_ ) || {};
765 :    
766 :     # ---------------------------------------# Default values for options
767 :    
768 :     $max_sim ||= 0.80; # Retain 80% identity of less
769 :     my $logfile = undef; # Log file of sequences processed
770 :     my $max_ref_sim = 0.99; # Get rid of identical sequences
771 :     my $max_e_val = 0.01; # Blast E-value to decrease output
772 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
773 :     my $save_exp = 1.0; # Don't retain near equivalents
774 :     my $stable = 0; # Pick reps input order
775 :    
776 :     # Two questionable decisions:
777 :     # 1. Be painfully flexible on option names.
778 :     # 2. Silently fix bad parameter values.
779 :    
780 :     foreach ( keys %$options )
781 :     {
782 :     my $value = $options->{ $_ };
783 :     if ( m/^log/i ) # logfile
784 :     {
785 :     next if ! $value;
786 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
787 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
788 :     }
789 :     elsif ( m/ref/i ) # max_ref_sim
790 :     {
791 :     $value += 0;
792 :     $value = 0 if $value < 0;
793 :     $value = 1 if $value > 1;
794 :     $max_ref_sim = $value;
795 :     }
796 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
797 :     {
798 :     $value += 0;
799 :     $value = 0 if $value < 0;
800 :     $value = 1 if $value > 1;
801 :     $max_sim = $value;
802 :     }
803 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
804 :     {
805 :     $value += 0;
806 :     $value = 0 if $value < 0;
807 :     $max_e_val = $value;
808 :     }
809 :     elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
810 :     {
811 :     $sim_meas = standardize_similarity_measure( $value );
812 :     }
813 :     elsif ( m/sav/i || m/exp/i ) # save_exp(onent)
814 :     {
815 :     $value += 0;
816 :     $value = 0 if $value < 0;
817 :     $value = 1 if $value > 1;
818 :     $save_exp = $value;
819 :     }
820 :     elsif ( m/stab/i ) # stable order
821 :     {
822 :     $stable = $value ? 1 : 0;
823 :     }
824 :     else
825 :     {
826 : golsen 1.18 # print STDERR "WARNING: representative_sequences bad option ignored: '$_' => '$value'\n";
827 : golsen 1.5 }
828 :     }
829 :    
830 : golsen 1.22 # Silent sanity check. This should not happen, as it is almost equivalent
831 : golsen 1.5 # to making no reference sequence.
832 :    
833 :     $max_ref_sim = $max_sim if ( $max_ref_sim < $max_sim );
834 :    
835 :     # Do the analysis
836 : overbeek 1.14 # Begin by eliminating indels from the input sequences
837 :     #
838 :     ($ref) = &gjoseqlib::pack_sequences($ref);
839 :     $seqs = &gjoseqlib::pack_sequences($seqs);
840 : golsen 1.5
841 :     my $ref_id = $ref->[0];
842 :    
843 :     # Build a list of the ids (without ref) and an index for the sequence entries:
844 :    
845 :     my @seq_id = map { $_->[0] } @$seqs;
846 :     my $seq_ind = { map { @{$_}[0] => $_ } ( $ref, @$seqs ) };
847 :    
848 :     # Make a lookup table of the sequence number, for use in reording
849 :     # sequences later:
850 :    
851 :     my $n = 0;
852 :     my %ord = ( map { @$_[0] => ++$n } @$seqs );
853 :    
854 :     # Build blast database (it includes the reference):
855 :    
856 :     my $protein = are_protein( $seqs );
857 : golsen 1.17
858 :     my $tmp_dir = &SeedAware::location_of_tmp( $options );
859 : golsen 1.5 $tmp_dir or print STDERR "Unable to locate temporary file directory\n"
860 :     and return;
861 : golsen 1.17
862 :     my $db = SeedAware::new_file_name( "$tmp_dir/tmp_blast_db" );
863 : golsen 1.5 make_blast_db( $db, [ $ref, @$seqs ], $protein );
864 :    
865 :     # Search query against new database
866 :    
867 :     my $max = 3 * @$seqs; # Alignments to keep
868 :     my $self = 1; # Keep self match (for its bit score)
869 :    
870 : golsen 1.18 my $blast_opt = [ -e => $max_e_val,
871 :     -v => $max,
872 :     -b => $max,
873 : golsen 1.20 -F => 'f',
874 : golsen 1.18 -a => 2
875 : golsen 1.17 ];
876 : golsen 1.5
877 : golsen 1.22 # Do the blast analysis. Returned records are of the form:
878 : golsen 1.5 #
879 :     # 0 1 2 3 4 5 6 7 8 9 10 11
880 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
881 :    
882 :     my $prog = $protein ? 'blastp' : 'blastn';
883 : golsen 1.17 push @$blast_opt, qw( -r 1 -q -1 ) if ! $protein;
884 : golsen 1.18 my @ref_hits = top_blast_per_subject( $prog, $db, $ref, $self, $blast_opt, $options );
885 : golsen 1.5
886 :     # First hit is always a perfect match, so we get bits per position:
887 :     # This is only used if the measure is bits per position
888 :    
889 :     my $ref_bpp = $ref_hits[0]->[6] / $ref_hits[0]->[8];
890 :    
891 :     # Remove self match (might not be first if there are identical sequences):
892 :    
893 :     my %hit = ();
894 :     @ref_hits = grep { my $sid = $_->[3]; $hit{ $sid } = 1; ( $sid ne $ref_id ) } @ref_hits;
895 :    
896 :     my %group = ();
897 :     $group{ $ref_id } = [];
898 :     my %veto = ();
899 :     my $n_to_do = @ref_hits;
900 :     my $rebuild_d_n = 40;
901 :     my $last_rebuild = 1.5 * $rebuild_d_n;
902 :     my $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
903 :    
904 :     # Sequence-specific maximum similarities:
905 :    
906 :     my %max_sim = map { ( $_ => $max_sim ) } @seq_id;
907 :    
908 :     foreach ( @ref_hits )
909 :     {
910 :     my $id = $_->[3];
911 :     my $sim = seq_similarity( $_, $sim_meas, $ref_bpp );
912 :    
913 :     if ( $sim > ( $use_ref ? $max_ref_sim : $max_sim ) )
914 :     {
915 :     $veto{ $id } = 1;
916 :     push @{ $group{ $ref_id } }, $id; # Log the sequences represented
917 :     $n_to_do--;
918 :     }
919 :     else
920 :     {
921 :     my $max_sim_i = exp( $save_exp * log( $sim ) );
922 :     $max_sim{ $id } = $max_sim_i if ( $max_sim_i > $max_sim );
923 :     }
924 :     }
925 :    
926 :    
927 :     if ( $logfile )
928 :     {
929 :     print $logfile join( "\t", $ref_id, @{ $group{ $ref_id } } ), "\n";
930 :     }
931 :    
932 :     # Search each sequence against the database.
933 :     # If the order is to be stable, reorder hits to match input order.
934 :    
935 :     my ( $id1, $seq1, $max_sim_1, $id2, $max_sim_2, $bpp_max );
936 :     my @ids_to_do = map { $_->[3] } @ref_hits;
937 :     @ids_to_do = sort { $ord{ $a } <=> $ord{ $b } } @ids_to_do if $stable;
938 :    
939 :     while ( $id1 = shift @ids_to_do )
940 :     {
941 :     next if $veto{ $id1 };
942 :    
943 :     # Is it time to rebuild a smaller BLAST database? This helps
944 :     # significantly in the overall performance.
945 :    
946 :     if ( $n_to_do <= $rebuild )
947 :     {
948 :     if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
949 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
950 :     make_blast_db( $db, [ map { $seq_ind->{ $_ } } # id to sequence entry
951 :     grep { ! $veto{ $_ } } # id not vetoed
952 :     ( $id1, @ids_to_do ) # remaining ids
953 :     ],
954 :     $protein
955 :     );
956 :     $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
957 :     }
958 :    
959 :     $n_to_do--;
960 :     $group{ $id1 } = [];
961 :    
962 :     $max_sim_1 = $max_sim{ $id1 };
963 :     $bpp_max = undef;
964 : golsen 1.18 foreach ( top_blast_per_subject( $prog, $db, $seq_ind->{$id1}, $self, $blast_opt, $options ) )
965 : golsen 1.5 {
966 :     $bpp_max ||= $_->[6] / $_->[8];
967 :     $id2 = $_->[3];
968 :     next if ( $veto{ $id2 } || $group{ $id2 } );
969 :     $max_sim_2 = $max_sim{ $id2 };
970 :     $max_sim_2 = $max_sim_1 if ( $max_sim_1 > $max_sim_2 );
971 :     if ( seq_similarity( $_, $sim_meas, $bpp_max ) > $max_sim_2 )
972 :     {
973 :     $veto{ $id2 } = 1;
974 :     push @{ $group{ $id1 } }, $id2; # Log the sequences represented
975 :     $n_to_do--;
976 :     }
977 :     }
978 :    
979 :     if ( $logfile )
980 :     {
981 :     print $logfile join( "\t", $id1, @{ $group{ $id1 } } ), "\n";
982 :     }
983 :     }
984 :    
985 : golsen 1.17 if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
986 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
987 : golsen 1.5
988 :     # Return the surviving sequence entries, and optionally the hash of
989 :     # ids represented by each survivor:
990 :    
991 :     my $kept = [ $ref, grep { $group{ $_->[0] } } @$seqs ];
992 :    
993 :     wantarray ? ( $kept, \%group, [ grep { ! $hit{ $_->[0] } } @$seqs ] ) : $kept;
994 : golsen 1.1 }
995 :    
996 :    
997 :     #===============================================================================
998 :     # Try to figure out the sequence similarity measure that is being requested:
999 :     #
1000 :     # $type = standardize_similarity_measure( $requested_type )
1001 :     #
1002 :     #===============================================================================
1003 :    
1004 :     sub standardize_similarity_measure
1005 :     { my ( $req_meas ) = @_;
1006 :     return ( ! $req_meas ) ? 'identity_fraction'
1007 :     : ( $req_meas =~ /id/i ) ? 'identity_fraction'
1008 :     : ( $req_meas =~ /sc/i ) ? 'score_per_position'
1009 :     : ( $req_meas =~ /spp/i ) ? 'score_per_position'
1010 :     : ( $req_meas =~ /bit/i ) ? 'score_per_position'
1011 :     : ( $req_meas =~ /bpp/i ) ? 'score_per_position'
1012 :     : ( $req_meas =~ /tiv/i ) ? 'positive_fraction'
1013 :     : ( $req_meas =~ /pos_/i ) ? 'positive_fraction'
1014 :     : ( $req_meas =~ /ppp/i ) ? 'positive_fraction'
1015 :     : 'identity_fraction';
1016 :     }
1017 :    
1018 :    
1019 :     #===============================================================================
1020 :     # Caluculate sequence similarity according to the requested measure:
1021 :     #
1022 :     # $similarity = seq_similarity( $hit, $measure, $bpp_max )
1023 :     #
1024 :     # $hit is a structure with blast information:
1025 :     #
1026 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1027 :     #===============================================================================
1028 :    
1029 :     sub seq_similarity
1030 :     { my ( $hit, $measure, $bpp_max ) = @_;
1031 :     return ( @$hit < 11 ) ? undef
1032 :     : ( $measure =~ /^sc/ ) ? $hit->[ 6] / ( $hit->[8] * ( $bpp_max || 2 ) )
1033 :     : ( $measure =~ /^po/ ) ? $hit->[10] / $hit->[8]
1034 :     : $hit->[ 9] / $hit->[8]
1035 :     }
1036 :    
1037 :    
1038 :     #===============================================================================
1039 :     # Caluculate self similarity of a sequence in bits per position:
1040 :     #
1041 : golsen 1.22 # $max_bpp = self_bpp( $db_name, $entry, $protein, $options )
1042 : golsen 1.1 #
1043 :     #===============================================================================
1044 :    
1045 :     sub self_bpp
1046 :     {
1047 : golsen 1.18 my ( $db, $entry, $protein, $options ) = @_;
1048 : golsen 1.22 return 2.0;
1049 : golsen 1.1
1050 :     # Build blast database:
1051 :    
1052 :     make_blast_db( $db, [ $entry ], $protein );
1053 :    
1054 :     # Search sequence against the database
1055 :    
1056 : golsen 1.18 my $self = 1; # Self match is what we need
1057 : golsen 1.1
1058 :     my $prog = $protein ? 'blastp' : 'blastn';
1059 : golsen 1.18 my $blast_opt = [ -v => 1,
1060 :     -b => 1,
1061 : golsen 1.22 -a => 2,
1062 :     -F => 'f'
1063 : golsen 1.18 ];
1064 :     push @$blast_opt, ( -r => 1, -q => -1 ) if ! $protein;
1065 : golsen 1.1
1066 : golsen 1.22 # Do the blast analysis. Returned records are of the form:
1067 : golsen 1.1 #
1068 :     # 0 1 2 3 4 5 6 7 8 9 10 11
1069 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1070 :    
1071 : golsen 1.18 my ( $hit ) = top_blast_per_subject( $prog, $db, $entry, $self, $blast_opt, $options );
1072 : golsen 1.1 # print STDERR join( ", ", @$hit ), "\n";
1073 :    
1074 :     # First hit is always a perfect match, so we get bits per position:
1075 :     # This is only used if the measure is bits per position
1076 : golsen 1.5
1077 : golsen 1.1 $hit->[6] / $hit->[8];
1078 :     }
1079 :    
1080 :    
1081 :     #===============================================================================
1082 : golsen 1.21 # Make or update a blast databse from a set of sequence entries.
1083 : golsen 1.1 #
1084 : golsen 1.21 # $status = make_blast_db( $dbname, \@seq_entries, $is_prot, $offset )
1085 :     # ( $status, $offset ) = make_blast_db( $dbname, \@seq_entries, $is_prot, $offset )
1086 : golsen 1.1 #
1087 :     # Sequence entries have the form: [ $id, $def, $seq ]
1088 : golsen 1.21 #
1089 :     # If offset is supplied, the new data are appended starting at $offset. If
1090 :     # is desirable to add some sequences permanently and some temporarily,
1091 :     # see update_blast_db().
1092 : golsen 1.1 #===============================================================================
1093 :    
1094 :     sub make_blast_db
1095 : golsen 1.18 {
1096 : golsen 1.21 my ( $db, $seqs, $is_prot, $offset ) = @_;
1097 : golsen 1.1
1098 : golsen 1.18 my $formatdb = &SeedAware::executable_for( 'formatdb' )
1099 : golsen 1.17 or print STDERR "Could not find exectuable file for 'formatdb'.\n"
1100 : golsen 1.21 and return wantarray ? ( 0, 0 ) : 0;
1101 :    
1102 :     $seqs && ref $seqs eq 'ARRAY' && @$seqs
1103 :     or print STDERR "No sequences.\n"
1104 :     and return wantarray ? ( 0, 0 ) : 0;
1105 :    
1106 :     $db or print STDERR "Bad database file name '$db'.\n"
1107 :     and return wantarray ? ( 0, 0 ) : 0;
1108 :    
1109 :     open( DB, $offset ? '+<' : '>', $db )
1110 :     or print STDERR "Failed to open '$db'.\n"
1111 :     and return wantarray ? ( 0, 0 ) : 0;
1112 :    
1113 :     if ( $offset ) { truncate( DB, $offset ); seek( DB, 0, 2 ) }
1114 :     gjoseqlib::write_fasta( \*DB, $seqs );
1115 :     $offset = tell( DB );
1116 :     close( DB );
1117 :    
1118 :     my @param = ( -i => $db, -p => ( $is_prot ? 'T' : 'F' ) );
1119 :    
1120 :     my $status = ! system( $formatdb, @param );
1121 :    
1122 :     wantarray ? ( $status, $offset ) : $status;
1123 :     }
1124 :    
1125 :    
1126 :     #-------------------------------------------------------------------------------
1127 :     # Make or update a blast databse from two sets of sequence entries, the first
1128 :     # of which are considered permanent additions, and the second of which are
1129 :     # considered temporary additions.
1130 :     #
1131 :     # $status = update_blast_db( $dbname, \@perm_entries, \@temp_entries, $is_prot, $offset )
1132 :     # ( $status, $offset ) = update_blast_db( $dbname, \@perm_entries, \@temp_entries, $is_prot, $offset )
1133 :     #
1134 :     # $offset denotes the end of the permanent sequences.
1135 :     #-------------------------------------------------------------------------------
1136 :    
1137 :     sub update_blast_db
1138 :     {
1139 :     my ( $db, $perm_seqs, $temp_seqs, $is_prot, $offset ) = @_;
1140 :    
1141 :     my $formatdb = &SeedAware::executable_for( 'formatdb' )
1142 :     or print STDERR "Could not find exectuable file for 'formatdb'.\n"
1143 :     and return wantarray ? ( 0, 0 ) : 0;
1144 :    
1145 :     $perm_seqs = [] unless ( $perm_seqs && ref $perm_seqs eq 'ARRAY' );
1146 :     $temp_seqs = [] unless ( $temp_seqs && ref $temp_seqs eq 'ARRAY' );
1147 :    
1148 :     @$perm_seqs || @$temp_seqs
1149 :     or print STDERR "No sequences.\n"
1150 :     and return wantarray ? ( 0, $offset ) : 0;
1151 : golsen 1.17
1152 :     $db or print STDERR "Bad database file name '$db'.\n"
1153 : golsen 1.21 and return wantarray ? ( 0, 0 ) : 0;
1154 :    
1155 :     open( DB, $offset ? '+<' : '>', $db )
1156 :     or print STDERR "Failed to open '$db'.\n"
1157 :     and return wantarray ? ( 0, 0 ) : 0;
1158 : golsen 1.17
1159 : golsen 1.21 if ( $offset ) { truncate( DB, $offset ); seek( DB, 0, 2 ) }
1160 :     gjoseqlib::write_fasta( \*DB, $perm_seqs ) if @$perm_seqs;
1161 :     $offset = tell( DB );
1162 :     gjoseqlib::write_fasta( \*DB, $temp_seqs ) if @$temp_seqs;
1163 :     close( DB );
1164 : golsen 1.17
1165 : golsen 1.21 my @param = ( -i => $db, -p => ( $is_prot ? 'T' : 'F' ) );
1166 : golsen 1.17
1167 : golsen 1.21 my $status = ! system( $formatdb, @param );
1168 : golsen 1.1
1169 : golsen 1.21 wantarray ? ( $status, $offset ) : $status;
1170 : golsen 1.1 }
1171 :    
1172 :    
1173 :     #===============================================================================
1174 :     # The type of data (protein or nucleic acid) is quessed from the sequences.
1175 :     #
1176 :     # are_protein( \@seq_entries )
1177 :     #
1178 :     # Sequence entries have the form: [ $id, $def, $seq ]
1179 :     #===============================================================================
1180 :    
1181 :     sub are_protein
1182 : golsen 1.18 {
1183 :     my ( $seqs ) = @_;
1184 : golsen 1.1 my ( $nt, $aa ) = ( 0, 0 );
1185 :     foreach ( @$seqs )
1186 :     {
1187 : golsen 1.5 my $s = $_->[2];
1188 :     $nt += $s =~ tr/ACGTacgt//d;
1189 :     $aa += $s =~ tr/A-Za-z//d;
1190 : golsen 1.1 }
1191 :     ( $nt < 3 * $aa ) ? 1 : 0;
1192 :     }
1193 :    
1194 :    
1195 :     #===============================================================================
1196 :     # Blast a subject against a datbase, saving only top hit per subject
1197 :     #
1198 :     # Return:
1199 :     #
1200 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1201 :     #
1202 :     #===============================================================================
1203 :    
1204 : golsen 1.17 sub top_blast_per_subject
1205 : golsen 1.18 {
1206 :     my $opts = $_[-1] && ref $_[-1] eq 'HASH' ? pop : {};
1207 : golsen 1.17
1208 : golsen 1.18 my ( $prog, $db, $query, $self, $blast_opt, $sort, $no_merge ) = @_;
1209 :    
1210 :     my $tmp_dir = &SeedAware::location_of_tmp( $opts );
1211 : golsen 1.17 $tmp_dir
1212 :     or print STDERR "Unable to locate temporary file directory.\n"
1213 :     and return;
1214 : golsen 1.1
1215 : golsen 1.18 my $blastall = &SeedAware::executable_for( 'blastall' )
1216 : golsen 1.17 or print STDERR "Could not find exectuable file for 'blastall'.\n"
1217 :     and return 0;
1218 : golsen 1.1
1219 : golsen 1.18 my $query_file = &SeedAware::new_file_name( "$tmp_dir/tmp_blast_query", '.seq' );
1220 : golsen 1.1
1221 : golsen 1.21 gjoseqlib::write_fasta( $query_file, [ $query ] );
1222 : golsen 1.1
1223 : golsen 1.17 $blast_opt ||= [];
1224 : golsen 1.20 my @blast_cmd = ( $blastall,
1225 :     -p => $prog,
1226 :     -d => $db,
1227 :     -i => $query_file,
1228 :     @$blast_opt
1229 :     );
1230 : golsen 1.5
1231 : golsen 1.17 open( BPIPE, '-|', @blast_cmd ) or die "Could not open blast pipe\n";
1232 : golsen 1.11 my $sims = integrate_blast_segments( \*BPIPE, $sort, $no_merge, $self );
1233 :     close BPIPE;
1234 : golsen 1.17 unlink $query_file;
1235 : golsen 1.1
1236 :     my $pq = ""; # Previous query id
1237 :     my $ps = ""; # Previous subject id
1238 :     my $keep;
1239 :    
1240 :     grep { $keep = ( $pq ne $_->[0] ) || ( $ps ne $_->[3] );
1241 :     $pq = $_->[0];
1242 :     $ps = $_->[3];
1243 :     $keep && ( $self || ( $pq ne $ps ) );
1244 :     } @$sims;
1245 :     }
1246 :    
1247 :    
1248 :     #===============================================================================
1249 : golsen 1.18 # Blast queries against a datbase, saving only top hit per subject
1250 :     #
1251 :     # Return:
1252 :     #
1253 :     # ( [ qid, hits ], ... )
1254 :     #
1255 :     # hits = [ [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ],
1256 :     # ...
1257 :     # ]
1258 :     #
1259 :     #===============================================================================
1260 :    
1261 :     sub top_blast_per_subject_2
1262 :     {
1263 :     my $opts = $_[-1] && ref $_[-1] eq 'HASH' ? pop : {};
1264 :    
1265 :     my ( $prog, $db, $queries, $self, $blast_opt, $sort, $no_merge ) = @_;
1266 :    
1267 :     my $tmp_dir = &SeedAware::location_of_tmp( $opts );
1268 :     $tmp_dir
1269 :     or print STDERR "Unable to locate temporary file directory.\n"
1270 :     and return;
1271 :    
1272 :     my $blastall = &SeedAware::executable_for( 'blastall' )
1273 :     or print STDERR "Could not find exectuable file for 'blastall'.\n"
1274 :     and return 0;
1275 :    
1276 :     my $query_file = &SeedAware::new_file_name( "$tmp_dir/tmp_blast_query", '.seq' );
1277 :    
1278 : golsen 1.21 gjoseqlib::write_fasta( $query_file, $queries );
1279 : golsen 1.18
1280 :     $blast_opt ||= [];
1281 :     my @cmd = ( $blastall,
1282 :     -p => $prog,
1283 :     -d => $db,
1284 :     -i => $query_file,
1285 : golsen 1.20 -F => 'f',
1286 : golsen 1.18 @$blast_opt
1287 :     );
1288 :    
1289 :     my $redirect = { stderr => '/dev/null' };
1290 :     my $pipe = SeedAware::read_from_pipe_with_redirect( @cmd, $redirect )
1291 :     or die "Could not open blast pipe\n";
1292 :     my $sims = integrate_blast_segments( $pipe, $sort, $no_merge, $self );
1293 :     close $pipe;
1294 :    
1295 :     unlink $query_file;
1296 :    
1297 :     my @qids = map { $_->[0] } @$queries;
1298 :     my %qhits = map { $_ => [] } @qids;
1299 :     my %seen;
1300 :     foreach ( @$sims )
1301 :     {
1302 :     my $qid = $_->[0];
1303 :     my $sid = $_->[3];
1304 :     next if $seen{ "$qid\t$sid" }++;
1305 :     next if $qid eq $sid && ! $self;
1306 :     push @{ $qhits{ $qid } }, $_;
1307 :     }
1308 :    
1309 :     map { [ $_, $qhits{ $_ } ] } @qids;
1310 :     }
1311 :    
1312 :    
1313 :     #===============================================================================
1314 : golsen 1.1 # Read output of rationalize blast and assemble minimally overlapping segments
1315 : golsen 1.22 # into a total score for each subject sequence. For each query, sort matches
1316 : golsen 1.1 # into user-chosen order (D = total score):
1317 :     #
1318 : golsen 1.18 # @sims = integrate_blast_segments_0( \*FILEHANDLE, $sort_order, $no_merge )
1319 :     # \@sims = integrate_blast_segments_0( \*FILEHANDLE, $sort_order, $no_merge )
1320 : golsen 1.1 #
1321 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
1322 :     # and 'positive_fraction' (matched very flexibly).
1323 :     #
1324 :     # Returned sims (e_val is only for best HSP, not any composite):
1325 :     #
1326 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1327 :     #
1328 :     # There is a strategic decision to not read the blast output from memory;
1329 : golsen 1.22 # it could be enormous. This cuts the flexibility some.
1330 : golsen 1.1 #===============================================================================
1331 :     #
1332 :     # coverage fields:
1333 :     #
1334 :     # [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
1335 :     #
1336 :     #===============================================================================
1337 :    
1338 : golsen 1.18 sub integrate_blast_segments_0
1339 :     {
1340 :     my ( $fh, $order, $no_merge, $self ) = @_;
1341 : golsen 1.1 $fh ||= \*STDIN;
1342 :     ( ref( $fh ) eq "GLOB" ) || die "integrate_blast_segments called without a filehandle\n";
1343 :    
1344 :     $order = ( ! $order ) ? 'score'
1345 :     : ( $order =~ /sc/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
1346 :     : ( $order =~ /bit/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
1347 :     : ( $order =~ /spp/i ) ? 'score_per_position'
1348 :     : ( $order =~ /id/i ) ? 'identity_fraction'
1349 :     : ( $order =~ /tiv/i ) ? 'positive_fraction'
1350 :     : 'score';
1351 :    
1352 :     my $max_frac_overlap = 0.2;
1353 :    
1354 :     my ( $qid, $qdef, $qlen, $sid, $sdef, $slen );
1355 :     my ( $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap );
1356 :     my ( $ttl_scr, $ttl_mat, $ttl_id, $ttl_pos, $ttl_gap );
1357 : golsen 1.18 my @sims = ();
1358 : golsen 1.1 my @qsims = ();
1359 :     my $coverage = undef;
1360 :     my $record;
1361 :    
1362 : golsen 1.19 local $_;
1363 :     while ( defined( $_ = next_blast_record( $fh, $self ) ) )
1364 : golsen 1.1 {
1365 : golsen 1.5 chomp;
1366 :     if ( $_->[0] eq 'Query=' )
1367 :     {
1368 :     if ( $coverage )
1369 :     {
1370 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
1371 :     $coverage = undef;
1372 :     }
1373 :     if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
1374 :     ( undef, $qid, $qdef, $qlen ) = @$_;
1375 :     $sid = undef;
1376 :     @qsims = ();
1377 :     }
1378 :     elsif ( $_->[0] eq '>' )
1379 :     {
1380 :     if ( $coverage )
1381 :     {
1382 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
1383 :     $coverage = undef;
1384 :     }
1385 :     next if ! $qid;
1386 :     ( undef, $sid, $sdef, $slen ) = @$_;
1387 :     }
1388 :     elsif ( $_->[0] eq 'HSP' && $sid )
1389 :     {
1390 : golsen 1.18 shift @$_; # discard HSP
1391 : golsen 1.5 $coverage = integrate_HSP( $coverage, $_, $max_frac_overlap, $no_merge );
1392 :     }
1393 : golsen 1.1 }
1394 :    
1395 :     if ( $coverage ) { push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ] }
1396 :    
1397 :     if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
1398 :    
1399 :     wantarray ? @sims : \@sims;
1400 :     }
1401 :    
1402 :    
1403 :     #===============================================================================
1404 : golsen 1.18 # Read blast output and assemble minimally overlapping segments into a total
1405 : golsen 1.22 # for each subject sequence. For each query, sort matches into user-chosen
1406 : golsen 1.18 # order (D = total score):
1407 :     #
1408 :     # @sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
1409 :     # \@sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
1410 :     #
1411 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
1412 :     # and 'positive_fraction' (matched very flexibly).
1413 :     #
1414 :     # Returned sims (e_val is only for best HSP, not any composite):
1415 :     #
1416 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1417 :     #
1418 :     # There is a strategic decision to not read the blast output from memory;
1419 : golsen 1.22 # it could be enormous. This cuts the flexibility some.
1420 : golsen 1.18 #===============================================================================
1421 :     #
1422 :     # coverage fields:
1423 :     #
1424 :     # [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
1425 :     #
1426 :     #===============================================================================
1427 :    
1428 :     sub integrate_blast_segments
1429 :     {
1430 :     my ( $fh, $order, $no_merge, $self ) = @_;
1431 :    
1432 :     $fh ||= \*STDIN;
1433 :     ( ref( $fh ) eq "GLOB" ) || die "integrate_blast_segments called without a filehandle\n";
1434 :    
1435 :     $order = ( ! $order ) ? 'score'
1436 :     : ( $order =~ /sc/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
1437 :     : ( $order =~ /bit/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
1438 :     : ( $order =~ /spp/i ) ? 'score_per_position'
1439 :     : ( $order =~ /id/i ) ? 'identity_fraction'
1440 :     : ( $order =~ /tiv/i ) ? 'positive_fraction'
1441 :     : 'score';
1442 :    
1443 :     my $max_frac_overlap = 0.2;
1444 :    
1445 :     my @sims = ();
1446 :     my $qdata;
1447 :     while ( defined( $qdata = next_blast_query( $fh, $self ) ) )
1448 :     {
1449 :     my ( $qid, $qdef, $qlen, $qmatch ) = @$qdata;
1450 :     my @qsims = ();
1451 :     foreach my $sdata ( @$qmatch )
1452 :     {
1453 :     my ( $sid, $sdef, $slen, $smatch ) = @$sdata;
1454 :     my $coverage = undef;
1455 :     foreach my $hsp ( @$smatch )
1456 :     {
1457 :     $coverage = integrate_HSP( $coverage, $hsp, $max_frac_overlap, $no_merge );
1458 :     }
1459 :    
1460 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
1461 :     }
1462 :    
1463 :     push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) if @qsims;
1464 :     }
1465 :    
1466 :     wantarray ? @sims : \@sims;
1467 :     }
1468 :    
1469 :    
1470 :     #===============================================================================
1471 : golsen 1.1 #
1472 : golsen 1.22 # Try to integrate non-conflicting HSPs for the same subject sequence. The
1473 : golsen 1.1 # conflicts are only assessed from the standpoint of the query, at least for
1474 : golsen 1.22 # now. We could track the subject sequence coverage as well (to avoid a direct
1475 : golsen 1.1 # repeat in the query from matching the same subject twice).
1476 :     #
1477 :     # $new_coverage = integrate_HSP( $coverage, $hsp, $max_frac_overlap, $no_merge )
1478 :     #
1479 :     # 0 1 2 3 4 5 6 7
1480 :     # $coverage = [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
1481 :     #
1482 :     # $coverage should be undefined at the first call; the function intiallizes
1483 : golsen 1.22 # all of the fields from the first HSP. scr, n_mat, n_id, n_pos, and n_gap
1484 :     # are sums over the combined HSPs. e_val is based only of the first HSP.
1485 : golsen 1.1 #
1486 : golsen 1.18 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
1487 :     # $hsp = [ scr, e_val, n_seg, e_val2, n_mat, n_id, n_pos, n_gap, dir, s1, e1, sq1, s2, e2, sq2 ]
1488 : golsen 1.1 #
1489 :     # $max_frac_overlap Amount of the new HSP that is allowed to overlap already
1490 :     # incorporated HSPs
1491 :     #
1492 : golsen 1.22 # $no_merge Disable the merging of multiple HSPs. The structure will
1493 : golsen 1.1 # be filled in from the first HSP and left unchanged though
1494 : golsen 1.22 # subsequence calls. This simplifies the program structure.
1495 : golsen 1.1 #
1496 :     # Fitting a new HSP into covered intervals:
1497 :     #
1498 :     # 1 qlen
1499 :     # |---------------------------------------------------------------| query
1500 :     # ------------ --------------- covered
1501 :     # ------------- new match
1502 :     # l r
1503 :     #
1504 :     #===============================================================================
1505 :    
1506 :     sub integrate_HSP
1507 : golsen 1.18 {
1508 :     my ( $coverage, $hsp, $max_frac_overlap, $no_merge ) = @_;
1509 :    
1510 :     my ( $scr, $e_val, undef, undef, $n_mat, $n_id, $n_pos, $n_gap, $dir, $s1, $e1 ) = @$hsp;
1511 : golsen 1.1
1512 :     # Ignore frame; just use direction of match:
1513 :    
1514 :     $dir = substr( $dir, 0, 1 );
1515 :    
1516 :     # Orient by left and right ends:
1517 :    
1518 :     my ( $l, $r ) = ( $e1 > $s1 ) ? ( $s1, $e1 ) : ( $e1, $s1 );
1519 :    
1520 :     # First HSP for the subject sequence:
1521 :    
1522 :     if ( ! $coverage )
1523 :     {
1524 : golsen 1.5 return [ $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap, $dir, [ [ $s1, $e1 ] ] ];
1525 : golsen 1.1 }
1526 :    
1527 :     # Not first; must be same direction to combine (also test no_merge here):
1528 :    
1529 :     return $coverage if ( $no_merge || ( $dir ne $coverage->[6] ) );
1530 :    
1531 :     # Not first; must fall in a gap of query sequence coverage:
1532 :    
1533 :     my @intervals = @{ $coverage->[7] };
1534 :     my $max_overlap = $max_frac_overlap * ( $r - $l + 1 );
1535 :     my $prev_end = 0;
1536 :     my $next_beg = $intervals[0]->[0];
1537 :     my @used = ();
1538 :     while ( $next_beg <= $l ) # *** Sequential search could be made binary
1539 :     {
1540 : golsen 1.5 $prev_end = $intervals[0]->[1];
1541 :     push @used, scalar shift @intervals;
1542 :     $next_beg = @intervals ? $intervals[0]->[0] : 1e10;
1543 : golsen 1.1 }
1544 :    
1545 :     my $overlap = ( ( $l <= $prev_end ) ? ( $prev_end - $l + 1 ) : 0 )
1546 :     + ( ( $r >= $next_beg ) ? ( $r - $next_beg + 1 ) : 0 );
1547 :     return $coverage if ( $overlap > $max_overlap );
1548 :    
1549 : golsen 1.22 # Okay, we have passed the overlap test. We need to integrate the
1550 :     # match into the coverage description. Yes, I know that this counts
1551 :     # the overlap region. We could pro rate it, but that is messy too:
1552 : golsen 1.1
1553 :     $coverage->[0] += $scr;
1554 :     $coverage->[2] += $n_mat;
1555 :     $coverage->[3] += $n_id;
1556 :     $coverage->[4] += $n_pos;
1557 :     $coverage->[5] += $n_gap;
1558 :    
1559 :     # Refigure the covered intervals, fusing intervals separated by a
1560 :     # gap of less than 10:
1561 :    
1562 :     my $min_gap = 10;
1563 :     if ( $l <= $prev_end + $min_gap )
1564 :     {
1565 : golsen 1.5 if ( @used ) { $l = $used[-1]->[0]; pop @used }
1566 :     else { $l = 1 }
1567 : golsen 1.1 }
1568 :     if ( $r >= $next_beg - $min_gap )
1569 :     {
1570 : golsen 1.5 if ( @intervals ) { $r = $intervals[0]->[1]; shift @intervals }
1571 :     else { $r = 1e10 }
1572 : golsen 1.1 }
1573 :    
1574 :     $coverage->[7] = [ @used, [ $l, $r ], @intervals ];
1575 :    
1576 :     return $coverage;
1577 :     }
1578 :    
1579 :    
1580 :     #===============================================================================
1581 :     # Sort the blast matches by the desired criterion:
1582 :     #
1583 :     # @sims = order_query_sims( $qid, $qdef, $qlen, \@qsims, $order )
1584 :     #
1585 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
1586 :     # and 'positive_fraction'
1587 :     #
1588 :     # @qsims fields:
1589 :     #
1590 :     # 0 1 2 3 4 5 6 7 8
1591 :     # [ sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1592 :     #
1593 :     #===============================================================================
1594 :    
1595 :     sub order_query_sims
1596 :     { my ( $qid, $qdef, $qlen, $qsims, $order ) = @_;
1597 :    
1598 :     my @sims;
1599 :     if ( $order eq 'score_per_position' )
1600 :     {
1601 : golsen 1.5 @sims = map { [ $_->[5] ? $_->[3]/$_->[5] : 0, $_ ] } @$qsims;
1602 : golsen 1.1 }
1603 :     elsif ( $order eq 'identity_fraction' )
1604 :     {
1605 : golsen 1.5 @sims = map { [ $_->[5] ? $_->[6]/$_->[5] : 0, $_ ] } @$qsims;
1606 : golsen 1.1 }
1607 :     elsif ( $order eq 'positive_fraction' )
1608 :     {
1609 : golsen 1.5 @sims = map { [ $_->[5] ? $_->[7]/$_->[5] : 0, $_ ] } @$qsims;
1610 : golsen 1.1 }
1611 :     else # Default is by 'score'
1612 :     {
1613 : golsen 1.5 @sims = map { [ $_->[3], $_ ] } @$qsims;
1614 : golsen 1.1 }
1615 :    
1616 :     map { [ $qid, $qdef, $qlen, @{$_->[1]} ] } sort { $b->[0] <=> $a->[0] } @sims;
1617 :     }
1618 :    
1619 :    
1620 : overbeek 1.7 ###############################################################################
1621 : golsen 1.5
1622 : golsen 1.17 sub n_rep_seqs
1623 :     {
1624 : overbeek 1.7 my(%args) = (ref($_[0]) eq 'HASH') ? %{$_[0]} : @_;
1625 :    
1626 :     my($seqs) = $args{seqs} || return undef;
1627 :     my($reps) = $args{reps} || undef;
1628 :     my($max_iden) = $args{max_iden} || 0.9; # we don't keep seqs more than 90% identical
1629 :     my($max_rep) = $args{max_rep} || 50; # maximum number of seqs in returned set
1630 :    
1631 : overbeek 1.9 if ($args{by_size}) { $seqs = [sort { length($b->[2]) <=> length($a->[2]) } @$seqs] };
1632 :    
1633 : overbeek 1.7 my($lost) = {};
1634 :     my($repseqs,$representing) = &rep_seq_2($reps ? $reps : (), $seqs, { max_sim => $max_iden });
1635 :     if ($max_rep >= @$repseqs)
1636 :     {
1637 :     return ($repseqs,$representing);
1638 :     }
1639 :     my $n_rep = $reps ? @$reps : 0;
1640 :     my $incr = $max_iden / 2;
1641 :     # print STDERR "max_iden=$max_iden, ", scalar @$repseqs,"\n";
1642 :     my $iterations_left = 7;
1643 :    
1644 :     my @seqs2;
1645 :     while ($iterations_left && ($max_rep != @$repseqs))
1646 :     {
1647 :     if ($max_rep > @$repseqs)
1648 :     {
1649 :     $max_iden += $incr;
1650 :     }
1651 :     else
1652 :     {
1653 :     @seqs2 = @$repseqs[$n_rep..(@$repseqs - 1)];
1654 :     &add_to_lost($lost,$representing);
1655 :     $max_iden -= $incr;
1656 :     }
1657 :     ($repseqs,$representing) = &rep_seq_2($reps ? $reps : (), \@seqs2, { max_sim => $max_iden });
1658 :     # print STDERR "max_iden=$max_iden, ", scalar @$repseqs,"\n";
1659 :     $iterations_left--;
1660 :     $incr = $incr / 2;
1661 :     }
1662 :    
1663 :     foreach my $id (keys(%$lost))
1664 :     {
1665 :     my $rep_by = $lost->{$id};
1666 :     while ($lost->{$rep_by})
1667 :     {
1668 :     $rep_by = $lost->{$rep_by};
1669 :     }
1670 :     push(@{$representing->{$rep_by}},$id);
1671 :     }
1672 :     return ($repseqs,$representing);
1673 :     }
1674 :    
1675 : golsen 1.21 sub add_to_lost
1676 :     {
1677 : overbeek 1.7 my($lost,$representing) = @_;
1678 :    
1679 :     foreach my $id (keys(%$representing))
1680 :     {
1681 : golsen 1.21 my $x = $representing->{$id};
1682 :     foreach my $lost_id ( @$x ) { $lost->{$lost_id} = $id }
1683 : overbeek 1.7 }
1684 :     }
1685 :    
1686 :     ###############################################################################
1687 : golsen 1.5
1688 : golsen 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3