[Bio] / FigKernelPackages / representative_sequences.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/representative_sequences.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : golsen 1.1 package representative_sequences;
2 :    
3 :     use strict;
4 :     use gjoparseblast;
5 : overbeek 1.3 use Data::Dumper;
6 :     use Carp;
7 : golsen 1.1
8 :     # These four variables are used as globals below. They would be changed
9 :     # for the SEED environment. formatdb is run as a system command, and might
10 :     # be encapsulated in a run() construct for the SEED. blastall is in an
11 :     # input pipe.
12 :    
13 :     my $tmp_dir = ".";
14 :     my $formatdb = "formatdb";
15 :     my $blastall = "blastall";
16 :    
17 :     require Exporter;
18 :    
19 :     our @ISA = qw(Exporter);
20 :     our @EXPORT = qw( representative_sequences
21 :     rep_seq_2
22 :     );
23 :    
24 :     #===============================================================================
25 :     # Construct a representative set of related sequences:
26 :     #
27 :     # \@repseqs = representative_sequences( $ref, \@seqs, $max_sim, \%options );
28 :     #
29 :     # or
30 :     #
31 :     # ( \@repseqs, \%representing, \@low_sim ) = representative_sequences( $ref,
32 :     # \@seqs, $max_sim, \%options );
33 :     #
34 : golsen 1.2 # Build or add to a set of representative sequences (if you to not want an
35 :     # enrichment of sequences around a focus sequence (called the reference), this
36 :     # is probably the subroutine that you want).
37 :     #
38 :     # \@reps = rep_seq_2( \@reps, \@new, \%options );
39 :     # \@reps = rep_seq_2( \@new, \%options );
40 :     #
41 :     # or
42 :     #
43 :     # ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
44 :     # ( \@reps, \%representing ) = rep_seq_2( \@new, \%options );
45 : golsen 1.1 #
46 :     # Output:
47 :     #
48 :     # \@repseqs Reference to the list of retained (representative subset)
49 :     # of sequence entries. Sequence entries have the form
50 :     # [ $id, $def, $seq ]
51 :     #
52 :     # \%representing
53 :     # Reference to a hash in which the keys are the ids of the
54 :     # representative sequences, for which the corresponding value
55 :     # is a list of ids of other sequences that are represented by
56 :     # that representive.
57 :     #
58 :     #
59 :     # Arguments (only \@seqs is required):
60 :     #
61 :     # $ref A reference sequence as [ $id, $def, $seq ]. If present, the
62 :     # reference sequence defines a focal point for the analysis. A
63 :     # representative sequence from each lineage in its vicinity will
64 :     # be retained, even though they are more similar than max_sim to
65 :     # the reference, or to each other. The reference will always be
66 :     # included in the representative set. A limit is put on the
67 :     # similarity of lineages retained by the reference sequence with
68 :     # the max_ref_sim option (default = 0.99). The reference sequence
69 : golsen 1.2 # should not be repeated in the set of other sequences. (Only
70 :     # applies to representative_sequences; there is no equivalent for
71 :     # rep_seq_2.)
72 :     #
73 :     # \@reps In rep_seq_2, these sequences will each be placed in their own
74 :     # cluster, regardless of their similarity to one another. Each
75 :     # remaining sequence is added to the cluster to which it is
76 :     # most similar, unless it is less simililar than max_sim, in
77 :     # which case it represents a new cluster.
78 : golsen 1.1 #
79 :     # \@seqs Set of sequences to be pruned. If there is no reference
80 :     # sequence, the fist sequence in this list will be the starting
81 :     # point for the analysis and will be retained, but all sequences
82 :     # more similar than max_sim to it will be removed (in contrast to
83 :     # a reference sequence, which retains a representative of each
84 :     # lineage in its vicinity). Sequences that fail the E-value test
85 :     # relative to the reference (or the fist sequence if there is no
86 :     # reference) are dropped.
87 :     #
88 : golsen 1.2 # $max_sim (representative_sequences only; an option for rep_seq_2)
89 :     # Sequences with a higher similarity than max_sim to an existing
90 :     # representative sequence will not be included in the @reps
91 :     # output. Their ids are associated with the identifier of the
92 :     # sequence representing them in \%representing. The details of
93 :     # the behaviour are modified by other options. (default = 0.80)
94 : golsen 1.1 #
95 :     # \%options Key=>Value pairs that modify the behaviour:
96 :     #
97 :     # logfile Filehandle for a logfile of the progress. As each
98 : golsen 1.2 # sequence is analyzed, its disposition in recorded.
99 :     # In representative_sequences(), the id of each new
100 :     # representative is followed by a tab separated list of the
101 :     # ids that it represents. In rep_seq_2(), as each sequence
102 :     # is analyzed, it is recorded, followed by the id of the
103 :     # sequence representing it, if it is not the first member
104 :     # of a new cluster. Autoflush is set for the logfile.
105 : golsen 1.1 # If the value supplied is not a reference to a GLOB, then
106 :     # the log is sent to STDOUT (which is probably not what you
107 :     # want in most cases). The behavior is intended to aid in
108 :     # following prgress, and in recovery of interupted runs.
109 :     #
110 : golsen 1.2 # max_ref_sim (representative_sequences only)
111 : golsen 1.1 # Maximum similarity of any sequence to the reference. If
112 :     # max_ref_sim is less than max_sim, it is silently reset to
113 :     # max_sim. (default = 0.99, because 1.0 can be annoying)
114 :     #
115 : golsen 1.2 # max_e_val Maximum E-value for blastall. Probably moot, but will help
116 :     # with performance. (default = 0.01)
117 :     #
118 :     # max_sim Sequences with a higher similarity than max_sim to a
119 :     # retained sequence will be deleted. The details of the
120 :     # behaviour is modified by other options. (default = 0.80)
121 :     # (a parameter for representative_sequences, but an option
122 :     # for rep_seq_2).
123 : golsen 1.1 #
124 :     # sim_meas Measure similarity for inclusion or exclusion by
125 :     # 'identity_fraction' (default), 'positive_fraction', or
126 :     # 'score_per_position'
127 :     #
128 : golsen 1.2 # save_exp (representative_sequences only)
129 :     # When there is a reference sequence, lineages more similar
130 : golsen 1.1 # than max_sim will be retained near the reference. The
131 :     # default goal is to save one member of each lineage. If
132 :     # the initial representative of the lineage is seq1, we
133 :     # pose the question, "Are there sufficiently deep divisions
134 :     # within the lineage to seq1 that it they might be viewed
135 :     # as independent? That is, might there be another sequence,
136 :     # seq2 that so different from seq1 that we might want to see
137 :     # it also?
138 :     #
139 :     # +---------------------- ref
140 :     # |
141 :     # ---+ +-------------------- seq1
142 :     # +-+
143 :     # +-------------------- seq2
144 :     #
145 :     # Without any special treatment, if the similarity of seq1
146 :     # to ref ( S(seq1,ref) ) is greater than max_sim, seq1 would
147 :     # be the sole representative of thelineage containing both
148 :     # seq1 and seq2, because the similarity of seq1 to seq2
149 :     # ( S(seq1,seq2) ) is greater than S(seq1,ref). This can
150 :     # be altered by the value of save_exp. In terms of
151 :     # similarity, seq2 will be discarded if:
152 :     #
153 :     # S(seq1,seq2) > S(seq1,ref) ** save_exp, and
154 :     # S(seq1,seq2) > S(seq2,ref) ** save_exp
155 :     #
156 :     # The default behavior described above occurs when save_exp
157 :     # is 1. If save_exp < 1, then greater similarities between
158 :     # seq1 and seq2 are allowed. Reasonable values of save_exp
159 :     # are roughly 0.7 to 1.0. (At save_exp = 0, any similarity
160 :     # would be allowed; yuck.)
161 :     #
162 : golsen 1.2 # stable (representative_sequences only; always true for rep_seq_2)
163 :     # If true (not undef, '', or 0), then the representatives
164 : golsen 1.1 # will be chosen from as early in the list as possible (this
165 : golsen 1.2 # facilitates augmentation of an existing list).
166 : golsen 1.1 #
167 :     #-------------------------------------------------------------------------------
168 :     #
169 : golsen 1.2 # Diagram of the pruning behavior of representative_sequences():
170 : golsen 1.1 #
171 :     # 0.5 0.6 0.7 0.8 0.9 1.0 Similarity
172 :     # |---------|---------|---------|---------|---------|
173 :     # .
174 :     # . + A
175 :     # . +---+
176 :     # . | + B
177 :     # . +---+
178 :     # . | +---- C
179 :     # . +----------+
180 :     # . | +-------- D
181 :     # . |
182 :     # +-----------+ +----------------- E
183 :     # | . +-+
184 :     # | . +----------------- F
185 :     # +----------------+ .
186 :     # | | . +--------------------------- G
187 :     # | +---+
188 :     # | . | +--------------------- H
189 :     # --+ . +-----+
190 :     # | . +--------------------- I
191 :     # | .
192 :     # | +------------------------------- J
193 :     # +----------------+ .
194 :     # | . +--------------------------- K
195 :     # +---+
196 :     # . +--------------------------- L
197 :     # .
198 :     # |---------|---------|---------|---------|---------|
199 :     # 0.5 0.6 0.7 0.8 0.9 1.0 Similarity
200 :     #
201 :     # In the above tree and max_sim = 0.70 and max_ref_sim = 0.99:
202 :     #
203 :     # With no reference sequence, and A first in the list, the representative
204 :     # sequences will be A, G, J and K.
205 :     #
206 :     # With A as the reference sequence and save_exp left at its default, the
207 :     # representative sequences will be A, C, D, E, G, J and K. B is excluded
208 :     # because it is more similar than max_ref_sim to A.
209 :     #
210 :     # With A as the reference sequence and save_exp = 0.8, the representative
211 :     # sequences will be A, C, D, E, F (comparably similar to A and E), G,
212 :     # H (comparably similar to A and G), J and K. The sequence L will be
213 :     # represented by K because L is much closer to K than to A.
214 :     #
215 :     # This oversimplifies the choice of representative of a cluster of related
216 :     # sequences. For example, whether G, H or I would represent the group of
217 :     # three actually depends on relative clock speed (slower is better) and
218 :     # sequence coverage (more complete is better). The actual order is by BLAST
219 :     # bit score (possibly combining independent segments).
220 :     #
221 :     # In addition, this discussion is in terms of a tree, but the calculations
222 :     # are based on a (partial) set of pairwise sequence similarities. Thus, the
223 :     # precise behavior is hard to predict, but should be similar to that described
224 :     # above.
225 :     #
226 :     #-------------------------------------------------------------------------------
227 :     #
228 :     # To construct a representative set of sequences relative to a reference
229 :     # sequence:
230 :     #
231 :     # 1. Prioritize sequences for keeping, from highest to lowest scoring
232 :     # relative to reference, as measured by blast score (bits).
233 :     # When stable is set, priority is from first to last in input file
234 :     # (a reference sequence should not be supplied).
235 :     #
236 :     # 2. Based on the similarity of each sequence to the reference and save_exp,
237 :     # compute sequence-specific values of max_sim:
238 :     #
239 :     # max_sim( seq_i ) = exp( save_exp * ln( seq_i_ref_sim ) )
240 :     #
241 :     # 3. Examine the next prioritized sequence (seq1).
242 :     #
243 :     # 4. If seq1 has been vetoed, go to 7.
244 :     #
245 :     # 5. Mark seq1 to keep.
246 :     #
247 :     # 6. Use blast to find similarities of seq1 to other sequences.
248 :     #
249 :     # 7. For each similar sequence (seq2):
250 :     #
251 :     # 7a. Skip if seq2 is marked to keep, or marked for veto
252 :     #
253 :     # 7b. Compute the maximum simiarity of seq1 and seq2 for retaining seq2:
254 :     #
255 :     # max_sim_1_2 = max( max_sim, max_sim( seq1 ), max_sim( seq2 ) )
256 :     #
257 :     # 7c. If similarity of seq1 and seq2 > max_sim, veto seq2
258 :     #
259 :     # 7d. Next seq2
260 :     #
261 :     # 8. If there are more sequences to examine, go to 3.
262 :     #
263 :     # 9. Collect the sequences marked for keeping.
264 :     #
265 :     #===============================================================================
266 :    
267 :     sub representative_sequences {
268 :     my $seqs = ( shift @_ || shift @_ ); # If $ref is undef, shift again
269 :     ref( $seqs ) eq "ARRAY"
270 :     or die "representative_sequences called with bad first argument\n";
271 :    
272 :     my ( $ref, $use_ref );
273 :     if ( ! ref( $seqs->[0] ) ) # First item was sequence entry, not list of entries
274 :     {
275 :     $ref = $seqs;
276 :     $seqs = shift @_;
277 :     ref( $seqs ) eq "ARRAY"
278 :     and ref( $seqs->[0] ) eq "ARRAY"
279 :     or die "representative_sequences called with bad sequences list\n";
280 :     $use_ref = 1;
281 :     }
282 :     else # First item was list of entries, split off first
283 :     {
284 :     ref( $seqs->[0] ) eq "ARRAY"
285 :     or die "representative_sequences called with bad sequences list\n";
286 :     $ref = shift @$seqs;
287 :     $use_ref = 0;
288 :     }
289 :    
290 :     my $max_sim = shift @_;
291 :     my $options;
292 :    
293 :     # Undocumented feature: skip max_sim (D = 0.8)
294 :    
295 :     if ( ref( $max_sim ) eq "HASH" )
296 :     {
297 :     $options = $max_sim;
298 :     $max_sim = undef;
299 :     }
300 :    
301 :     # If the above did not give us options, get them now:
302 :    
303 :     $options ||= ( shift @_ ) || {};
304 :    
305 :     # ---------------------------------------# Default values for options
306 :    
307 :     $max_sim ||= 0.80; # Retain 80% identity of less
308 :     my $logfile = undef; # Log file of sequences processed
309 :     my $max_ref_sim = 0.99; # Get rid of identical sequences
310 :     my $max_e_val = 0.01; # Blast E-value to decrease output
311 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
312 :     my $save_exp = 1.0; # Don't retain near equivalents
313 :     my $stable = 0; # Pick reps input order
314 :    
315 :     # Two questionable decisions:
316 :     # 1. Be painfully flexible on option names.
317 :     # 2. Silently fix bad parameter values.
318 :    
319 :     foreach ( keys %$options )
320 :     {
321 :     my $value = $options->{ $_ };
322 :     if ( m/^log/i ) # logfile
323 :     {
324 :     next if ! $value;
325 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
326 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
327 :     }
328 :     elsif ( m/ref/i ) # max_ref_sim
329 :     {
330 :     $value += 0;
331 :     $value = 0 if $value < 0;
332 :     $value = 1 if $value > 1;
333 :     $max_ref_sim = $value;
334 :     }
335 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
336 :     {
337 :     $value += 0;
338 :     $value = 0 if $value < 0;
339 :     $value = 1 if $value > 1;
340 :     $max_sim = $value;
341 :     }
342 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
343 :     {
344 :     $value += 0;
345 :     $value = 0 if $value < 0;
346 :     $max_e_val = $value;
347 :     }
348 :     elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
349 :     {
350 :     $sim_meas = standardize_similarity_measure( $value );
351 :     }
352 :     elsif ( m/sav/i || m/exp/i ) # save_exp(onent)
353 :     {
354 :     $value += 0;
355 :     $value = 0 if $value < 0;
356 :     $value = 1 if $value > 1;
357 :     $save_exp = $value;
358 :     }
359 :     elsif ( m/stab/i ) # stable order
360 :     {
361 :     $stable = $value ? 1 : 0;
362 :     }
363 :     else
364 :     {
365 :     print STDERR "WARNING: representative_sequences bad option ignored: '$_' => '$value'\n";
366 :     }
367 :     }
368 :    
369 :     # Silent sanity check. This should not happen, as it is almost equivalent
370 :     # to making no reference sequence.
371 :    
372 :     $max_ref_sim = $max_sim if ( $max_ref_sim < $max_sim );
373 :    
374 :     # Do the analysis
375 :    
376 :     my $ref_id = $ref->[0];
377 :     my $ref_seq = $ref->[2];
378 :    
379 :     # Build a list of the ids (without ref) and an index for the sequence entries:
380 :    
381 :     my @seq_id = map { $_->[0] } @$seqs;
382 :     my $seq_ind = { map { @{$_}[0] => $_ } ( $ref, @$seqs ) };
383 :    
384 :     # Make a lookup table of the sequence number, for use in reording
385 :     # sequences later:
386 :    
387 :     my $n = 0;
388 :     my %ord = ( map { @$_[0] => ++$n } @$seqs );
389 :    
390 :     # Build blast database (it includes the reference):
391 :    
392 :     my $protein = are_protein( $seqs );
393 :     my $db = ( $tmp_dir ? "$tmp_dir/" : "" ) . "tmp_blast_db_$$";
394 :     make_blast_db( $db, [ $ref, @$seqs ], $protein );
395 :    
396 :     # Search query against new database
397 :    
398 :     my $max = 3 * @$seqs; # Alignments to keep
399 :     my $self = 1; # Keep self match (for its bit score)
400 :    
401 :     my $blast_opt = "-e $max_e_val -v $max -b $max -F F -a 2";
402 :    
403 :     # Do the blast analysis. Returned records are of the form:
404 :     #
405 :     # 0 1 2 3 4 5 6 7 8 9 10 11
406 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
407 :    
408 :     my $prog = $protein ? 'blastp' : 'blastn';
409 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
410 :     my @ref_hits = top_blast_per_subject( $prog, $db, $ref_id, $ref_seq, $self, $blast_opt );
411 :    
412 :     # First hit is always a perfect match, so we get bits per position:
413 :     # This is only used if the measure is bits per position
414 :    
415 :     my $ref_bpp = $ref_hits[0]->[6] / $ref_hits[0]->[8];
416 :    
417 :     # Remove self match (might not be first if there are identical sequences):
418 :    
419 :     my %hit = ();
420 :     @ref_hits = grep { my $sid = $_->[3]; $hit{ $sid } = 1; ( $sid ne $ref_id ) } @ref_hits;
421 :    
422 :     my %keep = ();
423 :     $keep{ $ref_id } = [];
424 :     my %veto = ();
425 :     my $n_to_do = @ref_hits;
426 :     my $rebuild_d_n = 40;
427 :     my $last_rebuild = 1.5 * $rebuild_d_n;
428 :     my $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
429 :    
430 :     # Sequence-specific maximum similarities:
431 :    
432 :     my %max_sim = map { ( $_ => $max_sim ) } @seq_id;
433 :    
434 :     foreach ( @ref_hits )
435 :     {
436 :     my $id = $_->[3];
437 :     my $sim = seq_similarity( $_, $sim_meas, $ref_bpp );
438 :    
439 :     if ( $sim > ( $use_ref ? $max_ref_sim : $max_sim ) )
440 :     {
441 :     $veto{ $id } = 1;
442 :     push @{ $keep{ $ref_id } }, $id; # Log the sequences represented
443 :     $n_to_do--;
444 :     }
445 :     else
446 :     {
447 :     my $max_sim_i = exp( $save_exp * log( $sim ) );
448 :     $max_sim{ $id } = $max_sim_i if ( $max_sim_i > $max_sim );
449 :     }
450 :     }
451 :    
452 :    
453 :     if ( $logfile )
454 :     {
455 :     print $logfile join( "\t", $ref_id, @{ $keep{ $ref_id } } ), "\n";
456 :     }
457 :    
458 :     # Search each sequence against the database.
459 :     # If the order is to be stable, reorder hits to match input order.
460 :    
461 :     my ( $id1, $seq1, $max_sim_1, $id2, $max_sim_2, $bpp_max );
462 :     my @ids_to_do = map { $_->[3] } @ref_hits;
463 :     @ids_to_do = sort { $ord{ $a } <=> $ord{ $b } } @ids_to_do if $stable;
464 :    
465 :     while ( $id1 = shift @ids_to_do )
466 :     {
467 :     next if $veto{ $id1 };
468 :    
469 :     # Is it time to rebuild a smaller BLAST database? This helps
470 :     # significantly in the overall performance.
471 :    
472 :     if ( $n_to_do <= $rebuild )
473 :     {
474 :     if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
475 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
476 :     make_blast_db( $db, [ map { $seq_ind->{ $_ } } # id to sequence entry
477 :     grep { ! $veto{ $_ } } # id not vetoed
478 :     ( $id1, @ids_to_do ) # remaining ids
479 :     ],
480 :     $protein
481 :     );
482 :     $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
483 :     }
484 :    
485 :     $n_to_do--;
486 :     $keep{ $id1 } = [];
487 :    
488 :     $max_sim_1 = $max_sim{ $id1 };
489 :     $bpp_max = undef;
490 :     $seq1 = $seq_ind->{$id1}->[2];
491 :     foreach ( top_blast_per_subject( $prog, $db, $id1, $seq1, $self, $blast_opt ) )
492 :     {
493 :     $bpp_max ||= $_->[6] / $_->[8];
494 :     $id2 = $_->[3];
495 :     next if ( $veto{ $id2 } || $keep{ $id2 } );
496 :     $max_sim_2 = $max_sim{ $id2 };
497 :     $max_sim_2 = $max_sim_1 if ( $max_sim_1 > $max_sim_2 );
498 :     if ( seq_similarity( $_, $sim_meas, $bpp_max ) > $max_sim_2 )
499 :     {
500 :     $veto{ $id2 } = 1;
501 :     push @{ $keep{ $id1 } }, $id2; # Log the sequences represented
502 :     $n_to_do--;
503 :     }
504 :     }
505 :    
506 :     if ( $logfile )
507 :     {
508 :     print $logfile join( "\t", $id1, @{ $keep{ $id1 } } ), "\n";
509 :     }
510 :     }
511 :    
512 :     if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
513 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
514 :    
515 :     # Return the surviving sequence entries, and optionally the hash of
516 :     # ids represented by each survivor:
517 :    
518 :     my $kept = [ $ref, grep { $keep{ $_->[0] } } @$seqs ];
519 :    
520 :     wantarray ? ( $kept, \%keep, [ grep { ! $hit{ $_->[0] } } @$seqs ] ) : $kept;
521 :     }
522 :    
523 :    
524 :     #===============================================================================
525 :     # Build or add to a set of representative sequences.
526 :     #
527 :     # \@reps = rep_seq_2( \@reps, \@new, \%options );
528 :     # \@reps = rep_seq_2( \@new, \%options );
529 :     #
530 :     # or
531 :     #
532 :     # ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
533 :     # ( \@reps, \%representing ) = rep_seq_2( \@new, \%options );
534 :     #
535 :     #===============================================================================
536 :    
537 :     sub rep_seq_2
538 :     {
539 :     ref $_[0] eq 'ARRAY'
540 :     or print STDERR "First parameter of rep_seq_2 must be an ARRAY reference\n"
541 :     and return undef;
542 :    
543 :     my ( $reps, $seqs, $options ) = ( [], undef, {} );
544 :    
545 :     if ( @_ == 1 )
546 :     {
547 :     $seqs = shift;
548 :     }
549 :    
550 :     elsif ( @_ == 2 )
551 :     {
552 :     if ( ref $_[1] eq 'ARRAY' ) { ( $reps, $seqs ) = @_ }
553 :     elsif ( ref $_[1] eq 'HASH' ) { ( $seqs, $options ) = @_ }
554 :     else
555 :     {
556 :     print STDERR "Second parameter of rep_seq_2 must be an ARRAY or HASH reference\n";
557 :     return undef;
558 :     }
559 :     }
560 :    
561 :     elsif ( @_ == 3 )
562 :     {
563 :     if ( ref $_[1] ne 'ARRAY' )
564 :     {
565 :     print STDERR "Second parameter of 3 parameter rep_seq_2 must be an ARRAY reference\n";
566 :     return undef;
567 :     }
568 :     if ( ref $_[2] ne 'HASH' )
569 :     {
570 :     print STDERR "Third parameter of 3 parameter rep_seq_2 must be a HASH reference\n";
571 :     return undef;
572 :     }
573 :    
574 :     ( $reps, $seqs, $options ) = @_;
575 :     }
576 :     else
577 :     {
578 :     print STDERR "rep_seq_2 called with @{[scalar @_]} parameters\n";
579 :     return undef;
580 :     }
581 :    
582 :     # ---------------------------------------# Default values for options
583 :    
584 :     my $max_sim = 0.80; # Retain 80% identity of less
585 :     my $logfile = undef; # Log file of sequences processed
586 :     my $max_e_val = 0.01; # Blast E-value to decrease output
587 :     my $sim_meas = 'identity_fraction'; # Use sequence identity as measure
588 :    
589 :     # Two questionable decisions:
590 :     # 1. Be painfully flexible on option names.
591 :     # 2. Silently fix bad parameter values.
592 :    
593 :     foreach ( keys %$options )
594 :     {
595 :     my $value = $options->{ $_ };
596 :     if ( m/^log/i ) # logfile
597 :     {
598 :     next if ! $value;
599 :     $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
600 :     select( ( select( $logfile ), $| = 1 )[0] ); # autoflush on
601 :     }
602 :     elsif ( m/max/i && m/sim/i ) # max(imum)_sim(ilarity)
603 :     {
604 :     $value += 0;
605 :     $value = 0 if $value < 0;
606 :     $value = 1 if $value > 1;
607 :     $max_sim = $value;
608 :     }
609 :     elsif ( m/max/i || m/[ep]_?val/i ) # Other "max" tests must come first
610 :     {
611 :     $value += 0;
612 :     $value = 0 if $value < 0;
613 :     $max_e_val = $value;
614 :     }
615 :     elsif ( m/sim/i || m/meas/i ) # sim(ilarity)_meas(ure)
616 :     {
617 :     $sim_meas = standardize_similarity_measure( $value );
618 :     }
619 :     else
620 :     {
621 :     print STDERR "WARNING: rep_seq_2 bad option ignored: '$_' => '$value'\n";
622 :     }
623 :     }
624 :    
625 :     # Check sequence ids for duplicates:
626 :    
627 :     my $reps2 = [];
628 :     my $seen = {};
629 :    
630 :     my $id;
631 :     foreach ( @$reps )
632 :     {
633 :     $id = $_->[0];
634 :     if ( $seen->{ $id }++ )
635 :     {
636 :     print STDERR "Duplicate sequence id '$id' skipped by rep_seq_2\n";
637 :     }
638 :     else
639 :     {
640 :     push @$reps2, $_;
641 :     }
642 :     }
643 :    
644 :     my $seqs2 = [];
645 :     foreach ( @$seqs )
646 :     {
647 :     $id = $_->[0];
648 :     if ( $seen->{ $id }++ )
649 :     {
650 :     print STDERR "Duplicate sequence id '$_[0]' skipped by rep_seq_2\n";
651 :     }
652 :     else
653 :     {
654 :     push @$seqs2, $_;
655 :     }
656 :     }
657 :    
658 :     # If no preexisting representatives, then take first sequence:
659 :    
660 :     @$reps2 or @$reps2 = ( shift @$seqs2 );
661 :    
662 :     if ( $logfile ) { foreach ( @$reps2 ) { print $logfile "$_->[0]\n" } }
663 :    
664 :     # Search each rep sequence against itself for max_bpp
665 :    
666 :     my $db = ( $tmp_dir ? "$tmp_dir/" : "" ) . "tmp_blast_db_$$";
667 :     my $protein = are_protein( $reps2 );
668 : overbeek 1.3
669 : golsen 1.1 my %max_bpp;
670 :     my $entry;
671 :     foreach $entry ( @$reps2 )
672 :     {
673 :     $max_bpp{ $entry->[0] } = self_bpp( $db, $entry, $protein );
674 :     }
675 :    
676 :     my $naln = 10; # Alignments to make
677 :     my $self = 1; # Self match will never occur
678 :     my $prog = $protein ? 'blastp' : 'blastn';
679 :     my $blast_opt = "-e $max_e_val -v $naln -b $naln -F F -a 2";
680 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
681 :    
682 :     # List of who is represented by a sequence:
683 :    
684 :     my %keep = map { $_->[0] => [] } @$reps2;
685 :    
686 :     # Search each sequence against the database.
687 :    
688 :     my ( $seq, $bpp_max );
689 :     my $newdb = 1;
690 :    
691 :     foreach $entry ( @$seqs2 )
692 :     {
693 :     # Is it time to rebuild a BLAST database?
694 :    
695 :     if ( $newdb )
696 :     {
697 :     make_blast_db( $db, $reps2, $protein );
698 :     $newdb = 0;
699 :     }
700 :     # Do the blast analysis. Returned records are of the form:
701 :     #
702 :     # 0 1 2 3 4 5 6 7 8 9 10 11
703 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
704 :    
705 :     $id = $entry->[0];
706 :     $seq = $entry->[2];
707 :     my ( $tophit ) = sort { $b->[0] <=> $a->[0] }
708 :     grep { $_->[0] >= $max_sim }
709 :     map { [ seq_similarity( $_, $sim_meas, $max_bpp{ $_->[3] } ), $_ ] }
710 :     top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );
711 :    
712 :    
713 :     # It matches an existing representative
714 :    
715 :     if ( $tophit )
716 :     {
717 :     # print STDERR join(", ", $tophit->[0], @{$tophit->[1]} ), "\n";
718 :     push @{ $keep{ $tophit->[1]->[3] } }, $id;
719 :     print $logfile "$id\t$tophit->[1]->[3]\n" if $logfile;
720 :     }
721 :    
722 :     # It is a new representative
723 :    
724 :     else
725 :     {
726 :     push @$reps2, $entry;
727 :     $keep{ $id } = [];
728 :     $max_bpp{ $id } = self_bpp( $db, $entry, $protein );
729 :     $newdb = 1;
730 :     print $logfile "$id\n" if $logfile;
731 :     }
732 :     }
733 :    
734 :     if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
735 :     else { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
736 :    
737 :     # Return the surviving sequence entries, and optionally the hash of
738 :     # ids represented by each survivor:
739 :    
740 :     wantarray ? ( $reps2, \%keep ) : $reps2;
741 :     }
742 :    
743 :    
744 :     #===============================================================================
745 :     # Try to figure out the sequence similarity measure that is being requested:
746 :     #
747 :     # $type = standardize_similarity_measure( $requested_type )
748 :     #
749 :     #===============================================================================
750 :    
751 :     sub standardize_similarity_measure
752 :     { my ( $req_meas ) = @_;
753 :     return ( ! $req_meas ) ? 'identity_fraction'
754 :     : ( $req_meas =~ /id/i ) ? 'identity_fraction'
755 :     : ( $req_meas =~ /sc/i ) ? 'score_per_position'
756 :     : ( $req_meas =~ /spp/i ) ? 'score_per_position'
757 :     : ( $req_meas =~ /bit/i ) ? 'score_per_position'
758 :     : ( $req_meas =~ /bpp/i ) ? 'score_per_position'
759 :     : ( $req_meas =~ /tiv/i ) ? 'positive_fraction'
760 :     : ( $req_meas =~ /pos_/i ) ? 'positive_fraction'
761 :     : ( $req_meas =~ /ppp/i ) ? 'positive_fraction'
762 :     : 'identity_fraction';
763 :     }
764 :    
765 :    
766 :     #===============================================================================
767 :     # Caluculate sequence similarity according to the requested measure:
768 :     #
769 :     # $similarity = seq_similarity( $hit, $measure, $bpp_max )
770 :     #
771 :     # $hit is a structure with blast information:
772 :     #
773 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
774 :     #===============================================================================
775 :    
776 :     sub seq_similarity
777 :     { my ( $hit, $measure, $bpp_max ) = @_;
778 :     return ( @$hit < 11 ) ? undef
779 :     : ( $measure =~ /^sc/ ) ? $hit->[ 6] / ( $hit->[8] * ( $bpp_max || 2 ) )
780 :     : ( $measure =~ /^po/ ) ? $hit->[10] / $hit->[8]
781 :     : $hit->[ 9] / $hit->[8]
782 :     }
783 :    
784 :    
785 :     #===============================================================================
786 :     # Caluculate self similarity of a sequence in bits per position:
787 :     #
788 :     # $max_bpp = self_bpp( $db_name, $entry, $protein )
789 :     #
790 :     #===============================================================================
791 :    
792 :     sub self_bpp
793 :     {
794 :     my ( $db, $entry, $protein ) = @_;
795 :    
796 :     # Build blast database:
797 :    
798 :     make_blast_db( $db, [ $entry ], $protein );
799 :    
800 :     # Search sequence against the database
801 :    
802 :     my $id = $entry->[0];
803 :     my $seq = $entry->[2];
804 :     my $self = 1; # Self match will never occur
805 :    
806 :     my $prog = $protein ? 'blastp' : 'blastn';
807 :     my $blast_opt = "-v 1 -b 1 -F F -a 2";
808 :     $blast_opt .= " -r 1 -q -1" if ! $protein;
809 :    
810 :     # Do the blast analysis. Returned records are of the form:
811 :     #
812 :     # 0 1 2 3 4 5 6 7 8 9 10 11
813 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
814 :    
815 :     my ( $hit ) = top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );
816 :     # print STDERR join( ", ", @$hit ), "\n";
817 :    
818 :     # First hit is always a perfect match, so we get bits per position:
819 :     # This is only used if the measure is bits per position
820 : overbeek 1.3 if (! $hit->[8]) { return 0 }
821 : golsen 1.1 $hit->[6] / $hit->[8];
822 :     }
823 :    
824 :    
825 :     #===============================================================================
826 :     # Make a blast databse from a set of sequence entries. The type of database
827 :     # (protein or nucleic acid) is quessed from the sequence data.
828 :     #
829 :     # make_blast_db( $db_filename, \@seq_entries, $protein )
830 :     #
831 :     # Sequence entries have the form: [ $id, $def, $seq ]
832 :     #===============================================================================
833 :    
834 :     sub make_blast_db
835 :     { my ( $db, $seqs, $protein ) = @_;
836 :    
837 :     open FH, ">$db" or die "Could not create blast database file \"$db\"";
838 :     foreach ( @$seqs )
839 :     {
840 :     print FH ">$_->[0]", ( $_->[1] ? " $_->[1]" : () ), "\n$_->[2]\n";
841 :     }
842 :     close FH;
843 :    
844 :     my $is_prot = $protein ? 'T' : 'F';
845 :     system "$formatdb -p $is_prot -i '$db'";
846 :     }
847 :    
848 :    
849 :     #===============================================================================
850 :     # The type of data (protein or nucleic acid) is quessed from the sequences.
851 :     #
852 :     # are_protein( \@seq_entries )
853 :     #
854 :     # Sequence entries have the form: [ $id, $def, $seq ]
855 :     #===============================================================================
856 :    
857 :     sub are_protein
858 :     { my ( $seqs ) = @_;
859 :     my ( $nt, $aa ) = ( 0, 0 );
860 :     foreach ( @$seqs )
861 :     {
862 :     my $s = $_->[2];
863 :     $nt += $s =~ tr/ACGTacgt//d;
864 :     $aa += $s =~ tr/A-Za-z//d;
865 :     }
866 :     ( $nt < 3 * $aa ) ? 1 : 0;
867 :     }
868 :    
869 :    
870 :     #===============================================================================
871 :     # Blast a subject against a datbase, saving only top hit per subject
872 :     #
873 :     # Return:
874 :     #
875 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
876 :     #
877 :     #===============================================================================
878 :    
879 :     # Use a bare block to compartmentalize a counter:
880 :    
881 :     { my $cnt = 0;
882 :    
883 :     sub top_blast_per_subject
884 :     { my ( $prog, $db, $qid, $qseq, $self, $blast_opt, $sort, $no_merge ) = @_;
885 :    
886 :     $cnt++;
887 :     my $query_file = ( $tmp_dir ? "$tmp_dir/" : "" ) . "tmp_blast_seq_${$}_$cnt";
888 :     my $QFILE;
889 :     open $QFILE, ">$query_file" or die "Could not create sequence file \"$query_file\"";
890 :     print $QFILE ">$qid\n$qseq\n";
891 :     close $QFILE;
892 :    
893 :     my $blast_cmd = "$blastall -p $prog -d '$db' -i '$query_file' $blast_opt";
894 :     my $BPIPE;
895 :     open $BPIPE, "$blast_cmd |" or die "Could open blast pipe\n";
896 :     my $sims = integrate_blast_segments( $BPIPE, $sort, $no_merge, $self );
897 :     close $BPIPE;
898 : overbeek 1.3
899 : golsen 1.1 unlink $query_file;
900 :    
901 :     my $pq = ""; # Previous query id
902 :     my $ps = ""; # Previous subject id
903 :     my $keep;
904 :    
905 :     grep { $keep = ( $pq ne $_->[0] ) || ( $ps ne $_->[3] );
906 :     $pq = $_->[0];
907 :     $ps = $_->[3];
908 :     $keep && ( $self || ( $pq ne $ps ) );
909 :     } @$sims;
910 :     }
911 :     }
912 :    
913 :    
914 :     #===============================================================================
915 :     # Read output of rationalize blast and assemble minimally overlapping segments
916 :     # into a total score for each subject sequence. For each query, sort matches
917 :     # into user-chosen order (D = total score):
918 :     #
919 :     # @sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
920 :     # \@sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
921 :     #
922 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
923 :     # and 'positive_fraction' (matched very flexibly).
924 :     #
925 :     # Returned sims (e_val is only for best HSP, not any composite):
926 :     #
927 :     # [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
928 :     #
929 :     # There is a strategic decision to not read the blast output from memory;
930 :     # it could be enormous. This cuts the flexibility some.
931 :     #===============================================================================
932 :     #
933 :     # coverage fields:
934 :     #
935 :     # [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
936 :     #
937 :     #===============================================================================
938 :    
939 :     sub integrate_blast_segments
940 :     { my ( $fh, $order, $no_merge, $self ) = @_;
941 :     $fh ||= \*STDIN;
942 :     ( ref( $fh ) eq "GLOB" ) || die "integrate_blast_segments called without a filehandle\n";
943 :    
944 :     $order = ( ! $order ) ? 'score'
945 :     : ( $order =~ /sc/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
946 :     : ( $order =~ /bit/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
947 :     : ( $order =~ /spp/i ) ? 'score_per_position'
948 :     : ( $order =~ /id/i ) ? 'identity_fraction'
949 :     : ( $order =~ /tiv/i ) ? 'positive_fraction'
950 :     : 'score';
951 :    
952 :     my $max_frac_overlap = 0.2;
953 :    
954 :     my ( $qid, $qdef, $qlen, $sid, $sdef, $slen );
955 :     my ( $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap );
956 :     my ( $ttl_scr, $ttl_mat, $ttl_id, $ttl_pos, $ttl_gap );
957 :     my @sims = ();
958 :     my @qsims = ();
959 :     my $coverage = undef;
960 :     my $record;
961 :    
962 :     while ( $_ = next_blast_record( $fh, $self ) )
963 :     {
964 :     chomp;
965 :     if ( $_->[0] eq 'Query=' )
966 :     {
967 :     if ( $coverage )
968 :     {
969 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
970 :     $coverage = undef;
971 :     }
972 :     if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
973 :     ( undef, $qid, $qdef, $qlen ) = @$_;
974 :     $sid = undef;
975 :     @qsims = ();
976 :     }
977 :     elsif ( $_->[0] eq '>' )
978 :     {
979 :     if ( $coverage )
980 :     {
981 :     push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
982 :     $coverage = undef;
983 :     }
984 :     next if ! $qid;
985 :     ( undef, $sid, $sdef, $slen ) = @$_;
986 :     }
987 :     elsif ( $_->[0] eq 'HSP' && $sid )
988 :     {
989 :     $coverage = integrate_HSP( $coverage, $_, $max_frac_overlap, $no_merge );
990 :     }
991 :     }
992 :    
993 :     if ( $coverage ) { push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ] }
994 :    
995 :     if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
996 :    
997 :     wantarray ? @sims : \@sims;
998 :     }
999 :    
1000 :    
1001 :     #===============================================================================
1002 :     #
1003 :     # Try to integrate non-conflicting HSPs for the same subject sequence. The
1004 :     # conflicts are only assessed from the standpoint of the query, at least for
1005 :     # now. We could track the subject sequence coverage as well (to avoid a direct
1006 :     # repeat in the query from matching the same subject twice).
1007 :     #
1008 :     # $new_coverage = integrate_HSP( $coverage, $hsp, $max_frac_overlap, $no_merge )
1009 :     #
1010 :     # 0 1 2 3 4 5 6 7
1011 :     # $coverage = [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
1012 :     #
1013 :     # $coverage should be undefined at the first call; the function intiallizes
1014 :     # all of the fields from the first HSP. scr, n_mat, n_id, n_pos, and n_gap
1015 :     # are sums over the combined HSPs. e_val is based only of the first HSP.
1016 :     #
1017 :     # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1018 :     # $hsp = [ 'HSP', scr, e_val, n_seg, e_val2, n_mat, n_id, n_pos, n_gap, dir, s1, e1, sq1, s2, e2, sq2 ]
1019 :     #
1020 :     # $max_frac_overlap Amount of the new HSP that is allowed to overlap already
1021 :     # incorporated HSPs
1022 :     #
1023 :     # $no_merge Disable the merging of multiple HSPs. The structure will
1024 :     # be filled in from the first HSP and left unchanged though
1025 :     # subsequence calls. This simplifies the program structure.
1026 :     #
1027 :     # Fitting a new HSP into covered intervals:
1028 :     #
1029 :     # 1 qlen
1030 :     # |---------------------------------------------------------------| query
1031 :     # ------------ --------------- covered
1032 :     # ------------- new match
1033 :     # l r
1034 :     #
1035 :     #===============================================================================
1036 :    
1037 :     sub integrate_HSP
1038 :     { my ( $coverage, $hsp, $max_frac_overlap, $no_merge ) = @_;
1039 :     my ( undef, $scr, $e_val, undef, undef, $n_mat, $n_id, $n_pos, $n_gap, $dir, $s1, $e1 ) = @$hsp;
1040 :    
1041 :     # Ignore frame; just use direction of match:
1042 :    
1043 :     $dir = substr( $dir, 0, 1 );
1044 :    
1045 :     # Orient by left and right ends:
1046 :    
1047 :     my ( $l, $r ) = ( $e1 > $s1 ) ? ( $s1, $e1 ) : ( $e1, $s1 );
1048 :    
1049 :     # First HSP for the subject sequence:
1050 :    
1051 :     if ( ! $coverage )
1052 :     {
1053 :     return [ $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap, $dir, [ [ $s1, $e1 ] ] ];
1054 :     }
1055 :    
1056 :     # Not first; must be same direction to combine (also test no_merge here):
1057 :    
1058 :     return $coverage if ( $no_merge || ( $dir ne $coverage->[6] ) );
1059 :    
1060 :     # Not first; must fall in a gap of query sequence coverage:
1061 :    
1062 :     my @intervals = @{ $coverage->[7] };
1063 :     my $max_overlap = $max_frac_overlap * ( $r - $l + 1 );
1064 :     my $prev_end = 0;
1065 :     my $next_beg = $intervals[0]->[0];
1066 :     my @used = ();
1067 :     while ( $next_beg <= $l ) # *** Sequential search could be made binary
1068 :     {
1069 :     $prev_end = $intervals[0]->[1];
1070 :     push @used, scalar shift @intervals;
1071 :     $next_beg = @intervals ? $intervals[0]->[0] : 1e10;
1072 :     }
1073 :    
1074 :     my $overlap = ( ( $l <= $prev_end ) ? ( $prev_end - $l + 1 ) : 0 )
1075 :     + ( ( $r >= $next_beg ) ? ( $r - $next_beg + 1 ) : 0 );
1076 :     return $coverage if ( $overlap > $max_overlap );
1077 :    
1078 :     # Okay, we have passed the overlap test. We need to integrate the
1079 :     # match into the coverage description. Yes, I know that this counts
1080 :     # the overlap region. We could pro rate it, but that is messy too:
1081 :    
1082 :     $coverage->[0] += $scr;
1083 :     $coverage->[2] += $n_mat;
1084 :     $coverage->[3] += $n_id;
1085 :     $coverage->[4] += $n_pos;
1086 :     $coverage->[5] += $n_gap;
1087 :    
1088 :     # Refigure the covered intervals, fusing intervals separated by a
1089 :     # gap of less than 10:
1090 :    
1091 :     my $min_gap = 10;
1092 :     if ( $l <= $prev_end + $min_gap )
1093 :     {
1094 :     if ( @used ) { $l = $used[-1]->[0]; pop @used }
1095 :     else { $l = 1 }
1096 :     }
1097 :     if ( $r >= $next_beg - $min_gap )
1098 :     {
1099 :     if ( @intervals ) { $r = $intervals[0]->[1]; shift @intervals }
1100 :     else { $r = 1e10 }
1101 :     }
1102 :    
1103 :     $coverage->[7] = [ @used, [ $l, $r ], @intervals ];
1104 :    
1105 :     return $coverage;
1106 :     }
1107 :    
1108 :    
1109 :     #===============================================================================
1110 :     # Sort the blast matches by the desired criterion:
1111 :     #
1112 :     # @sims = order_query_sims( $qid, $qdef, $qlen, \@qsims, $order )
1113 :     #
1114 :     # Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
1115 :     # and 'positive_fraction'
1116 :     #
1117 :     # @qsims fields:
1118 :     #
1119 :     # 0 1 2 3 4 5 6 7 8
1120 :     # [ sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
1121 :     #
1122 :     #===============================================================================
1123 :    
1124 :     sub order_query_sims
1125 :     { my ( $qid, $qdef, $qlen, $qsims, $order ) = @_;
1126 :    
1127 :     my @sims;
1128 :     if ( $order eq 'score_per_position' )
1129 :     {
1130 :     @sims = map { [ $_->[5] ? $_->[3]/$_->[5] : 0, $_ ] } @$qsims;
1131 :     }
1132 :     elsif ( $order eq 'identity_fraction' )
1133 :     {
1134 :     @sims = map { [ $_->[5] ? $_->[6]/$_->[5] : 0, $_ ] } @$qsims;
1135 :     }
1136 :     elsif ( $order eq 'positive_fraction' )
1137 :     {
1138 :     @sims = map { [ $_->[5] ? $_->[7]/$_->[5] : 0, $_ ] } @$qsims;
1139 :     }
1140 :     else # Default is by 'score'
1141 :     {
1142 :     @sims = map { [ $_->[3], $_ ] } @$qsims;
1143 :     }
1144 :    
1145 :     map { [ $qid, $qdef, $qlen, @{$_->[1]} ] } sort { $b->[0] <=> $a->[0] } @sims;
1146 :     }
1147 :    
1148 :    
1149 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3