[Bio] / FigKernelPackages / representative_sequences.pm Repository:
ViewVC logotype

View of /FigKernelPackages/representative_sequences.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Dec 20 20:49:21 2006 UTC (13 years, 5 months ago) by golsen
Branch: MAIN
Adding representative_sequences.pm

package representative_sequences;

use strict;
use gjoparseblast;

#  These four variables are used as globals below.  They would be changed
#  for the SEED environment.  formatdb is run as a system command, and might
#  be encapsulated in a run() construct for the SEED.  blastall is in an
#  input pipe.

my $tmp_dir  = ".";
my $formatdb = "formatdb";
my $blastall = "blastall";

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw( representative_sequences
                  rep_seq_2
                );

#===============================================================================
#  Construct a representative set of related sequences:
#
#    \@repseqs = representative_sequences( $ref, \@seqs, $max_sim, \%options );
#
#  or
#
#    ( \@repseqs, \%representing, \@low_sim ) = representative_sequences( $ref,
#                                                 \@seqs, $max_sim, \%options );
#
#
#  Output:
#
#    \@repseqs  Reference to the list of retained (representative subset)
#               of sequence entries.  Sequence entries have the form
#               [ $id, $def, $seq ]
#
#    \%representing
#               Reference to a hash in which the keys are the ids of the
#               representative sequences, for which the corresponding value
#               is a list of ids of other sequences that are represented by
#               that representive.
#
#
#  Arguments (only \@seqs is required):
#
#    $ref       A reference sequence as [ $id, $def, $seq ].  If present, the
#               reference sequence defines a focal point for the analysis.  A
#               representative sequence from each lineage in its vicinity will
#               be retained, even though they are more similar than max_sim to
#               the reference, or to each other.  The reference will always be
#               included in the representative set.  A limit is put on the
#               similarity of lineages retained by the reference sequence with
#               the max_ref_sim option (default = 0.99).  The reference sequence
#               should not be repeated in the set of other sequences.
#
#    \@seqs     Set of sequences to be pruned.  If there is no reference
#               sequence, the fist sequence in this list will be the starting
#               point for the analysis and will be retained, but all sequences
#               more similar than max_sim to it will be removed (in contrast to
#               a reference sequence, which retains a representative of each
#               lineage in its vicinity).  Sequences that fail the E-value test
#               relative to the reference (or the fist sequence if there is no
#               reference) are dropped.
#
#    $max_sim   Sequences with a higher similarity than max_sim to a retained
#               sequence will be deleted.  The details of the behaviour is
#               modified by other options. (default = 0.80)
#
#    \%options  Key=>Value pairs that modify the behaviour:
#
#        logfile    Filehandle for a logfile of the progress.  As each
#                   representative sequence is identified, its id is written
#                   to the logfile, followed by a tab separated list of the
#                   ids that it represents.  Autoflush is set for the logfile.
#                   If the value supplied is not a reference to a GLOB, then
#                   the log is sent to STDOUT (which is probably not what you
#                   want in most cases).  The behavior is intended to aid in
#                   following prgress, and in recovery of interupted runs.
#
#        max_ref_sim
#                   Maximum similarity of any sequence to the reference.  If
#                   max_ref_sim is less than max_sim, it is silently reset to
#                   max_sim.  (default = 0.99, because 1.0 can be annoying)
#
#        max_e_val  Maximum E-value for BLAST (probably moot, but will help
#                   with performance) (default = 0.01)
#
#        sim_meas   Measure similarity for inclusion or exclusion by
#                  'identity_fraction' (default), 'positive_fraction', or
#                  'score_per_position'
#
#        save_exp   When there is a reference sequence, lineages more similar
#                   than max_sim will be retained near the reference.  The
#                   default goal is to save one member of each lineage.  If
#                   the initial representative of the lineage is seq1, we
#                   pose the question, "Are there sufficiently deep divisions
#                   within the lineage to seq1 that it they might be viewed
#                   as independent?  That is, might there be another sequence,
#                   seq2 that so different from seq1 that we might want to see
#                   it also?
#
#                                +---------------------- ref
#                                |
#                             ---+ +-------------------- seq1
#                                +-+
#                                  +-------------------- seq2
#
#                   Without any special treatment, if the similarity of seq1
#                   to ref ( S(seq1,ref) ) is greater than max_sim, seq1 would
#                   be the sole representative of thelineage containing both
#                   seq1 and seq2, because the similarity of seq1 to seq2
#                   ( S(seq1,seq2) ) is greater than S(seq1,ref).  This can
#                   be altered by the value of save_exp.  In terms of
#                   similarity, seq2 will be discarded if:
#
#                       S(seq1,seq2) > S(seq1,ref) ** save_exp, and
#                       S(seq1,seq2) > S(seq2,ref) ** save_exp
#
#                   The default behavior described above occurs when save_exp
#                   is 1.  If save_exp < 1, then greater similarities between
#                   seq1 and seq2 are allowed.  Reasonable values of save_exp
#                   are roughly 0.7 to 1.0.  (At save_exp = 0, any similarity
#                   would be allowed; yuck.)
#
#        stable     If true (not undef, '', or 0), then the representatives
#                   will be chosen from as early in the list as possible (this
#                   facilitates augmentation of an existing list). 
#
#-------------------------------------------------------------------------------
#
#  Diagram of the pruning behavior:
#
#  0.5       0.6       0.7       0.8       0.9       1.0   Similarity
#   |---------|---------|---------|---------|---------|
#                       .
#                       .                            +  A
#                       .                        +---+
#                       .                        |   +  B
#                       .                    +---+
#                       .                    |   +----  C
#                       .         +----------+
#                       .         |          +--------  D
#                       .         |
#                     +-----------+ +-----------------  E
#                     | .         +-+
#                     | .           +-----------------  F
#    +----------------+ .
#    |                | . +---------------------------  G
#    |                +---+
#    |                  . |     +---------------------  H
#  --+                  . +-----+
#    |                  .       +---------------------  I
#    |                  .
#    |                +-------------------------------  J
#    +----------------+ .
#                     | . +---------------------------  K
#                     +---+
#                       . +---------------------------  L
#                       .
#   |---------|---------|---------|---------|---------|
#  0.5       0.6       0.7       0.8       0.9       1.0   Similarity
#
#  In the above tree and max_sim = 0.70 and max_ref_sim = 0.99:
#
#      With no reference sequence, and A first in the list, the representative
#      sequences will be A, G, J and K.
#
#      With A as the reference sequence and save_exp left at its default, the
#      representative sequences will be A, C, D, E, G, J and K.  B is excluded
#      because it is more similar than max_ref_sim to A.
#
#      With A as the reference sequence and save_exp = 0.8, the representative
#      sequences will be A, C, D, E, F (comparably similar to A and E), G,
#      H (comparably similar to A and G), J and K.  The sequence L will be
#      represented by K because L is much closer to K than to A.
#
#  This oversimplifies the choice of representative of a cluster of related
#  sequences.  For example, whether G, H or I would represent the group of
#  three actually depends on relative clock speed (slower is better) and
#  sequence coverage (more complete is better).  The actual order is by BLAST
#  bit score (possibly combining independent segments).
#
#  In addition, this discussion is in terms of a tree, but the calculations
#  are based on a (partial) set of pairwise sequence similarities.  Thus, the
#  precise behavior is hard to predict, but should be similar to that described
#  above.
#
#-------------------------------------------------------------------------------
#
#  To construct a representative set of sequences relative to a reference
#  sequence:
#
#    1. Prioritize sequences for keeping, from highest to lowest scoring
#             relative to reference, as measured by blast score (bits).
#             When stable is set, priority is from first to last in input file
#             (a reference sequence should not be supplied).
#
#    2. Based on the similarity of each sequence to the reference and save_exp,
#             compute sequence-specific values of max_sim:
#
#         max_sim( seq_i ) = exp( save_exp * ln( seq_i_ref_sim ) )
#
#    3. Examine the next prioritized sequence (seq1).
#
#    4. If seq1 has been vetoed, go to 7.
#
#    5. Mark seq1 to keep.
#
#    6. Use blast to find similarities of seq1 to other sequences.
#
#    7. For each similar sequence (seq2):
#
#        7a. Skip if seq2 is marked to keep, or marked for veto
#
#        7b. Compute the maximum simiarity of seq1 and seq2 for retaining seq2:
#
#            max_sim_1_2 = max( max_sim, max_sim( seq1 ), max_sim( seq2 ) )
#
#        7c. If similarity of seq1 and seq2 > max_sim, veto seq2
#
#        7d. Next seq2
#
#    8. If there are more sequences to examine, go to 3.
#
#    9. Collect the sequences marked for keeping.
#
#===============================================================================

sub representative_sequences {
    my $seqs = ( shift @_ || shift @_ );  #  If $ref is undef, shift again
    ref( $seqs ) eq "ARRAY"
        or die "representative_sequences called with bad first argument\n";

    my ( $ref, $use_ref );
    if ( ! ref( $seqs->[0] ) )  #  First item was sequence entry, not list of entries
    {
	$ref = $seqs;
	$seqs = shift @_;
	ref( $seqs ) eq "ARRAY"
	    and ref( $seqs->[0] ) eq "ARRAY"
	    or die "representative_sequences called with bad sequences list\n";
	$use_ref = 1;
    }
    else                        #  First item was list of entries, split off first
    {
	ref( $seqs->[0] ) eq "ARRAY"
	    or die "representative_sequences called with bad sequences list\n";
	$ref = shift @$seqs;
	$use_ref = 0;
    }

    my $max_sim = shift @_;
    my $options;

    #  Undocumented feature: skip max_sim (D = 0.8)

    if ( ref( $max_sim ) eq "HASH" )
    {
	$options = $max_sim;
	$max_sim = undef;
    }

    #  If the above did not give us options, get them now:

    $options ||= ( shift @_ ) || {};

    # ---------------------------------------# Default values for options

    $max_sim      ||= 0.80;                  # Retain 80% identity of less
    my $logfile     = undef;                 # Log file of sequences processed
    my $max_ref_sim = 0.99;                  # Get rid of identical sequences
    my $max_e_val   = 0.01;                  # Blast E-value to decrease output
    my $sim_meas    = 'identity_fraction';   # Use sequence identity as measure
    my $save_exp    = 1.0;                   # Don't retain near equivalents
    my $stable      = 0;                     # Pick reps input order

    #  Two questionable decisions:
    #     1. Be painfully flexible on option names.
    #     2. Silently fix bad parameter values.

    foreach ( keys %$options )
    {
	my $value = $options->{ $_ };
	if    ( m/^log/i )                  #  logfile
	{
	    next if ! $value;
	    $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
	    select( ( select( $logfile ), $| = 1 )[0] );  #  autoflush on
	}
	elsif ( m/ref/i )                   #  max_ref_sim
	{
	    $value += 0;
	    $value  = 0 if $value < 0;
	    $value  = 1 if $value > 1; 
	    $max_ref_sim = $value;
	}
	elsif ( m/max/i && m/sim/i )        #  max(imum)_sim(ilarity)
	{
	    $value += 0;
	    $value  = 0 if $value < 0;
	    $value  = 1 if $value > 1; 
	    $max_sim = $value;
	}
	elsif ( m/max/i || m/[ep]_?val/i )  #  Other "max" tests must come first
	{
	    $value += 0;
	    $value  = 0 if $value < 0;
	    $max_e_val = $value;
	}
	elsif ( m/sim/i || m/meas/i )       #  sim(ilarity)_meas(ure)
	{
	    $sim_meas = standardize_similarity_measure( $value );
	}
	elsif ( m/sav/i || m/exp/i )        #  save_exp(onent)
	{
	    $value += 0;
	    $value  = 0 if $value < 0;
	    $value  = 1 if $value > 1; 
	    $save_exp = $value;
	}
	elsif ( m/stab/i )                  #  stable order
	{
	    $stable = $value ? 1 : 0;
	}
	else
	{
	    print STDERR "WARNING: representative_sequences bad option ignored: '$_' => '$value'\n";
	}
    }

    #  Silent sanity check.  This should not happen, as it is almost equivalent
    #  to making no reference sequence.

    $max_ref_sim = $max_sim if ( $max_ref_sim < $max_sim );

    #  Do the analysis

    my $ref_id  = $ref->[0];
    my $ref_seq = $ref->[2];

    #  Build a list of the ids (without ref) and an index for the sequence entries:

    my @seq_id = map { $_->[0] } @$seqs;
    my $seq_ind = { map { @{$_}[0] => $_ } ( $ref, @$seqs ) };

    #  Make a lookup table of the sequence number, for use in reording
    #  sequences later:

    my $n = 0;
    my %ord = ( map { @$_[0] => ++$n } @$seqs );

    #  Build blast database (it includes the reference):

    my $protein = are_protein( $seqs );
    my $db = ( $tmp_dir ? "$tmp_dir/" : "" ) . "tmp_blast_db_$$";
    make_blast_db( $db, [ $ref, @$seqs ], $protein );

    #  Search query against new database

    my $max  = 3 * @$seqs;  # Alignments to keep
    my $self = 1;           # Keep self match (for its bit score)

    my $blast_opt = "-e $max_e_val -v $max -b $max -F F -a 2";

    #  Do the blast analysis.  Returned records are of the form:
    #
    #        0    1     2     3    4     5     6     7      8     9     10     11
    #     [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]

    my $prog = $protein ? 'blastp' : 'blastn';
    $blast_opt .= " -r 1 -q -1" if ! $protein;
    my @ref_hits = top_blast_per_subject( $prog, $db, $ref_id, $ref_seq, $self, $blast_opt );

    #  First hit is always a perfect match, so we get bits per position:
    #  This is only used if the measure is bits per position

    my $ref_bpp = $ref_hits[0]->[6] / $ref_hits[0]->[8];

    #  Remove self match (might not be first if there are identical sequences):

    my %hit = ();
    @ref_hits = grep { my $sid = $_->[3]; $hit{ $sid } = 1; ( $sid ne $ref_id ) } @ref_hits;

    my %keep = ();
    $keep{ $ref_id } = [];
    my %veto = ();
    my $n_to_do = @ref_hits;
    my $rebuild_d_n  = 40;
    my $last_rebuild = 1.5 * $rebuild_d_n;
    my $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;

    #  Sequence-specific maximum similarities:

    my %max_sim = map { ( $_ => $max_sim ) } @seq_id; 

    foreach ( @ref_hits )
    {
	my $id = $_->[3];
	my $sim = seq_similarity( $_, $sim_meas, $ref_bpp );

	if ( $sim > ( $use_ref ? $max_ref_sim : $max_sim ) )
	{
	    $veto{ $id } = 1;
	    push @{ $keep{ $ref_id } }, $id;   #  Log the sequences represented
	    $n_to_do--;
	} 
	else
	{
	    my $max_sim_i = exp( $save_exp * log( $sim ) );
	    $max_sim{ $id } = $max_sim_i if ( $max_sim_i > $max_sim );
	}
    }


    if ( $logfile )
    {
	print $logfile join( "\t", $ref_id, @{ $keep{ $ref_id } } ), "\n";
    }

    #  Search each sequence against the database.
    #  If the order is to be stable, reorder hits to match input order.

    my ( $id1, $seq1, $max_sim_1, $id2, $max_sim_2, $bpp_max );
    my @ids_to_do = map { $_->[3] } @ref_hits;
    @ids_to_do = sort { $ord{ $a } <=> $ord{ $b } } @ids_to_do if $stable;

    while ( $id1 = shift @ids_to_do )
    {
	next if $veto{ $id1 };

	#  Is it time to rebuild a smaller BLAST database?  This helps
	#  significantly in the overall performance.

	if ( $n_to_do <= $rebuild )
	{
	    if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
	    else            { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }
	    make_blast_db( $db, [ map { $seq_ind->{ $_ } } # id to sequence entry
	                          grep { ! $veto{ $_ } }   # id not vetoed
	                          ( $id1, @ids_to_do )     # remaining ids
	                        ],
	                   $protein
	                 );
	    $rebuild = ( $n_to_do > $last_rebuild ) ? $n_to_do - $rebuild_d_n : 0;
	}

	$n_to_do--;
	$keep{ $id1 } = [];

	$max_sim_1 = $max_sim{ $id1 };
	$bpp_max = undef;
	$seq1 = $seq_ind->{$id1}->[2];
	foreach ( top_blast_per_subject( $prog, $db, $id1, $seq1, $self, $blast_opt ) )
	{
	    $bpp_max ||= $_->[6] / $_->[8];
	    $id2 = $_->[3];
	    next if ( $veto{ $id2 } || $keep{ $id2 } );
	    $max_sim_2 = $max_sim{ $id2 };
	    $max_sim_2 = $max_sim_1 if ( $max_sim_1 > $max_sim_2 );
	    if ( seq_similarity( $_, $sim_meas, $bpp_max ) > $max_sim_2 )
	    {
		$veto{ $id2 } = 1;
		push @{ $keep{ $id1 } }, $id2;  #  Log the sequences represented
		$n_to_do--;
	    }
	}

	if ( $logfile )
	{
	    print $logfile join( "\t", $id1, @{ $keep{ $id1 } } ), "\n";
	}
    }

    if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
    else            { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }

    #  Return the surviving sequence entries, and optionally the hash of
    #  ids represented by each survivor:

    my $kept = [ $ref, grep { $keep{ $_->[0] } } @$seqs ];

    wantarray ? ( $kept, \%keep, [ grep { ! $hit{ $_->[0] } } @$seqs ] ) : $kept;
}


#===============================================================================
#  Build or add to a set of representative sequences.
#
#    \@reps = rep_seq_2( \@reps, \@new, \%options );
#    \@reps = rep_seq_2(         \@new, \%options );
#
#  or
#
#    ( \@reps, \%representing ) = rep_seq_2( \@reps, \@new, \%options );
#    ( \@reps, \%representing ) = rep_seq_2(         \@new, \%options );
#
#===============================================================================

sub rep_seq_2
{
    ref $_[0] eq 'ARRAY'
            or print STDERR "First parameter of rep_seq_2 must be an ARRAY reference\n"
               and return undef;

    my ( $reps, $seqs, $options ) = ( [], undef, {} );

    if    ( @_ == 1 )
    {
        $seqs = shift;
    }

    elsif ( @_ == 2 )
    {
        if    ( ref $_[1] eq 'ARRAY' ) { ( $reps, $seqs )    = @_ }
        elsif ( ref $_[1] eq 'HASH'  ) { ( $seqs, $options ) = @_ }
        else
        {
            print STDERR "Second parameter of rep_seq_2 must be an ARRAY or HASH reference\n";
            return undef;
        }
    }

    elsif ( @_ == 3 )
    {
        if ( ref $_[1] ne 'ARRAY' )
        {
            print STDERR "Second parameter of 3 parameter rep_seq_2 must be an ARRAY reference\n";
            return undef;
        }
        if ( ref $_[2] ne 'HASH' )
        {
            print STDERR "Third parameter of 3 parameter rep_seq_2 must be a HASH reference\n";
            return undef;
        }

        ( $reps, $seqs, $options ) = @_;
    }
    else
    {
        print STDERR "rep_seq_2 called with @{[scalar @_]} parameters\n";
        return undef;
    }

    # ---------------------------------------# Default values for options

    my $max_sim   = 0.80;                  # Retain 80% identity of less
    my $logfile   = undef;                 # Log file of sequences processed
    my $max_e_val = 0.01;                  # Blast E-value to decrease output
    my $sim_meas  = 'identity_fraction';   # Use sequence identity as measure

    #  Two questionable decisions:
    #     1. Be painfully flexible on option names.
    #     2. Silently fix bad parameter values.

    foreach ( keys %$options )
    {
	my $value = $options->{ $_ };
	if    ( m/^log/i )                  #  logfile
	{
	    next if ! $value;
	    $logfile = ( ref $value eq "GLOB" ? $value : \*STDOUT );
	    select( ( select( $logfile ), $| = 1 )[0] );  #  autoflush on
	}
	elsif ( m/max/i && m/sim/i )        #  max(imum)_sim(ilarity)
	{
	    $value += 0;
	    $value  = 0 if $value < 0;
	    $value  = 1 if $value > 1; 
	    $max_sim = $value;
	}
	elsif ( m/max/i || m/[ep]_?val/i )  #  Other "max" tests must come first
	{
	    $value += 0;
	    $value  = 0 if $value < 0;
	    $max_e_val = $value;
	}
	elsif ( m/sim/i || m/meas/i )       #  sim(ilarity)_meas(ure)
	{
	    $sim_meas = standardize_similarity_measure( $value );
	}
	else
	{
	    print STDERR "WARNING: rep_seq_2 bad option ignored: '$_' => '$value'\n";
	}
    }

    #  Check sequence ids for duplicates:

    my $reps2 = [];
    my $seen  = {};

    my $id;
    foreach ( @$reps )
    {
        $id = $_->[0];
        if ( $seen->{ $id }++ )
        {
            print STDERR "Duplicate sequence id '$id' skipped by rep_seq_2\n";
        }
        else
        {
            push @$reps2, $_;
        }
    }

    my $seqs2 = [];
    foreach ( @$seqs )
    {
        $id = $_->[0];
        if ( $seen->{ $id }++ )
        {
            print STDERR "Duplicate sequence id '$_[0]' skipped by rep_seq_2\n";
        }
        else
        {
            push @$seqs2, $_;
        }
    }

    #  If no preexisting representatives, then take first sequence:

    @$reps2 or @$reps2 = ( shift @$seqs2 );

    if ( $logfile ) { foreach ( @$reps2 ) { print $logfile "$_->[0]\n" } }

    #  Search each rep sequence against itself for max_bpp

    my $db = ( $tmp_dir ? "$tmp_dir/" : "" ) . "tmp_blast_db_$$";
    my $protein = are_protein( $reps2 );
    my %max_bpp;
    my $entry;
    foreach $entry ( @$reps2 )
    {
        $max_bpp{ $entry->[0] } = self_bpp( $db, $entry, $protein );
    }

    my $naln = 10;  # Alignments to make
    my $self =  1;  # Self match will never occur
    my $prog = $protein ? 'blastp' : 'blastn';
    my $blast_opt = "-e $max_e_val -v $naln -b $naln -F F -a 2";
    $blast_opt   .= " -r 1 -q -1" if ! $protein;

    #  List of who is represented by a sequence:

    my %keep = map { $_->[0] => [] } @$reps2;

    #  Search each sequence against the database.

    my ( $seq, $bpp_max );
    my $newdb = 1;

    foreach $entry ( @$seqs2 )
    {
	#  Is it time to rebuild a BLAST database?

	if ( $newdb )
	{
	    make_blast_db( $db, $reps2, $protein );
	    $newdb = 0;
	}

        #  Do the blast analysis.  Returned records are of the form:
        #
        #        0    1     2     3    4     5     6     7      8     9     10     11
        #     [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]

	$id  = $entry->[0];
	$seq = $entry->[2];
        my ( $tophit ) = sort { $b->[0] <=> $a->[0] }
                         grep { $_->[0] >= $max_sim }
                         map  { [ seq_similarity( $_, $sim_meas, $max_bpp{ $_->[3] } ), $_ ] }
                         top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );


        # It matches an existing representative

	if ( $tophit )
	{
	    # print STDERR join(", ", $tophit->[0], @{$tophit->[1]} ), "\n";
	    push @{ $keep{ $tophit->[1]->[3] } }, $id;
	    print $logfile "$id\t$tophit->[1]->[3]\n" if $logfile;
	}

	# It is a new representative

	else
	{
	    push @$reps2, $entry;
	    $keep{ $id } = [];
	    $max_bpp{ $id } = self_bpp( $db, $entry, $protein );
	    $newdb = 1;
	    print $logfile "$id\n" if $logfile;
	}
    }

    if ( $protein ) { unlink $db, "$db.psq", "$db.pin", "$db.phr" }
    else            { unlink $db, "$db.nsq", "$db.nin", "$db.nhr" }

    #  Return the surviving sequence entries, and optionally the hash of
    #  ids represented by each survivor:

    wantarray ? ( $reps2, \%keep ) : $reps2;
}


#===============================================================================
#  Try to figure out the sequence similarity measure that is being requested:
#
#     $type = standardize_similarity_measure( $requested_type )
#
#===============================================================================

sub standardize_similarity_measure
{   my ( $req_meas ) = @_;
    return ( ! $req_meas )          ? 'identity_fraction'
         : ( $req_meas =~ /id/i )   ? 'identity_fraction'
         : ( $req_meas =~ /sc/i )   ? 'score_per_position'
         : ( $req_meas =~ /spp/i )  ? 'score_per_position'
         : ( $req_meas =~ /bit/i )  ? 'score_per_position'
         : ( $req_meas =~ /bpp/i )  ? 'score_per_position'
         : ( $req_meas =~ /tiv/i )  ? 'positive_fraction'
         : ( $req_meas =~ /pos_/i ) ? 'positive_fraction'
         : ( $req_meas =~ /ppp/i )  ? 'positive_fraction'
         :                            'identity_fraction';
}


#===============================================================================
#  Caluculate sequence similarity according to the requested measure:
#
#     $similarity = seq_similarity( $hit, $measure, $bpp_max )
#
#  $hit is a structure with blast information:
#
#  [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ] 
#===============================================================================

sub seq_similarity
{   my ( $hit, $measure, $bpp_max ) = @_;
    return  ( @$hit < 11 )        ? undef
          : ( $measure =~ /^sc/ ) ? $hit->[ 6] / ( $hit->[8] * ( $bpp_max || 2 ) )
          : ( $measure =~ /^po/ ) ? $hit->[10] /   $hit->[8]
          :                         $hit->[ 9] /   $hit->[8]
}


#===============================================================================
#  Caluculate self similarity of a sequence in bits per position:
#
#     $max_bpp = self_bpp( $db_name, $entry, $protein )
#
#===============================================================================

sub self_bpp
{
    my ( $db, $entry, $protein ) = @_;

    #  Build blast database:

    make_blast_db( $db, [ $entry ], $protein );

    #  Search sequence against the database

    my $id   = $entry->[0];
    my $seq  = $entry->[2];
    my $self = 1;  # Self match will never occur

    my $prog = $protein ? 'blastp' : 'blastn';
    my $blast_opt = "-v 1 -b 1 -F F -a 2";
    $blast_opt .= " -r 1 -q -1" if ! $protein;

    #  Do the blast analysis.  Returned records are of the form:
    #
    #        0    1     2     3    4     5     6     7      8     9     10     11
    #     [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]

    my ( $hit ) = top_blast_per_subject( $prog, $db, $id, $seq, $self, $blast_opt );
    # print STDERR join( ", ", @$hit ), "\n";

    #  First hit is always a perfect match, so we get bits per position:
    #  This is only used if the measure is bits per position

    $hit->[6] / $hit->[8];
}


#===============================================================================
#  Make a blast databse from a set of sequence entries.  The type of database
#  (protein or nucleic acid) is quessed from the sequence data.
#
#     make_blast_db( $db_filename, \@seq_entries, $protein )
#
#  Sequence entries have the form: [ $id, $def, $seq ]
#===============================================================================

sub make_blast_db
{   my ( $db, $seqs, $protein ) =  @_;

    open FH, ">$db" or die "Could not create blast database file \"$db\"";
    foreach ( @$seqs )
    {
	print FH ">$_->[0]", ( $_->[1] ? " $_->[1]" : () ), "\n$_->[2]\n";
    }
    close FH;

    my $is_prot = $protein ? 'T' : 'F';
    system "$formatdb -p $is_prot -i '$db'";
}


#===============================================================================
#  The type of data (protein or nucleic acid) is quessed from the sequences.
#
#     are_protein( \@seq_entries )
#
#  Sequence entries have the form: [ $id, $def, $seq ]
#===============================================================================

sub are_protein
{   my ( $seqs ) =  @_;
    my  ( $nt, $aa ) = ( 0, 0 );
    foreach ( @$seqs )
    {
	my $s = $_->[2];
	$nt += $s =~ tr/ACGTacgt//d;
	$aa += $s =~ tr/A-Za-z//d;
    }
    ( $nt < 3 * $aa ) ? 1 : 0;
}


#===============================================================================
#  Blast a subject against a datbase, saving only top hit per subject
#
#  Return:
#
#   [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ] 
#
#===============================================================================

# Use a bare block to compartmentalize a counter:

{ my $cnt = 0;

sub top_blast_per_subject
{   my ( $prog, $db, $qid, $qseq, $self, $blast_opt, $sort, $no_merge ) = @_;

    $cnt++;
    my $query_file = ( $tmp_dir ? "$tmp_dir/" : "" ) . "tmp_blast_seq_${$}_$cnt";
    my    $QFILE;
    open  $QFILE, ">$query_file" or die "Could not create sequence file \"$query_file\"";
    print $QFILE  ">$qid\n$qseq\n";
    close $QFILE;

    my $blast_cmd = "$blastall -p $prog -d '$db' -i '$query_file' $blast_opt";

    my   $BPIPE;
    open $BPIPE, "$blast_cmd |" or die "Could open blast pipe\n";
    my $sims = integrate_blast_segments( $BPIPE, $sort, $no_merge, $self );
    close $BPIPE;
    unlink $query_file;

    my $pq = "";  #  Previous query id
    my $ps = "";  #  Previous subject id
    my $keep;

    grep { $keep = ( $pq ne $_->[0] ) || ( $ps ne $_->[3] );
           $pq = $_->[0];
           $ps = $_->[3];
           $keep && ( $self || ( $pq ne $ps ) );
         } @$sims;
}
}


#===============================================================================
#  Read output of rationalize blast and assemble minimally overlapping segments
#  into a total score for each subject sequence.  For each query, sort matches
#  into user-chosen order (D = total score):
#
#      @sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
#     \@sims = integrate_blast_segments( \*FILEHANDLE, $sort_order, $no_merge )
#
#  Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
#  and 'positive_fraction' (matched very flexibly).
#
#  Returned sims (e_val is only for best HSP, not any composite):
#
#     [ qid, qdef, qlen, sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ] 
#
#  There is a strategic decision to not read the blast output from memory;
#  it could be enormous.  This cuts the flexibility some.
#===============================================================================
#
#  coverage fields:
#
#  [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ] 
#
#===============================================================================

sub integrate_blast_segments
{   my ( $fh, $order, $no_merge, $self ) = @_;
    $fh ||= \*STDIN;
    ( ref( $fh ) eq "GLOB" ) || die "integrate_blast_segments called without a filehandle\n";

    $order = ( ! $order )         ? 'score'
           : ( $order =~ /sc/i )  ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
           : ( $order =~ /bit/i ) ? ( $order =~ /p/i ? 'score_per_position' : 'score' )
           : ( $order =~ /spp/i ) ? 'score_per_position'
           : ( $order =~ /id/i )  ? 'identity_fraction'
           : ( $order =~ /tiv/i ) ? 'positive_fraction'
           :                        'score';

    my $max_frac_overlap = 0.2;

    my ( $qid, $qdef, $qlen, $sid, $sdef, $slen );
    my ( $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap );
    my ( $ttl_scr, $ttl_mat, $ttl_id, $ttl_pos, $ttl_gap );
    my @sims = ();
    my @qsims = ();
    my $coverage = undef;
    my $record;

    while ( $_ = next_blast_record( $fh, $self ) )
    {
	chomp;
	if    ( $_->[0] eq 'Query=' )
	{
	    if ( $coverage )
	    {
		push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
		$coverage = undef;
	    }
	    if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }
	    ( undef, $qid, $qdef, $qlen ) = @$_;
	    $sid = undef;
	    @qsims = ();
	}
	elsif ( $_->[0] eq '>' )
	{
	    if ( $coverage )
	    {
		push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ];
		$coverage = undef;
	    }
	    next if ! $qid;
	    ( undef, $sid, $sdef, $slen ) = @$_;
	}
	elsif ( $_->[0] eq 'HSP' && $sid )
	{
	    $coverage = integrate_HSP( $coverage, $_, $max_frac_overlap, $no_merge );
	}
    }

    if ( $coverage ) { push @qsims, [ $sid, $sdef, $slen, @$coverage[ 0 .. 5 ] ] }

    if ( @qsims ) { push @sims, order_query_sims( $qid, $qdef, $qlen, \@qsims, $order ) }

    wantarray ? @sims : \@sims;
}


#===============================================================================
#
#  Try to integrate non-conflicting HSPs for the same subject sequence.  The
#  conflicts are only assessed from the standpoint of the query, at least for
#  now.  We could track the subject sequence coverage as well (to avoid a direct
#  repeat in the query from matching the same subject twice).
#
#    $new_coverage = integrate_HSP( $coverage, $hsp, $max_frac_overlap, $no_merge )
#
#                 0      1     2      3     4      5     6             7
#  $coverage = [ scr, e_val, n_mat, n_id, n_pos, n_gap, dir, [ intervals_covered ] ]
#
#      $coverage should be undefined at the first call; the function intiallizes
#      all of the fields from the first HSP.  scr, n_mat, n_id, n_pos, and n_gap
#      are sums over the combined HSPs.  e_val is based only of the first HSP.
#
#             0     1     2      3      4       5      6     7      8     9  10   11  12  13   14  15
#  $hsp = [ 'HSP', scr, e_val, n_seg, e_val2, n_mat, n_id, n_pos, n_gap, dir, s1, e1, sq1, s2, e2, sq2 ]
#
#  $max_frac_overlap  Amount of the new HSP that is allowed to overlap already
#                     incorporated HSPs
#
#  $no_merge          Disable the merging of multiple HSPs.  The structure will
#                     be filled in from the first HSP and left unchanged though
#                     subsequence calls.  This simplifies the program structure.
#
#  Fitting a new HSP into covered intervals:
#
#   1                                                              qlen
#   |---------------------------------------------------------------| query
#           ------------                ---------------               covered
#                     -------------                                   new match
#                     l           r
#
#===============================================================================

sub integrate_HSP
{   my ( $coverage, $hsp, $max_frac_overlap, $no_merge ) = @_;
    my ( undef, $scr, $e_val, undef, undef, $n_mat, $n_id, $n_pos, $n_gap, $dir, $s1, $e1 ) = @$hsp;

    #  Ignore frame; just use direction of match:

    $dir = substr( $dir, 0, 1 );

    #  Orient by left and right ends:

    my ( $l, $r ) = ( $e1 > $s1 ) ? ( $s1, $e1 ) : ( $e1, $s1 );

    #  First HSP for the subject sequence:

    if ( ! $coverage )
    {
	return [ $scr, $e_val, $n_mat, $n_id, $n_pos, $n_gap, $dir, [ [ $s1, $e1 ] ] ];
    }

    #  Not first; must be same direction to combine (also test no_merge here):

    return $coverage  if ( $no_merge || ( $dir ne $coverage->[6] ) );

    #  Not first; must fall in a gap of query sequence coverage:

    my @intervals = @{ $coverage->[7] };
    my $max_overlap = $max_frac_overlap * ( $r - $l + 1 );
    my $prev_end = 0;
    my $next_beg = $intervals[0]->[0];
    my @used = ();
    while ( $next_beg <= $l )      # *** Sequential search could be made binary
    {
	$prev_end = $intervals[0]->[1];
	push @used, scalar shift @intervals;
	$next_beg = @intervals ? $intervals[0]->[0] : 1e10;
    }

    my $overlap = ( ( $l <= $prev_end ) ? ( $prev_end - $l + 1 ) : 0 )
                + ( ( $r >= $next_beg ) ? ( $r - $next_beg + 1 ) : 0 );
    return $coverage  if ( $overlap > $max_overlap );

    #  Okay, we have passed the overlap test.  We need to integrate the
    #  match into the coverage description.  Yes, I know that this counts
    #  the overlap region.  We could pro rate it, but that is messy too:

    $coverage->[0] += $scr;
    $coverage->[2] += $n_mat;
    $coverage->[3] += $n_id;
    $coverage->[4] += $n_pos;
    $coverage->[5] += $n_gap;

    #  Refigure the covered intervals, fusing intervals separated by a
    #  gap of less than 10:

    my $min_gap = 10;
    if ( $l <= $prev_end + $min_gap )
    {
	if ( @used ) { $l = $used[-1]->[0]; pop @used }
	else         { $l = 1 }
    }
    if ( $r >= $next_beg - $min_gap )
    {
	if ( @intervals ) { $r = $intervals[0]->[1]; shift @intervals }
	else              { $r = 1e10 }
    }

    $coverage->[7] = [ @used, [ $l, $r ], @intervals ];

    return $coverage;
}


#===============================================================================
#  Sort the blast matches by the desired criterion:
#
#     @sims = order_query_sims(  $qid, $qdef, $qlen, \@qsims, $order )
#
#  Allowed sort orders are 'score', 'score_per_position', 'identity_fraction',
#  and 'positive_fraction'
#
#  @qsims fields:
#
#        0    1     2     3     4      5     6      7      8
#     [ sid, sdef, slen, scr, e_val, n_mat, n_id, n_pos, n_gap ]
#
#===============================================================================

sub order_query_sims
{   my ( $qid, $qdef, $qlen, $qsims, $order ) = @_;

    my @sims;
    if ( $order eq 'score_per_position' )
    {
	@sims = map { [ $_->[5] ? $_->[3]/$_->[5] : 0, $_ ] } @$qsims;
    }
    elsif ( $order eq 'identity_fraction' )
    {
	@sims = map { [ $_->[5] ? $_->[6]/$_->[5] : 0, $_ ] } @$qsims;
    }
    elsif ( $order eq 'positive_fraction' )
    {
	@sims = map { [ $_->[5] ? $_->[7]/$_->[5] : 0, $_ ] } @$qsims;
    }
    else  # Default is by 'score'
    {
	@sims = map { [ $_->[3], $_ ] } @$qsims;
    }

    map { [ $qid, $qdef, $qlen, @{$_->[1]} ] } sort { $b->[0] <=> $a->[0] } @sims;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3