[Bio] / FigKernelPackages / find_homologs.pm Repository:
ViewVC logotype

View of /FigKernelPackages/find_homologs.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Sun Feb 22 14:38:13 2009 UTC (10 years, 9 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2009_05_18, rast_rel_2010_0928, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2010_1206, rast_rel_2010_0118, rast_rel_2009_07_09, rast_rel_2010_0827, rast_rel_2009_03_26
Changes since 1.2: +13 -5 lines
update from Gary

package find_homologs;

#
#  Find homologs to a set of nucleotide sequences
#
use strict;
use gjoseqlib;
use gjoparseblast;
# use Data::Dumper;

my $DEBUG = 0;

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(
        find_nucleotide_homologs
        );

#  The following code makes this FIG-aware, but not FIG dependent:

my $is_fig;
eval { require FIG; require FIG_Config; $is_fig = 1 };

my $ext_bin = $is_fig ? "$FIG_Config::ext_bin" : '';


#
#  \@instances = find_nucleotide_homologs( \@contigs,     \%options )
#  \@instances = find_nucleotide_homologs(  $contig_file, \%options )
#
#  An instance is:
#
#      { location => $loc, definition => $def, sequence => $seq }
#
#  Options:
#
#      blastall    =>  $blastall_path
#      coverage    =>  $min_coverage         # D = 0.70
#      descript    =>  $description          # D = ''
#      description =>  $description          # D = ''
#      expect      =>  $max_expect           # D = 1e-2
#      extrapol    =>  $max_extrapolate      # D = 0
#      extrapolate =>  $max_extrapolate      # D = 0
#      formatdb    =>  $formatdb_path
#      identity    =>  $min_identity         # D = 0.60
#      maxgap      =>  $max_gap              # D = 5000 nt
#      prefix      =>  $contig_id_prefix     # D = ''
#      reffile     =>  $filename
#      refseq      => \@seqs
#      seedexp     =>  $max_seed_expect      # D = 1e-20
#      tmp         =>  $location_for_tmpdir
#      tmpdir      =>  $location_of_tmpdir
#      verbose     =>  boolean               # D = 0
#
#  Analyze one contig at a time:
#
#      $query_results = rationalize_blast_query( $fh )
#
#  Output is clustered heirarchically:
#
#    [ qid, qdef, qlen, [ [ sid, sdef, slen, [ hsp_data, hsp_data, ... ] ],
#                         [ sid, sdef, slen, [ hsp_data, hsp_data, ... ] ],
#                         ...
#                       ]
#    ]
#
#                0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
#  hsp_data = [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
#

sub find_nucleotide_homologs
{
    my ( $contigs, $options ) = @_;
    -f $contigs && -s $contigs
       or ref $contigs eq 'ARRAY' && ref $contigs->[0] eq 'ARRAY'
       or print STDERR "find_nucleotide_homologs called with bad \\\@contigs\n"
          and return [];
    ref $options eq 'HASH'
       or print STDERR "find_nucleotide_homologs called with bad \\%options\n"
          and return [];

    my $blastall  = $options->{ blastall } ||= ( $ext_bin ? "$ext_bin/blastall" : "blastall" );
    my $min_cover = $options->{ coverage } ||=    0.70;  # Minimum fraction of reference covered
    my $max_exp   = $options->{ expect }   ||=    0.01;  # Maximum e-value for blast
    my $extrapol  = $options->{ extrapol } ||= $options->{ extrapolate } || 0;  # Max to extrapolate ends of subj seq
    my $formatdb  = $options->{ formatdb } ||= ( $ext_bin ? "$ext_bin/formatdb" : "formatdb" );
    my $descr     = $options->{ descript } ||= $options->{ description } || "";
    my $min_ident = $options->{ identity } ||=    0.60;  # Minimum fraction sequence identity
    my $max_gap   = $options->{ maxgap }   ||= 5000;     # Maximum gap in genome match
    my $prefix    = $options->{ prefix }   ||=   "";
    my $ref_file  = $options->{ reffile };
    my $ref_seq   = $options->{ refseq };
    my $seed_exp  = $options->{ seedexp }  ||=    1e-20; # Maximum e-value to nucleate match
    my $verbose   = $options->{ verbose }  ||=    0;

    my ( $tmp_dir, $save_tmp ) = temporary_directory( $options );

    #  Build the blast database of reference sequences:
    #
    #  Four cases:
    #     "$ref_file.nsq" exists
    #         use it
    #     "$ref_file.neq" and $ref_seq do not exist, but "$ref_file" exists
    #         run formatdb -i $ref_file -n "$tmp_dir/$tmp_name"
    #     "$ref_file.nsq" does not exist, but $ref_seq exists
    #         write a temp file in $tmp_dir and run formatdb -i "$tmp_dir/$tmp_name"
    #     nothing exists
    #         bail

    my $db;
    if ( -f "$ref_file.nsq" )
    {
        $db = $ref_file;
    }
    elsif ( ref $ref_seq eq 'ARRAY' && ref $ref_seq->[0] eq 'ARRAY' )
    {
        $db = "$tmp_dir/ref_seqs";
        print_alignment_as_fasta( $db, $ref_seq );
        my @command = ( $formatdb, '-p', 'f', '-i', $db );
        open( FORMATDB, '-|' ) or exec( @command )
                               and die join( ' ', 'Failed', @command );
        close FORMATDB;  #  Wait for completion
    }
    elsif ( -f $ref_file && -s $ref_file )
    {
        my $name = $ref_file;
        $name =~ s/^.*\///;    # Remove leading path
        $db = "$tmp_dir/$name";
        my @command = ( $formatdb, '-p', 'f', '-i', $ref_file, '-n', $db );
        open( FORMATDB, '-|' ) or exec( @command )
                               and die join( ' ', 'Failed', @command );
        close FORMATDB;  #  Wait for completion
    }
    else
    {
        print STDERR "find_nucleotide_homologs cannot locate reference sequence data\n";
        return [];
    }

    $options->{ db } = $db;

    #  There are two ways to go for the contigs:
    #
    #     $contigs is a file of contigs
    #         use it
    #     $contigs is a reference to an array of sequences
    #         write them to a file

    my $qfile;
    if ( ref $contigs eq 'ARRAY' )
    {
        return [] if ! @$contigs;
        return [] if ! ref $contigs->[0] eq 'ARRAY';   #  Could do better diagnosis
        $qfile   = "$tmp_dir/query";
        print_alignment_as_fasta( $qfile, $contigs );  #  Write them all
    }
    else
    {
        -f $contigs
            or print STDERR "Bad contigs file '$contigs'\n"
            and return [];
        $qfile = $contigs;
    }

    my @command = ( $blastall, '-p', 'blastn',
                               '-d', $db,
                               '-i', $qfile,
                               '-r',  1,
                               '-q', -1,
                               '-F', 'f',
                               '-e',  $max_exp,
                               '-v',  5,
                               '-b',  5
                  );

    open( BLAST, '-|' )
        or exec( @command ) and die join( ' ', 'Failed:', @command );

    #  Process contig blast results one contig at a time

    my @out;
    my $contig;  #  Contig sequence data

    while ( $contig = next_contig( $contigs ) )
    {
        my $query_results = next_blast_query( \*BLAST );
        my ( $qid, $qdef, $qlen, $q_matches ) = @$query_results;

        #  Check the framing between contigs and blast queries:

        if ( $qid ne $contig->[0] )
        {
            die "Contig data ($contig->[0]) and blastn output ($qid) are out of phase.\n";
        }

        #  A given query (contig) may hit zero or more reference sequences

        my @matches = ();
        foreach my $subject_results ( @$q_matches )
        {
            my ( $sid, $sdef, $slen, $s_matches ) = @$subject_results;
            push @matches, merge_hsps( $qlen, $slen, $s_matches, $options );
        }

        #  Okay, now we need to remove duplicates:

        my @merged = merge_contig_matches( \@matches );

        #  Report matched sequences, or just locations:

        my $qid   =  $contig->[0];
        my $qseqR = \$contig->[2];
        foreach ( sort { $a->[0]+$a->[1] <=> $b->[0]+$b->[1] } @merged )
        {
            my $loc = $prefix . join( "_", $qid, @$_ );
            my $seq = DNA_subseq( $qseqR, @$_ );
            push @out, { location   => $loc,
                         definition => $descr,
                         sequence   => $seq
                       };
        }
    }

    close BLAST;

    system "/bin/rm -r '$tmp_dir'" if ! $save_tmp;

    \@out;
}


#  Produce the next fasta sequence from an array or a file:

{
    my %index = ();  #  $index{ $a } is current index into array @$a
    sub next_contig
    {
        my ( $contigs ) = @_;
        return $contigs->[ $index{ $contigs }++ || 0 ] if ref $contigs eq 'ARRAY';
        gjoseqlib::read_next_fasta_seq( $contigs );
    }
}


#===============================================================================
#  Merge the hsps for a single reference sequence.  The goal is to cover as
#  much of the reference as possible.
#
#     \@match_regions = merge_hsps( $qlen, $slen, $s_matches, $options )
#
#  Match regions are in contig orientation, not query orientation:
#
#     $match_region = [ $q1, $q2, $dir, $mid ]
#
#  Hsps are:
#
#                 0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
#  $hsp_data = [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
#
#===============================================================================
sub merge_hsps
{
    my ( $qlen, $slen, $s_matches, $options ) = @_;
    my $min_cover = $options->{ coverage };  # Minimum fraction of reference covered
    my $max_exp   = $options->{ expect };    # Maximum e-value for blast
    my $extrapol  = $options->{ extrapol };  # Amount to extrapolate ends of subject seq
    my $min_ident = $options->{ identity };  # Minimum fraction sequence identity
    my $max_gap   = $options->{ maxgap };    # Maximum gap in genome match
    my $seed_exp  = $options->{ seedexp };   # Maximum e-value to nucleate match
    my $verbose   = $options->{ verbose };
    my @by_score = sort { $b->[0] <=> $a->[0] }
                   grep { $_->[1] <=  $max_exp
                       && gjoseqlib::fraction_nt_diff( @$_[11,14] ) <= 1 - $min_ident
                        }
                   @$s_matches;
    foreach ( @by_score )
    {
        push @$_, $_->[ 9]  +  $_->[10];  # mid
        $_->[8] = $_->[13] <=> $_->[12];  # direction
    }

    my @by_pos = sort { $a->[15] <=> $b->[15] } @by_score;
    my $i = 0;
    my %pos_dex = map { $_ => $i++ } @by_pos;

    my @found;  #  [ $q1, $q2, $s1, $s2, $dir, \@hsps ]
    my %done;
    foreach my $hsp ( @by_score )
    {
        next if $done{ $hsp } || ( $hsp->[1] > $seed_exp );
        $done{ $hsp } = 1;
        my @hsps = ( $hsp );
        my ( $dir, $q1, $q2, $s1, $s2 )= @$hsp[ 8, 9, 10, 12, 13 ];
        my $dex = $pos_dex{ $hsp };

        # extend left on query (maybe):

        my $min_dex = $dex;   #  Smallest index used
        my $skip_scr = 0;     #  Biggest score skipped
        my $d = $dex;         #  Current try
        while ( $d > 0 )
        {
            my $try = $by_pos[ $d - 1 ];
            last if $done{ $try };                    #  Used
            last if ( $q1 - $try->[10] > $max_gap );  #  Too far
            $d--;                                     #  Move current try
            #  Can extend if moves to left in both subject and query
            if ( $try->[8] == $dir                             # Same strand
              && $try->[9] <  $q1                              # Move to left in query
              && ( ( $s1 <=> $s2 ) == ( $try->[12] <=> $s1 ) ) # Move to left in subject
               )
            {
                #  But do not do it if score added is less than score "skipped"
                if ( $try->[0] >= $skip_scr )
                {
                    $q1 = $try->[ 9];
                    $q2 = $try->[10] if $try->[10] > $q2;
                    $s1 = $try->[12];
                    $s2 = $try->[13] if ( $s1 <=> $s2 ) == ( $s2 <=> $try->[13] );
                    push @hsps, $try;
                    while ( $min_dex > $d ) { $done{ $by_pos[ --$min_dex ] } = 1 }
                    $skip_scr = 0;
                }
            }
            else
            {
                $skip_scr = $try->[0] if $try->[0] > $skip_scr;
            }
        }

        # extend right on query (maybe):

        my $max_dex = $dex;   #  Largest index used
        $skip_scr = 0;        #  Biggest score skipped
        $d = $dex;            #  Current try
        while ( $d < @by_pos - 1 )
        {
            my $try = $by_pos[ $d + 1 ];
            last if $done{ $try };                    #  Used
            last if ( $try->[9] - $q2 > $max_gap );   #  Too far
            $d++;                                     #  Move current try
            #  Can extend if moves to right in both subject and query
            if ( $try->[ 8] == $dir                            # Same strand
              && $try->[10] >  $q2                             # Move to right in query
              && ( ( $s1 <=> $s2 ) == ( $s2 <=> $try->[13] ) ) # Move to right in subject
               )
            {
                #  But do not do it if score added is less than score "skipped"
                if ( $try->[0] >= $skip_scr )
                {
                    $q1 = $try->[ 9] if $try->[9] < $q1;
                    $q2 = $try->[10];
                    $s1 = $try->[12] if ( $s1 <=> $s2 ) == ( $try->[12] <=> $s1 );
                    $s2 = $try->[13];
                    push @hsps, $try;
                    while ( $max_dex < $d ) { $done{ $by_pos[ ++$max_dex ] } = 1 }
                    $skip_scr = 0;
                }
            }
            else
            {
                $skip_scr = $try->[0] if $try->[0] > $skip_scr;
            }
        }

        #  Extrapolate ends in query to ends of reference?

        if ( $extrapol )
        {
            #  How far to extrapolate at ends of subject sequence:
            my $d_s1 = $s1   - 1;
            my $d_s2 = $slen - $s2;

            #  Map to proper ends of query sequence:
            my ( $d_q1, $d_q2 ) = ( $s1 < $s2 ) ? ( $d_s1, $d_s2 ) : ( $d_s2, $d_s1 );

            #  Limit by ends of query sequence and adjust if small enough:
            $d_q1 = $q1 - 1 if $q1 - $d_q1 < 1;
            $q1  -= $d_q1   if $d_q1 <= $extrapol;

            $d_q2 = $qlen - $q2 if $q2 + $d_q2 > $qlen;
            $q2  += $d_q2       if $d_q2 <= $extrapol;
        }

        push @found, [ $q1, $q2, $dir, $q1+$q2 ];
    }

    @found;
}


#===============================================================================
#  Combine regions identified by different reference sequences:
#
#     @merged_matches = merge_contig_matches( \@match_regions )
#
#     $match_region = [ $l, $r, $dir, $mid ]
#===============================================================================
sub merge_contig_matches
{
    ref $_[0] eq 'ARRAY' and @{$_[0]} or return ();

    my @regions = sort { $a->[2] <=> $b->[2] || $a->[3] <=> $b->[3] } @{$_[0]};
    my @keep  = ();
    
    #   left, right, dir, mid
    my ( $l1, $r1, $d1, $m1 ) = @{ shift @regions };
    foreach my $match ( @regions )
    {
        my ( $l2, $r2, $d2, $m2 ) = @$match;
        my $overlap = min( $r1, $r2 ) - max( $l1, $l2 ) + 1;
        my $length  = min( $r1 - $l1 + 1, $r2 - $l2 + 1 );
        if ( ( $d2 == $d1 )  && ( $overlap >= 0.5 * $length ) )
        {
            $l1 = $l2 if $l2 < $l1;  # Leftmost match
            $r1 = $r2 if $r2 > $r1;  # Rightmost match
            $m1 = $l1 + $r1;
        }
        else
        {
            push @keep, match_location( $l1, $r1, $d1, $m1 );
            ( $l1, $r1, $d1, $m1 ) = @$match;
        }
    }
    push @keep, match_location( $l1, $r1, $d1, $m1 );

    # This is where we could check things (like coverage):

    @keep
}


#
#  match_location
#
sub match_location { ( $_[2] > 0 ) ? [ @_[0,1] ] : ( $_[2] < 0 ) ? [ @_[1,0] ] : () }


sub min { $_[0] < $_[1] ? $_[0] : $_[1] }
sub max { $_[0] > $_[1] ? $_[0] : $_[1] }


#-------------------------------------------------------------------------------
#  Utility functions:
#-------------------------------------------------------------------------------
#  Allow variations of option keys including uppercase, underscores and
#  terminal 's':
#
#      $key = canonical_key( $key );
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sub canonical_key
{
    my $key = lc shift;
    $key =~ s/_//g;
    $key =~ s/s$//;  #  This is dangerous if an s is part of a word!
    return $key;
}


#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#  Return a temporary directory for files.
#     If supplied as an option:
#        If it exists, use it and set save_tmp
#        If it does not exist, create it
#     If not supplied
#        If tmp is supplied and exists, create it there
#        If $FIG_Config::temp exists, create it there
#        If /tmp exists, create it there
#        Otherwise create it in .
#
#  ( $tmp_dir, $save_tmp ) = temporary_directory( \%options )
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sub temporary_directory
{
    my $options = ( shift ) || {};

    my $tmp_dir  = $options->{ tmpdir };
    my $save_tmp = $options->{ savetmp } || '';

    if ( $tmp_dir )
    {
        if ( -d $tmp_dir ) { $options->{ savetmp } = $save_tmp = 1 }
    }
    else
    {
        my $tmp = $options->{ tmp } && -d  $options->{ tmp } ?  $options->{ tmp }
                : $FIG_Config::temp && -d  $FIG_Config::temp ?  $FIG_Config::temp
                :                      -d '/tmp'             ? '/tmp'
                :                                              '.';
	$tmp_dir = sprintf( "$tmp/tmp_dir.%05d.%09d", $$, int(1000000000*rand) );
    }

    if ( $tmp_dir && ! -d $tmp_dir )
    {
        mkdir $tmp_dir;
        if ( ! -d $tmp_dir )
        {
            print STDERR "find_homologs::temporary_directory() could not create '$tmp_dir'\n";
            $options->{ tmpdir } = $tmp_dir = undef;
        }
    }

    return ( $tmp_dir, $save_tmp );
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3