[Bio] / FigKernelPackages / find_homologs.pm Repository:
ViewVC logotype

View of /FigKernelPackages/find_homologs.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Sun Jan 2 22:03:14 2011 UTC (9 years ago) by golsen
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011
Changes since 1.3: +113 -150 lines
use SeedAware
Many changes in details of code.

package find_homologs;

#
#  Find homologs to a set of nucleotide sequences
#
use strict;
use gjoseqlib;
use gjoparseblast;
use SeedAware;
# use Data::Dumper;

my $DEBUG = 0;

require Exporter;

our @ISA    = qw( Exporter );
our @EXPORT = qw( find_nucleotide_homologs );


#===============================================================================
#
#  \@instances = find_nucleotide_homologs( \@contigs,     \%options )
#  \@instances = find_nucleotide_homologs(  $contig_file, \%options )
#
#  An instance is:
#
#      { location   => $loc,    #  SEED location string
#        definition => $def,
#        sequence   => $seq,
#        uncover5   => $uncov5,
#        uncover3   => $uncov3
#      }
#
#  Options:
#
#      blastall    =>  $blastall_path
#      coverage    =>  $min_coverage         # D = 0.70
#      descript    =>  $description          # D = ''
#      description =>  $description          # D = ''
#      expect      =>  $max_expect           # D = 1e-2
#      extrapol    =>  $max_extrapolate      # D = 0
#      extrapolate =>  $max_extrapolate      # D = 0
#      formatdb    =>  $formatdb_path
#      identity    =>  $min_identity         # D = 0.60
#      loc_format  =>  $format               # SEED, Sapling, CBDL [contig,beg,dir,len]
#      maxgap      =>  $max_gap              # D = 5000 nt
#      prefix      =>  $contig_id_prefix     # D = ''
#      reffile     =>  $filename
#      refseq      => \@seqs
#      seedexp     =>  $max_seed_expect      # D = 1e-20
#      tmp         =>  $location_for_tmpdir
#      tmpdir      =>  $location_of_tmpdir
#      verbose     =>  boolean               # D = 0
#
#===============================================================================

sub find_nucleotide_homologs
{
    my ( $contigs, $options ) = @_;
    -f $contigs && -s $contigs
       or ref $contigs eq 'ARRAY' && ref $contigs->[0] eq 'ARRAY'
       or print STDERR "find_nucleotide_homologs called with bad \\\@contigs\n"
          and return [];
    ref $options eq 'HASH'
       or print STDERR "find_nucleotide_homologs called with bad \\%options\n"
          and return [];

    my $blastall  = $options->{ blastall }   ||= SeedAware::executable_for( 'blastall' );
    my $min_cover = $options->{ coverage }   ||=    0.70;  # Minimum fraction of reference covered
    my $max_exp   = $options->{ expect }     ||=    0.01;  # Maximum e-value for blast
    my $extrapol  = $options->{ extrapol }   ||= $options->{ extrapolate } || 0;  # Max to extrapolate ends of subj seq
    my $formatdb  = $options->{ formatdb }   ||= SeedAware::executable_for( 'formatdb' );
    my $descr     = $options->{ descript }   ||= $options->{ description } || "";
    my $loc_form  = $options->{ loc_format } ||= 'seed';
    my $min_ident = $options->{ identity }   ||=    0.60;  # Minimum fraction sequence identity
    my $max_gap   = $options->{ maxgap }     ||= 5000;     # Maximum gap in genome match
    my $prefix    = $options->{ prefix }     ||=   "";
    my $ref_file  = $options->{ reffile };
    my $ref_seq   = $options->{ refseq };
    my $seed_exp  = $options->{ seedexp }    ||=    1e-20; # Maximum e-value to nucleate match
    my $verbose   = $options->{ verbose }    ||=    0;

    my ( $tmp_dir, $save_tmp ) = SeedAware::temporary_directory( $options );

    #  Build the blast database of reference sequences:
    #
    #  Four cases:
    #     "$ref_file.nsq" exists
    #         use it
    #     "$ref_file.nsq" and $ref_seq do not exist, but "$ref_file" exists
    #         run formatdb -i $ref_file -n "$tmp_dir/$tmp_name"
    #     "$ref_file.nsq" does not exist, but $ref_seq exists
    #         write a temp file in $tmp_dir and run formatdb -i "$tmp_dir/$tmp_name"
    #     nothing exists
    #         bail

    my $db;
    if ( $ref_file && -f "$ref_file.nsq" )
    {
        $db = $ref_file;
    }
    elsif ( ref $ref_seq eq 'ARRAY' && ref $ref_seq->[0] eq 'ARRAY' )
    {
        $db = "$tmp_dir/ref_seqs";
        print_alignment_as_fasta( $db, $ref_seq );
        my @cmd = ( $formatdb, -p => 'f', -i => $db );
        system( @cmd ) and die join( ' ', 'Failed', @cmd );
    }
    elsif ( -f $ref_file && -s $ref_file )
    {
        my $name = $ref_file;
        $name =~ s/^.*\///;    # Remove leading path
        $db = "$tmp_dir/$name";
        my @cmd = ( $formatdb, -p => 'f', -i => $ref_file, -n => $db );
        system( @cmd ) and die join( ' ', 'Failed', @cmd );
    }
    else
    {
        print STDERR "find_nucleotide_homologs cannot locate reference sequence data\n";
        return [];
    }

    $options->{ db } = $db;

    #  There are two ways to go for the contigs:
    #
    #     $contigs is a file of contigs
    #         use it
    #     $contigs is a reference to an array of sequences
    #         write them to a file

    my $qfile;
    if ( ref $contigs eq 'ARRAY' )
    {
        return [] if ! @$contigs;
        return [] if ! ref $contigs->[0] eq 'ARRAY';   #  Could do better diagnosis
        $qfile   = "$tmp_dir/query";
        print_alignment_as_fasta( $qfile, $contigs );  #  Write them all
    }
    else
    {
        -f $contigs
            or print STDERR "Bad contigs file '$contigs'\n"
            and return [];
        $qfile = $contigs;
    }

    my @cmd = ( $blastall,
                -p => 'blastn',
                -d => $db,
                -i => $qfile,
                -r =>  1,
                -q => -1,
                -F => 'f',
                -e => $max_exp,
                -v =>  5,
                -b =>  5
              );

    my $redirect = { stderr => '/dev/null' };
    my $blastFH = SeedAware::read_from_pipe_with_redirect( @cmd, $redirect )
        and die join( ' ', 'Failed:', @cmd );

    #  Process contig blast results one contig at a time

    my @out;
    my $contig;  #  Contig sequence data
    my $n = 0;

    while ( $contig = next_contig( $contigs, $n++ ) )
    {
        my $query_results = next_blast_query( $blastFH );
        my ( $qid, $qdef, $qlen, $q_matches ) = @$query_results;

        #  Check the framing between contigs and blast queries:

        if ( $qid ne $contig->[0] )
        {
            die "Contig data ($contig->[0]) and blastn output ($qid) are out of phase.\n";
        }

        #  A given query (contig) may hit zero or more reference sequences

        my @matches = ();
        foreach my $subject_results ( @$q_matches )
        {
            my ( $sid, $sdef, $slen, $s_matches ) = @$subject_results;
            push @matches, merge_hsps( $qlen, $slen, $s_matches, $options );
        }

        #  Okay, now we need to remove duplicates:

        my @merged = merge_contig_matches( \@matches );

        #  Report matched sequences, or just locations:

        $qid      =  $contig->[0];
        my $qseqR = \$contig->[2];
        foreach ( sort { $a->[3] <=> $b->[3] } @merged )
        {
            # [ $l, $r, $d, $m, $uc5, $uc3 ]
            my ( $l, $r, $dir ) = @$_[0,1,2];
            my ( $beg, $end ) = ( $dir > 0 ) ? ( $l, $r ) : ( $r, $l );
            my $loc = format_match_location( $qid, $beg, $end, $dir, $r-$l+1, $loc_form );
            my $seq = DNA_subseq( $qseqR, $beg, $end );
            push @out, { location   => $loc,
                         definition => $descr,
                         sequence   => $seq,
                         uncover5   => $_->[4],
                         uncover3   => $_->[5]
                       };
        }
    }

    close( $blastFH );
    system( '/bin/rm', '-r', $tmp_dir ) if ! $save_tmp;

    \@out;
}


sub format_match_location
{
    my ( $contig, $beg, $end, $dir, $len, $format ) = @_;

    $format ||= 'seed';
    return ( $format =~ m/sapling/i ) ? "${contig}_$beg$dir$len"      :
           ( $format =~ m/cbdl/i )    ? [ $contig, $beg, $dir, $len ] :
                                        "${contig}_${beg}_$end" 
}


#  Produce the next fasta sequence from an array or a file:

sub next_contig
{
    my ( $contigs, $n ) = @_;
    return $contigs->[ $n || 0 ] if ref $contigs eq 'ARRAY';
    gjoseqlib::read_next_fasta_seq( $contigs );
}


#===============================================================================
#  Merge the hsps for a single reference sequence.  The goal is to cover as
#  much of the reference as possible.
#
#     \@match_regions = merge_hsps( $qlen, $slen, $s_matches, $options )
#
#  Match regions are in contig orientation, not query orientation:
#
#     $match_region = [ $q1, $q2, $dir, $mid, $uncov5, $uncov3 ]
#
#  Hsps are:
#
#                 0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
#  $hsp_data = [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
#
#===============================================================================
sub merge_hsps
{
    my ( $qlen, $slen, $s_matches, $options ) = @_;
    my $min_cover = $options->{ coverage };  # Minimum fraction of reference covered
    my $max_exp   = $options->{ expect };    # Maximum e-value for blast
    my $extrapol  = $options->{ extrapol };  # Amount to extrapolate ends of subject seq
    my $min_ident = $options->{ identity };  # Minimum fraction sequence identity
    my $max_gap   = $options->{ maxgap };    # Maximum gap in genome match
    my $seed_exp  = $options->{ seedexp };   # Maximum e-value to nucleate match
    my $verbose   = $options->{ verbose };
    my @by_score = sort { $b->[0] <=> $a->[0] }
                   grep { $_->[1] <=  $max_exp
                       && gjoseqlib::fraction_nt_diff( @$_[11,14] ) <= 1 - $min_ident
                        }
                   @$s_matches;
    foreach ( @by_score )
    {
        push @$_, $_->[ 9]  +  $_->[10];  # mid
        $_->[8] = $_->[13] <=> $_->[12];  # direction
    }

    my @by_pos = sort { $a->[15] <=> $b->[15] } @by_score;
    my $i = 0;
    my %pos_dex = map { $_ => $i++ } @by_pos;

    my @found;  #  [ $q1, $q2, $s1, $s2, $dir, \@hsps ]
    my %done;
    foreach my $hsp ( @by_score )
    {
        next if $done{ $hsp } || ( $hsp->[1] > $seed_exp );
        $done{ $hsp } = 1;
        my @hsps = ( $hsp );
        my ( $dir, $q1, $q2, $s1, $s2 )= @$hsp[ 8, 9, 10, 12, 13 ];
        my $dex = $pos_dex{ $hsp };

        # extend left on query (maybe):

        my $min_dex = $dex;   #  Smallest index used
        my $skip_scr = 0;     #  Biggest score skipped
        my $d = $dex;         #  Current try
        while ( $d > 0 )
        {
            my $try = $by_pos[ $d - 1 ];
            last if $done{ $try };                    #  Used
            last if ( $q1 - $try->[10] > $max_gap );  #  Too far
            $d--;                                     #  Move current try
            #  Can extend if moves to left in both subject and query
            if ( $try->[8] == $dir                             # Same strand
              && $try->[9] <  $q1                              # Move to left in query
              && ( ( $s1 <=> $s2 ) == ( $try->[12] <=> $s1 ) ) # Move to left in subject
               )
            {
                #  But do not do it if score added is less than score "skipped"
                if ( $try->[0] >= $skip_scr )
                {
                    $q1 = $try->[ 9];
                    $q2 = $try->[10] if $try->[10] > $q2;
                    $s1 = $try->[12];
                    $s2 = $try->[13] if ( $s1 <=> $s2 ) == ( $s2 <=> $try->[13] );
                    push @hsps, $try;
                    while ( $min_dex > $d ) { $done{ $by_pos[ --$min_dex ] } = 1 }
                    $skip_scr = 0;
                }
            }
            else
            {
                $skip_scr = $try->[0] if $try->[0] > $skip_scr;
            }
        }

        # extend right on query (maybe):

        my $max_dex = $dex;   #  Largest index used
        $skip_scr = 0;        #  Biggest score skipped
        $d = $dex;            #  Current try
        while ( $d < @by_pos - 1 )
        {
            my $try = $by_pos[ $d + 1 ];
            last if $done{ $try };                    #  Used
            last if ( $try->[9] - $q2 > $max_gap );   #  Too far
            $d++;                                     #  Move current try
            #  Can extend if moves to right in both subject and query
            if ( $try->[ 8] == $dir                            # Same strand
              && $try->[10] >  $q2                             # Move to right in query
              && ( ( $s1 <=> $s2 ) == ( $s2 <=> $try->[13] ) ) # Move to right in subject
               )
            {
                #  But do not do it if score added is less than score "skipped"
                if ( $try->[0] >= $skip_scr )
                {
                    $q1 = $try->[ 9] if $try->[9] < $q1;
                    $q2 = $try->[10];
                    $s1 = $try->[12] if ( $s1 <=> $s2 ) == ( $try->[12] <=> $s1 );
                    $s2 = $try->[13];
                    push @hsps, $try;
                    while ( $max_dex < $d ) { $done{ $by_pos[ ++$max_dex ] } = 1 }
                    $skip_scr = 0;
                }
            }
            else
            {
                $skip_scr = $try->[0] if $try->[0] > $skip_scr;
            }
        }

        #  Amount of uncovered reference sequence:

        my $d_s5 = ( ( $dir > 0 ) ? $s1 : $s2 ) - 1;
        my $d_s3 = $slen - ( ( $dir > 0 ) ? $s2 : $s1 );

        #  Extrapolate ends in query to ends of reference?

        if ( $extrapol )
        {
            #  Map desired adjust to proper ends of query sequence:

            my ( $d_q1, $d_q2 ) = ( $dir > 0 ) ? ( $d_s5, $d_s3 ) : ( $d_s3, $d_s5 );

            #  Limit by ends of query sequence and adjust if small enough:

            $d_q1 = $q1 - 1 if $q1 - $d_q1 < 1;
            if ( $d_q1 <= $extrapol )
            {
                $q1 -= $d_q1;
                if ( $dir > 0 ) { $d_s5 -= $d_q1 } else { $d_s3 -= $d_q1 }
            }

            $d_q2 = $qlen - $q2 if $q2 + $d_q2 > $qlen;
            if ( $d_q2 <= $extrapol )
            {
                $q2 += $d_q2;
                if ( $dir > 0 ) { $d_s3 -= $d_q2 } else { $d_s5 -= $d_q2 }
            }
        }

        push @found, [ $q1, $q2, $dir, $q1+$q2, $d_s5, $d_s3 ];
    }

    @found;
}


#===============================================================================
#  Combine regions identified by different reference sequences:
#
#     @merged_matches = merge_contig_matches( \@match_regions )
#
#     $match_region = [ $l, $r, $dir, $mid, $uncov5, $uncov3 ]
#===============================================================================
sub merge_contig_matches
{
    ref $_[0] eq 'ARRAY' and @{$_[0]} or return ();

    my @regions = sort { $a->[2] <=> $b->[2] || $a->[3] <=> $b->[3] } @{$_[0]};
    my @keep  = ();
    
    #   left, right, dir, mid, uncov5, uncov3
    my ( $l1, $r1, $d1, $m1, $ub1, $ue1 ) = @{ shift @regions };
    foreach my $match ( @regions )
    {
        my ( $l2, $r2, $d2, $m2, $ub2, $ue2 ) = @$match;
        my $overlap = min( $r1, $r2 ) - max( $l1, $l2 ) + 1;
        my $length  = min( $r1 - $l1 + 1, $r2 - $l2 + 1 );
        if ( ( $d2 == $d1 )  && ( $overlap >= 0.5 * $length ) )
        {
            if ( $l2 < $l1 )  # Leftmost match
            {
                $l1 = $l2;
                if ( $d1 > 0 ) { $ub1 = $ub2 } else { $ue1 = $ue2 }
            }
            if ( $r2 > $r1 )  # Rightmost match
            {
                $r1 = $r2;
                if ( $d1 > 0 ) { $ue1 = $ue2 } else { $ub1 = $ub2 }
            }
            $m1 = $l1 + $r1;
        }
        else
        {
            push @keep, [ $l1, $r1, $d1, $m1, $ub1, $ue1 ];
            ( $l1, $r1, $d1, $m1, $ub1, $ue1 ) = @$match;
        }
    }
    push @keep, [ $l1, $r1, $d1, $m1, $ub1, $ue1 ];

    # This is where we could check things (like coverage):

    @keep
}


sub min { $_[0] < $_[1] ? $_[0] : $_[1] }
sub max { $_[0] > $_[1] ? $_[0] : $_[1] }


#-------------------------------------------------------------------------------
#  Utility functions:
#-------------------------------------------------------------------------------
#  Allow variations of option keys including uppercase, underscores and
#  terminal 's':
#
#      $key = canonical_key( $key );
#- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sub canonical_key
{
    my $key = lc shift;
    $key =~ s/_//g;
    $key =~ s/s$//;  #  This is dangerous if an s is part of a word!
    return $key;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3