[Bio] / FigKernelPackages / AlignTree.pm Repository:
ViewVC logotype

View of /FigKernelPackages/AlignTree.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (download) (as text) (annotate)
Mon Nov 29 20:01:30 2010 UTC (9 years ago) by fangfang
Branch: MAIN
CVS Tags: rast_rel_2010_1206
Changes since 1.18: +2 -2 lines
update comments

# This is a SAS component
#
# Copyright (c) 2003-2010 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
#
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License.
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package AlignTree;

use strict;

use Carp;     
use Data::Dumper;

use SeedAware;
use SeedUtils;

use ffxtree;
use gjoseqlib;
use gjoalignment;
use gjoalignandtree;
use gjoparseblast;
use representative_sequences;

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw( align_sequences
                  trim_alignment
                  psiblast_search
                  make_tree );


#-------------------------------------------------------------------------------
#
#   @align = align_sequences( \@seqs, \%opts )
#  \@align = align_sequences( \@seqs, \%opts )
#   @align = align_sequences( \%opts )            # \@seqs = $opts{seqs}
#  \@align = align_sequences( \%opts )            # \@seqs = $opts{seqs}
#
#  Options:
#
#     tool => program  # Clustal (d), MAFFT or Muscle
#
#     Other options supported in align_with_clustal, align_with_muscle or align_with_mafft
#
#-------------------------------------------------------------------------------

sub align_sequences {
    my ($seqs, $opts);

    if (@_ == 1 && ref $_[0] eq 'HASH') {
        $opts = $_[0];
        $seqs = $opts->{seqs};
    } else {
        ($seqs, $opts) = @_;
    }
    $opts->{version} or $seqs && ref $seqs eq 'ARRAY' && @$seqs
        or die "align_sequences called with invalid sequences.\n";

    my $program;

    $opts->{tool} ||= 'clustal';

    if    ($opts->{tool} =~ /muscle/i)  { $program = \&gjoalignment::align_with_muscle;  $opts->{muscle} = "/home/fangfang/bin/muscle" }
    elsif ($opts->{tool} =~ /mafft/i)   { $program = \&gjoalignment::align_with_mafft;   $opts->{mafft} = "/home/fangfang/bin/mafft" }
    elsif ($opts->{tool} =~ /clustal/i) { $program = \&gjoalignment::align_with_clustal; }

    if ($opts->{version}) {
        if ($opts->{tool} =~ /clustal/i) { # version option not supported in gjoalignment::align_with_clustal
            my $clustal = SeedAware::executable_for($opts->{clustalw} || $opts->{program} || 'clustalw');
            my $tmpdir  = SeedAware::location_of_tmp($opts);
            my $tmpF    = SeedAware::new_file_name("$tmpdir/version", '');

            SeedAware::system_with_redirect($clustal, "--version", {stdout => $tmpF});

            open(F, $tmpF) or die "Could not open $tmpF";
            my @info = <F>;
            close(F);
            unlink($tmpF);
            my $version = $info[3]; # fourth line of Clustal usage info
            chomp($version);
            return $version;
        }
        return $program->($seqs, $opts);
    }

    my $ali = $program->($seqs, $opts);
    wantarray ? @$ali : $ali;
}


#-------------------------------------------------------------------------------
#
#   @align = trim_alignment( \@align, \%opts )
#  \@align = trim_alignment( \@align, \%opts )
#   @align = trim_alignment( \%opts )            # input \@align = $opts{ali}
#  \@align = trim_alignment( \%opts )            # input \@align = $opts{ali}
#
#  Options:
#
#     align_opts    => HASH    #  options for all alignment operations in trimming. Use Clustal by default.
#     fract_cov     => fract   #  fraction of sequences to be covered in initial trimming of ends (D: 0.75)
#     fract_ends    => fract   #  minimum fraction of ends to be considered significant for uncov cutoff (D = 0.1)
#     keep_def      => bool    #  do not append trimming coordinates to description fields in seqs
#     log_dir       => dir     #  directory for log files 
#     log_prefix    => string  #  prefix for log file names
#     max_reps_sim  => thresh  #  threshold used to collapse seqs into representatives (D = 0.9)
#     single_round  => bool    #  if set to false, additional rounds of psiblast are attempted to incorporate seqs with multiple hsps.
#     skip_psiblast => bool    #  trim to median ends only
#     use_reps      => bool    #  first collapse seqs into representatives
#     win_size      => size    #  size of sliding window used in calculating uncov cutoff (D = 10)
#
#-------------------------------------------------------------------------------

sub trim_alignment {
    my ($ali, $opts);

    if (@_ == 1 && ref $_[0] eq 'HASH') {
        $opts = $_[0];
        $ali  = $opts->{ali} || $opts->{seqs};
    } else {
        ($ali, $opts) = @_;
    }
    $ali && ref $ali eq 'ARRAY' && @$ali
        or die "trim_alignment called with invalid alignment.\n";

    my $skip_psiblast = $opts->{skip_psiblast} ? 1 : 0;
    my $single_round  = $opts->{single_round}  ? 1 : 0;
    my $log_dir       = $opts->{log_dir}      || SeedAware::location_of_tmp();
    my $keep_log      = $opts->{keep_log}     || $opts->{log_dir} ? 1 : 0;
    my $log_prefix    = $opts->{log_prefix};
    my $use_reps      = $opts->{use_reps}     || $opts->{max_reps_sim} ? 1 : 0;
    my $max_reps_sim  = $opts->{max_reps_sim} || 0.90;
    my $fract_cov     = $opts->{fract_cov}    || 0.75;
    my $fract_ends    = $opts->{fract_ends}   || 0.10;
    my $win_size      = $opts->{win_size}     || 10;
    my $align_opts    = $opts->{align_opts};

    my $trim0 = gjoalignandtree::trim_align_to_median_ends($ali, {begin => 1, end => 1, fract_cov => $fract_cov});
    
    return wantarray ? @$trim0 : $trim0 if $skip_psiblast;

    SeedUtils::verify_dir($log_dir);
    my $log_stderr = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.stderr") : "/dev/null";
    my $log_report = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.report") : "/dev/null";

    my $reps               = $use_reps ? representative_sequences::rep_seq_2($trim0, {max_sim => $max_reps_sim}) : [@$trim0];
    my $db                 = gjoseqlib::pack_sequences($ali);
    my $profile            = align_sequences($reps, $align_opts);
    my $blast              = gjoalignandtree::blastpgp($db, $profile, {stderr => $log_stderr});
    my ($trimmed, $report) = process_psiblast_v2($blast, $opts);
    
    my $report_string      = join("\n", map { join "\t", @$_ } values %$report) . "\n";

    print_string($log_report, $report_string) if $keep_log;

    my $new_ali = align_sequences($trimmed, $align_opts);
    return wantarray ? @$new_ali : $new_ali if $single_round;

    my @to_trim = grep { $report->{$_->[0]} =~ /multiple hsps/ } @$trim0;

    my $round = 2;
    while (@to_trim >= max(2, @$ali/10)) {
        my $log_stderr  = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.stderr.$round") : "/dev/null";
        my $log_report  = $keep_log ? SeedAware::new_file_name("$log_dir/$log_prefix" . "psiblast.report.$round") : "/dev/null";
        my $blast       = gjoalignandtree::blastpgp($db, \@to_trim, {stderr => $log_stderr});
        my ($trm, $rpt) = process_psiblast_v2($blast, $opts);
        my @new_hits    = grep { $report->{$_->[0]} =~ /multiple hsps/ } @$trm;
        @to_trim        = grep { $rpt->{$_->[0]}    =~ /multiple hsps/ } @$trm; 

        push @$trimmed, $_ for @new_hits;
    }
    
    $new_ali = align_sequences($trimmed, $align_opts);
    wantarray ? @$new_ali : $new_ali;
}


#-------------------------------------------------------------------------------
#
#    \@seq             = psiblast_search( $db, $profile, \%opts )
#  ( \@seq, \%report ) = psiblast_search( $db, $profile, \%opts )
#    \@seq             = psiblast_search( $profile, \%opts )
#  ( \@seq, \%report ) = psiblast_search( $profile, \%opts )
#    \@seq             = psiblast_search( \%opts )
#  ( \@seq, \%report ) = psiblast_search( \%opts )
#
#     $profile can be $profile_file_name or \@profile_seqs
#     $db      can be $db_file_name      or \@db_seqs, or
#              'SEED' or 'PSEED', for protein seqs in complete genomes in annotator's SEED or PSEED
#
#     If supplied as file, profile is pseudoclustal
#
#     Report records:
#
#         [ $sid, $slen, $status, $frac_id, $frac_pos,
#                 $q_uncov_n_term, $q_uncov_c_term,
#                 $s_uncov_n_term, $s_uncov_c_term ]
#
#  Options:
#
#     e_value       =>  $max_e_value    #  maximum blastpgp E-value (D = 0.01)
#     incremental   =>  bool            #  expand an initial set with multiple rounds of psiblast
#     max_e_val     =>  $max_e_value    #  maximum blastpgp E-value (D = 0.01)
#     max_q_uncov   =>  $aa_per_end     #  maximum unmatched query (D = min{20, qlen/3})
#     max_q_uncov_c =>  $aa_per_end     #  maximum unmatched query, c-term (D = min{20, qlen/3})
#     max_q_uncov_n =>  $aa_per_end     #  maximum unmatched query, n-term (D = min{20, qlen/3})
#     min_ident     =>  $frac_ident     #  minimum fraction identity (D = 0.15)
#     min_positive  =>  $frac_positive  #  minimum fraction positive scoring (D = 0.20)
#     max_reps_sim  =>  $thresh         #  threshold used to collapse seqs into representatives (D = 0.8)
#     nresult       =>  $max_seq        #  maximim matching sequences (D = 5000)
#     query         =>  $q_file_name    #  query sequence file (D = most complete)
#     query         => \@q_seq_entry    #  query sequence (D = most complete)
#     stderr        =>  $file           #  blastpgp stderr (D = /dev/stderr)
#
#  Options for incremental search:
#
#     max_reps_sim      =>  $thresh     #  threshold used to collapse seqs into representatives (D = 0.9)
#     min_seqs_for_reps => $n           #  only use representative seqs if number of seqs exceeds this threshold (D = 100)
#     stop_round        =>  $n          #  for incremental search, stop after a specified number of psiblast rounds (D = unlimited)
#     use_reps          =>  bool        #  collapse profile seqs into representatives before submitting to psiblast
#
#  If supplied, query must be identical to a profile sequence, but no gaps.
#
#-------------------------------------------------------------------------------

sub psiblast_search {
    my ($db, $profile, $opts);

    if    (@_ >= 2 && ref $_[1] eq 'ARRAY') { ($db, $profile, $opts) = @_ }
    elsif (@_ >= 1 && ref $_[0] eq 'ARRAY') { ($profile, $opts)      = @_ }
    elsif (@_ == 1 && ref $_[0] eq 'HASH')  { ($opts)                = @_ }

    $opts->{stderr} ||= '/dev/null';

    $profile ||= $opts->{profile};
    $db      ||= $opts->{db} || 'SEED';

    my $org_dir = "/home/fangfang/FIGdisk/FIG/Data/Organisms";
    my $psi_dir = "/home/fangfang/WB/PsiblastDB/";
    # my $org_dir = ${FIG_Config::data};  # does not work on bio-big
    
    if (ref $db ne 'ARRAY') {
        if ($db =~ /^(\d+\.\d+)$/) { $db = "$org_dir/$1/Features/peg/fasta" }
        elsif (uc $db eq 'PSEED')  { $db = "$psi_dir/PSEED.complete.fasta" }
        elsif (uc $db eq 'SEED')   { $db = "$psi_dir/SEED.complete.fasta" }
    }

    my $inc = $opts->{incremental} || $opts->{inc};

    my ($hits, $report, $history) = $inc ? incremental_psiblast_search($db, $profile, $opts)
                                         : gjoalignandtree::extract_with_psiblast($db, $profile, $opts);
    
    wantarray ? ($hits, $report, $history) : $hits;
}

#-------------------------------------------------------------------------------
#
#  Incremental psiblast search from a small set of initial sequences,
#  which may be unaligned.
#  
#    \@seq                        = incremental_psiblast_search( $db, $profile, \%opts )
#  ( \@seq, \%report, \@history ) = incremental_psiblast_search( $db, $profile, \%opts )
#
#  Options
#
#     max_reps_sim      => $thresh   # threshold used to collapse seqs into representatives (D = 0.9)
#     min_seqs_for_reps => $n        # only use representative seqs if number of seqs exceeds this threshold (D = 100)
#     stop_round        => $n        # stop after a specified number of psiblast rounds (D = unlimited)
#     use_reps          => bool      # always collapse profile seqs into representatives before submitting to psiblast
#
#  Other options only affect the final round of psiblast. 
#
#  Report records are documented in psiblast_search().
#
#  History consists a list 4-tuples, corresponding to search status
#  at each psiblast round:
#    
#     [ profile_length, num_starting_seqs, num_trimmed_reps, num_psiblast_hits ]
#  
#  The psiblast hits at the end of each round are aligned, trimmed,
#  and sorted.  The top hits are then selected to form the set of
#  profile sequences for the next round.  The algorithm tries to
#  expand the set cautiously. Unless the psiblast hits share high
#  identity (~75%) with the profile, the set grows by no more than a
#  factor of 2 each round.  If stop_round is not specified, the
#  process runs to convergence or until the number of profile
#  sequences reaches 500.
#
#-------------------------------------------------------------------------------

sub incremental_psiblast_search {
    my ($db, $profile, $opts) = @_;

    my $stop_round        = $opts->{stop_round};
    my $max_reps_sim      = $opts->{max_reps_sim}      || 0.9;
    my $min_reps_seqs     = $opts->{min_seqs_for_reps} || 100;
    my $use_reps          = $opts->{use_reps}          || $opts->{max_reps_sim} ? 1 : 0;

    my $max_nseq_clustal  = 100;
    my $initial_set_keep  = 0.8;
    my $max_query_nseq    = 500;
    my $significant_score = 150;  # 150 = ~ (+ 75% id + %80 pos - 25 uncov)
    my $expansion_factor  = 2;    # query set doubles each round
    my $frac_id_weight    = 100;
    my $frac_pos_weight   = 100;
    my $uncov_penalty     = 0.2;

    my @history;
    my @prof = @$profile;

    # in case profile seqs are unaligned
    my @lens = sort { $a <=> $b } map { length $_->[2] } @$profile;
    if ($lens[0] != $lens[-1]) {  
        my $reps  = $use_reps && @prof >= $min_reps_seqs ? representative_sequences::rep_seq_2(\@prof, {max_sim => $max_reps_sim}) : [@prof];
        my $opts2 = @$reps < $max_nseq_clustal ? { tool => 'clustal' } : { tool => 'mafft', auto => 1 };
        my $ali   = align_sequences($reps, $opts2);
        my $trim  = trim_alignment($ali, { align_opts => $opts2 });
        my $redo  = (@$trim / @$ali) < $initial_set_keep;
        $trim     = trim_alignment($ali, { skip_psiblast => 1 }) if $redo;
        @prof     = @$trim;

        my $record = [ length $ali->[0]->[2], scalar @$profile, scalar @$trim ];
        push @history, $record;
        print STDERR join("\t", @$record). "\n";
    } 

    my $opts3 = { e_value => 0.01, max_q_uncov => 1000, nresult => 1000, stderr => '/dev/null' };

    $max_query_nseq = min($max_query_nseq, $opts->{nresult}) if $opts->{nresult} > 0;
    my @seqs = @$profile;
    my $i = 0;
    while (@seqs < $max_query_nseq) {
        my ($hits, $report) = gjoalignandtree::extract_with_psiblast($db, \@prof, $opts3);
        my $record = [ length $prof[0]->[2], scalar @seqs, scalar @prof, scalar keys %$report ];
        push @history, $record;
        print STDERR join("\t", @$record)."\n";
        my %score;
        for (keys %$report) {
            my ($status, $frac_id, $frac_pos, $uncov_q1, $uncov_q2) = @{$report->{$_}}[2, 3, 4, 5, 6];
            next unless $status =~ /included/i;
            my $uncov = $uncov_q1 + $uncov_q2;
            $score{$_} = $frac_id * $frac_id_weight + $frac_pos * $frac_pos_weight - $uncov * $uncov_penalty; 
        }
        my @keep;
        for (sort { $score{$b} <=> $score{$a} } keys %score) {
            last if @keep / @prof >= $expansion_factor && $score{$_} < $significant_score;
            push @keep, $_;
        }
        my %seen;
        $seen{$_} = 1 for @keep;
        @seqs     = grep { $seen{$_->[0]} } @$hits;
        my $reps  = $use_reps && @seqs >= $min_reps_seqs ? representative_sequences::rep_seq_2(\@seqs, {max_sim => $max_reps_sim}) : [@seqs];
        my $opts2 = @$reps < $max_nseq_clustal ? { tool => 'clustal' } : { tool => 'mafft', auto => 1 };
        my $ali   = align_sequences($reps, $opts2);
        my $trim  = trim_alignment($ali, { align_opts => $opts2 });

        last if @$trim <= @prof || ($stop_round && ++$i >= $stop_round);
        @prof = @$trim;
    }

    my ($hits, $report) = gjoalignandtree::extract_with_psiblast($db, \@prof, $opts);
    my $record = [ length $prof[0]->[2], scalar @seqs, scalar @prof, scalar @$hits ];
    push @history, $record;
    print STDERR join("\t", @$record). "\n";

    wantarray ? ($hits, $report, \@history) : $hits;
}


#-------------------------------------------------------------------------------
#
#  Options:
#
#     keep_def      =>  bool            #  do not append trimming coordinates to description fields in seqs
#     fract_ends    =>  $fract_ends     #  minimum fraction of ends to be considered significant for uncov cutoff (D = 0.1)
#     max_q_uncov   =>  $aa_per_end     #  maximum unmatched query (D = min{20, qlen/3})
#     max_q_uncov_c =>  $aa_per_end     #  maximum unmatched query, c-term (D = min{20, qlen/3})
#     max_q_uncov_n =>  $aa_per_end     #  maximum unmatched query, n-term (D = min{20, qlen/3})
#     min_ident     =>  $frac_ident     #  minimum fraction identity (D = 0.15)
#     min_positive  =>  $frac_positive  #  minimum fraction positive scoring (D = 0.20)
#     min_q_cov     =>  $aa_covered     #  minimum number of query characters covered (D = max{10, qlen/5})
#     win_size      =>  size            #  size of sliding window used in calculating uncov cutoff (D = 10)
#
#-------------------------------------------------------------------------------

sub process_psiblast_v2 {
    my ( $blast, $opts ) = @_;

    $blast && ref $blast eq 'ARRAY' or return ();
    $opts  && ref $opts  eq 'HASH'  or $opts = {};

    my( $qid, $qdef, $qlen, $qhits ) = @{ $blast->[0] };

    my $keep_def      = $opts->{ keep_def }      || $opts->{ keep_seq_def } ? 1 : 0;  
    my $fract_ends    = $opts->{ fract_ends }    || $opts->{ fraction_ends } || 0.1;               # fraction of sequences ending in window
    my $max_q_uncov_c = $opts->{ max_q_uncov_c } || $opts->{ max_q_uncov }   || min(20, $qlen/3);
    my $max_q_uncov_n = $opts->{ max_q_uncov_n } || $opts->{ max_q_uncov }   || min(20, $qlen/3);
    my $min_q_cov     = $opts->{ min_q_cov }     || $opts->{ min_nongaps }   || max(10, $qlen/5);
    my $min_ident     = $opts->{ min_ident }     ||  0.15;
    my $min_pos       = $opts->{ min_positive }  ||  0.20;
    my $win_size      = $opts->{ win_size }      || $opts->{ window_size }   || 10;

    my (@uncov1, @uncov2);

    foreach my $sdata ( @$qhits ) {
        my( $sid, $sdef, $slen, $hsps ) = @$sdata;
        if ( one_real_hsp($hsps) ) {
#            [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
#               0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
            my( $q1, $q2, $s1, $s2 ) = ( @{ $hsps->[0] } )[9, 10, 12, 13];
            push @uncov1, [ $q1-1,     $s1-1     ];
            push @uncov2, [ $qlen-$q2, $slen-$s2 ];
        }
    }

    my $q1_cut = calc_uncov_cut(\@uncov1, $fract_ends, $win_size) + 1;
    my $q2_cut = $qlen - calc_uncov_cut(\@uncov2, $fract_ends, $win_size);

    my (@trimmed, %report, $status);

    foreach my $sdata ( @$qhits ) {
        my( $sid, $sdef, $slen, $hsps ) = @$sdata;

        if ( one_real_hsp($hsps) ) {

            my $hsp0 = $hsps->[0];
            my ($scr, $nmat, $nid, $npos, $q1, $q2, $qseq, $s1, $s2, $sseq) = (@$hsp0)[ 0, 4, 5, 6, 9, 10, 11, 12, 13, 14 ];

            my $uncov1 = $q1 <= $q1_cut ? 0 : $q1 - $q1_cut;
            my $uncov2 = $q2 >= $q2_cut ? 0 : $q2_cut - $q2;
      
            if    ( $q1-1     > $max_q_uncov_n ) { $status = 'missing start' }
            elsif ( $qlen-$q2 > $max_q_uncov_c ) { $status = 'missing end' }
            elsif ( $nid  / $nmat < $min_ident ) { $status = 'low identity' }
            elsif ( $npos / $nmat < $min_pos )   { $status = 'low positives' }
            else {
                my ($t1, $t2, $s1t, $s2t);
                
                ($sseq, $t1) = trim_5( $q1_cut - $q1, $qseq, $sseq );
                ($sseq, $t2) = trim_3( $q2 - $q2_cut, $qseq, $sseq );

                $s1t = $s1 + $t1;
                $s2t = $s2 - $t2;
                    
                $sseq =~ s/-+//g;

                if ( length $sseq < $min_q_cov )   {
                    $status = 'tiny coverage'
                } else {
                    $sdef .= " ($s1t-$s2t/$slen)" if !$keep_def && ($s1t > 1 || $s2t < $slen );
                    push @trimmed, [ $sid, $sdef, $sseq ];
                    $status = 'included';
                }
            }
            
            $report{ $sid } = [ $sid, $slen, $status, $nid/$nmat, $npos/$nmat, $q1-1, $qlen-$q2, $s1-1, $slen-$s2 ];

        } else {    $status = 'multiple hsps' }
    }

    wantarray ? ( \@trimmed, \%report ) : \@trimmed;
}

sub trim_5 {
    my ( $ntrim, $qseq, $sseq ) = @_;
    return $sseq if $ntrim <= 0;
    my ( $to_trim ) = $qseq =~ /^(([^-]-*){$ntrim})/;
    my $trimmed = substr( $sseq, length($to_trim) );
    wantarray ? ( $trimmed, length($to_trim) ) : $trimmed;
}


sub trim_3 {
    my ( $ntrim, $qseq, $sseq ) = @_;
    return $sseq if $ntrim <= 0;
    my ( $to_trim ) = $qseq =~ /((-*[^-]){$ntrim})$/;
    my $trimmed = substr( $sseq, 0, length($sseq) - length($to_trim) );
    wantarray ? ( $trimmed, length($to_trim) ) : $trimmed;
}

#-------------------------------------------------------------------------------
#
#  Allow fragmentary matches inside of the highest-scoring hsp:
#
#-------------------------------------------------------------------------------

sub one_real_hsp {
    my ( $hsps ) = @_;
    return 0 if ! ( $hsps && ( ref( $hsps ) eq 'ARRAY' ) && @$hsps );
    return 1 if  @$hsps == 1;

    my ( $q1_0, $q2_0 ) = ( @{ $hsps->[0] } )[9, 10];
    for ( my $i = 1; $i < @$hsps; $i++ )
    {
        my ($q1, $q2) = ( @{ $hsps->[$i] } )[9, 10];
        return 0 if $q1 < $q1_0 || $q2 > $q2_0;
    }

    return 1;
}

#-------------------------------------------------------------------------------
#
#  Calculate the best cutoff from an array of uncov numbers:
#     
#    1. Sort the uncov numbers
#    2. Use a sliding window to count the instances of uncovs in a specified range
#    3. Look for the first significant peak of uncovs from the highest uncov
#
#-------------------------------------------------------------------------------

sub calc_uncov_cut {
    my ($uncovs, $thresh, $winsize) = @_;
    return undef unless $uncovs && ref $uncovs eq 'ARRAY' && @$uncovs;

    my @uncov = sort { $a->[0] <=> $b->[0] } @$uncovs;
    return $uncov[0]->[0] if $uncov[0]->[0] == $uncov[-1]->[0];

    $thresh  ||= 0.10;
    $winsize ||= 10;

    my $min_count = int($thresh * @uncov) + 1;
    my $imax = $uncov[-1]->[0];

    my $j1 = @uncov - 1;
    my $j2 = @uncov - 1;
    my $cnt = 0;
    my $max_cnt = 0;      # maximum seqs in uncov window range
    my $i_of_max_cnt;     # center of moving window

    for (my $i = $imax; $j2 >= 0 && $i >= 0; $i--) {
        while ($j1 >= 0 && $uncov[$j1]->[0] >= $i - $winsize) {
            $cnt++;
            $j1--;
        }
        while ($j2 >= 0 && $uncov[$j2]->[0] > $i + $winsize) {
            $cnt--;
            $j2--;
        }
        if ($cnt > $max_cnt) {
            $max_cnt = $cnt;
            $i_of_max_cnt = $i;
        }
        last if ($cnt < $max_cnt && $max_cnt >= $min_count);   # just past peak
    }
    if ($max_cnt >= $min_count) {
        return ($i_of_max_cnt > $winsize) ? $i_of_max_cnt - $winsize : 0;
    }    

    return calc_uncov_cut($uncovs, $thresh, max($winsize+1, int($winsize*1.43))); 
}

sub print_string {
    my ( $fh, $close, $unused ) = gjoseqlib::output_filehandle( shift ); 
    ( unshift @_, $unused ) if $unused;                       # modeled after gjoseqlib::print_alignment_as_fasta

    my $str = shift;
    return unless $str;
    
    print $fh $str;
    close $fh if $close;
}



#-------------------------------------------------------------------------------
#
#     $tree            = make_tree( \@ali, \%opts )
#   ( $tree, \%stats ) = make_tree( \@ali, \%opts )
#     $tree            = make_tree( \%opts )           # \@ali = $opts{ali}
#   ( $tree, \%stats ) = make_tree( \%opts )           # \@ali = $opts{ali}
#
#  Options:
#
#     bootstrap  => n                     # bootstrap samples (D = 1) 
#     tool       => program               # fasttree (d), phyml, raxml
#
#     params     => parameter string for tree tool
#                   (can be superseded by the following common options)
#
#     search     => topolog_search_method # NNI (d), SPR
#     model      => substitution_model    # nt: HKY85, JC69, K80, F81, F84, TN93
#                                         # aa: LG, JTT, WAG, MtREV, Dayhoff, DCMut
#     rate       => rate_distribution     # Gamma (d), Uniform 
#     nclasses   => num_subst_categories  # 4 (d)
#     optimize   => all (d, topology && brlen && parameters), eval (optimize model parameters only)
#     input      => input tree            # tree file name
#     
#     Option default values depend the tool used. See details in:
#       ffxtree:: tree_with_fasttree, tree_with_phyml, tree_with_raxml
#
#-------------------------------------------------------------------------------

sub make_tree {
    my ($ali, $opts) = ffxtree::process_input_args_w_ali(@_);

    my $program;

    $opts->{tool} ||= 'fasttree';

    if    ($opts->{tool} =~ /fasttree/i) { $program = \&ffxtree::tree_with_fasttree;  $opts->{fasttree} = "/home/fangfang/bin/fasttree" }
    elsif ($opts->{tool} =~ /phyml/i)    { $program = \&ffxtree::tree_with_phyml;     $opts->{phyml}    = "/home/fangfang/bin/phyml" }
    elsif ($opts->{tool} =~ /raxml/i)    { $program = \&ffxtree::tree_with_raxml;     $opts->{raxml}    = "/home/fangfang/bin/raxmlHPC" }

    my ($tree, $stats) = $program->($ali, $opts);
    
    if ((my $n = $opts->{bootstrap}) > 1) {
        my @samples;
        for (my $i = 0; $i < $n; $i++) {
            my $a = gjoalignment::bootstrap_sample($ali);
            my $t = $program->($a, $opts);
            push @samples, $t;
        }
        $tree = gjonewicklib::reroot_newick_to_midpoint_w($tree);
        $tree = gjophylip::bootstrap_label_nodes($tree, \@samples);
    }

    wantarray() ? ($tree, $stats) : $tree;
}


#-------------------------------------------------------------------------------
#
#  Obsolete routines
#
#-------------------------------------------------------------------------------

sub psiblast_search_old {
    my ($profile, $opts) = @_;

    my $min_ident     = $opts->{ min_ident }    ||  0.15;
    $opts->{ e_value } ||= $opts->{ max_e_val } ||= 0.01;

    # print STDERR Dumper($opts);

    my $dir = "/home/fangfang/WB/tmpsvr";
    my $logdir = "$dir/logs";

    my $complete = "/home/fangfang/WB/svr_complete.fasta";       # db      (fasta or gjo seqs)

    my %trimmedH;
    my %rejectedH;

    my $i      = 1;
    my $suffix = -s "$dir/trim2-$i" ? "-$i" : "";
    my $fprof  = "$dir/trim2$suffix";


    # print "Running psiblast, querying trim2$suffix against complete genomes...\n";

    my $blast = gjoalignandtree::blastpgp($complete, $profile, { max_exp => $opts->{e_value}, stderr => "$logdir/psiblast2$suffix.stderr" });
    my($qid, $qdef, $qlen, $qhits) = @$blast;
    my @trimmed;

    open  PSI, ">$logdir/psiblast2$suffix.dump" or die "could not open psiblast2$suffix.dump\n";
    print PSI Dumper($blast);
    close PSI;

    my $rej = "$dir/psiblast2$suffix.rejects";
    open REJ, ">$rej" or die "Could not open $rej";

    my @scores;

    my $max_uncov = 20;
    my $max_s_uncov = min($qlen*5, 500);
    foreach my $sdata (@$qhits) {
        my($sid, $sdef, $slen, $hsps) = @$sdata;
        my $explanation;

        # if (@$hsps == 1) 
        if ( one_real_hsp($hsps) ) {
            # [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
            #     0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
            my($scr, $nmat, $nid, $q1, $q2, $qseq, $s1, $s2, $sseq) = (@{$hsps->[0]})[0, 4, 5, 9, 10, 11, 12, 13, 14];

            if (($q1-1) + ($qlen-$q2) > 2 * $max_uncov) {
                $explanation = [ $sid, "short coverage", $q1, $qlen-$q2, $s1, $slen-$s2 ];
            } elsif ($s1-1 + $slen-$s2 > $max_s_uncov) {
                $explanation = [ $sid, "long subject", $q1, $qlen-$q2, $s1, $slen-$s2 ];
            } elsif ($nid / $nmat < $min_ident) {
                $explanation = [ $sid, "low identity", $nid / $nmat ];
            } else {
                $sseq =~ s/-+//g;
                my $entry = [$sid, $sdef, $sseq];
                push(@trimmed, $entry);
                $trimmedH{$sid} = [$entry, $scr] unless $trimmedH{$sid} && $trimmedH{$sid}->[1] >= $scr;
            }
        } else {
            $explanation = [ $sid, "multiple hsps" ];
        }
        if ($explanation) {
            print REJ join("\t", @$explanation)."\n";
            $rejectedH{$sid} ||= $explanation;
        }
    }
    close REJ;
    # &gjoseqlib::print_alignment_as_fasta("$dir/trim2$suffix-complete", \@trimmed);    
    # printf "trim2$suffix-complete contains %d sequences\n", &num_seqs_in_fasta("$dir/trim2$suffix-complete");

    return \@trimmed;

}

sub trim_alignment_old {
    my ($ali, $opts) = @_;
    my $rv;

    my $profile;

    # my $dir = SeedAware::location_of_tmp( $opts );
    my $dir = "/home/fangfang/WB/tmpsvr";

    my $logdir = "$dir/logs";
    &SeedUtils::verify_dir($logdir);

    my $trim1 = gjoalignandtree::trim_align_to_median_ends($ali, { begin => 1, end => 1 });
    my $reps = [@$trim1];

    my (@trims, %rejectH, %successH);

    my $i    = 0;
    my $nseq = @$trim1;

    while (@$trim1 >= 0.1 * $nseq && ($i == 0 || @$trim1 >= 2)) {
        $i++;
        my $suffix = $i > 1 ? "-$i" : "";

        # print "Running psiblast, querying trim1 [$i] against reps-0.8...\n";

        my $blast = gjoalignandtree::blastpgp($reps, $trim1, { stderr => "$logdir/psiblast1.stderr$suffix" });
        open  PSI, ">$logdir/psiblast1.dump$suffix" or die "could not open psiblast1.dump$suffix\n";
        print PSI Dumper($blast);
        close PSI;

        my $opts = {};
        my ( $trimmed, $reject, $scoreH ) = process_psiblast1( $blast, $opts );
        
        push @trims, [ $_->[0], $_, $scoreH->{$_->[0]}, $i ] for @$trimmed;
        $successH{$_->[0]} = 1 for @$trimmed;
        push @{$rejectH{$_->[0]}}, $_ for @$reject;
               
        my $rej = "$dir/psiblast1.rejects$suffix";
        open REJ, ">", $rej or die "Could not open $rej";
        foreach ( @$reject ) {
            print REJ join( "\t", @$_ ), "\n";
            # print "REJ\t", join( "\t", @$_ ), "\n";
        }
        close REJ;

        # foreach (@$trimmed) {
            # print "GOOD\t", join("\t", $_->[0], $_->[2])."\n";
        # }
        # print "\n";

        &gjoseqlib::print_alignment_as_fasta("$dir/trim1-reps-0.8$suffix", $trimmed);    
        # printf "trim1-reps-0.8$suffix contains %d sequences\n", scalar@$trimmed;
        $profile = [@$trimmed];

        my $ntrim1 = @$trim1;

        # take only the rejected seqs marked as multiple hsps
        my %multiple = map { $_->[0] => 1 } grep { $_->[1] =~ /multiple/i } @$reject;
        @$trim1 = grep { $multiple{$_->[0]} } @$trim1;

        # previous (wrong): take all the rejected seqs
        # @$trim1 = grep { !$successH{$_->[0]} } @$trim1;

        # take all the rejected seqs except tiny ones
        # my %nontiny = map { $_->[0] => 1 } grep { $_->[1] !~ /tiny/i } @$reject;
        # @$trim1 = grep { $nontiny{$_->[0]} } @$trim1;

        last if $ntrim1 == @$trim1;
    }

    if ($i > 1) {
        run("mv $dir/trim1-reps-0.8 $dir/trim1-reps-0.8-1");
        run("mv $dir/psiblast1.rejects $dir/psiblast1.rejects-1");

        my %seen;
        @trims = grep { !$seen{$_->[0]}++ } sort { $b->[2] <=> $a->[2] } @trims;

        my @fhs;
        foreach my $j (1 .. $i) {
            open($fhs[$j], ">$dir/trim1-reps-0.8-$j-ids") or die "Could not open $dir/trim1-reps-0.8-$j-ids";
        }
        foreach my $trm (@trims) {
            my $j = $trm->[3];
            my $fh = $fhs[$j];
            print $fh join("\t", $trm->[0], $trm->[2])."\n";
        }
        close $_ for @fhs[1..$i];

        @trims = map { $_->[1] } @trims;
        &gjoseqlib::print_alignment_as_fasta("$dir/trim1-reps-0.8", \@trims);    
        $profile = [@trims];
        
        printf "After merging, trim1-reps-0.8 contains %d sequences\n", &num_seqs_in_fasta("$dir/trim1-reps-0.8");
        
        my $rej = "$dir/psiblast1.rejects";
        open REJ, ">", $rej or die "Could not open $rej";
        for (grep { @{$rejectH{$_}} == $i } keys %rejectH) {
            print REJ map { join("\t", @$_), "\n" } @{$rejectH{$_}};
        }
        close REJ;
    }

    # my $profile1 = &gjoseqlib::read_fasta("$dir/trim1-reps-0.8");
    my $profile1 = $profile;
    my $align2 = &gjoalignment::align_with_mafft($profile1);
    my $trim2 = &gjoalignandtree::trim_align_to_median_ends($align2, { begin  => 1, end => 1 } );

    return $trim2;
}


sub process_psiblast1 {
    my ( $blast, $opts ) = @_;

    $blast && ref $blast eq 'ARRAY' or return ();
    $opts  && ref $opts  eq 'HASH'  or $opts = {};

    my( $qid, $qdef, $qlen, $qhits ) = @$blast;

    my $fract_ends = $opts->{ fract_ends } || $opts->{ fraction_ends } || 0.1;  # fraction of sequences ending in window
    my $max_uncov  = $opts->{ max_uncov }  || $opts->{ max_uncovered } || 20 > $qlen/3 ? ($qlen/3 + 1) : 20;
    my $min_cov    = $opts->{ min_cov }    || $opts->{ min_nongaps }   || 10 < $qlen/5 ? ($qlen/5 + 1) : 10;
    my $win_size   = $opts->{ win_size }   || $opts->{ window_size }   || 10;

    # print "max_uncovered = $max_uncov\n";

    my @uncov1;
    my @uncov2;
    foreach my $sdata ( @$qhits )
    {
        my( $sid, $sdef, $slen, $hsps ) = @$sdata;
        # if ( @$hsps == 1 )
        if ( one_real_hsp($hsps) )
        {
#            [ scr, exp, p_n, pval, nmat, nid, nsim, ngap, dir, q1, q2, qseq, s1, s2, sseq ]
#               0    1    2    3     4     5    6     7     8   9   10   11   12  13   14
            my( $q1, $q2, $s1, $s2 ) = ( @{ $hsps->[0] } )[9, 10, 12, 13];
            push @uncov1, [ $q1-1,     $s1-1     ];
            push @uncov2, [ $qlen-$q2, $slen-$s2 ];
        }
    }

    my $q1_cut = calc_uncov_cut(\@uncov1, $fract_ends, $win_size) + 1;
    my $q2_cut = $qlen - calc_uncov_cut(\@uncov2, $fract_ends, $win_size);

    # print $qlen - $q2_cut, "\n";
    # print "q1_cut  = $q1_cut   q2_cut = $q2_cut   qlen = $qlen\n";

    my @trimmed;
    my @rejected;
    my %scoreH;
    foreach my $sdata ( @$qhits ) {
        my( $sid, $sdef, $slen, $hsps ) = @$sdata;
        # if ( @$hsps == 1 )
        if ( one_real_hsp($hsps) ) {
            my( $scr, $q1, $q2, $qseq, $s1, $s2, $sseq ) = ( @{ $hsps->[0] } )[ 0, 9 .. 14 ];
            my $uncov1 = $q1 <= $q1_cut ? 0 : $q1 - $q1_cut;
            my $uncov2 = $q2 >= $q2_cut ? 0 : $q2_cut - $q2;
            if ( $uncov1 <= $max_uncov  && $uncov2 <= $max_uncov ) {
                $sseq = trim_5( $q1_cut - $q1, $qseq, $sseq );
                $sseq = trim_3( $q2 - $q2_cut, $qseq, $sseq );
                $sseq =~ s/-+//g;
                if ( length($sseq) >= $min_cov ) {
                    push @trimmed, [ $sid, $sdef, $sseq ];
                    $scoreH{$sid} = $scr;
                } else {
                    push @rejected, [ $sid, 'tiny coverage', $uncov1, $uncov2, $s1-1, $slen-$s2, length($sseq) ];
                }
            } else {
                push @rejected, [ $sid, 'short coverage', $uncov1, $uncov2, $s1-1, $slen-$s2 ];
            }
        } else {
            push @rejected, [ $sid, 'multiple hsps' ];
        }
    }
    return wantarray ? ( \@trimmed, \@rejected, \%scoreH ) : \@trimmed;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3