[Bio] / FigKernelPackages / gjoalignandtree.pm Repository:
ViewVC logotype

View of /FigKernelPackages/gjoalignandtree.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Apr 12 20:27:23 2010 UTC (9 years, 8 months ago) by fangfang
Branch: MAIN
CVS Tags: rast_rel_2010_0526
Changes since 1.1: +1 -0 lines
fixed a small bug in representative_for_profile()

package gjoalignandtree;

#

use strict;
use gjoseqlib;

#-------------------------------------------------------------------------------
#  Order a set of sequences:
#
#    @seqs = order_sequences_by_length( \@seqs, \%opts )
#   \@seqs = order_sequences_by_length( \@seqs, \%opts )
#
#  Options:
#
#     order  => key  # increasing (D), decreasing, median_outward,
#                    #       closest_to_median, closest_to_length
#     length => prefered_length
#
#-------------------------------------------------------------------------------

sub order_sequences_by_length
{
    my ( $seqs, $opts ) = @_;
    $seqs && ref $seqs eq 'ARRAY' && @$seqs
        or print STDERR "order_sequences_by_length called with invalid sequence list.\n"
           and return wantarray ? () : [];

    $opts = {} if ! ( $opts && ref $opts eq 'HASH' );
    my $order = $opts->{ order } || 'increasing';

    my @seqs = ();
    if    ( $order =~ /^dec/i )
    {
        $opts->{ order } = 'decreaseing';
        @seqs = sort { length( $b->[2] ) <=> length( $a->[2] ) } @$seqs;
    }
    elsif ( $order =~ /^med/i )
    {
        $opts->{ order } = 'median_outward';
        my @seqs0 = sort { length( $a->[2] ) <=> length( $b->[2] ) } @$seqs;
        while ( $_ = shift @seqs0 )
        {
            push @seqs, $_;
            push @seqs, pop @seqs0 if @seqs0;
        }
        @seqs = reverse @seqs;
    }
    elsif ( $order =~ /^close/i )
    {
        my $pref_len;
        if ( defined $opts->{ length } )   #  caller supplies preferred length?
        {
            $opts->{ order } = 'closest_to_length';
            $pref_len = $opts->{ length } + 0;
        }
        else                               #  preferred length is median
        {
            $opts->{ order } = 'closest_to_median';
            my @seqs0 = sort { length( $a->[2] ) <=> length( $b->[2] ) } @$seqs;
            my $i = int( @seqs0 / 2 );
            $pref_len = ( @seqs0 % 2 ) ? length( $seqs0[$i]->[2] )
                                       : ( length( $seqs0[$i-1]->[2] ) + length( $seqs0[$i]->[2] ) ) / 2;
        }
 
        @seqs = map  { $_->[0] }
                sort { $a->[1] <=> $b->[1] }  #  closest to preferred first
                map  { [ $_, abs( length( $_->[2] ) - $pref_len ) ] }  # dist from pref?
                @$seqs;
    }
    else
    {
        $opts->{ order } = 'increasing';
        @seqs = sort { length( $a->[2] ) <=> length( $b->[2] ) } @$seqs;
    }

    wantarray ? @seqs : \@seqs;
}


#-------------------------------------------------------------------------------
#  Write representative seqeunce groups to a file
#
#      write_representative_groups( \%rep, $file )
#
#-------------------------------------------------------------------------------

sub  write_representative_groups
{
    my ( $repH, $file ) = @_;
    $repH && (ref $repH eq 'HASH') && keys %$repH
        or print STDERR "write_representative_groups called with invalid rep hash.\n"
           and return;

    my ( $fh, $close ) = &output_file_handle( $file );

    #  Order keys from largest to smallest set, then alphabetical

    my @keys = sort { @{ $repH->{$b} } <=> @{ $repH->{$a} } || lc $a cmp lc $b } keys %$repH;

    foreach ( @keys ) { print $fh join( "\t", ( $_, @{ $repH->{ $_ } } ) ), "\n" }

    close $fh if $close;
}


#-------------------------------------------------------------------------------
#
#   @align = trim_align_to_median_ends( \@align, \%opts )
#  \@align = trim_align_to_median_ends( \@align, \%opts )
#
#  Options:
#
#     begin => bool   #  Trim start (specifically)
#     end   => bool   #  Trim end (specifically)
#
#-------------------------------------------------------------------------------
sub trim_align_to_median_ends
{
    my ( $align, $opts ) = @_;
    $align && ref $align eq 'ARRAY' && @$align
        or print STDERR "trim_align_to_median_ends called with invalid sequence list.\n"
           and return wantarray ? () : [];

    $opts = {} if ! ( $opts && ref $opts eq 'HASH' );
    my $tr_beg = $opts->{ begin } || $opts->{ beg } || $opts->{ start } ? 1 : 0;
    my $tr_end = $opts->{ end } ? 1 : 0;
    $tr_beg = $tr_end = 1 if ! ( $tr_beg || $tr_end );

    my( @pos1, @pos2, $i );
    foreach my $seq ( @$align )
    {
        my( $b, $e ) = $seq->[2] =~ /^(-*).*[^-](-*)$/;
        push @pos1, length( $b || '' );
        push @pos2, length( $e || '' );
    }

    @pos1 = sort { $a <=> $b } @pos1;
    @pos2 = sort { $a <=> $b } @pos2;

    my $pos1 = $tr_beg ? $pos1[ int( @pos1/2 ) ] : 0;
    my $pos2 = $tr_end ? $pos2[ int( @pos2/2 ) ] : 0;

    my $ori_len = length( $align->[0]->[2] );
    my $new_len = $ori_len - ( $pos1 + $pos2 );
    my @align2 = map { [ @$_[0,1], substr( $_->[2], $pos1, $new_len ) ] }
                 @$align;

    wantarray ? @align2 : \@align2;
}


#-------------------------------------------------------------------------------
#
#   Options:
#
#        file  =>  $filename  #  supply a file name to open and write
#        file  => \*FH        #  supply an open file handle (D = STDOUT)
#        line  =>  $linelen   #  residues per line (D = 60)
#        lower =>  $bool      #  all lower case sequence
#        upper =>  $bool      #  all upper case sequence
#
#-------------------------------------------------------------------------------

sub print_alignment_as_pseudoclustal
{
    my ( $align, $opts ) = @_;
    $align && ref $align eq 'ARRAY' && @$align
        or print STDERR "print_alignment_as_pseudoclustal called with invalid sequence list.\n"
           and return wantarray ? () : [];

    $opts = {} if ! ( $opts && ref $opts eq 'HASH' );
    my $line_len = $opts->{ line } || 60;
    my $case = $opts->{ upper } ?  1 : $opts->{ lower } ? -1 : 0;

    my ( $fh, $close ) = &output_file_handle( $opts->{ file } );

    my $namelen = 0;
    foreach ( @$align ) { $namelen = length $_->[0] if $namelen < length $_->[0] }
    my $fmt = "%-${namelen}s  %s\n";

    my $id;
    my @lines = map { $id = $_->[0]; [ map { sprintf $fmt, $id, $_ }
                                       map { $case < 0 ? lc $_ : $case > 0 ? up $_ : $_ }  # map sequence only
                                       $_->[2] =~ m/.{1,$line_len}/g
                                     ] }
                
                @$align;

    my $ngroup = @{ $lines[0] };
    for ( my $i = 0; $i < $ngroup; $i++ )
    {
        foreach ( @lines ) { print $fh $_->[$i] if $_->[$i] }
        print $fh "\n";
    }

    close $fh if $close;
}


#-------------------------------------------------------------------------------
#  The profile 'query' sequence:
#     1. No terminal gaps, if possible
#        1.1 Otherwise, no gaps at start
#     2. Longest sequence passing above
#
#    $prof_rep = representative_for_profile( $align )
#-------------------------------------------------------------------------------
sub representative_for_profile
{
    my ( $align ) = @_;
    $align && ref $align eq 'ARRAY' && @$align
        or die "representative_for_profile called with invalid sequence list.\n";

    my @cand;
    @cand = grep { $_->[2] =~ /^[^-].*[^-]$/ } @$align;            # No end gaps
    @cand = grep { $_->[2] =~ /^[^-]/ }        @$align if ! @cand; # No start gaps
    @cand = @$align if ! @cand; 

    my ( $rep ) = map  { $->[0] }               # sequence entry
                  sort { $b->[1] <=> $a->[1] }  # max nongaps
                  map  { [ $_, tr/-//c ] }      # count nongaps
                  @cand;

    $rep = [ @$rep ];       # Make a copy
    $rep->[2] =~ s/-+//g;   # Compress sequence

    return $rep;
}


#-------------------------------------------------------------------------------
#
#   $structured_blast = blastpgp( $db, $profile, $options )
#
#  Required:
#  db_file or db_seq
#  profile_file or profile_seq
#
#  Optional:
#  query => q_file
#  query => q_seq
#
#-------------------------------------------------------------------------------

sub blastpgp
{
    my ( $db, $profile, $opts ) = @_;
    $opts ||= {};

    my ( $dbfile, $rm_db );
    if ( defined $db && ref $db )
    {
        ref $db eq 'ARRAY'
            && @$db
            || print STDERR "blastpgp requires one or more database sequences.\n"
               && return undef;
        $dbfile = "blastpgp_db_$$";
        gjoseqlib::print_alignment_as_fasta( $dbfile, $db );
        $rm_db = 1;
    }
    elsif ( defined $db && -f $db )
    {
        $dbfile = $db;
    }
    else
    {
        die "blastpgp requires database.";
    }
    SEED_utils::verify_db( $db, 'T' );  # protein

    my ( $proffile, $rm_profile );
    if ( defined $profile && ref $profile )
    {
        ref $profile eq 'ARRAY'
            && @$profile
            || print STDERR "blastpgp requires one or more profile sequences.\n"
               && return undef;
        $proffile = "blastpgp_profile_$$";
        &print_alignment_as_pseudoclustal( $profile,  { file => $proffile, upper => 1 } );
        $rm_profile = 1;
    }
    elsif ( defined $profile && -f $profile )
    {
        $proffile = $profile;
    }
    else
    {
        die "blastpgp requires profile.";
    }

    my ( $qfile, $rm_query );
    my $query = $opts->{ query };
    if ( defined $query && ref $query )
    {
        ref $query eq 'ARRAY'
            && @$query
            || print STDERR "blastpgp invalid query sequence.\n"
               && return undef;
        $qfile = "blastpgp_query_$$";
        gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );
        $rm_query = 1;
    }
    elsif ( defined $query && -f $query )
    {
        $qfile = $query;
    }
    elsif ( $profile )    #  Build it from profile
    {
        $query = &representative_for_profile( $profile );
        $qfile = "blastpgp_query_$$";
        gjoseqlib::print_alignment_as_fasta( $qfile, [$query] );
        $rm_query = 1;
    }
    else
    {;
        die "blastpgp requires database.";
    }

    my $nkeep = $opts->{ nresult } || 5000;
    my $cmd = "blastpgp -B '$proffile' -j 1 -d '$dbfile -b $nkeep -v $nkeep -i '$qfile'";

    open BLAST, "$cmd |" or print STDERR "Failed to open: '$cmd'.\n"
                            and return undef;
    my $out = &gjoparseblast::structured_blast_output( \*BLAST, 1 );  # selfmatches okay
    close BLAST;

    if ( $rm_db )
    {
        my @files = grep { -f $_ } map { ( $_, "$_.psq", "$_.pin", "$_.phr" ) } $dbfile;
        unlink @files if @files;
    }
    unlink $proffile if $rm_profile;
    unlink $qfile    if $rm_query;

    return $out;
}


#-------------------------------------------------------------------------------
#  Get an output file handle, and boolean on whether to close or not:
#
#  ( \*FH, $close ) = output_file_handle(  $filename );
#  ( \*FH, $close ) = output_file_handle( \*FH );
#  ( \*FH, $close ) = output_file_handle( );                   # D = STDOUT
#
#-------------------------------------------------------------------------------

sub output_file_handle
{
    my ( $file, $umask ) = @_;

    my ( $fh, $close );
    if ( defined $file )
    {
        if ( ref $file eq 'GLOB' )
        {
            $fh = $file;
            $close = 0;
        }
        else
        {
            open( $fh, ">$file") || die "output_file_handle could not open '$file'.\n";
            chmod 0664, $file;  #  Seems to work on open file!
            $close = 1;
        }
    }
    else
    {
        $fh = \*STDOUT;
        $close = 0;
    }

    return ( $fh, $close );
}


#-------------------------------------------------------------------------------
#
#   print_blast_as_records(  $file, \@queries )
#   print_blast_as_records( \*FH,   \@queries )
#   print_blast_as_records(         \@queries )     # STDOUT
#
#   \@queries = read_blast_from_records(  $file )
#   \@queries = read_blast_from_records( \*FH )
#   \@queries = read_blast_from_records( )          # STDIN
#
#  Three output record types:
#
#   Query \t qid \t qdef \t dlen
#   >     \t sid \t sdef \t slen
#   HSP   \t ...
#
#-------------------------------------------------------------------------------
sub print_blast_as_records
{
    my $file = ( $_[0] && ! ( ref $_[0] eq 'ARRAY' ) ? shift : undef );
    my ( $fh, $close ) = output_file_handle( $file );

    my $queries = shift;
    $queries && ref $queries eq 'ARRAY'
        or print STDERR "Bad blast data supplied to print_blast_as_records.\n"
           and return;

    foreach my $qdata ( @$queries )
    {
        $qdata && ref $qdata eq 'ARRAY' && @$qdata == 4
            or print STDERR "Bad blast data supplied to print_blast_as_records.\n"
               and return;

        my $subjcts = pop @$qdata;
        $subjcts && ref $subjcts eq 'ARRAY'
            or print STDERR "Bad blast data supplied to print_blast_as_records.\n"
               and return;

        print $fh join( "\t", 'Query', @$qdata ), "\n";
        foreach my $sdata ( @$subjcts )
        {
            $sdata && ref $sdata eq 'ARRAY' && @$sdata == 4
                or print STDERR "Bad blast data supplied to print_blast_as_records.\n"
                   and return;

            my $hsps = pop @$sdata;
            $hsps && ref $hsps eq 'ARRAY' && @$hsps
                or print STDERR "Bad blast data supplied to print_blast_as_records.\n"
                   and return;

            print $fh join( "\t", '>', @$sdata ), "\n";

            foreach my $hsp ( @$hsps )
            {
                $hsp && ref $hsp eq 'ARRAY' && @$hsp > 4
                    or print STDERR "Bad blast data supplied to print_blast_as_records.\n"
                       and return;
                print $fh join( "\t", 'HSP', @$hsp ), "\n";
            }
        }
    }

    close $fh if $close;
}


sub read_blast_from_records
{
    my ( $file ) = @_;
    my ( $fh, $close );
    if ( $file && ref $file eq 'GLOB' )
    {
        $fh = $file;
    }
    elsif ( defined $file )
    {
        -f $file && open( $fh, "<$file" )
            or print STDERR "read_blast_from_records could not open '$file'.\n"
                and return wantarray ? () : [];
        $close = 1;
    }
    else
    {
        $fh = \@STDIN;
    }
    my @queries = ();
    my @qdata;
    my @subjects = ();
    my @sdata;
    my @hsps = ();

    local $_;
    while ( defined( $_ = <$fh> ) )
    {
        chomp;
        my ( $type, @datum ) = split /\t/;
        if ( $type eq 'Query' )
        {
            push @subjects, [ @sdata, [ @hsps     ] ] if @hsps;
            push @queries,  [ @qdata, [ @subjects ] ] if @qdata;
            @qdata = @datum;
            @sdata = ();
            @subjects = ();
            @hsps = ();
        }
        elsif ( $type eq '>' )
        {
            push @subjects, [ @sdata, [ @hsps ] ] if @hsps;
            @sdata = @datum;
            @hsps = ();
        }
        elsif ( $type eq 'HSP' )
        {
            push @hsps, \@datum;
        }
    }

    close $fh if $close;

    push @subjects, [ @sdata, [ @hsps     ] ] if @hsps;
    push @queries,  [ @qdata, [ @subjects ] ] if @qdata;

    wantarray ? @queries : \@queries;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3