[Bio] / FigKernelPackages / gjoalignment.pm Repository:
ViewVC logotype

View of /FigKernelPackages/gjoalignment.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.15 - (download) (as text) (annotate)
Thu Aug 14 17:21:53 2008 UTC (11 years, 6 months ago) by golsen
Branch: MAIN
CVS Tags: rast_2008_0924, rast_rel_2008_09_30, mgrast_rel_2008_0924, mgrast_rel_2008_0625, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29
Changes since 1.14: +15 -12 lines
Fix FIG dependencies so that it is FIG-aware, but not FIG-dependent.

#
# Copyright (c) 2003-2008 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package gjoalignment;

#  A package of functions for alignments (to be expanded)
#
#  @alignment = align_with_clustal( \@sequences );
#  $tree      = tree_with_clustal( \@alignment );
#  @alignment = add_to_alignment( $seqentry, \@alignment );
#  @seqs      = read_fasta_file( $file );
#  @seqs      = read_fasta_file();
#  @seqs      = read_clustal_file( $file );
#  @seqs      = read_clustal_file();
#               write_fasta_file( \@$seqs, $compress, $file );
#               write_fasta_file( \@$seqs, $compress );
#               write_fasta_file( \@$seqs );
#  $seq       = expand_seq( $seq, \@$to_add );

use strict;
use Carp;                       # Used for diagnostics when run() fails
eval { require Data::Dumper };  # Not present on all systems

#  The following code makes this FIG-aware, but not FIG dependent:

my $is_fig;
eval { require FIG; require FIG_Config; $is_fig = 1 };

my ( $ext_bin, $tmp_dir );
if ( $is_fig )
{
    $ext_bin = "$FIG_Config::ext_bin";
    $tmp_dir =  $FIG_Config::temp;
}
else
{
    $ext_bin = '';
    $tmp_dir = -d '/tmp' ? '/tmp' : '.';
}

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(
        align_with_clustal
        add_to_alignment
        );

our @EXPORT_OK = qw(
        read_fasta_file
        write_fasta_file
        read_clustal_file
        expand_seq
        );


#===============================================================================
#  Align sequence with clustalw and return the alignment
#
#    @align = align_with_clustal(  @sequences )
#    @align = align_with_clustal( \@sequences )
#   \@align = align_with_clustal(  @sequences )
#   \@align = align_with_clustal( \@sequences )
#===============================================================================
sub align_with_clustal
{
    @_ and ref( $_[0] ) eq 'ARRAY' or return undef;
    my @seqs = ref( $_[0]->[0] ) eq 'ARRAY' ? @{ $_[0] } : @_;

    #  Remap the id to be clustal friendly, saving the originals in a hash:

    my ( $id, $def, $seq, $id2, %desc, @seqs2 );

    $id2 = "seq00000";
    @seqs2 = map { ( $id, $def, $seq ) = @$_;
                   $desc{ ++$id2 } = [ $id, $def ];
                   [ $id2, "", $seq ]
                 } @seqs;

    #  Do the alignment:

    my $seqfile = "$tmp_dir/align_fasta_tmp_${$}.fasta";
    my $outfile = "$tmp_dir/align_fasta_tmp_${$}.aln";
    my $dndfile = "$tmp_dir/align_fasta_tmp_${$}.dnd";

    write_fasta_file( \@seqs2, 1, $seqfile );
    my $clustalw = $ext_bin ? "$ext_bin/clustalw" : 'clustalw';
    &run( "$clustalw -infile=$seqfile -outfile=$outfile -newtree=$dndfile -outorder=aligned -maxdiv=0 -align > /dev/null" );
    my @aligned = read_clustal_file( $outfile );
    &run( "/bin/rm -f $seqfile $outfile $dndfile" );

    #  Restore the id and definition and write the entries:

    wantarray ?   map { [ @{ $desc{$_->[0]} }, $_->[2] ] } @aligned
              : [ map { [ @{ $desc{$_->[0]} }, $_->[2] ] } @aligned ];
}



#===============================================================================
#  Insert a new sequence into an alignment without altering the relative
#  alignment of the existing sequences.  The alignment is based on a profile
#  of those sequences that are not significantly less similar than the most
#  similar sequence.
#===============================================================================

sub add_to_alignment {
    my ( $seq, $ali, $trim, $silent ) = @_;

    my $std_dev = 1.5;  #  The definition of "not significantly less similar"

    #  Don't add a sequence with a duplicate id.  This used to be fatal.

    my $id = $seq->[0];
    foreach ( @$ali )
    {
        next if $_->[0] ne $id;
        if (! $silent)
        {
            print STDERR "Warning: add_to_alignment not adding sequence with duplicate id:\n$id\n";
        }
        return wantarray ? @$ali : $ali;
    }

    #  Put sequences in a clean canonical form:
    
    my $clnseq =       [ "seq000000", "", clean_for_clustal( $seq->[2] ) ];
    my @clnali = map { [  $_->[0],    "", clean_for_clustal(   $_->[2] ) ] } @$ali;

    my($trimmed_start,$trimmed_len);
    if ($trim)    #### if we are trimming sequences before inserting into the alignment
    {
	($clnseq,$trimmed_start,$trimmed_len)    = &trim($clnseq,\@clnali);
	if (! defined($clnseq)) 
	{ 
	    print STDERR "Warning: attempting to add a sequence with no recognizable similarity: $id\n";
	    return $ali;
	}
    }

    #  Tag alignment sequences with similarity to new sequence and sort:

    my @evaluated = sort { $b->[0] <=> $a->[0] }
                    map  { [ fract_identity( $_, $clnseq ), $_ ] }
                    @clnali;

    #  Compute identity threshold from the highest similarity:

    my $threshold = identity_threshold( $evaluated[0]->[0],
                                        length( $evaluated[0]->[1]->[2] ),
                                        $std_dev
                                      );
    my $top_hit = $evaluated[0]->[1]->[0];

    #  Filter sequences for those that pass similarity threshold.
    #  Give them clustal-friendly names.

    my $s;
    $id = "seq000001";
    my @relevant = map  { [ $id++, "", $_->[1]->[2] ] }
                   grep { ( $_->[0] >= $threshold ) }
                   @evaluated;

    #  Do the profile alignment:

    my $profile = "$tmp_dir/add_to_alignment_tmp_${$}_1.fasta";
    my $seqfile = "$tmp_dir/add_to_alignment_tmp_${$}_2.fasta";
    my $outfile = "$tmp_dir/add_to_alignment_tmp_${$}_1.aln";
    my $dndfile = "$tmp_dir/add_to_alignment_tmp_${$}_1.dnd";

    write_fasta_file(  \@relevant, 0, $profile );
    write_fasta_file( [ $clnseq ], 0, $seqfile );
    #
    #  I would have thought that the profile tree file should be -newtree1, but
    #  that fails.  -newtree works fine at putting the file where we want it.
    #  Perhaps it would have made more sense to do a cd to the desired directory
    #  first.
    #
    my $clustalw = $ext_bin ? "$ext_bin/clustalw" : 'clustalw';
    &run( "$clustalw -profile1=$profile -profile2=$seqfile -outfile=$outfile -newtree=$dndfile -profile -maxdiv=0 -outorder=input > /dev/null" );
    my @relevant_aligned = map { $_->[2] } read_clustal_file( $outfile );
    &run( "/bin/rm -f $profile $seqfile $outfile $dndfile" );

    my $ali_seq = pop @relevant_aligned;

    #  Figure out where the gaps were added to the existing alignment:

    my ( $i, $j, $c );
    my $jmax = length( $relevant_aligned[0] ) - 1;
    my @rel_seqs = map { $_->[2] } @relevant; # Save a level of referencing;
    my @to_add = ();

    for ( $i = $j = 0; $j <= $jmax; $j++ ) {
	$c = same_col( \@rel_seqs, $i, \@relevant_aligned, $j ) ? "x" : "-";
	push @to_add, $c;
	if ( $c ne "-" ) { $i++ }
    }
    my $mask = join( "", @to_add );

    if (0)
    {
	print STDERR "max identity = $evaluated[0]->[0]\n";
	print STDERR "threshold    = $threshold identity\n";
	# foreach ( @evaluated ) { print STDERR "$_->[1]->[0] has $_->[0] identity score\n" }
	foreach ( @relevant_aligned ) { print STDERR "$_\n" }
	print STDERR "$ali_seq\n";
	print STDERR "$mask\n\n";
    }

    #  Time to expand the sequences; we will respect case and non-standard
    #  gap characters.  We will add new sequence immediately following the
    #  top_hit.

    my $def;
    my @new_align = ();

    foreach my $entry ( @$ali )
    {
	( $id, $def, $s ) = @$entry;
	push @new_align, [ $id, $def, expand_seq( $s, \@to_add ) ];
	if ( $id eq $top_hit )
	{
	    my( $new_id, $new_def, $new_s ) = @$seq;
	    if ($trim) { $new_s = substr($new_s,$trimmed_start,$trimmed_len); }
	    push @new_align, [ $new_id, $new_def, expand_seq( $new_s, [split //, $ali_seq] ) ];
	}
    }

    if ($trim)
    {
	@new_align = &final_trim($seq->[0],\@new_align);
    }
    wantarray ? @new_align : \@new_align;
}


sub final_trim {
    my($id,$ali) = @_;  # remove dangling ends for $id

    my($first,$last);
    for ($first=0; ($first < @$ali) && &bad_col($ali,$first,$id); $first++) {}
    for ($last=length($ali->[0]->[2]) - 1; ($last >= 0) && &bad_col($ali,$last,$id); $last--) {}

    my(@trimmed) = ();
    my($seq);
    foreach $seq (@$ali)
    {
	my($id1,$desc1,$seq1) = @$seq;
	push(@trimmed,[$id1,$desc1,substr($seq1,$first,($last+1-$first))]);
    }
    return @trimmed;
}


sub bad_col {
    my($ali,$col,$id) = @_;

    my($i);
    for ($i=0; ($i < @$ali) && 
	       (($ali->[$i]->[0] eq $id) || 
		(substr($ali->[$i]->[2],$col,1) eq "-")); 
         $i++) {}
    return ($i == @$ali);
}


sub read_fasta_file {
    my ( $file ) = @_;
    my ( $id, $rest, @lines, $line );
    my @seqs = ();

    my $fh;
    if ( ref( $file ) eq 'GLOB' ) { $fh = $file }
    elsif ( $file ) { open( $fh, "<$file" ) || die "could not open $file" }
    else         { $fh = \*STDIN }

    $line = <$fh>;
    while ( defined( $line ) && ( ( $id, $rest ) = $line =~ /^>(\S+)\s*(.*)$/ ) )
    {
	$id   =~ s/\;$//;
	$rest =~ s/\s+$//;

	@lines = ();
	while ( defined( $line = <$fh> ) && ( $line !~ /^>/ ) )
	{
	    $line =~ s/\s+//g;
	    push @lines, uc $line;
	}
	push @seqs, [ $id, $rest, join( "", @lines ) ];
    }
    if ( $file && ref( $file ) ne 'GLOB' ) { close $fh }

    wantarray ? @seqs : \@seqs;
}


sub write_fasta_file {
    my ( $seqs, $compress, $file ) = @_;
    my ( $outfil, $entry, $id, $def, $seq );
 
    if ( ref( $file ) eq 'GLOB' )
    {
        $outfil = $file;
    }
    elsif ( $file )
    {
	open( $outfil, ">$file" ) || die "could not open $file for writing";
    }
    else
    {
	$outfil = \*STDOUT;
    }

    foreach $entry ( @$seqs )
    {
	defined( $id  = $entry->[0] ) || confess "missing seq id";
	$def = $entry->[1];
	defined( $seq = $entry->[2] ) || confess "missing seq data";
	if ( $compress ) { $seq =~ s/[-~.]+//g }
	print $outfil ">$id", $def ? " $def" : "", "\n$seq\n";
    }
    if ( $file && ref( $file ) ne 'GLOB' ) { close $outfil }
}


sub read_clustal_file {
    my ( $file ) = @_;

    my ( %seq, @ids, $line, $fh );

    if ( ref( $file ) eq 'GLOB' ) { $fh = $file }
    elsif ( $file ) { open( $fh, "<$file" ) || die "could not open $file" }
    else         { $fh = \*STDIN }

    while ( defined( $line = <$fh> ) )
    {
	( $line =~ /^[A-Za-z0-9]/ ) or next;
	chomp $line;
	my @flds = split /\s+/, $line;
	if ( @flds == 2 )
	{
	    $seq{ $flds[0] } or push @ids, $flds[0];
	    push @{ $seq{ $flds[0] } }, $flds[1];
	}
    }
    if ( $file && ref( $file ) ne 'GLOB' ) { close $fh }

    my @seqs = map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
    wantarray ? @seqs : \@seqs;
}


sub clean_for_clustal {
    my $seq = uc shift;
    $seq =~ s/U/C/g;          # Sec -> Cys (well, for proteins)
    $seq =~ s/\*/X/g;         # Clustal chokes on *
    $seq =~ s/[^A-Z]/-/g;     # Nonstandard gaps
    $seq
}


sub fract_identity {
    my ( $seq1, $seq2 ) = @_;
    my ( $s1, $s2, $i, $same );

    my $infile  = "$tmp_dir/add_to_alignment_tmp_${$}.fasta";
    my $outfile = "$tmp_dir/add_to_alignment_tmp_${$}.aln";
    my $dndfile = "$tmp_dir/add_to_alignment_tmp_${$}.dnd";

    write_fasta_file( [ [ "seq1", "", $seq1->[2] ], [ "seq2", "", $seq2->[2] ] ], 1, $infile );
    my $clustalw = $ext_bin ? "$ext_bin/clustalw" : 'clustalw';
    &run( "$clustalw -infile=$infile -outfile=$outfile -newtree=$dndfile -align -maxdiv=0 > /dev/null;" );
    ( $s1, $s2 ) = map { $_->[2] } read_clustal_file( $outfile );  # just seqs
    &run( "/bin/rm -f $infile $outfile $dndfile" );

    for ( $i = length( $s1 ) - 1, $same = 0; $i >= 0; $i-- )
    {
	if ( substr($s1, $i, 1) eq substr($s2, $i, 1) ) { $same++ }
    }
    return ( $same / length( $s1 ) );
}
	

sub identity_threshold {
    my ( $maxsim, $seqlen, $z ) = @_;
    my ( $p, $sigma, $step );

    $p = $maxsim / 2;
    $step = $p / 2;
    while ( $step > 0.0005 ) {
	$sigma = sqrt( $p * (1 - $p) / $seqlen );
	$p += ( $p + ( $z * $sigma ) < $maxsim ) ? $step : (- $step);
	$step /= 2;
    }
    return $p - $z * $sigma;
}


#===============================================================================
#  The logic used here to optimize identification of "same" column depends
#  on the fact that only the second alignment ($y) has new columns, and they
#  are all gaps.  Therefore any non-gap character in alignment $y indicates
#  that it is not a column of added gaps (it must match).  After learning
#  that alignment $y has a gap, then we only need test $x for a gap.
#===============================================================================

sub same_col {
    my ( $x, $colx, $y, $coly ) = @_;
    my ( $seq, $seqmax, $cy );

    $seqmax = @$x - 1;
    for ( $seq = 0; $seq <= $seqmax; $seq++ )
    {
	if ( substr($y->[$seq], $coly, 1) ne "-" ) { return 1 } # Non-gap in aligned
	if ( substr($x->[$seq], $colx, 1) ne "-" ) { return 0 } # Unmatched gap
    }
    return 1;
}


#===============================================================================
#  Sequence expansion respects the local gap character.  $c0 and $c1 are the
#  previous and next characters.  (($c0,$c1)=($c1,shift @s1))[0] updates the
#  values and evaluates to what was the next character, and becomes the new
#  previous character.
#
#  The following really does print w, x, y, and z, one per line: 
#
#  ( $a, $b, @c ) = ("", split //, "wxyz");
#  while ( $b ) { print( (($a,$b)=($b,shift @c))[0], "\n" ) }
#===============================================================================

sub expand_seq {
    my ( $seq, $to_add ) = @_;

    my ( $c0, $c1, @s1 ) = ( "", split( //, $seq ), "" );
    my @s2 = map { $_ ne "-"                ? (($c0,$c1)=($c1,shift @s1))[0]
                 : $c0 eq "~" || $c1 eq "~" ? "~"
                 : $c0 eq "." || $c1 eq "." ? "."
                 :                            "-"
                 } @$to_add;

    join "", @s2;
}


sub trim {
    my($clnseq,$clnali) = @_;

    my $blastfile = "$tmp_dir/blastfile.$$";
    my $seqfile   = "$tmp_dir/seqfile.$$";
    &write_fasta_file($clnali,1,$blastfile);
    &write_fasta_file([$clnseq],1,$seqfile);

    my $tmp_seq = $clnali->[0]->[2];
    $tmp_seq =~ s/-+//g;
    my $nt_cnt = $tmp_seq =~ tr/ACGTUacgtu//;
    my ($is_prot,$prog) = ( $nt_cnt < ( 0.5 * length( $tmp_seq ) ) ) ? ("T",'blastp') : ("F",'blastn -r 1 -q -1');
    my $formatdb = $ext_bin ? "$ext_bin/formatdb" : 'formatdb';
    my $blastall = $ext_bin ? "$ext_bin/blastall" : 'blastall';

    &run( "$formatdb -i $blastfile -p $is_prot" );

    open( BLASTOUT, "$blastall -b 5 -v 5 -FF -m 8 -i $seqfile -e 0.001 -d $blastfile -p $prog |" )
	|| die "could not handle the blast";

    my $x;
    my @out;
    while (defined($x = <BLASTOUT>))
    {
	my @tmp = split(/\s+/,$x);
	push(@out,[@tmp[1,6,7,8,9]]);
    }
    close(BLASTOUT);
    if (@out < 1) { return undef }

    my %lenH;
    foreach my $tuple (@$clnali)
    {
	$lenH{$tuple->[0]} = $tuple->[2] =~ tr/a-zA-Z//;
    }

    @out = sort { ($a->[1] <=> $b->[1]) or ($a->[3] <=> $b->[3]) } @out;
    my @to_removeS = sort { $a <=> $b } map { &remove($_->[1],$_->[3])} @out;

    my $lenQ = length($clnseq->[2]);
    my @ends = map { [$lenQ - $_->[2], $lenH{$_->[0]} - $_->[4]] } @out;
    my @to_removeE = sort { $a <=> $b } map { &remove($_->[0]+1,$_->[1]+1) } @ends;
    my $trimmed_start = $to_removeS[0];
    my $trimmed_len = $lenQ - ($to_removeS[0] + $to_removeE[0]);
    my $seqT = substr($clnseq->[2],$trimmed_start,$trimmed_len);

    return ([$clnseq->[0],$clnseq->[1],$seqT],$trimmed_start,$trimmed_len);
}


sub remove {
    my( $b1, $b2 ) = @_;
    return ($b2 <= 5) ? &max( $b1 - $b2, 0 ) : $b1 - 1;
}


sub max { $_[0] > $_[1] ? $_[0] : $_[1] }

sub run { system( $_[0] ) == 0 || Confess( "FAILED: $_[0]" ) }


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3