[Bio] / FigKernelPackages / gjoalignment.pm Repository:
ViewVC logotype

View of /FigKernelPackages/gjoalignment.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Dec 29 19:28:11 2005 UTC (14 years, 3 months ago) by golsen
Branch: MAIN
CVS Tags: caBIG-05Apr06-00, caBIG-13Feb06-00
Changes since 1.2: +16 -8 lines
Make attempt to add a duplicate id nonfatal.  Having the program die
in the middle of a large number of adds is annoying.
Increase flexibility on return data type (array or reference to array)
in the function library.
Make the calls by reference in the program.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

package gjoalignment;

#  A package of functions for alignments (to be expanded)
#
#  @alignment = add_to_alignment( $seqentry, \@alignment );
#  @seqs      = read_fasta_file( $file );
#  @seqs      = read_fasta_file();
#  @seqs      = read_clustal_file( $file );
#  @seqs      = read_clustal_file();
#               write_fasta_file( \@$seqs, $compress, $file );
#               write_fasta_file( \@$seqs, $compress );
#               write_fasta_file( \@$seqs );
#  $seq       = expand_seq( $seq, \@$to_add );

use Carp;
use strict;

use FIG_Config;                      #  These are the only SEED dependencies:
my $ext_bin = $FIG_Config::ext_bin;  #     location of clustalw
my $tmp_dir = $FIG_Config::temp;     #     location for temp files

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(
        add_to_alignment
        );

our @EXPORT_OK = qw(
        read_fasta_file
        write_fasta_file
        read_clustal_file
        expand_seq
        );


#  Insert a new sequence into an alignment without altering the relative
#  alignment of the existing sequences.  The alignment is based on a profile
#  of those sequences that are not significantly less similar than the most
#  similar sequence.

sub add_to_alignment {
    my ( $seq, $ali ) = @_;

    my $std_dev = 1.5;  #  The definition of "not significantly less similar"

    #  Don't add a sequence with a duplicate id.  This used to be fatal.

    my $id = $seq->[0];
    foreach ( @$ali )
    {
        next if $_->[0] ne $id;
        print STDERR "Warning: add_to_alignment not adding sequence with duplicate id:\n$id\n";
        return wantarray ? @$ali : $ali;
    }

    #  Put sequences in a clean canonical form:
    
    my $clnseq =       [ "seq000000", "", clean_for_clustal( $seq->[2] ) ];
    my @clnali = map { [  $_->[0],    "", clean_for_clustal(   $_->[2] ) ] } @$ali;

    #  Tag alignment sequences with similarity to new sequence and sort:

    my @evaluated = sort { $b->[0] <=> $a->[0] }
                    map  { [ fract_identity( $_, $clnseq ), $_ ] }
                    @clnali;

    #  Compute identity threshold from the highest similarity:

    my $threshold = identity_threshold( $evaluated[0]->[0],
                                        length( $evaluated[0]->[1]->[2] ),
                                        $std_dev
                                      );
    my $top_hit = $evaluated[0]->[1]->[0];

    #  Filter sequences for those that pass similarity threshold.
    #  Give them clustal-friendly names.

    my $s;
    $id = "seq000001";
    my @relevant = map  { [ $id++, "", $_->[1]->[2] ] }
                   grep { ( $_->[0] >= $threshold ) }
                   @evaluated;

    #  Do the profile alignment:

    my $profile = "$tmp_dir/add_to_alignment_tmp_${$}_1.fasta";
    my $seqfile = "$tmp_dir/add_to_alignment_tmp_${$}_2.fasta";
    my $outfile = "$tmp_dir/add_to_alignment_tmp_${$}_1.aln";
    my $dndfile = "$tmp_dir/add_to_alignment_tmp_${$}_1.dnd";

    write_fasta_file(  \@relevant, 0, $profile );
    write_fasta_file( [ $clnseq ], 0, $seqfile );
    #
    #  I would have thought that the profile tree file should be -newtree1, but
    #  that fails.  -newtree works fine at putting the file where we want it.
    #  Perhaps it would have made more sense to do a cd to the desired directory
    #  first.
    #
    system "$ext_bin/clustalw -profile1=$profile -profile2=$seqfile -outfile=$outfile -newtree=$dndfile -profile -maxdiv=0 -outorder=input > /dev/null";
    my @relevant_aligned = map { $_->[2] } read_clustal_file( $outfile );
    system "/bin/rm -f $profile $seqfile $outfile $dndfile";

    my $ali_seq = pop @relevant_aligned;

    #  Figure out where the gaps were added to the existing alignment:

    my ( $i, $j, $c );
    my $jmax = length( $relevant_aligned[0] ) - 1;
    my @rel_seqs = map { $_->[2] } @relevant; # Save a level of referencing;
    my @to_add = ();

    for ( $i = $j = 0; $j <= $jmax; $j++ ) {
	$c = same_col( \@rel_seqs, $i, \@relevant_aligned, $j ) ? "x" : "-";
	push @to_add, $c;
	if ( $c ne "-" ) { $i++ }
    }
    my $mask = join( "", @to_add );

    if (0)
    {
	print STDERR "max identity = $evaluated[0]->[0]\n";
	print STDERR "threshold    = $threshold identity\n";
	# foreach ( @evaluated ) { print STDERR "$_->[1]->[0] has $_->[0] identity score\n" }
	foreach ( @relevant_aligned ) { print STDERR "$_\n" }
	print STDERR "$ali_seq\n";
	print STDERR "$mask\n\n";
    }

    #  Time to expand the sequences; we will respect case and non-standard
    #  gap characters.  We will add new sequence immediately following the
    #  top_hit.

    my $def;
    my @new_align = ();

    foreach my $entry ( @$ali )
    {
	( $id, $def, $s ) = @$entry;
	push @new_align, [ $id, $def, expand_seq( $s, \@to_add ) ];
	if ( $id eq $top_hit )
	{
	    ( $id, $def, $s ) = @$seq;
	    push @new_align, [ $id, $def, expand_seq( $s, [split //, $ali_seq] ) ];
	}
    }

    wantarray ? @new_align : \@new_align;
}


sub read_fasta_file {
    my ( $file ) = @_;
    my ( $id, $rest, @lines, $line );
    my @seqs = ();

    my $fh;
    if ( $file ) { open( $fh, "<$file" ) || die "could not open $file" }
    else         { $fh = *STDIN }

    $line = <$fh>;
    while ( defined( $line ) && ( ( $id, $rest ) = $line =~ /^>(\S+)\s*(.*)$/ ) )
    {
	$id   =~ s/\;$//;
	$rest =~ s/\s+$//;

	@lines = ();
	while ( defined( $line = <$fh> ) && ( $line !~ /^>/ ) )
	{
	    $line =~ s/\s+//g;
	    push @lines, uc $line;
	}
	push @seqs, [ $id, $rest, join( "", @lines ) ];
    }
    if ( $file ) { close $fh }

    wantarray ? @seqs : \@seqs;
}


sub write_fasta_file {
    my ( $seqs, $compress, $file ) = @_;
    my ( $outfil, $entry, $id, $def, $seq );
 
    if ( $file )
    {
	open( $outfil, ">$file" ) || die "could not open $file for writing";
    }
    else
    {
	$outfil = *STDOUT;
    }

    foreach $entry ( @$seqs )
    {
	defined( $id  = $entry->[0] ) || confess "missing seq id";
	$def = $entry->[1];
	defined( $seq = $entry->[2] ) || confess "missing seq data";
	if ( $compress ) { $seq =~ s/[-~.]+//g }
	print $outfil ">$id", $def ? " $def" : "", "\n$seq\n";
    }
    if ( $file ) { close $outfil }
}


sub read_clustal_file {
    my ( $file ) = @_;

    my ( %seq, @ids, $line, $fh );

    if ( $file ) { open( $fh, "<$file" ) || die "could not open $file" }
    else         { $fh = *STDIN }

    while ( defined( $line = <$fh> ) )
    {
	( $line =~ /^[A-Za-z0-9]/ ) or next;
	chomp $line;
	my @flds = split /\s+/, $line;
	if ( @flds == 2 )
	{
	    $seq{ $flds[0] } or push @ids, $flds[0];
	    push @{ $seq{ $flds[0] } }, $flds[1];
	}
    }
    if ( $file ) { close $fh }

    my @seqs = map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
    wantarray ? @seqs : \@seqs;
}


sub clean_for_clustal {
    my $seq = uc shift;
    $seq =~ s/U/C/g;          # Sec -> Cys (well, for proteins)
    $seq =~ s/\*/X/g;         # Clustal chokes on *
    $seq =~ s/[^A-Z]/-/g;     # Nonstandard gaps
    $seq
}


sub fract_identity {
    my ( $seq1, $seq2 ) = @_;
    my ( $s1, $s2, $i, $same );

    my $infile  = "add_to_alignment_tmp_${$}.fasta";
    my $outfile = "add_to_alignment_tmp_${$}.aln";
    my $dndfile = "add_to_alignment_tmp_${$}.dnd";

    write_fasta_file( [ [ "seq1", "", $seq1->[2] ], [ "seq2", "", $seq2->[2] ] ], 1, $infile );
    system "$ext_bin/clustalw -infile=$infile -outfile=$outfile -newtree=$dndfile -align -maxdiv=0 > /dev/null;";
    ( $s1, $s2 ) = map { $_->[2] } read_clustal_file( $outfile );  # just seqs
    system "/bin/rm -f $infile $outfile $dndfile";

    for ( $i = length( $s1 ) - 1, $same = 0; $i >= 0; $i-- )
    {
	if ( substr($s1, $i, 1) eq substr($s2, $i, 1) ) { $same++ }
    }
    return ( $same / length( $s1 ) );
}
	

sub identity_threshold {
    my ( $maxsim, $seqlen, $z ) = @_;
    my ( $p, $sigma, $step );

    $p = $maxsim / 2;
    $step = $p / 2;
    while ( $step > 0.0005 ) {
	$sigma = sqrt( $p * (1 - $p) / $seqlen );
	$p += ( $p + ( $z * $sigma ) < $maxsim ) ? $step : (- $step);
	$step /= 2;
    }
    return $p - $z * $sigma;
}


#  The logic used here to optimize identification of "same" column depends
#  on the fact that only the second alignment ($y) has new columns, and they
#  are all gaps.  Therefore any non-gap character in alignment $y indicates
#  that it is not a column of added gaps (it must match).  After learning
#  that alignment $y has a gap, then we only need test $x for a gap.

sub same_col {
    my ( $x, $colx, $y, $coly ) = @_;
    my ( $seq, $seqmax, $cy );

    $seqmax = @$x - 1;
    for ( $seq = 0; $seq <= $seqmax; $seq++ )
    {
	if ( substr($y->[$seq], $coly, 1) ne "-" ) { return 1 } # Non-gap in aligned
	if ( substr($x->[$seq], $colx, 1) ne "-" ) { return 0 } # Unmatched gap
    }
    return 1;
}


#  Sequence expansion respects the local gap character.  $c0 and $c1 are the
#  previous and next characters.  (($c0,$c1)=($c1,shift @s1))[0] updates the
#  values and evaluates to what was the next character, and becomes the new
#  previous character.
#
#  The following really does print w, x, y, and z, one per line: 
#
#  ( $a, $b, @c ) = ("", split //, "wxyz");
#  while ( $b ) { print( (($a,$b)=($b,shift @c))[0], "\n" ) }

sub expand_seq {
    my ( $seq, $to_add ) = @_;

    my ( $c0, $c1, @s1 ) = ( "", split( //, $seq ), "" );
    my @s2 = map { $_ ne "-"                ? (($c0,$c1)=($c1,shift @s1))[0]
                 : $c0 eq "~" || $c1 eq "~" ? "~"
                 : $c0 eq "." || $c1 eq "." ? "."
                 :                            "-"
                 } @$to_add;

    join "", @s2;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3