[Bio] / FigKernelPackages / FS_RAST.pm Repository:
ViewVC logotype

View of /FigKernelPackages/FS_RAST.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (download) (as text) (annotate)
Fri Sep 12 01:47:15 2008 UTC (11 years, 5 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_2008_0924, rast_rel_2008_09_30, mgrast_rel_2008_0924, mgrast_rel_2008_0625, rast_rel_2008_10_09, rast_release_2008_09_29, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29
Changes since 1.17: +2 -2 lines
fixes to FIGV.pm, CompareMR.pm and FIG.pm to support comparison of metabolic reconstructions

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

package FS_RAST;
use FIG;

use gjoseqlib;
use strict;
use Tracer;
&ETracing;    ### initializes Tracing
#
# export TRACING=RossO     <<< to turn on tracing, do this from command line
# trace 3

use find_special_proteins;

use Data::Dumper;

#===============================================================================
#  Use a FIGfam (set of representatives and set of family members) as the basis
#  for searching for an instance in a given set of contigs.  The thing that makes
#  it special is that it is tolerant of frameshifts.
#
#      @instances  = find_and_correct( \%params )
#
#  Required parameters:
#
#      contigs    => \@contigs
#      family     => \@family_members
#
#  Optional paramaters:
#
#      reps       => \@representatives
#      is_init    => \@initiator_triplets                ( D = [ATG,GTG] )
#      is_alt     => \@second_choice_initiator_triplets  ( D = [TTG] )
#      is_term    => \@terminator_triplets               ( D = [TAA,TAG,TGA] )
#      tmp        =>  $directory           directory for tmp_dir
#      tmp_dir    =>  $directory           directory for temporary files
#
#  RETURNS:  a list of hashes.  Each hash is an "instance" with
#         $instance->{location}    => location
#         $instance->{translation} => translation
#         $instance->{annotation}  => annotation describing alterations
#         $instance->{genomes}     => pointer to a list of the closest genomes [unsorted, up to 10]
#==================================================================================================

sub find_and_correct
{
    my ( $params ) = @_;
    my @instances = ();

    my ( $tmp_dir, $save_tmp ) = &find_special_proteins::temporary_directory( $params );

    my $contigs = $params->{ contigs }
        or return ();

    ( ref( $contigs ) eq 'ARRAY' ) and ( @$contigs )
        or return ();

    #  Make a hash to find contig data by id:
    my %contig = map { $_->[0] => $_ } @$contigs;

    my $family = $params->{ family }
        or return ();

    ( ref( $family ) eq 'ARRAY' ) and ( @$family )
        or return ();

    my $reps = $params->{ reps };
    if (! $reps) { $reps = $family }

    ( ref( $reps ) eq 'ARRAY' ) and ( @$reps )
        or return ();

    my $suffix = $$ . "_" . sprintf( "%09d", int( 1e9 * rand() ) );
    my $contig_file = "$tmp_dir/contigs_$suffix";
    gjoseqlib::print_alignment_as_fasta( $contig_file, $contigs );
    -f $contig_file or return ();

    my $family_file = "$tmp_dir/family_$suffix";
    gjoseqlib::print_alignment_as_fasta( $family_file, $family );
    -f $family_file or return ();
    $params->{family_file} = $family_file;

    my $reps_file = "$tmp_dir/reps_$suffix";
    gjoseqlib::print_alignment_as_fasta( $reps_file, $reps );
    -f $reps_file or return ();

    my $ext_bin = $FIG_Config::ext_bin;
    my($formatdb,$blastall);
    if (! ($formatdb = $params->{formatdb}))
    {
	$formatdb   = "$ext_bin/formatdb";
	$params->{formatdb} = $formatdb;
    }

    if (! ($blastall = $params->{blastall}))
    {
	$blastall   = "$ext_bin/blastall";
	$params->{blastall}  = $blastall;
    }

    system "cd '$tmp_dir'; $formatdb -p f -i 'contigs_$suffix'; $formatdb -p t -i 'family_$suffix'";

    my @search_out = map { $_ =~ /^(\S+)\t(\S+)\t(\S+\t){4}(\d+)\t(\d+)\t(\d+)\t(\d+)/; [$1,$2,$4,$5,$6,$7] }
                     `$blastall -i $reps_file -d $contig_file -FF -p tblastn -m 8 -e 1.0e-5`;

#   print STDERR &Dumper(\@search_out);
    my @groups = &cluster_blast(\@search_out);

    my $group;
    my %seen;
    foreach $group (@groups)
    {
	my($loc,$translation,$genomes,$annotation,$complete_contig) = 
	    &process_group($params,$group,$family_file,$contigs);
	if (defined($loc) && (! $seen{$loc}))
	{
	    $seen{$loc} = 1;

	    my $instance = {};
	    $instance->{location} = $loc;
	    $instance->{translation} = $translation;
	    $instance->{genomes} = $genomes;
	    $instance->{annotation} = $annotation ? $annotation : "";
	    push(@instances,$instance);
	}
    }
    system "/bin/rm -r $tmp_dir" if ! $save_tmp;
    return @instances;
}

sub process_group {
    my($params,$group,$familyDB,$contigs) = @_;
    my($i);

    my @search_out = @$group;
    my $contig = $search_out[0]->[1];
    my $first_piece = shift @search_out;
    my @pieces = ($first_piece);

    my $piece;
    my $p1;
    for ($p1=0; ($p1 < @search_out) && (! &close_to_picked(\@pieces,$search_out[$p1])); $p1++) {}
    while ($p1 < @search_out)
    {
	push(@pieces,splice(@search_out,$p1,1));
	for ($p1=0; ($p1 < @search_out) && (! &close_to_picked(\@pieces,$search_out[$p1])); $p1++) {}
    }
    @pieces = &remove_embedded(\@pieces);

###
### You have now taken the best hit piece and found others to tack on the ends.
###
    my $minD = 1000000000;
    my $minP = 100000;
    my $maxD = 0;
    my $maxP = 0;
    foreach $piece (@pieces)
    {
	$minD = &FIG::min($minD,$piece->[4],$piece->[5]);
	$maxD = &FIG::max($maxD,$piece->[4],$piece->[5]);

	$minP = &FIG::min($minP,$piece->[2],$piece->[3]);
	$maxP = &FIG::max($maxP,$piece->[2],$piece->[3]);
    }

###
### We need to now extract the boundaries for the region to be blasted back against all of the members of the family
###
    my $reps = $params->{ reps };
    my $start_adj = 3 * ($minP - 1) + 1000;
    for ($i=0; ($i < @$reps) && ($reps->[$i]->[0] ne $pieces[0]->[0]); $i++) {}
    my $end_adj = ((length($reps->[$i]->[2]) - $maxP) * 3) + 1000;
    Trace &Dumper([$contig,"minD=$minD","maxD=$maxD","minP=$minP","maxP=$maxP","start_adjust=$start_adj","end_adjust=$end_adj"]) if T(3);

    for ($i=0; (($i < @$contigs) && $contigs->[$i]->[0] ne $contig); $i++) {}

    if ($i < @$contigs)
    {
	my $complete_contig = $contigs->[$i]->[2];
	my $start = &FIG::max(1,($minD-$start_adj));
	my $end   = &FIG::min(length($contigs->[$i]->[2]),$maxD+$end_adj);
	my $dna = substr($contigs->[$i]->[2],$start-1,($end+1-$start));
        my $frag = [$contig,$start,$end,$dna];     			      
        Trace &Dumper(['fragment from tblastn: ',$frag]) if T(3);
###
### $frag is a 4-tuple that describes the piece of DNA from the contig that we will use to blast against
### members of the family
###
	return (&best_match_in_family($params,$frag),$complete_contig);
    }
    return undef;
}

sub make_ali {
    my($untranslated,$translation) = @_;

    my $tmpD = "$FIG_Config::temp/dna$$";
    my $tmpP = "$FIG_Config::temp/prot$$";
    my $tmpA = "$FIG_Config::temp/ali$$";
    
    my $alignment = "";
    open(TMP,">$tmpD") || die "could not open $tmpD";
    print TMP ">dna\n$untranslated\n";
    close(TMP);

    open(TMP,">$tmpP") || die "could not open $tmpP";
    print TMP ">prot\n$translation\n";
    close(TMP);
    system "nap $tmpD $tmpP 9 $FIG_Config::global/nap-matrix 10 1 > /dev/null 2> $tmpA";
    my @tmp = `cat $tmpA`;
    splice(@tmp,0,4);
    $alignment .= join("",@tmp);
    unlink($tmpD,$tmpP,$tmpA);
    return $alignment;
}

sub best_match_in_family {
    my($params,$frag) = @_;

#   print STDERR &Dumper($frag);
    my $tmp_dir = $params->{tmpdir};
    my $save_tmp = 1;

    if (! $tmp_dir)
    {
	( $tmp_dir, $save_tmp ) = &find_special_proteins::temporary_directory( $params );
    }

    if (! $params->{ family_file })
    {
	my $family = $params->{ family }
           or return undef;

	( ref( $family ) eq 'ARRAY' ) and ( @$family )
	    or return undef;
	my $suffix = $$ . "_" . sprintf( "%09d", int( 1e9 * rand() ) );
	my $family_file = "$tmp_dir/family_$suffix";
	gjoseqlib::print_alignment_as_fasta( $family_file, $family );
	-f $family_file or return undef;
	$params->{family_file} = $family_file;

	my $ext_bin = $FIG_Config::ext_bin;
	my($formatdb,$blastall);
	if (! ($formatdb = $params->{formatdb}))
	{
	    $formatdb   = "$ext_bin/formatdb";
	    $params->{formatdb} = $formatdb;
	}
	system "cd '$tmp_dir'; $formatdb -p t -i 'family_$suffix'";

	if (! ($blastall = $params->{blastall}))
	{
	    $blastall   = "$ext_bin/blastall";
	    $params->{blastall}  = $blastall;
	}
    }

    my $tmpF = "$tmp_dir/fragment.fasta";
    my $dna = $frag->[3];
    open(DNA,">$tmpF") || die "could not open $tmpF";
    print DNA ">frag\n$dna\n";
    close(DNA);

    my $family_file = $params->{family_file};
    my $blastall    = $params->{blastall};

#    my @tmp = `$blastall -i $tmpF -d $family_file -FF -p blastx -m 8 -g F -e 1.0e-10`;
#    print STDERR join("",@tmp),"\n";

    my @search_out = map { $_ =~ /^(\S+)\t(\S+)\t(\S+\t){4}(\d+)\t(\d+)\t(\d+)\t(\d+)/; [$2,$1,$6,$7,$4,$5] }
                    `$blastall -i $tmpF -d $family_file -FF -p blastx -m 8 -g F -e 1.0e-10`;
###
### We just used blastx to try to find the most similar members from the family
###
    my $genomes = {};
    my $sim;
    for (my $simI = 0; ($simI < @search_out) && ($simI < 10); $simI++)
    {
	$sim = $search_out[$simI];
	$genomes->{&FIG::genome_of($sim->[0])}++;   ### this just records the most similar genomes
    }

    my @groups = &cluster_blast(\@search_out);

###
### @search_out now has the hits against the best member of the family
###

    my($groupI,$loc,$pieces_of_untrans,$untranslated,$translation,$annotation);
    $groupI = 0;
    while (($groupI < @groups) && (! defined($loc)))
    {
	my $group = $groups[$groupI];

	my @search_out = @$group;
	my $first_piece = shift @search_out;
	my @pieces = ($first_piece);
	my $p1;
	for ($p1=0; ($p1 < @search_out) && (! &close_to_picked(\@pieces,$search_out[$p1])); $p1++) {}
	while ($p1 < @search_out)
	{
	    push(@pieces,splice(@search_out,$p1,1));
	    for ($p1=0; ($p1 < @search_out) && (! &close_to_picked(\@pieces,$search_out[$p1])); $p1++) {}
	}
	@pieces = &remove_embedded(\@pieces);
	my @sorted = sort { $a->[2] <=> $b->[2] } @pieces;
###
### again we piece together the set of (perhaps) frameshifted pieces (sorted by the beginning protein position)
###
# 	print STDERR &Dumper(\@sorted); 
#       die "aborted";

	if (&ok_seq(\@sorted) && &extend_to_start_stop(\@sorted,$frag,$params))
	{
	    ($pieces_of_untrans,$untranslated,$translation) = &make_translation(\@sorted,$frag);
	    $loc         = &make_loc(\@sorted,$frag,&stops($params));
#
#           The untranslated is now chunks corresponding to the translation pieces.
#           I am now adding just the DNA of the region, since that is what should be
#           aligned
#
	    my $begD = $sorted[0]->[4];
	    my $endD = $sorted[$#sorted]->[5];
	    my $untranslated_region = ($begD < $endD) ? 
                                      substr($frag->[3],$begD-1,($endD-$begD)+1) :
		                      &FIG::reverse_comp(substr($frag->[3],$endD-1,($begD-$endD)+1));
#
	    $annotation = "";

	    if (@sorted > 1)
	    {
		my $n = @sorted - 1;
		$annotation = "We believe that there may have been frameshifts and embedded stop codons, " .
			      " and we attempted to correct the translation.\n  Hence, the translation does not agree perfectly with the corresponding region of DNA.\n\n";
		my $alignment = &make_ali($untranslated_region,$translation);
		$annotation .= $alignment;
		my $guide_peg = $sorted[0]->[0];
		$annotation .= "\n==========\nNow we show an alignment against a similar protein ($guide_peg):\n\n";
		my $fig = new FIG;
		my $trans = $fig->get_translation($guide_peg);
		if (abs(length($trans) - length($translation)) > 5) 
		{ 
		    undef $loc ;
		}
		else
		{
		    $alignment = &make_ali($untranslated_region,$trans);
		    $annotation .= $alignment;
		}
	    }
	}
	$groupI++;
    }
    system "/bin/rm -r $tmp_dir" if ! $save_tmp;
    return $loc ? ($loc,$translation,[keys(%$genomes)],$annotation) : undef;
}

sub stops {
    my($param) = @_;
    my($code);

    if     ($param->{is_term}) { return $param->{is_term} }
    elsif  (($code = $param->{code}) && ($code == 4)) { return ['taa','tag'] }
    return ['taa','tga','tag'];
}

sub ok_seq {
    my($pieces) = @_;
    my($i);

    if (@$pieces < 1) { return 0 }
    if ($pieces->[0]->[2] > 10) { return 0 }
    my $strand = ($pieces->[0]->[4] < $pieces->[0]->[5]) ? "+" : "-";
    for ($i=0; ($i < (@$pieces-1)) && &ok_seq1($strand,$pieces->[$i]->[5],$pieces->[$i+1]->[4],$pieces->[$i+1]->[5]); $i++) {}
    return $i == (@$pieces - 1);
}

sub ok_seq1 {
    my($strand,$e1,$b2,$e2) = @_;

    if (($strand eq "+") && ($b2 > ($e1 + 9))) { return 0 }
    if (($strand eq "-") && ($b2 < ($e1 - 9))) { return 0 }
    return ($strand eq "+") ? ($b2 < $e2) : ($e2 < $b2);
}

### This routine attemts to construct the appropriate location, even in the presence
### of frameshifts.  $pieces describes the regions of similarity that got patched together.
### $pieces_of_untrans give the pieces of DNA that go into the untranslated string.  $frag
### allows you to map everything back to the full contig and the BEG/END region on it.

sub make_loc {
    my($pieces,$frag,$stopsL) = @_;

    my @stop_codons = $stopsL ? @$stopsL : ('taa','tga','tag');
    my $stops = {};
    foreach $_ (@stop_codons) { $stops->{lc $_} = $stops->{uc $_} = 1; }

    my($contig,$beg,$end,$dna) = @$frag;
    my @locs1 = &locs_from_pieces($pieces);
    my @locs2 = &remove_embedded_stops(\@locs1,$dna,$stops);
    my @locs3 = &shift_coords(\@locs2,$beg,$end);

    return join(",",map { join("_",($contig,@$_)) } @locs3);
}

sub locs_from_pieces {
    my($pieces) = @_;

    return ($pieces->[0]->[4] < $pieces->[0]->[5]) ? 
	   &locs_from_piecesP($pieces) :
	   &locs_from_piecesM($pieces);
}

sub locs_from_piecesP {
    my($pieces) = @_;

    my @locs = ();

    my @coords = map { [$_->[4],$_->[5]] } @$pieces;
    my $i = 0;
    while ($i < @coords)
    {
	if ($i == (@coords - 1))
	{
	    push(@locs,$coords[$i]);
	    $i++;
	}
	else
	{
	    while ($coords[$i+1]->[0] <= $coords[$i]->[1])
	    {
		$coords[$i+1]->[0] += 3;
	    }

	    my $shift = ($coords[$i+1]->[0] - ($coords[$i]->[1]+1)) % 3;

	    if ($shift == 0)
	    {
		$coords[$i+1]->[0] = $coords[$i]->[0];
		$i++;
	    }
	    elsif ($shift == 1)
	    {
		push(@locs,[$coords[$i]->[0],$coords[$i+1]->[0] - 2]);
		$i++;
	    }
	    else  # $shift == 2
	    {
		push(@locs,[$coords[$i]->[0],$coords[$i+1]->[0]]);
		$i++;
	    }
	}
    }
    return @locs;
}

sub locs_from_piecesM {
    my($pieces) = @_;

    my @locs = ();

    my @coords = map { [$_->[4],$_->[5]] } @$pieces;
    my $i = 0;
    while ($i < @coords)
    {
	if ($i == (@coords - 1))
	{
	    push(@locs,$coords[$i]);
	    $i++;
	}
	else
	{
	    while ($coords[$i+1]->[0] >= $coords[$i]->[1])
	    {
		$coords[$i+1]->[0] -= 3;
	    }

	    my $shift = (($coords[$i]->[1] - 1) - $coords[$i+1]->[0]) % 3;

	    if ($shift == 0)
	    {
		$coords[$i+1]->[0] = $coords[$i]->[0];
		$i++;
	    }
	    elsif ($shift == 1)
	    {
		push(@locs,[$coords[$i]->[0],$coords[$i+1]->[0] + 2]);
		$i++;
	    }
	    else  # $shift == 2
	    {
		push(@locs,[$coords[$i]->[0],$coords[$i+1]->[0]]);
		$i++;
	    }
	}
    }
    return @locs;
}

sub remove_embedded_stops {
    my($locs,$dna,$stops) = @_;
    my($i);

    my @cleaned = ();
    for ($i=0; ($i < @$locs); $i++)
    {
	push(@cleaned,&clean_span($locs->[$i],$dna,$stops,($i == (@$locs - 1))));
    }
    return @cleaned;
}

##
sub clean_span {
    my($loc,$dna,$stops,$last_piece) = @_;

    my($b,$e) = @$loc;
    return ($b < $e) ? &clean_spanP($b,$e,$dna,$stops,$last_piece) :
	               &clean_spanM($b,$e,$dna,$stops,$last_piece);
}

sub clean_spanP {
    my($b,$e,$dna,$stops,$last_piece) = @_;

    my @span = ();
    my $i = $b;
    my $t = $last_piece ? $e-3 : $e;
    while ($i < $t)
    {
	my $codon = substr($dna,$i-1,3);
	if ($stops->{$codon})
	{
	    if ($b == $i)
	    {
		$b += 3;
	    }
	    else
	    {
		push(@span,[$b,$i-1]);
		$b = $i + 3;
	    }
	}
	$i += 3;
    }
    if ($b < $e)
    {
	push(@span,[$b,$e]);
    }
    return @span;
}

sub clean_spanM {
    my($b,$e,$dna,$stops,$last_piece) = @_;

    my @span = ();
    my $i = $b;
    my $t = $last_piece ? $e+3 : $e;
    while ($i > $t)
    {
	my $codon = &FIG::reverse_comp(substr($dna,$i-3,3));
	if ($stops->{$codon})
	{
	    if ($b == $i)
	    {
		$b -= 3;
	    }
	    else
	    {
		push(@span,[$b,$i+1]);
		$b = $i - 3;
	    }
	}
	$i -= 3;
    }
    if ($b > $e)
    {
	push(@span,[$b,$e]);
    }
    return @span;
}

sub shift_coords {
    my($locs,$beg,$end) = @_;
    my $loc;

    my $dir = ($beg < $end) ? 1 : -1;
    my @shifted = ();
    foreach $loc (@$locs)
    {
	my($b,$e) = @$loc;
	push(@shifted,[$beg + ($dir * ($b-1)), $beg + ($dir * ($e-1))]);
    }
    return @shifted;
}

sub extend_to_start_stop {
    my($sorted,$frag,$params) = @_;

    my $opt = {};
    $opt->{is_init} = $params->{is_init} ? $params->{is_init} : ['ATG','GTG'];
    $opt->{is_alt}  = $params->{is_alt}  ? $params->{is_alt}  : ['TTG'];

    if ($params->{code} == 4) 
    {
	$opt->{'is_term'} = [ qw( TAA TAG ) ];
    }
    else
    {
	$opt->{'is_term'} = $params->{is_term} ? $params->{is_term} : [ qw( TAA TAG TGA ) ];
    }

    my($contig,$beg,undef,$dna) = @$frag;
    my($start,$rcS) = &find_special_proteins::find_orf_start(\$dna,$sorted->[0]->[4],$sorted->[0]->[5],$opt);

    my $pieceL = $sorted->[@$sorted - 1];
    my($end,$rcE) = &find_special_proteins::find_orf_end(\$dna,$pieceL->[4],$pieceL->[5],$opt);

    if (defined($start) && defined($end) && (! $rcS) && (! $rcE))
    {
	$sorted->[0]->[4] = $start;
	$pieceL->[5] = $end;
	return 1;
    }
    return 0;
}

sub start {
    my($codon) = @_;

    return ($codon =~ /^(atg|gtg|ttg)$/i);
}

sub startC {
    my($codon) = @_;

    return ($codon =~ /^(cat|cac|caa)$/i);
}

sub stop {
    my($codon) = @_;
    return ($codon =~ /^(taa|tag|tga)$/i);
}

sub stopC {
    my($codon) = @_;
    return ($codon =~ /^(tta|cta|tca)$/i);
}

sub make_translation {
    my($sorted,$frag) = @_;

    my $i;
    for ($i=0; ($i < (@$sorted - 1)); $i++)
    {
	my $db1 = $sorted->[$i]->[4];
	my $de1 = $sorted->[$i]->[5];

	my $db2 = $sorted->[$i+1]->[4];
	my $de2 = $sorted->[$i+1]->[5];

	if ($db1 < $de1)
	{
	    if ($de1 >= $db2)
	    {
		my $de1_new = $de1 - 3;
		while ($de1_new >= $db2) { $de1_new -= 3 }
		my $db2_new = $db2 + 3;
		while ($db2_new <= $de1) { $db2_new += 3 }
		$sorted->[$i]->[5]   = $de1_new;
		$sorted->[$i+1]->[4] = $db2_new;
	    }
	}
	else
	{
	    if ($de1 <= $db2)
	    {
		my $de1_new = $de1 + 3;
		while ($de1_new <= $db2) { $de1_new += 3 }
		my $db2_new = $db2 - 3;
		while ($db2_new >= $de1) { $db2_new -= 3 }
		$sorted->[$i]->[5]   = $de1_new;
		$sorted->[$i+1]->[4] = $db2_new;
	    }
	}
    }
    my @pieces_of_trans = ();
    my @pieces_of_untrans = ();
    my $dna = $frag->[3];
#   print STDERR &Dumper(['sorted best hit',$sorted,$dna]);
    Trace &Dumper(['sorted best hit',$sorted,$dna]) if T(3);

    for ($i=0; ($i < (@$sorted - 1)); $i++)
    {
	&add_trans(\@pieces_of_trans,\@pieces_of_untrans,$sorted->[$i],$dna,($i == 0));
	&add_gap(\@pieces_of_trans,\@pieces_of_untrans,$sorted->[$i]->[5],$sorted->[$i+1]->[4],$dna);
    }
    &add_trans(\@pieces_of_trans,\@pieces_of_untrans,$sorted->[$i],$dna,($i == 0));
#   print STDERR &Dumper(['pieces',\@pieces_of_trans,\@pieces_of_untrans]);

    my $tran    = join("",@pieces_of_trans);
    my $untran  = join("",@pieces_of_untrans);
    while ($tran =~ /\*[^\n]/)    ### This nonsense is needed to keep *s at the end of a sequence
    {
	$tran =~ s/\*/X/;
    }
    $tran =~ s/\*$//;
    return (\@pieces_of_untrans,$untran,$tran);
}

sub add_gap {
    my($pieces_of_trans,$pieces_of_untrans,$end_of_last,$beg_of_next,$dna) = @_;
    my $gap_dna;
    if ($end_of_last < $beg_of_next)
    {
	$gap_dna = substr($dna,$end_of_last,($beg_of_next - $end_of_last) - 1);
    }
    else
    {
	$gap_dna = &FIG::reverse_comp(substr($dna,$beg_of_next,($end_of_last - $beg_of_next) - 1));
    }
    my $gap = abs($beg_of_next - $end_of_last) - 1;
    push(@$pieces_of_trans,"x" x int(($gap+1) / 3));
    push(@$pieces_of_untrans,$gap_dna);
}
    

sub add_trans {
    my($pieces_of_trans,$pieces_of_untrans,$piece,$dna,$fix_start) = @_;

#   print STDERR &Dumper(['piece in add_trans',$piece,$dna]);

    Trace &Dumper(['piece in add_trans',$piece,$dna]) if T(3);
    my $db = $piece->[4];
    my $de = $piece->[5];

    my $dna_to_tran;
    if ($db < $de)
    {
	$dna_to_tran = substr($dna,$db-1,($de+1-$db));
	if ($fix_start && ($dna_to_tran =~ /[gt]tg/i))
	{
	    substr($dna_to_tran,0,1) = 'A';
	}
    }
    else
    {
	$dna_to_tran = &FIG::reverse_comp(substr($dna,$de-1,($db+1-$de)));
	if ($fix_start && ($dna_to_tran =~ /[gt]tg/i))
	{
	    substr($dna_to_tran,0,1) = 'A';
	}
    }
    push(@$pieces_of_trans,&FIG::translate($dna_to_tran));
    push(@$pieces_of_untrans,$dna_to_tran);
    Trace &Dumper(["trans: ",$dna_to_tran,$pieces_of_trans->[@$pieces_of_trans - 1]]) if T(3);
}

sub remove_embedded {
    my($pieces) = @_;

    my @clean = ();
    my @pieces = sort { $a->[2] <=> $b->[2] } @$pieces;
    foreach my $piece (@pieces)
    {
	if ((@clean == 0) || ($clean[$#clean]->[3] < $piece->[3]))
	{
	    push(@clean,$piece);
	}
    }
    return @clean;
}

sub close_to_picked {
    my($pieces,$piece) = @_;

    my $b1 = $piece->[4];
    my $e1 = $piece->[5];
    my $i;
    for ($i=0; ($i < @$pieces) && (! &ok($b1,$e1,$pieces->[$i]->[4],$pieces->[$i]->[5])); $i++) {}
    return ($i < @$pieces);
}

sub ok {
    my($b1,$e1,$b2,$e2) = @_;

    if ($b1 < $e1) 
    {
	return (($b2 < $e2) && (&FIG::min(abs($b2-$e1),abs($b1-$e2)) < 15))
    }
    else
    {
	return (($b2 > $e2) && (&FIG::min(abs($b2-$e1),abs($b1-$e2)) < 15));
    }
}

sub cluster_blast {
    my($blast_out) = @_;

    my @sets = ();
    my $i = 0;
    while ($i < @$blast_out)
    {
	my $set = [$blast_out->[$i]];
	$i++;
	while (($i < @$blast_out) && ($blast_out->[$i]->[0] eq $set->[0]->[0]) && 
	       ($blast_out->[$i]->[1] eq $set->[0]->[1]) &&
	       (abs((($blast_out->[$i]->[4] + $blast_out->[$i]->[5]) / 2) - (($set->[0]->[4] + $set->[0]->[5]) / 2)) < 8000))
	{
	    push(@$set,$blast_out->[$i]);
	    $i++;
	}
	push(@sets,$set);
    }
    return &condense(\@sets);
}

sub condense {
    my($sets) = @_;
    my($i);

    my @keep = ();
    foreach my $set (@$sets)
    {
	for ($i=0; ($i < @keep) && (! &same_region($set,$keep[$i])); $i++) {}
	if ($i == @keep)
	{
	    push(@keep,$set);
	}
    }
    return @keep;
}

sub same_region {
    my($x,$y) = @_;

    my $min1 = &min_hit($x);
    my $max1 = &max_hit($x);
    my $min2 = &min_hit($y);
    my $max2 = &max_hit($y);
    return &FIG::between($min1,$min2,$max1) || &FIG::between($min2,$min1,$max2);
}

sub min_hit {
    my($x) = @_;

    my $min = 100000000;
    foreach my $x1 (@$x)
    {
	$min = &FIG::min($min,$x1->[4],$x1->[5]);
    }
    return $min;
}

sub max_hit {
    my($x) = @_;

    my $max = 0;
    foreach my $x1 (@$x)
    {
	$max = &FIG::max($max,$x1->[4],$x1->[5]);
    }
    return $max;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3