[Bio] / FigKernelScripts / insert.pl Repository:
ViewVC logotype

View of /FigKernelScripts/insert.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Mon Dec 5 18:56:37 2005 UTC (13 years, 11 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, caBIG-05Apr06-00, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, caBIG-13Feb06-00, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.1: +17 -0 lines
Add license words.

#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#

# usage: insert Ali Tree Id [Weights] > new_tree

use tree_utilities;
use Carp;
use Data::Dumper;
use fastDNAml;

$usage = "usage: insert Ali Tree Id Weights > new_tree";

$size_rep = 50;

($ali = shift @ARGV) 
    || die "bad alignment: $usage";

if ((! -s "$ali.index") || ((-M $ali) < (-M "$ali.index")))
{
    (system("index_fasta $ali > $ali.index") == 0)
	|| die "could not make $ali.index";
}
open(INDEX,"<$ali.index")
    || die "could not open $ali.index";
while (defined($_ = <INDEX>))
{
    chop;
    ($id,$seek,$ln) = split(/\t/,$_);
    $seek_of{$id} = [$seek,$ln];
}
close(INDEX);
open(ALI,"<$ali") || die "could not open $ali";

(($tree_file = shift @ARGV) && (-s $tree_file))
    || die "missing tree: $usage";

($id_to_ins = shift @ARGV)
    || die "missing id to insert: $usage";

if (!($weights = shift @ARGV)) { $weights = ""; }

($tree = &uproot(&parse_newick_tree(join("",`cat \"$tree_file\"`))))
    || die "could not parse the tree";
$big_indexes = &tree_index_tables($tree);

##############  get the tree ##########
$tips    = &tips_of_tree($tree);
@in_tree = ();
foreach $x (@$tips)
{
    $in_tree{$x} = 1;
}


if ($in_tree{$id_to_ins})
{
    die "$id_to_ins is already in the tree";
}

$bad = 0;
foreach $id (@$tips)
{
    if ((! &seq_of($id,\%seek_of,\*ALI)) && (! &seq_of("\'$id\'",\%seek_of,\*ALI)))
    {
	$bad = 1;
	warn "The tree contains $id, but it is not in the alignment\n";
    }
}
(! $bad) || die "Could not insert - alignment is missing sequences";

print STDERR "insert: best_hits $id_to_ins $ali\n";
@closest = `best_hits $id_to_ins $ali`;
chop @closest;
foreach $x (@closest)
{
    $x = &printable_to_label($x);
    $in{$x} = 1;
}

$rep_tree = &representative_by_size($tree,$size_rep);
$rep_tips = &tips_of_tree($rep_tree);
foreach $x (@$rep_tips)
{
    $in{$x} = 1;
}

$subtree = &subtree($tree,\%in);
$x       = &display_tree($subtree,{});
print STDERR "Initial Representative Tree:\n\n$x\n";

##################################################
@seen = ();

while (! &seen($subtree))
{
    ($where,$len_to_add) = &sequence_ins_point($subtree,$id_to_ins,\%seek_of,\*ALI);
#    print STDERR "node1=@{$where->[0]} node2=@{$where->[1]} frac=$where->[2] len_to_add=$len_to_add\n";
    $subtree = &neighborhood_of_tree_point($big_indexes,$where,$size_rep);
    $x       = &display_tree($subtree,{});
#   print STDERR "new neighborhood\n";
#   print STDERR $x;
}
$new_tree = &to_newick(&add_branch($big_indexes,$where,$len_to_add,$id_to_ins));
print "$new_tree\n";
print STDERR "successfully inserted $id_to_ins\n";

sub seen {
    my($tree) = @_;
    my($tips,@sorted,$key);

    $tips = &tips_of_tree($tree);
    @sorted = sort @$tips;
    $key = join(" ",@sorted);

    if (! $seen{$key})
    {
	$seen{$key} = 1;
	return 0;
    }
    return 1;
}

sub add_branch {
    my($tabP,$where,$len_to_add,$id) = @_;
    my($nd1,$nd2,$node1,$node2,$fract,$node,$desc);

    ($node1,$node2,$fract) = @$where;
    $nd1 = &locate_node($node1,$tabP);
    $nd2 = &locate_node($node2,$tabP);
    $tree = &root_tree_between_nodes($nd1,$nd2,$fract,$tabP);
    $node = [$id,$len_to_add,[$tree]];
    $desc = $tree->[2];
    push(@$desc,$node);
    return $tree;
}

sub sequence_ins_point {
    my($tree,$id,$seek_of,$ali_fh) = @_;
    my($id_seqs,$tips,$seq1,@weights_and_rates,@weights,$column_weights,$ncats);
    my(@cats,$cats,@lines,$new_trees,$x,$id1);
    my($indexes,$at_node,$sz1,$sz2,$len_to_add,$where);
    my($tips1,$tips2,$node1,$node2,$d1,$d2,$frac);

    $id_seqs = [];
    $tips = &tips_of_tree($tree);
    foreach $id1 (@$tips)
    {
	$seq1 = &seq_of($id1,$seek_of,$ali_fh);
	$seq1 || confess "could not locate sequence for $id1";
	push(@$id_seqs,[$id1,$seq1]);
    }
    $seq1 = &seq_of($id,\%seek_of,\*ALI);
    $seq1 || confess "could not locate sequence for $id";
    push(@$id_seqs,[$id,$seq1]);

    if ($weights)
    {
	@weights_and_rates = `cat $weights`;
	@weights = ();
	while (($_ = shift @weights_and_rates) && ($_ !~ /^C/))
	{
	    push(@weights,$_);
	}
	$column_weights = join("",@weights);
	$column_weights =~ s/^Weights//;
	$column_weights =~ s/\s//g;
    
	if ($_ =~ /^C\s+(\d+)\s+(\S.*\S)\s*$/)
	{
	    $ncats = $1;
	    @cats = ($2);
	    while (($_ = shift @weights_and_rates) && ($_ !~ /^Categories/))
	    {
		push(@cats,$_);
	    }
	
	    $cats = join("",@cats);
	    $cats =~ s/\s+$//;
	    $cats =~ s/^\s+//;
	    @cats = split(/\s+/,$cats);
	    (@cats == $ncats) || die "category mismatch - BUG @cats";
	
	    $_ =~ s/^Categories\s+//;
	    $_ =~ s/\s//g;
	    @lines = ($_);
	    while ($_ = shift @weights_and_rates)
	    {
		$_ =~ s/\s//g;
		push(@lines,$_);
	    }
	    $column_cats = join("",@lines);
	}
	else
	{
	    die "bad weights and rates file: missing category data";
	}
    
	$new_trees = &fastDNAml($id_seqs,
				[
				 ["categories",\@cats,$column_cats],
				 ["weights",$column_weights],
				 ["fast_add"],
				 ["restart",$subtree],
				 ["global",0,0]
				 ]);
    }
    else
    {
	$new_trees = &fastDNAml($id_seqs,[["fast_add"],["restart",$subtree],["global",0,0]]);
    }
    
    (@$new_trees == 1) || die "Bad value back from fastDNAml";
    $new_tree = $new_trees->[0]->[1];
    $x       = &display_tree($new_tree,{});
    print STDERR "Tree returned by fastDNAml\n";
    print STDERR $x;

    $indexes = &tree_index_tables($new_tree);
    $at_node = &root_tree_at_node($indexes,$id);

#   &print_tree($at_node);

    $sz1 = &tree_length($tree);
    $sz2 = &tree_length($at_node->[2]->[1]) - $at_node->[2]->[1]->[1];
    if (($sz1 >= 0) && ($sz2 > 0))
    {
	$len_to_add = $at_node->[2]->[1]->[1] * ($sz1 / $sz2);
    }
    else
    {
	$len_to_add = 0;
    }
#   print STDERR "sz1=$sz1 sz2=$sz2 len_to_add=$len_to_add\n";
    $descendant1 = $at_node->[2]->[1]->[2]->[1];
    $descendant2 = $at_node->[2]->[1]->[2]->[2];
    $node1 = &rooted_identify($descendant1);
    $node2 = &rooted_identify($descendant2);
    if (@$node1 == 2) { push(@$node1,$node2->[0]); }
    if (@$node2 == 2) { push(@$node2,$node1->[0]); }

    $d1 = $descendant1->[1];
    $d2 = $descendant2->[1];
    if (($d1  + $d2 ) > 0)
    {
	$frac = ($d1 / ($d1+$d2));
    }
    else
    {
	$frac = 0.5;
    }
#    print STDERR "node1=@$node1 node2=@$node2 frac=$frac\n";
    $where = [$node1,$node2,$frac];
#
# Set $where
#
    return ($where,$len_to_add);
}

sub tree_length {
    my($tree) = @_;
    my($desc,$ln,$i);

    $desc = $tree->[2];
    for ($ln=0,$i=1; ($i <= $#{$desc}); $i++)
    {
	$ln += &tree_length($desc->[$i]) + $desc->[$i]->[1];
    }
    return $ln;
}

sub rooted_identify {
    my($tree) = @_;

    $cc = $tree->[2];
    if (@$cc == 1) { return [$tree->[0]]; }
    if (@$cc == 2) { return ""; }
    return [&node_of_tree($cc->[1]),&node_of_tree($cc->[2])];
}

sub node_of_tree {
    my($tree) = @_;

    while ($#{$tree->[2]} > 0)
    {
	$tree = $tree->[2]->[1];
    }
    return $tree->[0];
}

sub seq_of {
    my($id,$seek_of,$ali_fh) = @_;
    my($x,$seek,$len,$seq);

    if ($x = $seek_of->{$id})
    {
	($seek,$len) = @$x;
	seek($ali_fh,$seek,0);
	read($ali_fh,$seq,$len);
	$seq =~ s/\s//g;
	$seq =~ tr/a-z/A-Z/;
	$seq =~ tr/U/T/;
	$seq =~ s/[~.]/\?/g;
	return $seq;
    }
    die "invalid seq id=$id";
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3