1 |
# |
# |
2 |
# Copyright (c) 2003-2006 University of Chicago and Fellowship |
# Copyright (c) 2003-2007 University of Chicago and Fellowship |
3 |
# for Interpretations of Genomes. All Rights Reserved. |
# for Interpretations of Genomes. All Rights Reserved. |
4 |
# |
# |
5 |
# This file is part of the SEED Toolkit. |
# This file is part of the SEED Toolkit. |
109 |
# set_newick_desc_list( $noderef, @desclist ) |
# set_newick_desc_list( $noderef, @desclist ) |
110 |
# set_newick_desc_i( $noderef1, $i, $noderef2) |
# set_newick_desc_i( $noderef1, $i, $noderef2) |
111 |
# |
# |
112 |
|
# $bool = newick_is_valid( $noderef ) # verify that tree is valid |
113 |
|
# |
114 |
# $bool = newick_is_rooted( $noderef ) # 2 branches from root |
# $bool = newick_is_rooted( $noderef ) # 2 branches from root |
115 |
# $bool = newick_is_unrooted( $noderef ) # 3 or more branches from root |
# $bool = newick_is_unrooted( $noderef ) # 3 or more branches from root |
116 |
# $bool = newick_is_tip_rooted( $noderef ) # 1 branch from root |
# $bool = newick_is_tip_rooted( $noderef ) # 1 branch from root |
130 |
# ( $tipref, $xmax ) = newick_most_distant_tip_ref( $noderef ) |
# ( $tipref, $xmax ) = newick_most_distant_tip_ref( $noderef ) |
131 |
# ( $tipname, $xmax ) = newick_most_distant_tip( $noderef ) |
# ( $tipname, $xmax ) = newick_most_distant_tip( $noderef ) |
132 |
# |
# |
133 |
|
# Tree tip insertion point (tip is on branch of length x that |
134 |
|
# is inserted into branch connecting node1 and node2, a distance |
135 |
|
# x1 from node1 and x2 from node2): |
136 |
|
# |
137 |
|
# [ $node1, $x1, $node2, $x2, $x ] |
138 |
|
# = newick_tip_insertion_point( $tree, $tip ) |
139 |
|
# |
140 |
|
# Standardized label for a node in terms of intersection of 3 lowest sorting |
141 |
|
# tips (sort is lower case): |
142 |
|
# |
143 |
|
# @TipOrTips = std_node_name( $Tree, $Node ) |
144 |
|
# |
145 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
146 |
# Paths from root of tree: |
# Paths from root of tree: |
147 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
171 |
# $treecopy = copy_newick_tree( $tree ) |
# $treecopy = copy_newick_tree( $tree ) |
172 |
# |
# |
173 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
174 |
# The following modify the existing tree, and passibly any components of that |
# The following modify the existing tree, and possibly any components of that |
175 |
# tree that are reached by reference. If the old version is still needed, copy |
# tree that are reached by reference. If the old version is still needed, copy |
176 |
# before modifying. |
# before modifying. |
177 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
190 |
# $n_changed = newick_fix_negative_branches( $tree ) |
# $n_changed = newick_fix_negative_branches( $tree ) |
191 |
# $node = newick_rescale_branches( $node, $factor ) |
# $node = newick_rescale_branches( $node, $factor ) |
192 |
# |
# |
193 |
|
# Modify comments: |
194 |
|
# |
195 |
|
# $node = newick_strip_comments( $node ) |
196 |
|
# |
197 |
# Modify rooting and/or order: |
# Modify rooting and/or order: |
198 |
# |
# |
199 |
# $nrmtree = normalize_newick_tree( $tree ) |
# $nrmtree = normalize_newick_tree( $tree ) |
218 |
# |
# |
219 |
# $newtree = collapse_zero_length_branches( $tree ) |
# $newtree = collapse_zero_length_branches( $tree ) |
220 |
# |
# |
221 |
|
# $node = newick_insert_at_node( $node, $subtree ) |
222 |
|
# $tree = newick_insert_between_nodes( $tree, $subtree, $node1, $node2, $fraction ) |
223 |
|
# |
224 |
#=============================================================================== |
#=============================================================================== |
225 |
# Tree reading and writing: |
# Tree reading and writing: |
226 |
#=============================================================================== |
#=============================================================================== |
241 |
#=============================================================================== |
#=============================================================================== |
242 |
|
|
243 |
|
|
244 |
|
use Carp; |
245 |
|
use Data::Dumper; |
246 |
|
|
247 |
require Exporter; |
require Exporter; |
248 |
|
|
249 |
our @ISA = qw(Exporter); |
our @ISA = qw(Exporter); |
251 |
overbeek_to_gjonewick |
overbeek_to_gjonewick |
252 |
gjonewick_to_overbeek |
gjonewick_to_overbeek |
253 |
|
|
254 |
|
newick_is_valid |
255 |
newick_is_rooted |
newick_is_rooted |
256 |
newick_is_unrooted |
newick_is_unrooted |
257 |
tree_rooted_on_tip |
tree_rooted_on_tip |
291 |
newick_fix_negative_branches |
newick_fix_negative_branches |
292 |
newick_rescale_branches |
newick_rescale_branches |
293 |
|
|
294 |
|
newick_strip_comments |
295 |
|
|
296 |
normalize_newick_tree |
normalize_newick_tree |
297 |
reverse_newick_tree |
reverse_newick_tree |
298 |
std_unrooted_newick |
std_unrooted_newick |
314 |
newick_subtree |
newick_subtree |
315 |
collapse_zero_length_branches |
collapse_zero_length_branches |
316 |
|
|
317 |
|
newick_insert_at_node |
318 |
|
newick_insert_between_nodes |
319 |
|
|
320 |
writeNewickTree |
writeNewickTree |
321 |
fwriteNewickTree |
fwriteNewickTree |
322 |
strNewickTree |
strNewickTree |
429 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
430 |
|
|
431 |
sub newick_desc_ref { $_[0]->[0] } # = ${$_[0]}[0] |
sub newick_desc_ref { $_[0]->[0] } # = ${$_[0]}[0] |
432 |
sub newick_lbl { $_[0]->[1] } |
sub newick_lbl { ref($_[0]) ? $_[0]->[1] : Carp::confess() } |
433 |
sub newick_x { $_[0]->[2] } |
sub newick_x { $_[0]->[2] } |
434 |
sub newick_c1 { $_[0]->[3] } |
sub newick_c1 { $_[0]->[3] } |
435 |
sub newick_c2 { $_[0]->[4] } |
sub newick_c2 { $_[0]->[4] } |
504 |
#=============================================================================== |
#=============================================================================== |
505 |
# Some tree property tests: |
# Some tree property tests: |
506 |
#=============================================================================== |
#=============================================================================== |
507 |
|
# Tree is valid? |
508 |
|
# |
509 |
|
# $bool = newick_is_valid( $node, $verbose ) |
510 |
|
#------------------------------------------------------------------------------- |
511 |
|
sub newick_is_valid |
512 |
|
{ |
513 |
|
my $node = shift; |
514 |
|
|
515 |
|
if ( ! array_ref( $node ) ) |
516 |
|
{ |
517 |
|
print STDERR "Node is not array reference\n" if $_[0]; |
518 |
|
return 0; |
519 |
|
} |
520 |
|
|
521 |
|
my @node = @$node; |
522 |
|
if ( ! @node ) |
523 |
|
{ |
524 |
|
print STDERR "Node is empty array reference\n" if $_[0]; |
525 |
|
return 0; |
526 |
|
} |
527 |
|
|
528 |
|
# Must have descendant or label: |
529 |
|
|
530 |
|
if ( ! ( array_ref( $node[0] ) && @{ $node[0] } ) && ! $node[2] ) |
531 |
|
{ |
532 |
|
print STDERR "Node has neither descendant nor label\n" if $_[0]; |
533 |
|
return 0; |
534 |
|
} |
535 |
|
|
536 |
|
# If comments are present, they must be array references |
537 |
|
|
538 |
|
foreach ( ( @node > 3 ) ? @node[ 3 .. $#node ] : () ) |
539 |
|
{ |
540 |
|
if ( defined( $_ ) && ! array_ref( $_ ) ) |
541 |
|
{ |
542 |
|
print STDERR "Node has neither descendant or label\n" if $_[0]; |
543 |
|
return 0; |
544 |
|
} |
545 |
|
} |
546 |
|
|
547 |
|
# Inspect the descendants: |
548 |
|
|
549 |
|
foreach ( array_ref( $node[0] ) ? @{ $node[0] } : () ) |
550 |
|
{ |
551 |
|
newick_is_valid( $_, @_ ) || return 0 |
552 |
|
} |
553 |
|
|
554 |
|
return 1; |
555 |
|
} |
556 |
|
|
557 |
|
|
558 |
|
#------------------------------------------------------------------------------- |
559 |
# Tree is rooted (2 branches at root node)? |
# Tree is rooted (2 branches at root node)? |
560 |
# |
# |
561 |
# $bool = newick_is_rooted( $node ) |
# $bool = newick_is_rooted( $node ) |
841 |
|
|
842 |
|
|
843 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
844 |
|
# Tree tip insertion point (with standard node labels): |
845 |
|
# |
846 |
|
# [ $node1, $x1, $node2, $x2, $x ] |
847 |
|
# = newick_tip_insertion_point( $tree, $tip ) |
848 |
|
# |
849 |
|
# Which means: tip is on a branch of length x that is inserted into the branch |
850 |
|
# connecting node1 and node2, at distance x1 from node1 and x2 from node2. |
851 |
|
# |
852 |
|
# x1 +------ n1a (lowest sorting tip of this subtree) |
853 |
|
# +--------n1 |
854 |
|
# | +------n1b (lowest sorting tip of this subtree) |
855 |
|
# tip-------n |
856 |
|
# x | +------------- n2a (lowest sorting tip of this subtree) |
857 |
|
# +------n2 |
858 |
|
# x2 +-------- n2b (lowest sorting tip of this subtree) |
859 |
|
# |
860 |
|
# The designations of 1 vs 2, and a vs b are chosen such that: |
861 |
|
# n1a < n1b, and n2a < n2b, and n1a < n2a |
862 |
|
# |
863 |
|
# Then the statandard description becomes: |
864 |
|
# |
865 |
|
# [ [ $n1a, min(n1b,n2a), max(n1b,n2a) ], x1, |
866 |
|
# [ $n2a, min(n2b,n1a), max(n2b,n1a) ], x2, |
867 |
|
# x |
868 |
|
# ] |
869 |
|
# |
870 |
|
#------------------------------------------------------------------------------- |
871 |
|
sub newick_tip_insertion_point |
872 |
|
{ |
873 |
|
my ( $tree, $tip ) = @_; |
874 |
|
$tree && $tip && ref( $tree ) eq 'ARRAY' or return undef; |
875 |
|
$tree = copy_newick_tree( $tree ) or return undef; |
876 |
|
$tree = reroot_newick_to_tip( $tree, $tip ) or return undef; |
877 |
|
my $node = $tree; |
878 |
|
|
879 |
|
my $x = 0; # Distance to node |
880 |
|
my $dl = newick_desc_ref( $node ); # Descendent list of tip node; |
881 |
|
$node = $dl->[0]; # Node adjacent to tip |
882 |
|
$dl = newick_desc_ref( $node ); |
883 |
|
while ( $dl && ( @$dl == 1 ) ) # Traverse unbranched nodes |
884 |
|
{ |
885 |
|
$node = $dl->[0]; |
886 |
|
$x += newick_x( $node ); |
887 |
|
$dl = newick_desc_ref( $node ); |
888 |
|
} |
889 |
|
$x += newick_x( $node ); |
890 |
|
|
891 |
|
# We are now at the node that is the insertion point. |
892 |
|
# Is it a tip? |
893 |
|
|
894 |
|
my @description; |
895 |
|
|
896 |
|
if ( ( ! $dl ) || @$dl == 0 ) |
897 |
|
{ |
898 |
|
@description = ( [ newick_lbl( $node ) ], 0, undef, 0, $x ); |
899 |
|
} |
900 |
|
|
901 |
|
# Is it a trifurcation or greater, in which case it does not go |
902 |
|
# away with tip deletion? |
903 |
|
|
904 |
|
elsif ( @$dl > 2 ) |
905 |
|
{ |
906 |
|
@description = ( [ std_node_name( $node, $node ) ], 0, undef, 0, $x ); |
907 |
|
} |
908 |
|
|
909 |
|
# The node is bifurcating. We need to describe it. |
910 |
|
|
911 |
|
else |
912 |
|
{ |
913 |
|
my ( $n1, $x1 ) = describe_desc( $dl->[0] ); |
914 |
|
my ( $n2, $x2 ) = describe_desc( $dl->[1] ); |
915 |
|
|
916 |
|
if ( @$n1 == 2 ) { push @$n1, $n2->[0] } |
917 |
|
if ( @$n2 == 2 ) |
918 |
|
{ |
919 |
|
@$n2 = sort { lc $a cmp lc $b } ( @$n2, $n1->[0] ); |
920 |
|
} |
921 |
|
if ( @$n1 == 3 ) { @$n2 = sort { lc $a cmp lc $b } @$n2 } |
922 |
|
@description = ( $n1, $x1, $n2, $x2, $x ); |
923 |
|
} |
924 |
|
|
925 |
|
return wantarray ? @description : \@description; |
926 |
|
} |
927 |
|
|
928 |
|
|
929 |
|
sub describe_desc |
930 |
|
{ |
931 |
|
my $node = shift; |
932 |
|
|
933 |
|
my $x = 0; # Distance to node |
934 |
|
my $dl = newick_desc_ref( $node ); # Descendent list of tip node; |
935 |
|
while ( $dl && ( @$dl == 1 ) ) # Traverse unbranched nodes |
936 |
|
{ |
937 |
|
$node = $dl->[0]; |
938 |
|
$x += newick_x( $node ); |
939 |
|
$dl = newick_desc_ref( $node ); |
940 |
|
} |
941 |
|
$x += newick_x( $node ); |
942 |
|
|
943 |
|
# Is it a tip? Return list of one tip; |
944 |
|
|
945 |
|
if ( ( ! $dl ) || @$dl == 0 ) |
946 |
|
{ |
947 |
|
return ( [ newick_lbl( $node ) ], $x ); |
948 |
|
} |
949 |
|
|
950 |
|
# Get tips of each descendent, keeping lowest sorting from each. |
951 |
|
# Return the two lowest of those (the third will come from the |
952 |
|
# other side of the original node). |
953 |
|
|
954 |
|
else |
955 |
|
{ |
956 |
|
my @rep_tips = sort { lc $a cmp lc $b } |
957 |
|
map { ( sort { lc $a cmp lc $b } newick_tip_list( $_ ) )[0] } |
958 |
|
@$dl; |
959 |
|
return ( [ @rep_tips[0,1] ], $x ); |
960 |
|
} |
961 |
|
} |
962 |
|
|
963 |
|
|
964 |
|
#------------------------------------------------------------------------------- |
965 |
# Standard node name: |
# Standard node name: |
966 |
# Tip label if at a tip |
# Tip label if at a tip |
967 |
# Three sorted tip labels intersecting at node, each being smallest |
# Three sorted tip labels intersecting at node, each being smallest |
984 |
# Work through lists of tips in descendant subtrees, removing them from |
# Work through lists of tips in descendant subtrees, removing them from |
985 |
# @rest, and keeping the best tip for each subtree. |
# @rest, and keeping the best tip for each subtree. |
986 |
|
|
987 |
my @rest = tips_in_newick( $tree ); |
my @rest = newick_tip_list( $tree ); |
988 |
my @best = map { |
my @best = map { |
989 |
my @tips = sort { lc $a cmp lc $b } tips_in_newick( $_ ); |
my @tips = sort { lc $a cmp lc $b } newick_tip_list( $_ ); |
990 |
@rest = gjolists::set_difference( \@rest, \@tips ); |
@rest = gjolists::set_difference( \@rest, \@tips ); |
991 |
$tips[0]; |
$tips[0]; |
992 |
} newick_desc_list( $noderef ); |
} newick_desc_list( $noderef ); |
1180 |
|
|
1181 |
array_ref( $node ) && defined( $node1 ) |
array_ref( $node ) && defined( $node1 ) |
1182 |
&& defined( $node2 ) || return undef; |
&& defined( $node2 ) || return undef; |
1183 |
my @p1 = path_to_node( $node, $node1 ); |
my @p1 = path_to_node( $node, $node1 ) or return undef; |
1184 |
my @p2 = path_to_node( $node, $node2 ); |
my @p2 = path_to_node( $node, $node2 ) or return undef; |
|
@p1 && @p2 || return undef; # Were they found? |
|
1185 |
|
|
1186 |
# Find the unique suffixes of the two paths |
# Find the unique suffixes of the two paths |
1187 |
my ( $suf1, $suf2 ) = gjolists::unique_suffixes( \@p1, \@p2 ); # Common node is lost |
my ( $suf1, $suf2 ) = gjolists::unique_suffixes( \@p1, \@p2 ); # Common node is lost |
1460 |
|
|
1461 |
|
|
1462 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
1463 |
|
# Remove comments from a newick tree (e.g., before writing for phylip). |
1464 |
|
# |
1465 |
|
# $node = newick_strip_comments( $node ) |
1466 |
|
#------------------------------------------------------------------------------- |
1467 |
|
sub newick_strip_comments { |
1468 |
|
my ( $node ) = @_; |
1469 |
|
|
1470 |
|
@$node = @$node[ 0 .. 2 ]; |
1471 |
|
foreach ( newick_desc_list( $node ) ) { newick_strip_comments( $_ ) } |
1472 |
|
$node; |
1473 |
|
} |
1474 |
|
|
1475 |
|
|
1476 |
|
#------------------------------------------------------------------------------- |
1477 |
# Normalize tree order (in place). |
# Normalize tree order (in place). |
1478 |
# |
# |
1479 |
# ( $tree, $label1 ) = normalize_newick_tree( $tree ) |
# ( $tree, $label1 ) = normalize_newick_tree( $tree ) |
1717 |
# |
# |
1718 |
# splice( @$dl1, $path1-1, 1 ); |
# splice( @$dl1, $path1-1, 1 ); |
1719 |
# |
# |
1720 |
# But this maintains the cyclic order of the nodes: |
# But the following maintains the cyclic order of the nodes: |
1721 |
|
|
1722 |
my $dl1 = newick_desc_ref( $node1 ); |
my $dl1 = newick_desc_ref( $node1 ); |
1723 |
my $nd1 = @$dl1; |
my $nd1 = @$dl1; |
1732 |
|
|
1733 |
my $dl2 = newick_desc_ref( $node2 ); |
my $dl2 = newick_desc_ref( $node2 ); |
1734 |
if ( array_ref( $dl2 ) ) { push @$dl2, $node1 } |
if ( array_ref( $dl2 ) ) { push @$dl2, $node1 } |
1735 |
else { set_newick_desc_list( $node2, [ $node1 ] ) } |
else { set_newick_desc_list( $node2, $node1 ) } |
1736 |
|
|
1737 |
# Move c1 comments from node 1 to node 2: |
# Move c1 comments from node 1 to node 2: |
1738 |
|
|
2144 |
( $tree, ( $collapse ? @new_desc : () ) ); |
( $tree, ( $collapse ? @new_desc : () ) ); |
2145 |
} |
} |
2146 |
|
|
2147 |
|
#------------------------------------------------------------------------------- |
2148 |
|
# Add a subtree to a newick tree node: |
2149 |
|
# |
2150 |
|
# $node = newick_insert_at_node( $node, $subtree ) |
2151 |
|
#------------------------------------------------------------------------------- |
2152 |
|
sub newick_insert_at_node |
2153 |
|
{ |
2154 |
|
my ( $node, $subtree ) = @_; |
2155 |
|
array_ref( $node ) && array_ref( $subtree ) or return undef; |
2156 |
|
|
2157 |
|
# We could check validity of trees, but .... |
2158 |
|
|
2159 |
|
my $dl = newick_desc_ref( $node ); |
2160 |
|
if ( array_ref( $dl ) ) |
2161 |
|
{ |
2162 |
|
push @$dl, $subtree; |
2163 |
|
} |
2164 |
|
else |
2165 |
|
{ |
2166 |
|
set_newick_desc_ref( $node, [ $subtree ] ); |
2167 |
|
} |
2168 |
|
return $node; |
2169 |
|
} |
2170 |
|
|
2171 |
|
|
2172 |
|
#------------------------------------------------------------------------------- |
2173 |
|
# Insert a subtree into a newick tree along the path between 2 nodes: |
2174 |
|
# |
2175 |
|
# $tree = newick_insert_between_nodes( $tree, $subtree, $node1, $node2, $fraction ) |
2176 |
|
#------------------------------------------------------------------------------- |
2177 |
|
sub newick_insert_between_nodes |
2178 |
|
{ |
2179 |
|
my ( $tree, $subtree, $node1, $node2, $fraction ) = @_; |
2180 |
|
array_ref( $tree ) && array_ref( $subtree ) or return undef; |
2181 |
|
$fraction >= 0 && $fraction <= 1 or return undef; |
2182 |
|
|
2183 |
|
# Find the paths to the nodes: |
2184 |
|
|
2185 |
|
my @path1 = path_to_node( $tree, $node1 ) or return undef; |
2186 |
|
my @path2 = path_to_node( $tree, $node2 ) or return undef; |
2187 |
|
|
2188 |
|
# Trim the common prefix: |
2189 |
|
|
2190 |
|
while ( $path1[1] == $path2[1] ) |
2191 |
|
{ |
2192 |
|
splice( @path1, 0, 2 ); |
2193 |
|
splice( @path2, 0, 2 ); |
2194 |
|
} |
2195 |
|
|
2196 |
|
my ( @path, $dist ); |
2197 |
|
if ( @path1 < 3 ) |
2198 |
|
{ |
2199 |
|
@path2 >= 3 or return undef; # node1 = node2 |
2200 |
|
$dist = $fraction* newick_path_length( @path2 ); |
2201 |
|
@path = @path2; |
2202 |
|
} |
2203 |
|
elsif ( @path2 < 3 ) |
2204 |
|
{ |
2205 |
|
$dist = ( 1 - $fraction ) * newick_path_length( @path1 ); |
2206 |
|
@path = @path1; |
2207 |
|
} |
2208 |
|
else |
2209 |
|
{ |
2210 |
|
my $dist1 = newick_path_length( @path1 ); |
2211 |
|
my $dist2 = newick_path_length( @path2 ); |
2212 |
|
$dist = $fraction * ( $dist1 + $dist2 ) - $dist1; |
2213 |
|
@path = ( $dist <= 0 ) ? @path1 : @path2; |
2214 |
|
$dist = abs( $dist ); |
2215 |
|
} |
2216 |
|
|
2217 |
|
# Descend tree until we reach the insertion branch: |
2218 |
|
|
2219 |
|
my $x; |
2220 |
|
while ( ( $dist > ( $x = newick_x( $path[2] ) ) ) && ( @path > 3 ) ) |
2221 |
|
{ |
2222 |
|
$dist -= $x; |
2223 |
|
splice( @path, 0, 2 ); |
2224 |
|
} |
2225 |
|
|
2226 |
|
# Insert the new node: |
2227 |
|
|
2228 |
|
set_newick_desc_i( $path[0], $path[1], [ [ $path[2], $subtree ], undef, $dist ] ); |
2229 |
|
set_newick_x( $path[2], ( ( $x > $dist ) ? ( $x - $dist ) : 0 ) ); |
2230 |
|
|
2231 |
|
return $tree; |
2232 |
|
} |
2233 |
|
|
2234 |
|
|
2235 |
#------------------------------------------------------------------------------- |
#------------------------------------------------------------------------------- |
2236 |
# Prune one or more tips from a tree: |
# Prune one or more tips from a tree: |