88 |
# \@packed_seqs = pack_alignment( @seqs ) |
# \@packed_seqs = pack_alignment( @seqs ) |
89 |
# \@packed_seqs = pack_alignment( \@seqs ) |
# \@packed_seqs = pack_alignment( \@seqs ) |
90 |
# |
# |
91 |
|
# Pack mask for an alignment (gap = 0x00, others are 0xFF) |
92 |
|
# |
93 |
|
# $mask = alignment_gap_mask( @seqs ) |
94 |
|
# $mask = alignment_gap_mask( \@seqs ) |
95 |
|
# |
96 |
|
# Pack a sequence alignment according to a mask: |
97 |
|
# |
98 |
|
# @packed = pack_alignment_by_mask( $mask, @align ) |
99 |
|
# @packed = pack_alignment_by_mask( $mask, \@align ) |
100 |
|
# \@packed = pack_alignment_by_mask( $mask, @align ) |
101 |
|
# \@packed = pack_alignment_by_mask( $mask, \@align ) |
102 |
|
# |
103 |
|
# Expand sequence by a mask, adding indel at "\000" or '-' in mask: |
104 |
|
# |
105 |
|
# $expanded = expand_sequence_by_mask( $seq, $mask ) |
106 |
|
# |
107 |
# Remove all alignment gaps from sequences: |
# Remove all alignment gaps from sequences: |
108 |
# |
# |
109 |
# @packed_seqs = pack_sequences( @seqs ) |
# @packed_seqs = pack_sequences( @seqs ) # Works for one sequence, too |
110 |
# @packed_seqs = pack_sequences( \@seqs ) |
# @packed_seqs = pack_sequences( \@seqs ) |
111 |
# \@packed_seqs = pack_sequences( @seqs ) |
# \@packed_seqs = pack_sequences( @seqs ) |
112 |
# \@packed_seqs = pack_sequences( \@seqs ) |
# \@packed_seqs = pack_sequences( \@seqs ) |
219 |
seq_data_by_id |
seq_data_by_id |
220 |
|
|
221 |
pack_alignment |
pack_alignment |
222 |
|
alignment_gap_mask |
223 |
|
pack_alignment_by_mask |
224 |
|
expand_sequence_by_mask |
225 |
pack_sequences |
pack_sequences |
226 |
|
|
227 |
subseq_DNA_entry |
subseq_DNA_entry |
959 |
return ${ $ind_ref->{$id} }[2]; |
return ${ $ind_ref->{$id} }[2]; |
960 |
} |
} |
961 |
|
|
962 |
|
|
963 |
#----------------------------------------------------------------------------- |
#----------------------------------------------------------------------------- |
964 |
# Remove columns of alignment gaps from sequences: |
# Remove columns of alignment gaps from sequences: |
965 |
# |
# |
968 |
# \@packed_seqs = pack_alignment( @seqs ) |
# \@packed_seqs = pack_alignment( @seqs ) |
969 |
# \@packed_seqs = pack_alignment( \@seqs ) |
# \@packed_seqs = pack_alignment( \@seqs ) |
970 |
# |
# |
971 |
|
# Gap characters are defined below as '-', '~', '.', and ' '. |
972 |
#----------------------------------------------------------------------------- |
#----------------------------------------------------------------------------- |
|
|
|
973 |
sub pack_alignment |
sub pack_alignment |
974 |
{ |
{ |
975 |
my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; |
$_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) |
976 |
|
or return (); |
977 |
|
|
978 |
|
my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; |
979 |
@seqs or return wantarray ? () : []; |
@seqs or return wantarray ? () : []; |
980 |
|
|
981 |
my $mask = pack_mask( $seqs[0]->[2] ); |
my $mask = gap_mask( $seqs[0]->[2] ); |
982 |
foreach ( @seqs[ 1 .. (@seqs-1) ] ) |
foreach ( @seqs[ 1 .. (@seqs-1) ] ) |
983 |
{ |
{ |
984 |
$mask |= pack_mask( $_->[2] ); |
$mask |= gap_mask( $_->[2] ); |
985 |
} |
} |
986 |
|
|
987 |
my $seq; |
my $seq; |
991 |
} |
} |
992 |
@seqs; |
@seqs; |
993 |
|
|
994 |
return wantarray ? @seqs2 : \@seqs2; |
wantarray ? @seqs2 : \@seqs2; |
995 |
|
} |
996 |
|
|
997 |
|
|
998 |
|
#------------------------------------------------------------------------------- |
999 |
|
# Produce a packing mask for columns of alignment gaps in sequences. Gap |
1000 |
|
# columns are 0x00 characters, and all others are 0xFF. |
1001 |
|
# |
1002 |
|
# $mask = alignment_gap_mask( @seqs ) |
1003 |
|
# $mask = alignment_gap_mask( \@seqs ) |
1004 |
|
# |
1005 |
|
#------------------------------------------------------------------------------- |
1006 |
|
sub alignment_gap_mask |
1007 |
|
{ |
1008 |
|
$_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) |
1009 |
|
or return undef; |
1010 |
|
|
1011 |
|
my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; |
1012 |
|
@seqs or return undef; |
1013 |
|
|
1014 |
|
my $mask = gap_mask( $seqs[0]->[2] ); |
1015 |
|
foreach ( @seqs[ 1 .. (@seqs-1) ] ) { $mask |= gap_mask( $_->[2] ) } |
1016 |
|
|
1017 |
|
$mask; |
1018 |
|
} |
1019 |
|
|
1020 |
|
|
1021 |
|
#------------------------------------------------------------------------------- |
1022 |
|
# Pack a sequence alignment according to a mask, removing positions where |
1023 |
|
# mask has 0x00 (or '-') characters |
1024 |
|
# |
1025 |
|
# @packed = pack_alignment_by_mask( $mask, @align ) |
1026 |
|
# @packed = pack_alignment_by_mask( $mask, \@align ) |
1027 |
|
# \@packed = pack_alignment_by_mask( $mask, @align ) |
1028 |
|
# \@packed = pack_alignment_by_mask( $mask, \@align ) |
1029 |
|
# |
1030 |
|
#------------------------------------------------------------------------------- |
1031 |
|
sub pack_alignment_by_mask |
1032 |
|
{ |
1033 |
|
my $mask = shift; |
1034 |
|
defined $mask && ! ref( $mask ) && length( $mask ) |
1035 |
|
or return (); |
1036 |
|
$mask =~ tr/-/\000/; # Allow '-' as a column to be removed |
1037 |
|
$mask =~ tr/\000/\377/c; # Make sure that everything not null is 0xFF |
1038 |
|
|
1039 |
|
$_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) |
1040 |
|
or return (); |
1041 |
|
my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; |
1042 |
|
|
1043 |
|
my $seq; |
1044 |
|
my @seqs2 = map { $seq = $_->[2] & $mask; # Apply mask to sequence |
1045 |
|
$seq =~ tr/\000//d; # Delete null characters |
1046 |
|
[ $_->[0], $_->[1], $seq ] # Rebuild sequence entries |
1047 |
|
} |
1048 |
|
@seqs; |
1049 |
|
|
1050 |
|
wantarray ? @seqs2 : \@seqs2; |
1051 |
|
} |
1052 |
|
|
1053 |
|
|
1054 |
|
#------------------------------------------------------------------------------- |
1055 |
|
# Weight a sequence alignment according to a mask of digits, 0-9. |
1056 |
|
# |
1057 |
|
# @packed = weight_alignment_by_mask( $mask, @align ) |
1058 |
|
# @packed = weight_alignment_by_mask( $mask, \@align ) |
1059 |
|
# \@packed = weight_alignment_by_mask( $mask, @align ) |
1060 |
|
# \@packed = weight_alignment_by_mask( $mask, \@align ) |
1061 |
|
# |
1062 |
|
#------------------------------------------------------------------------------- |
1063 |
|
sub weight_alignment_by_mask |
1064 |
|
{ |
1065 |
|
my $mask = shift; |
1066 |
|
defined $mask && ! ref( $mask ) && length( $mask ) |
1067 |
|
or return (); |
1068 |
|
|
1069 |
|
$_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) |
1070 |
|
or return (); |
1071 |
|
my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; |
1072 |
|
|
1073 |
|
my @seqs2 = map { [ $_->[0], $_->[1], weight_seq_by_mask_0( $mask, $_->[2] ) ] } @seqs; |
1074 |
|
|
1075 |
|
wantarray ? @seqs2 : \@seqs2; |
1076 |
} |
} |
1077 |
|
|
1078 |
sub pack_mask |
|
1079 |
|
# |
1080 |
|
# Assume data are valid |
1081 |
|
# |
1082 |
|
sub weight_seq_by_mask_0 |
1083 |
|
{ |
1084 |
|
my ( $mask, $seq ) = @_; |
1085 |
|
|
1086 |
|
# Remove 0 weight columns, which is fast and easy: |
1087 |
|
my $m0 = $mask; |
1088 |
|
$m0 =~ tr/123456789/\377/; |
1089 |
|
$m0 =~ tr/\377/\000/c; |
1090 |
|
( $mask &= $m0 ) =~ tr/\000//d; |
1091 |
|
( $seq &= $m0 ) =~ tr/\000//d; |
1092 |
|
|
1093 |
|
# If all remaining cols are weight 1, we are done: |
1094 |
|
return $seq if $mask =~ /^1*$/; |
1095 |
|
|
1096 |
|
my @seq; |
1097 |
|
for ( my $i = 0; $i < length( $mask ); $i++ ) |
1098 |
|
{ |
1099 |
|
push @seq, substr( $seq, $i, 1 ) x substr( $mask, $i, 1 ); |
1100 |
|
} |
1101 |
|
|
1102 |
|
join( '', @seq ); |
1103 |
|
} |
1104 |
|
|
1105 |
|
|
1106 |
|
#----------------------------------------------------------------------------- |
1107 |
|
# Make a mask in which gap characters ('-', '~', '.', and ' ') are converted |
1108 |
|
# to 0x00, and all other characters to 0xFF. |
1109 |
|
# |
1110 |
|
# $mask = gap_mask( $seq ) |
1111 |
|
# |
1112 |
|
#----------------------------------------------------------------------------- |
1113 |
|
sub gap_mask |
1114 |
{ |
{ |
1115 |
my $mask = shift; |
my $mask = shift; |
1116 |
$mask =~ tr/-/\000/; |
defined $mask or return ''; |
1117 |
$mask =~ tr/\000/\377/c; |
|
1118 |
return $mask; |
$mask =~ tr/-~. /\000/; # Gap characters (might be extended) |
1119 |
|
$mask =~ tr/\000/\377/c; # Non-gap characters |
1120 |
|
$mask; |
1121 |
|
} |
1122 |
|
|
1123 |
|
|
1124 |
|
#=============================================================================== |
1125 |
|
# Expand sequences with the local gap character in a manner that reverses the |
1126 |
|
# pack by mask function. |
1127 |
|
# |
1128 |
|
# $expanded = expand_sequence_by_mask( $seq, $mask ) |
1129 |
|
# |
1130 |
|
# The columns to be added can be marked by '-' or "\000" in the mask. |
1131 |
|
# |
1132 |
|
# Code note: |
1133 |
|
# |
1134 |
|
# The function attempts to match the local gap character in the sequence. |
1135 |
|
# $c0 and $c1 are the previous and next characters in the sequence being |
1136 |
|
# expanded. (($c0,$c1)=($c1,shift @s1))[0] updates the values and evaluates |
1137 |
|
# to what was the next character, and becomes the new previous character. |
1138 |
|
# The following really does print w, x, y, and z, one per line: |
1139 |
|
# |
1140 |
|
# ( $a, $b, @c ) = ("", split //, "wxyz"); |
1141 |
|
# while ( defined $b ) { print( (($a,$b)=($b,shift @c))[0], "\n" ) } |
1142 |
|
#=============================================================================== |
1143 |
|
|
1144 |
|
sub expand_sequence_by_mask |
1145 |
|
{ |
1146 |
|
my ( $seq, $mask ) = @_; |
1147 |
|
|
1148 |
|
$mask =~ tr/-/\000/; # Allow hyphen or null in mask at added positions. |
1149 |
|
my ( $c0, $c1, @s1 ) = ( '', split( //, $seq ), '' ); |
1150 |
|
|
1151 |
|
join '', map { $_ ne "\000" ? (($c0,$c1)=($c1,shift @s1))[0] |
1152 |
|
: $c0 eq '~' || $c1 eq '~' ? '~' |
1153 |
|
: $c0 eq '.' || $c1 eq '.' ? '.' |
1154 |
|
: '-' |
1155 |
|
} |
1156 |
|
split //, $mask; |
1157 |
} |
} |
1158 |
|
|
1159 |
|
|
1160 |
#----------------------------------------------------------------------------- |
#----------------------------------------------------------------------------- |
1161 |
# Remove all alignment gaps from sequences: |
# Remove all alignment gaps from sequences: |
1162 |
# |
# |
1163 |
# @packed_seqs = pack_sequences( @seqs ) |
# @packed_seqs = pack_sequences( @seqs ) # Also handles single sequence |
1164 |
# @packed_seqs = pack_sequences( \@seqs ) |
# @packed_seqs = pack_sequences( \@seqs ) |
1165 |
# \@packed_seqs = pack_sequences( @seqs ) |
# \@packed_seqs = pack_sequences( @seqs ) |
1166 |
# \@packed_seqs = pack_sequences( \@seqs ) |
# \@packed_seqs = pack_sequences( \@seqs ) |
1167 |
# |
# |
1168 |
#----------------------------------------------------------------------------- |
#----------------------------------------------------------------------------- |
|
|
|
1169 |
sub pack_sequences |
sub pack_sequences |
1170 |
{ |
{ |
1171 |
my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0]} : @_; |
$_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) |
1172 |
@seqs or return wantarray ? () : []; |
or return (); |
1173 |
|
|
1174 |
|
my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; |
1175 |
|
|
1176 |
my @seqs2 = map { [ $_->[0], $_->[1], pack_seq( $_->[2] ) ] } @seqs; |
my @seqs2 = map { [ $_->[0], $_->[1], pack_seq( $_->[2] ) ] } @seqs; |
1177 |
|
|
1178 |
return wantarray ? @seqs2 : \@seqs2; |
wantarray ? @seqs2 : \@seqs2; |
1179 |
} |
} |
1180 |
|
|
1181 |
|
|
1182 |
#----------------------------------------------------------------------------- |
#----------------------------------------------------------------------------- |
1183 |
# Some simple sequence manipulations: |
# Some simple sequence manipulations: |
1184 |
# |
# |
1391 |
|
|
1392 |
|
|
1393 |
sub clean_ae_sequence { |
sub clean_ae_sequence { |
1394 |
$_ = shift; |
local $_ = shift; |
1395 |
$_ = to7bit($_); |
$_ = to7bit($_); |
1396 |
s/[+]/1/g; |
s/\+/1/g; |
1397 |
s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g; |
s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g; |
1398 |
return $_; |
return $_; |
1399 |
} |
} |
1400 |
|
|
1401 |
|
|
1402 |
sub to7bit { |
sub to7bit { |
1403 |
$_ = shift; |
local $_ = shift; |
1404 |
my ($o, $c); |
my ($o, $c); |
1405 |
while (/\\([0-3][0-7][0-7])/) { |
while (/\\([0-3][0-7][0-7])/) { |
1406 |
$o = oct($1) % 128; |
$o = oct($1) % 128; |
1412 |
|
|
1413 |
|
|
1414 |
sub to8bit { |
sub to8bit { |
1415 |
$_ = shift; |
local $_ = shift; |
1416 |
my ($o, $c); |
my ($o, $c); |
1417 |
while (/\\([0-3][0-7][0-7])/) { |
while (/\\([0-3][0-7][0-7])/) { |
1418 |
$o = oct($1); |
$o = oct($1); |