[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.19, Sat Aug 21 17:29:14 2010 UTC revision 1.20, Sun Aug 29 22:13:28 2010 UTC
# Line 88  Line 88 
88  #  \@packed_seqs = pack_alignment(  @seqs )  #  \@packed_seqs = pack_alignment(  @seqs )
89  #  \@packed_seqs = pack_alignment( \@seqs )  #  \@packed_seqs = pack_alignment( \@seqs )
90  #  #
91    #  Pack mask for an alignment (gap = 0x00, others are 0xFF)
92    #
93    #   $mask = alignment_gap_mask(  @seqs )
94    #   $mask = alignment_gap_mask( \@seqs )
95    #
96    #  Pack a sequence alignment according to a mask:
97    #
98    #   @packed = pack_alignment_by_mask( $mask,  @align )
99    #   @packed = pack_alignment_by_mask( $mask, \@align )
100    #  \@packed = pack_alignment_by_mask( $mask,  @align )
101    #  \@packed = pack_alignment_by_mask( $mask, \@align )
102    #
103    #  Expand sequence by a mask, adding indel at "\000" or '-' in mask:
104    #
105    #   $expanded = expand_sequence_by_mask( $seq, $mask )
106    #
107  #  Remove all alignment gaps from sequences:  #  Remove all alignment gaps from sequences:
108  #  #
109  #   @packed_seqs = pack_sequences(  @seqs )  #   @packed_seqs = pack_sequences(  @seqs )  #  Works for one sequence, too
110  #   @packed_seqs = pack_sequences( \@seqs )  #   @packed_seqs = pack_sequences( \@seqs )
111  #  \@packed_seqs = pack_sequences(  @seqs )  #  \@packed_seqs = pack_sequences(  @seqs )
112  #  \@packed_seqs = pack_sequences( \@seqs )  #  \@packed_seqs = pack_sequences( \@seqs )
# Line 203  Line 219 
219          seq_data_by_id          seq_data_by_id
220    
221          pack_alignment          pack_alignment
222            alignment_gap_mask
223            pack_alignment_by_mask
224            expand_sequence_by_mask
225          pack_sequences          pack_sequences
226    
227          subseq_DNA_entry          subseq_DNA_entry
# Line 940  Line 959 
959      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
960  }  }
961    
962    
963  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
964  #  Remove columns of alignment gaps from sequences:  #  Remove columns of alignment gaps from sequences:
965  #  #
# Line 948  Line 968 
968  #  \@packed_seqs = pack_alignment(  @seqs )  #  \@packed_seqs = pack_alignment(  @seqs )
969  #  \@packed_seqs = pack_alignment( \@seqs )  #  \@packed_seqs = pack_alignment( \@seqs )
970  #  #
971    #  Gap characters are defined below as '-', '~', '.', and ' '.
972  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
   
973  sub pack_alignment  sub pack_alignment
974  {  {
975      my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;      $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
976            or return ();
977    
978        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
979      @seqs or return wantarray ? () : [];      @seqs or return wantarray ? () : [];
980    
981      my $mask  = pack_mask( $seqs[0]->[2] );      my $mask  = gap_mask( $seqs[0]->[2] );
982      foreach ( @seqs[ 1 .. (@seqs-1) ] )      foreach ( @seqs[ 1 .. (@seqs-1) ] )
983      {      {
984          $mask |= pack_mask( $_->[2] );          $mask |= gap_mask( $_->[2] );
985      }      }
986    
987      my $seq;      my $seq;
# Line 968  Line 991 
991                      }                      }
992                  @seqs;                  @seqs;
993    
994      return wantarray ? @seqs2 : \@seqs2;      wantarray ? @seqs2 : \@seqs2;
995    }
996    
997    
998    #-------------------------------------------------------------------------------
999    #  Produce a packing mask for columns of alignment gaps in sequences.  Gap
1000    #  columns are 0x00 characters, and all others are 0xFF.
1001    #
1002    #   $mask = alignment_gap_mask(  @seqs )
1003    #   $mask = alignment_gap_mask( \@seqs )
1004    #
1005    #-------------------------------------------------------------------------------
1006    sub alignment_gap_mask
1007    {
1008        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1009            or return undef;
1010    
1011        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1012        @seqs or return undef;
1013    
1014        my $mask = gap_mask( $seqs[0]->[2] );
1015        foreach ( @seqs[ 1 .. (@seqs-1) ] ) { $mask |= gap_mask( $_->[2] ) }
1016    
1017        $mask;
1018    }
1019    
1020    
1021    #-------------------------------------------------------------------------------
1022    #  Pack a sequence alignment according to a mask, removing positions where
1023    #  mask has 0x00 (or '-') characters
1024    #
1025    #   @packed = pack_alignment_by_mask( $mask,  @align )
1026    #   @packed = pack_alignment_by_mask( $mask, \@align )
1027    #  \@packed = pack_alignment_by_mask( $mask,  @align )
1028    #  \@packed = pack_alignment_by_mask( $mask, \@align )
1029    #
1030    #-------------------------------------------------------------------------------
1031    sub pack_alignment_by_mask
1032    {
1033        my $mask = shift;
1034        defined $mask && ! ref( $mask ) && length( $mask )
1035            or return ();
1036        $mask =~ tr/-/\000/;      # Allow '-' as a column to be removed
1037        $mask =~ tr/\000/\377/c;  # Make sure that everything not null is 0xFF
1038    
1039        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1040            or return ();
1041        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1042    
1043        my $seq;
1044        my @seqs2 = map { $seq = $_->[2] & $mask;     # Apply mask to sequence
1045                          $seq =~ tr/\000//d;         # Delete null characters
1046                          [ $_->[0], $_->[1], $seq ]  # Rebuild sequence entries
1047                        }
1048                    @seqs;
1049    
1050        wantarray ? @seqs2 : \@seqs2;
1051    }
1052    
1053    
1054    #-------------------------------------------------------------------------------
1055    #  Weight a sequence alignment according to a mask of digits, 0-9.
1056    #
1057    #   @packed = weight_alignment_by_mask( $mask,  @align )
1058    #   @packed = weight_alignment_by_mask( $mask, \@align )
1059    #  \@packed = weight_alignment_by_mask( $mask,  @align )
1060    #  \@packed = weight_alignment_by_mask( $mask, \@align )
1061    #
1062    #-------------------------------------------------------------------------------
1063    sub weight_alignment_by_mask
1064    {
1065        my $mask = shift;
1066        defined $mask && ! ref( $mask ) && length( $mask )
1067            or return ();
1068    
1069        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1070            or return ();
1071        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1072    
1073        my @seqs2 = map { [ $_->[0], $_->[1], weight_seq_by_mask_0( $mask, $_->[2] ) ] } @seqs;
1074    
1075        wantarray ? @seqs2 : \@seqs2;
1076  }  }
1077    
1078  sub pack_mask  
1079    #
1080    #  Assume data are valid
1081    #
1082    sub weight_seq_by_mask_0
1083    {
1084        my ( $mask, $seq ) = @_;
1085    
1086        #  Remove 0 weight columns, which is fast and easy:
1087        my $m0 = $mask;
1088        $m0 =~ tr/123456789/\377/;
1089        $m0 =~ tr/\377/\000/c;
1090        ( $mask &= $m0 ) =~ tr/\000//d;
1091        ( $seq  &= $m0 ) =~ tr/\000//d;
1092    
1093        #  If all remaining cols are weight 1, we are done:
1094        return $seq if $mask =~ /^1*$/;
1095    
1096        my @seq;
1097        for ( my $i = 0; $i < length( $mask ); $i++ )
1098        {
1099            push @seq, substr( $seq, $i, 1 ) x substr( $mask, $i, 1 );
1100        }
1101    
1102        join( '', @seq );
1103    }
1104    
1105    
1106    #-----------------------------------------------------------------------------
1107    #  Make a mask in which gap characters ('-', '~', '.', and ' ') are converted
1108    #  to 0x00, and all other characters to 0xFF.
1109    #
1110    #      $mask = gap_mask( $seq )
1111    #
1112    #-----------------------------------------------------------------------------
1113    sub gap_mask
1114  {  {
1115      my $mask = shift;      my $mask = shift;
1116      $mask =~ tr/-/\000/;      defined $mask or return '';
1117      $mask =~ tr/\000/\377/c;  
1118      return $mask;      $mask =~ tr/-~. /\000/;    #  Gap characters (might be extended)
1119        $mask =~ tr/\000/\377/c;   #  Non-gap characters
1120        $mask;
1121    }
1122    
1123    
1124    #===============================================================================
1125    #  Expand sequences with the local gap character in a manner that reverses the
1126    #  pack by mask function.
1127    #
1128    #      $expanded = expand_sequence_by_mask( $seq, $mask )
1129    #
1130    #  The columns to be added can be marked by '-' or "\000" in the mask.
1131    #
1132    #  Code note:
1133    #
1134    #  The function attempts to match the local gap character in the sequence.
1135    #  $c0 and $c1 are the previous and next characters in the sequence being
1136    #  expanded.  (($c0,$c1)=($c1,shift @s1))[0] updates the values and evaluates
1137    #  to what was the next character, and becomes the new previous character.
1138    #  The following really does print w, x, y, and z, one per line:
1139    #
1140    #     ( $a, $b, @c ) = ("", split //, "wxyz");
1141    #     while ( defined $b ) { print( (($a,$b)=($b,shift @c))[0], "\n" ) }
1142    #===============================================================================
1143    
1144    sub expand_sequence_by_mask
1145    {
1146        my ( $seq, $mask ) = @_;
1147    
1148        $mask =~ tr/-/\000/;  #  Allow hyphen or null in mask at added positions.
1149        my ( $c0, $c1, @s1 ) = ( '', split( //, $seq ), '' );
1150    
1151        join '', map { $_  ne "\000"            ? (($c0,$c1)=($c1,shift @s1))[0]
1152                     : $c0 eq '~' || $c1 eq '~' ? '~'
1153                     : $c0 eq '.' || $c1 eq '.' ? '.'
1154                     :                            '-'
1155                     }
1156                 split //, $mask;
1157  }  }
1158    
1159    
1160  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1161  #  Remove all alignment gaps from sequences:  #  Remove all alignment gaps from sequences:
1162  #  #
1163  #   @packed_seqs = pack_sequences(  @seqs )  #   @packed_seqs = pack_sequences(  @seqs )  # Also handles single sequence
1164  #   @packed_seqs = pack_sequences( \@seqs )  #   @packed_seqs = pack_sequences( \@seqs )
1165  #  \@packed_seqs = pack_sequences(  @seqs )  #  \@packed_seqs = pack_sequences(  @seqs )
1166  #  \@packed_seqs = pack_sequences( \@seqs )  #  \@packed_seqs = pack_sequences( \@seqs )
1167  #  #
1168  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
   
1169  sub pack_sequences  sub pack_sequences
1170  {  {
1171      my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0]} : @_;      $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1172      @seqs or return wantarray ? () : [];          or return ();
1173    
1174        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1175    
1176      my @seqs2 = map { [ $_->[0], $_->[1], pack_seq( $_->[2] ) ] } @seqs;      my @seqs2 = map { [ $_->[0], $_->[1], pack_seq( $_->[2] ) ] } @seqs;
1177    
1178      return wantarray ? @seqs2 : \@seqs2;      wantarray ? @seqs2 : \@seqs2;
1179  }  }
1180    
1181    
1182  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1183  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
1184  #  #
# Line 1211  Line 1391 
1391    
1392    
1393  sub clean_ae_sequence {  sub clean_ae_sequence {
1394      $_ = shift;      local $_ = shift;
1395      $_ = to7bit($_);      $_ = to7bit($_);
1396      s/[+]/1/g;      s/\+/1/g;
1397      s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g;      s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g;
1398      return $_;      return $_;
1399  }  }
1400    
1401    
1402  sub to7bit {  sub to7bit {
1403      $_ = shift;      local $_ = shift;
1404      my ($o, $c);      my ($o, $c);
1405      while (/\\([0-3][0-7][0-7])/) {      while (/\\([0-3][0-7][0-7])/) {
1406          $o = oct($1) % 128;          $o = oct($1) % 128;
# Line 1232  Line 1412 
1412    
1413    
1414  sub to8bit {  sub to8bit {
1415      $_ = shift;      local $_ = shift;
1416      my ($o, $c);      my ($o, $c);
1417      while (/\\([0-3][0-7][0-7])/) {      while (/\\([0-3][0-7][0-7])/) {
1418          $o = oct($1);          $o = oct($1);

Legend:
Removed from v.1.19  
changed lines
  Added in v.1.20

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3