[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Sun Jun 10 17:17:40 2007 UTC revision 1.11, Thu Jan 3 21:36:41 2008 UTC
# Line 44  Line 44 
44  #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );  #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45  #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );  #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46  #  #
47  #  @seqs = pack_alignment( @seqs )  #   @packed_seqs = pack_alignment(  @seqs )
48    #   @packed_seqs = pack_alignment( \@seqs )
49    #  \@packed_seqs = pack_alignment(  @seqs )
50    #  \@packed_seqs = pack_alignment( \@seqs )
51  #  #
52  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
53  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
54    #  $DNAseq = DNA_subseq(  $seq, $from, $to );
55    #  $DNAseq = DNA_subseq( \$seq, $from, $to );
56    #  $RNAseq = RNA_subseq(  $seq, $from, $to );
57    #  $RNAseq = RNA_subseq( \$seq, $from, $to );
58  #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );  #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
59  #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );  #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
60  #  $DNAseq = complement_DNA_seq( $NA_seq );  #  $DNAseq = complement_DNA_seq( $NA_seq );
# Line 57  Line 64 
64  #  $seq    = pack_seq( $sequence )  #  $seq    = pack_seq( $sequence )
65  #  $seq    = clean_ae_sequence( $seq )  #  $seq    = clean_ae_sequence( $seq )
66  #  #
67  #  $seq = translate_seq( $seq [, $met_start] )  #  $aa = translate_seq( $nt, $met_start )
68    #  $aa = translate_seq( $nt )
69  #  $aa  = translate_codon( $triplet );  #  $aa  = translate_codon( $triplet );
 #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  
70  #  #
71  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code.  The supplied code needs to be complete in
72  #  DNA versus RNA type of sequence  #  RNA and/or DNA, and upper and/or lower case.  The program guesses based
73    #  on lysine and phenylalanine codons.
74    #
75    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash, $met_start )
76    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash )
77  #  #
78  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
79  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
# Line 78  Line 89 
89  #  Convert GenBank locations to SEED locations  #  Convert GenBank locations to SEED locations
90  #  #
91  #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )  #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
92    #
93    #  Read quality scores from a fasta-like file:
94    #
95    #  @seq_entries = read_qual( )               #  STDIN
96    # \@seq_entries = read_qual( )               #  STDIN
97    #  @seq_entries = read_qual( \*FILEHANDLE )
98    # \@seq_entries = read_qual( \*FILEHANDLE )
99    #  @seq_entries = read_qual(  $filename )
100    # \@seq_entries = read_qual(  $filename )
101    #
102    #  Evaluate nucleotide alignments:
103    #
104    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
105    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
106    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_weight )
107    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_open, $gap_extend )
108    #
109    #  ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
110    #
111    
112  use strict;  use strict;
113    use Carp;
 use gjolib qw( wrap_text );  
114    
115  #  Exported global variables:  #  Exported global variables:
116    
# Line 128  Line 156 
156    
157          subseq_DNA_entry          subseq_DNA_entry
158          subseq_RNA_entry          subseq_RNA_entry
159            DNA_subseq
160            RNA_subseq
161          complement_DNA_entry          complement_DNA_entry
162          complement_RNA_entry          complement_RNA_entry
163          complement_DNA_seq          complement_DNA_seq
# Line 149  Line 179 
179          reverse_intervals          reverse_intervals
180    
181          gb_location_2_seed          gb_location_2_seed
182    
183            read_qual
184    
185            fraction_nt_diff
186            interpret_nt_align
187          );          );
188    
189  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 427  Line 462 
462      my ( $fh, undef, $close, $unused ) = output_filehandle( shift );      my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
463      ( unshift @_, $unused ) if $unused;      ( unshift @_, $unused ) if $unused;
464    
465      ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";      ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n";
466    
467      #  Expand the sequence entry list if necessary:      #  Expand the sequence entry list if necessary:
468    
# Line 630  Line 665 
665    
666    
667  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
668    #  Return a string with text wrapped to defined line lengths:
669    #
670    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
671    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
672    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
673    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
674    #-----------------------------------------------------------------------------
675    sub wrap_text {
676        my ($str, $len, $ind, $indn) = @_;
677    
678        defined($str)  || die "wrap_text called without a string\n";
679        defined($len)  || ($len  =   80);
680        defined($ind)  || ($ind  =    0);
681        ($ind  < $len) || die "wrap error: indent greater than line length\n";
682        defined($indn) || ($indn = $ind);
683        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
684    
685        $str =~ s/\s+$//;
686        $str =~ s/^\s+//;
687        my ($maxchr, $maxchr1);
688        my (@lines) = ();
689    
690        while ($str) {
691            $maxchr1 = ($maxchr = $len - $ind) - 1;
692            if ($maxchr >= length($str)) {
693                push @lines, (" " x $ind) . $str;
694                last;
695            }
696            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
697                push @lines, (" " x $ind) . $1;
698                $str = $2;
699            }
700            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
701                push @lines, (" " x $ind) . $1;
702                $str = $2;
703            }
704            else {
705                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
706                $str = substr($str, $maxchr);
707            }
708            $ind = $indn;
709        }
710    
711        return join("\n", @lines);
712    }
713    
714    
715    #-----------------------------------------------------------------------------
716  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
717  #  #
718  #     my \%seq_ind  = index_seq_list(  @seq_list );  #     my \%seq_ind  = index_seq_list(  @seq_list );
# Line 682  Line 765 
765  #  Remove columns of alignment gaps from sequences:  #  Remove columns of alignment gaps from sequences:
766  #  #
767  #  @packed_seqs = pack_alignment( @seqs )  #  @packed_seqs = pack_alignment( @seqs )
768    #   @packed_seqs = pack_alignment( \@seqs )
769    #  \@packed_seqs = pack_alignment(  @seqs )
770    #  \@packed_seqs = pack_alignment( \@seqs )
771  #  #
772  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
773    
# Line 788  Line 874 
874  }  }
875    
876    
877    sub DNA_subseq
878    {
879        my ( $seq, $from, $to ) = @_;
880    
881        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
882                                          : length(  $seq );
883        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
884        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
885    
886        my $left  = ( $from < $to ) ? $from : $to;
887        my $right = ( $from < $to ) ? $to   : $from;
888        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
889        if ( $right > $len ) { $right = $len }
890        if ( $left  < 1    ) { $left  =    1 }
891    
892        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
893                                             : substr(  $seq, $left-1, $right-$left+1 );
894    
895        if ( $from > $to )
896        {
897            $subseq = reverse $subseq;
898            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
899                         [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
900        }
901    
902        $subseq
903    }
904    
905    
906    sub RNA_subseq
907    {
908        my ( $seq, $from, $to ) = @_;
909    
910        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
911                                          : length(  $seq );
912        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
913        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
914    
915        my $left  = ( $from < $to ) ? $from : $to;
916        my $right = ( $from < $to ) ? $to   : $from;
917        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
918        if ( $right > $len ) { $right = $len }
919        if ( $left  < 1    ) { $left  =    1 }
920    
921        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
922                                             : substr(  $seq, $left-1, $right-$left+1 );
923    
924        if ( $from > $to )
925        {
926            $subseq = reverse $subseq;
927            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
928                         [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
929        }
930    
931        $subseq
932    }
933    
934    
935  sub complement_DNA_entry {  sub complement_DNA_entry {
936      my ($id, $desc, $seq, $fix_id) = @_;      my ($id, $desc, $seq, $fix_id) = @_;
937      $fix_id ||= 0;     #  fix undef values      $fix_id ||= 0;     #  fix undef values
# Line 905  Line 1049 
1049  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein:
1050  #  #
1051  #  $seq = translate_seq( $seq [, $met_start] )  #  $seq = translate_seq( $seq [, $met_start] )
1052  #  $aa  = translate_codon( $triplet );  #     $aa  = translate_codon( $triplet )
1053  #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  #     $aa  = translate_DNA_codon( $triplet )     # Does not rely on DNA
1054    #     $aa  = translate_uc_DNA_codon( $triplet )  # Does not rely on uc or DNA
1055  #  #
1056  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code must be upper case index and match the
1057  #  DNA versus RNA type of sequence  #  DNA versus RNA type of sequence
# Line 923  Line 1068 
1068    
1069      # DNA version      # DNA version
1070    
1071      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => 'F',  TCT => 'S',  TAT => 'Y',  TGT => 'C',
1072      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => 'F',  TCC => 'S',  TAC => 'Y',  TGC => 'C',
1073      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => 'L',  TCA => 'S',  TAA => '*',  TGA => '*',
1074      TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",      TTG => 'L',  TCG => 'S',  TAG => '*',  TGG => 'W',
1075      CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",      CTT => 'L',  CCT => 'P',  CAT => 'H',  CGT => 'R',
1076      CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",      CTC => 'L',  CCC => 'P',  CAC => 'H',  CGC => 'R',
1077      CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",      CTA => 'L',  CCA => 'P',  CAA => 'Q',  CGA => 'R',
1078      CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",      CTG => 'L',  CCG => 'P',  CAG => 'Q',  CGG => 'R',
1079      ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",      ATT => 'I',  ACT => 'T',  AAT => 'N',  AGT => 'S',
1080      ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",      ATC => 'I',  ACC => 'T',  AAC => 'N',  AGC => 'S',
1081      ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",      ATA => 'I',  ACA => 'T',  AAA => 'K',  AGA => 'R',
1082      ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",      ATG => 'M',  ACG => 'T',  AAG => 'K',  AGG => 'R',
1083      GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",      GTT => 'V',  GCT => 'A',  GAT => 'D',  GGT => 'G',
1084      GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",      GTC => 'V',  GCC => 'A',  GAC => 'D',  GGC => 'G',
1085      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => 'V',  GCA => 'A',  GAA => 'E',  GGA => 'G',
1086      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => 'V',  GCG => 'A',  GAG => 'E',  GGG => 'G',
   
     # RNA suppliment  
   
     UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",  
     UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",  
     UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",  
     UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",  
     CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",  
     CUC => "L",  
     CUA => "L",  
     CUG => "L",  
     AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",  
     AUC => "I",  
     AUA => "I",  
     AUG => "M",  
     GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",  
     GUC => "V",  
     GUA => "V",  
     GUG => "V",  
1087    
1088      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1089      #  speed up the processing of some ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1090    
1091      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => 'F',  TCY => 'S',  TAY => 'Y',  TGY => 'C',
1092      TTR => "L",  TCR => "S",  TAR => "*",      TTR => 'L',  TCR => 'S',  TAR => '*',
1093                   TCN => "S",                   TCN => 'S',
1094      CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",      CTY => 'L',  CCY => 'P',  CAY => 'H',  CGY => 'R',
1095      CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",      CTR => 'L',  CCR => 'P',  CAR => 'Q',  CGR => 'R',
1096      CTN => "L",  CCN => "P",               CGN => "R",      CTN => 'L',  CCN => 'P',               CGN => 'R',
1097      ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",      ATY => 'I',  ACY => 'T',  AAY => 'N',  AGY => 'S',
1098                   ACR => "T",  AAR => "K",  AGR => "R",                   ACR => 'T',  AAR => 'K',  AGR => 'R',
1099                   ACN => "T",                   ACN => 'T',
1100      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => 'V',  GCY => 'A',  GAY => 'D',  GGY => 'G',
1101      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => 'V',  GCR => 'A',  GAR => 'E',  GGR => 'G',
1102      GTN => "V",  GCN => "A",               GGN => "G",      GTN => 'V',  GCN => 'A',               GGN => 'G'
   
     UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",  
     UUR => "L",  UCR => "S",  UAR => "*",  
                  UCN => "S",  
     CUY => "L",  
     CUR => "L",  
     CUN => "L",  
     AUY => "I",  
     GUY => "V",  
     GUR => "V",  
     GUN => "V"  
1103  );  );
1104    
1105    #  Add RNA by construction:
1106    
1107    foreach ( grep { /T/ } keys %genetic_code )
1108    {
1109        my $codon = $_;
1110        $codon =~ s/T/U/g;
1111        $genetic_code{ $codon } = lc $genetic_code{ $_ }
1112    }
1113    
1114  #  Add lower case by construction:  #  Add lower case by construction:
1115    
1116  foreach ( keys %genetic_code ) {  foreach ( keys %genetic_code )
1117    {
1118      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1119  }  }
1120    
1121    
1122  #  Construct the genetic code with selanocysteine by difference:  #  Construct the genetic code with selenocysteine by difference:
1123    
1124  %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;  %genetic_code_with_U = %genetic_code;
1125  $genetic_code_with_U{ TGA } = "U";  $genetic_code_with_U{ TGA } = 'U';
1126  $genetic_code_with_U{ tga } = "u";  $genetic_code_with_U{ tga } = 'u';
1127  $genetic_code_with_U{ UGA } = "U";  $genetic_code_with_U{ UGA } = 'U';
1128  $genetic_code_with_U{ uga } = "u";  $genetic_code_with_U{ uga } = 'u';
1129    
1130    
1131  %amino_acid_codons_DNA = (  %amino_acid_codons_DNA = (
# Line 1265  Line 1389 
1389    
1390    
1391  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1392  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein.  Respects case of the
1393    #  nucleotide sequence.
1394  #  #
1395  #      $seq = translate_seq( $seq [, $met_start] )  #      $aa = translate_seq( $nt, $met_start )
1396    #      $aa = translate_seq( $nt )
1397  #  #
1398  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1399    
1400  sub translate_seq {  sub translate_seq
1401      my $seq = uc shift;  {
1402      $seq =~ tr/UX/TN/;      #  make it DNA, and allow X      my $seq = shift;
1403      $seq =~ tr/-//d;        #  remove gaps      $seq =~ tr/-//d;        #  remove gaps
1404    
1405      my $met = shift || 0;   #  a second argument that is true      my @codons = $seq =~ m/(...?)/g;  #  Will try to translate last 2 nt
1406                              #  forces first amino acid to be Met  
1407                              #  (note: undef is false)      #  A second argument that is true forces first amino acid to be Met
1408    
1409      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my @met;
1410      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1411      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      {
1412          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1413      }      }
1414    
1415      return $pep;      join( '', @met, map { translate_codon( $_ ) } @codons )
1416  }  }
1417    
1418    
1419  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1420  #  Translate a single triplet with "universal" genetic code  #  Translate a single triplet with "universal" genetic code.
 #  Uppercase and DNA are performed, then translate_uc_DNA_codon  
 #  is called.  
1421  #  #
1422  #      $aa = translate_codon( $triplet )  #      $aa = translate_codon( $triplet )
1423    #      $aa = translate_DNA_codon( $triplet )
1424    #      $aa = translate_uc_DNA_codon( $triplet )
1425  #  #
1426  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1427    
1428  sub translate_codon {  sub translate_DNA_codon { translate_codon( @_ ) }
     my $codon = uc shift;  #  Make it uppercase  
     $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X  
     return translate_uc_DNA_codon($codon);  
 }  
1429    
1430    sub translate_uc_DNA_codon { translate_codon( uc $_[0] ) }
1431    
1432  #-----------------------------------------------------------------------------  sub translate_codon
1433  #  Translate a single triplet with "universal" genetic code  {
 #  Uppercase and DNA assumed  
 #  Intended for private use by translate_codon and translate_seq  
 #  
 #      $aa = translate_uc_DNA_codon( $triplet )  
 #  
 #-----------------------------------------------------------------------------  
   
 sub translate_uc_DNA_codon {  
1434      my $codon = shift;      my $codon = shift;
1435      my $aa;      $codon =~ tr/Uu/Tt/;     #  Make it DNA
1436    
1437      #  Try a simple lookup:      #  Try a simple lookup:
1438    
1439        my $aa;
1440      if ( $aa = $genetic_code{ $codon } ) { return $aa }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1441    
1442      #  With the expanded code defined above, this catches simple N, R      #  Attempt to recover from mixed-case codons:
     #  and Y ambiguities in the third position.  Other codes like  
     #  GG[KMSWBDHV], or even GG, might be unambiguously translated by  
     #  converting the last position to N and seeing if this is in the  
     #  (expanded) code table:  
   
     if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }  
   
     #  Test that codon is valid and might have unambiguous aa:  
   
     if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }  
     #                     ^^  
     #                     |+- for leucine YTR  
     #                     +-- for arginine MGR  
1443    
1444      #  Expand all ambiguous nucleotides to see if they all yield same aa.      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1445      #  Loop order tries to fail quickly with first position change.      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1446    
1447      $aa = "";      #  The code defined above catches simple N, R and Y ambiguities in the
1448      for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {      #  third position.  Other codons (e.g., GG[KMSWBDHV], or even GG) might
1449          for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {      #  be unambiguously translated by converting the last position to N and
1450              for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {      #  seeing if this is in the code table:
1451                  #  set the first value of $aa  
1452                  if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }      my $N = ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1453                  #  or break out if any other amino acid is detected      if ( $aa = $genetic_code{ substr($codon,0,2) . $N } ) { return $aa }
1454                  elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }  
1455              }      #  Test that codon is valid for an unambiguous aa:
1456          }  
1457        my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1458        if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/i
1459          && $codon !~ m/^YT[AGR]$/i     #  Leu YTR
1460          && $codon !~ m/^MG[AGR]$/i     #  Arg MGR
1461           )
1462        {
1463            return $X;
1464      }      }
1465    
1466      return $aa || "X";      #  Expand all ambiguous nucleotides to see if they all yield same aa.
1467    
1468        my @n1 = @{ $DNA_letter_can_be{ substr( $codon, 0, 1 ) } };
1469        my $n2 =                        substr( $codon, 1, 1 );
1470        my @n3 = @{ $DNA_letter_can_be{ substr( $codon, 2, 1 ) } };
1471        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1472    
1473        my $triple = shift @triples;
1474        $aa = $genetic_code{ $triple };
1475        $aa or return $X;
1476    
1477        foreach $triple ( @triples ) { return $X if $aa ne $genetic_code{$triple} }
1478    
1479        return $aa;
1480  }  }
1481    
1482    
# Line 1362  Line 1485 
1485  #  Diagnose the use of upper versus lower, and T versus U in the supplied  #  Diagnose the use of upper versus lower, and T versus U in the supplied
1486  #  code, and transform the supplied nucleotide sequence to match.  #  code, and transform the supplied nucleotide sequence to match.
1487  #  #
1488  #  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )  #     $aa = translate_seq_with_user_code( $nt, \%gen_code )
1489    #     $aa = translate_seq_with_user_code( $nt, \%gen_code, $start_with_met )
1490  #  #
1491    #  Modified 2007-11-22 to be less intrusive in these diagnoses by sensing
1492    #  the presence of both versions in the user code.
1493  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1494    
1495  sub translate_seq_with_user_code {  sub translate_seq_with_user_code
1496    {
1497      my $seq = shift;      my $seq = shift;
1498      $seq =~ tr/-//d;     #  remove gaps  ***  Why?      $seq =~ tr/-//d;     #  remove gaps  ***  Why?
     $seq =~ tr/Xx/Nn/;   #  allow X  
1499    
1500      my $gc = shift;      #  Reference to hash of DNA alphabet code      my $gc = shift;      #  Reference to hash of code
1501      if (! $gc || ref($gc) ne "HASH") {      if (! $gc || ref($gc) ne "HASH")
1502          die "translate_seq_with_user_code needs genetic code hash as secondargument.";      {
1503            print STDERR "translate_seq_with_user_code needs genetic code hash as second argument.";
1504            return undef;
1505      }      }
1506    
1507      #  Test the type of code supplied: uppercase versus lowercase      #  Test code support for upper vs lower case:
   
     my ($RNA_F, $DNA_F, $M, $N, $X);  
1508    
1509      if ($gc->{ "AAA" }) {     #  Looks like uppercase code table      my ( $TTT, $UUU );
1510        if    ( $gc->{AAA} && ! $gc->{aaa} )   #  Uppercase only code table
1511        {
1512          $seq   = uc $seq;     #  Uppercase sequence          $seq   = uc $seq;     #  Uppercase sequence
1513          $RNA_F = "UUU";       #  Uppercase RNA Phe codon          ( $TTT, $UUU ) = ( 'TTT', 'UUU' );
         $DNA_F = "TTT";       #  Uppercase DNA Phe codon  
         $M     = "M";         #  Uppercase initiator  
         $N     = "N";         #  Uppercase ambiguous nuc  
         $X     = "X";         #  Uppercase ambiguous aa  
1514      }      }
1515      elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table      elsif ( $gc->{aaa} && ! $gc->{AAA} )   #  Lowercase only code table
1516        {
1517          $seq   = lc $seq;     #  Lowercase sequence          $seq   = lc $seq;     #  Lowercase sequence
1518          $RNA_F = "uuu";       #  Lowercase RNA Phe codon          ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
         $DNA_F = "ttt";       #  Lowercase DNA Phe codon  
         $M     = "m";         #  Lowercase initiator  
         $N     = "n";         #  Lowercase ambiguous nuc  
         $X     = "x";         #  Lowercase ambiguous aa  
1519      }      }
1520      else {      elsif ( $gc->{aaa} )
1521          die "User-supplied genetic code does not have aaa or AAA\n";      {
1522            ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
1523        }
1524        else
1525        {
1526            print STDERR "User-supplied genetic code does not have aaa or AAA\n";
1527            return undef;
1528      }      }
1529    
1530      #  Test the type of code supplied:  UUU versus TTT      #  Test code support for U vs T:
   
     my ($ambigs);  
1531    
1532      if ($gc->{ $RNA_F }) {     #  Looks like RNA code table      my $ambigs;
1533          $seq =~ tr/Tt/Uu/;      if    ( $gc->{$UUU} && ! $gc->{$TTT} )  # RNA only code table
1534        {
1535            $seq = tr/Tt/Uu/;
1536          $ambigs = \%RNA_letter_can_be;          $ambigs = \%RNA_letter_can_be;
1537      }      }
1538      elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table      elsif ( $gc->{$TTT} && ! $gc->{$UUU} )  # DNA only code table
1539          $seq =~ tr/Uu/Tt/;      {
1540            $seq = tr/Uu/Tt/;
1541          $ambigs = \%DNA_letter_can_be;          $ambigs = \%DNA_letter_can_be;
1542      }      }
1543      else {      else
1544          die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";      {
1545            my $t = $seq =~ tr/Tt//;
1546            my $u = $seq =~ tr/Uu//;
1547            $ambigs = ( $t > $u ) ? \%DNA_letter_can_be : \%RNA_letter_can_be;
1548      }      }
1549    
1550      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  We can now do the codon-by-codon translation:
1551    
1552      my $met = shift;     #  a third argument that is true      my @codons = $seq =~ m/(...?)/g;  #  will try to translate last 2 nt
1553                           #  forces first amino acid to be Met  
1554                           #  (note: undef is false)      #  A third argument that is true forces first amino acid to be Met
     my $pep  = ($met && ($imax >= 0)) ? $M : "";  
     my $aa;  
1555    
1556      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      my @met;
1557          $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1558        {
1559            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1560      }      }
1561    
1562      return $pep;      join( '', @met, map { translate_codon_with_user_code( $_, $gc, $ambigs ) } @codons )
1563  }  }
1564    
1565    
1566  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1567  #  Translate with user-supplied genetic code hash.  For speed, no error  #  Translate with user-supplied genetic code hash.  No error check on the code.
1568  #  check on the hash.  Calling programs should check for the hash at a  #  Should only be called through translate_seq_with_user_code.
 #  higher level.  
1569  #  #
1570  #  Should only be called through translate_seq_with_user_code  #     $aa = translate_codon_with_user_code( $triplet, \%code, $ambig_table )
 #  
 #   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )  
1571  #  #
1572  #  $triplet      speaks for itself  #  $triplet      speaks for itself
1573  #  $code         ref to the hash with the codon translations  #  $code         ref to the hash with the codon translations
 #  $N            character to use for ambiguous nucleotide  
 #  $X            character to use for ambiguous amino acid  
1574  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code
1575  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1576    
1577    sub translate_codon_with_user_code
1578  sub translate_codon_with_user_code {  {
1579      my $codon = shift;      my ( $codon, $gc, $N, $X, $ambigs ) = @_;
     my $gc    = shift;  
     my $aa;  
1580    
1581      #  Try a simple lookup:      #  Try a simple lookup:
1582    
1583        my $aa;
1584      if ( $aa = $gc->{ $codon } ) { return $aa }      if ( $aa = $gc->{ $codon } ) { return $aa }
1585    
1586      #  Test that codon is valid and might have unambiguous aa:      #  Attempt to recover from mixed-case codons:
1587    
1588      my ($N, $X, $ambigs) = @_;      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1589      if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
     if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }  
     #                          ^^  
     #                          |+- for leucine YTR  
     #                          +-- for arginine MGR  
1590    
1591      #  Expand all ambiguous nucleotides to see if they all yield same aa.      #  Test that codon is valid for an unambiguous aa:
     #  Loop order tries to fail quickly with first position change.  
1592    
1593      $aa = "";      my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1594      for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {  
1595          for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {      if ( $codon =~ m/^[ACGTU][ACGTU]$/i )  # Add N?
1596              for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {      {
1597                  #  set the first value of $aa          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
                 if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }  
                 #  break out if any other amino acid is detected  
                 elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }  
1598              }              }
1599        elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?
1600        {
1601            return $X;
1602          }          }
1603    
1604        #  Expand all ambiguous nucleotides to see if they all yield same aa.
1605    
1606        my @n1 = @{ $ambigs->{ substr( $codon, 0, 1 ) } };
1607        my $n2 =               substr( $codon, 1, 1 );
1608        my @n3 = @{ $ambigs->{ substr( $codon, 2, 1 ) } };
1609        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1610    
1611        my $triple = shift @triples;
1612        $aa = $gc->{ $triple } || $gc->{ lc $triple } || $gc->{ uc $triple };
1613        $aa or return $X;
1614    
1615        foreach $triple ( @triples )
1616        {
1617            return $X if $aa ne ( $gc->{$triple} || $gc->{lc $triple} || $gc->{uc $triple} );
1618      }      }
1619    
1620      return $aa || $X;      return $aa;
1621  }  }
1622    
1623    
# Line 1670  Line 1805 
1805  }  }
1806    
1807    
1808    #-----------------------------------------------------------------------------
1809    #  Read qual.
1810    #
1811    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1812    #
1813    #     @seq_entries = read_qual( )               #  STDIN
1814    #    \@seq_entries = read_qual( )               #  STDIN
1815    #     @seq_entries = read_qual( \*FILEHANDLE )
1816    #    \@seq_entries = read_qual( \*FILEHANDLE )
1817    #     @seq_entries = read_qual(  $filename )
1818    #    \@seq_entries = read_qual(  $filename )
1819    #-----------------------------------------------------------------------------
1820    sub read_qual {
1821        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1822        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1823    
1824        my @quals = ();
1825        my ($id, $desc, $qual) = ("", "", []);
1826    
1827        while ( <$fh> ) {
1828            chomp;
1829            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1830                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1831                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1832            }
1833            else {
1834                push @$qual, split;
1835            }
1836        }
1837        close( $fh ) if $close;
1838    
1839        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1840        return wantarray ? @quals : \@quals;
1841    }
1842    
1843    
1844    #-------------------------------------------------------------------------------
1845    #  Fraction difference for an alignment of two nucleotide sequences in terms of
1846    #  number of differing residues, number of gaps, and number of gap opennings.
1847    #
1848    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
1849    #
1850    #  or
1851    #
1852    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
1853    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_wgt )
1854    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $open_wgt, $extend_wgt )
1855    #
1856    #  Options:
1857    #
1858    #      gap      => $gap_wgt          # Gap open and extend weight (D = 0.5)
1859    #      open     => $open_wgt         # Gap openning weight (D = gap_wgt)
1860    #      extend   => $extend_wgt       # Gap extension weight (D = open_wgt)
1861    #      t_gap    => $term_gap_wgt     # Terminal open and extend weight
1862    #      t_open   => $term_open_wgt    # Terminal gap open weight (D = open_wgt)
1863    #      t_extend => $term_extend_wgt  # Terminal gap extend weight (D = extend_wgt)
1864    #
1865    #  Default gap open and gap extend weights are 1/2.  Beware that
1866    #
1867    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1 )
1868    #
1869    #  and
1870    #
1871    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1, 0 )
1872    #
1873    #  are different.  The first has equal openning and extension weights, whereas
1874    #  the second has an openning weight of 1, and and extension weight of 0 (it
1875    #  only penalizes the number of runs of gaps).
1876    #-------------------------------------------------------------------------------
1877    sub fraction_nt_diff
1878    {
1879        my ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( @_[0,1] );
1880    
1881        my $diff_scr;
1882        if ( ref( $_[2] ) eq 'HASH' )
1883        {
1884            my $opts = $_[2];
1885            my $gap_open    = defined $opts->{ open }     ? $opts->{ open }
1886                            : defined $opts->{ gap }      ? $opts->{ gap }
1887                            : 0.5;
1888            my $gap_extend  = defined $opts->{ extend }   ? $opts->{ extend }
1889                            : $gap_open;
1890            my $term_open   = defined $opts->{ t_open }   ? $opts->{ t_open }
1891                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
1892                            : $gap_open;
1893            my $term_extend = defined $opts->{ t_extend } ? $opts->{ t_extend }
1894                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
1895                            : $gap_extend;
1896    
1897            $nopen -= $topen;
1898            $ngap  -= $tgap;
1899            $diff_scr = $ndif + $gap_open  * $nopen + $gap_extend  * ($ngap-$nopen)
1900                              + $term_open * $topen + $term_extend * ($tgap-$topen);
1901        }
1902        else
1903        {
1904            my $gap_open   = defined( $_[2] ) ? $_[2] : 0.5;
1905            my $gap_extend = defined( $_[3] ) ? $_[3] : $gap_open;
1906            $diff_scr = $ndif + $gap_open * $nopen + $gap_extend * ($ngap-$nopen);
1907        }
1908        my $ttl_scr = $nid + $diff_scr;
1909    
1910        $ttl_scr ? $diff_scr / $ttl_scr : undef
1911    }
1912    
1913    
1914    #-------------------------------------------------------------------------------
1915    #  Interpret an alignment of two nucleotide sequences in terms of: useful
1916    #  aligned positions (unambiguous, and not a common gap), number of identical
1917    #  residues, number of differing residues, number of gaps, and number of gap
1918    #  opennings.
1919    #
1920    #     ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
1921    #
1922    #  $npos  = total aligned positons (= $nid + $ndif + $ngap)
1923    #  $nid   = number of positions with identical nucleotides (ignoring case)
1924    #  $ndif  = number of positions with differing nucleotides
1925    #  $ngap  = number of positions with gap in one sequence but not the other
1926    #  $nopen = number of runs of gaps
1927    #  $tgap  = number of gaps in runs adjacent to a terminus
1928    #  $topen = number of alignment ends with gaps
1929    #
1930    #  Some of the methods might seem overly complex, but are necessary for cases
1931    #  in which the gaps switch strands in the alignment:
1932    #
1933    #     seq1  ---ACGTGAC--TTGCAGAG
1934    #     seq2  TTT---TGACGG--GCAGGG
1935    #     mask  00000011110000111111
1936    #
1937    #     npos  = 20
1938    #     nid   =  9
1939    #     ndif  =  1
1940    #     ngap  = 10
1941    #     nopen =  4
1942    #     tgap  =  3
1943    #     topen =  1
1944    #
1945    #  Although there are 4 gap opennings, there are only 2 runs in the mask,
1946    #  and the terminal run is length 6, not 3.  (Why handle these?  Because
1947    #  pairs of sequences from a multiple sequence alignment can look like this.)
1948    #-------------------------------------------------------------------------------
1949    sub interpret_nt_align
1950    {
1951        #  Remove alignment columns that are not informative:
1952        my ( $s1, $s2 ) = useful_nt_align( @_[0,1] );
1953        my $nmat = length( $s1 );          # Useful alignment length
1954    
1955        my $m1 = $s1;
1956        $m1 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
1957        $m1 =~ tr/\377/\000/c;             # Others (gaps) to null byte
1958        my $m2 = $s2;
1959        $m2 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
1960        $m2 =~ tr/\377/\000/c;             # Others (gaps) to null byte
1961        $m1 &= $m2;                        # Gap in either sequence becomes null
1962        $s1 &= $m1;                        # Apply mask to sequence 1
1963        $s2 &= $m1;                        # Apply mask to sequence 2
1964        my $nopen = @{[ $s1 =~ /\000+/g ]}   # Gap opens in sequence 1
1965                  + @{[ $s2 =~ /\000+/g ]};  # Gap opens in sequence 2
1966        my ( $tgap, $topen ) = ( 0, 0 );
1967        if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ }
1968        if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ }
1969        $s1 =~ tr/\000//d;                 # Remove nulls (former gaps)
1970        $s2 =~ tr/\000//d;                 # Remove nulls (former gaps)
1971        my $ngap = $nmat - length( $s1 );  # Total gaps
1972    
1973        my $xor = $s1 ^ $s2;               # xor of identical residues is null byte
1974        my $nid = ( $xor =~ tr/\000//d );  # Count the nulls (identical residues)
1975        my $ndif = $nmat - $nid - $ngap;
1976    
1977        ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen )
1978    }
1979    
1980    
1981    sub useful_nt_align
1982    {
1983        my ( $s1, $s2 ) = map { uc $_ } @_;
1984        $s1 =~ tr/U/T/;         # Convert U to T
1985        my $m1 = $s1;
1986        $m1 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
1987        $m1 =~ tr/\377/\000/c;  # All else to null byte
1988        $s2 =~ tr/U/T/;         # Convert U to T
1989        my $m2 = $s2;
1990        $m2 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
1991        $m2 =~ tr/\377/\000/c;  # All else to null byte
1992        $m1 &= $m2;             # Invalid in either sequence becomes null
1993        $s1 &= $m1;             # Apply mask to sequence 1
1994        $s1 =~ tr/\000//d;      # Delete nulls in sequence 1
1995        $s2 &= $m1;             # Apply mask to sequence 2
1996        $s2 =~ tr/\000//d;      # Delete nulls in sequence 2
1997        ( $s1, $s2 )
1998    }
1999    
2000    
2001  1;  1;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3