[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.9, Thu Nov 1 20:07:42 2007 UTC revision 1.10, Fri Nov 23 00:33:31 2007 UTC
# Line 64  Line 64 
64  #  $seq    = pack_seq( $sequence )  #  $seq    = pack_seq( $sequence )
65  #  $seq    = clean_ae_sequence( $seq )  #  $seq    = clean_ae_sequence( $seq )
66  #  #
67  #  $seq = translate_seq( $seq [, $met_start] )  #  $aa = translate_seq( $nt, $met_start )
68    #  $aa = translate_seq( $nt )
69  #  $aa  = translate_codon( $triplet );  #  $aa  = translate_codon( $triplet );
 #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  
70  #  #
71  #     User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code.  The supplied code needs to be complete in
72  #     DNA versus RNA type of sequence  #  RNA and/or DNA, and upper and/or lower case.  The program guesses based
73    #  on lysine and phenylalanine codons.
74  #  #
75  #  $seq = translate_seq_with_user_code( $seq, $gen_code_hash [, $met_start] )  #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash, $met_start )
76    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash )
77  #  #
78  #     Locations (= oriented intervals) are ( id, start, end )  #     Locations (= oriented intervals) are ( id, start, end )
79  #     Intervals are ( id, left, right )  #     Intervals are ( id, left, right )
# Line 1035  Line 1037 
1037  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein:
1038  #  #
1039  #  $seq = translate_seq( $seq [, $met_start] )  #  $seq = translate_seq( $seq [, $met_start] )
1040  #  $aa  = translate_codon( $triplet );  #     $aa  = translate_codon( $triplet )
1041  #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  #     $aa  = translate_DNA_codon( $triplet )     # Does not rely on DNA
1042    #     $aa  = translate_uc_DNA_codon( $triplet )  # Does not rely on uc or DNA
1043  #  #
1044  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code must be upper case index and match the
1045  #  DNA versus RNA type of sequence  #  DNA versus RNA type of sequence
# Line 1053  Line 1056 
1056    
1057      # DNA version      # DNA version
1058    
1059      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => 'F',  TCT => 'S',  TAT => 'Y',  TGT => 'C',
1060      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => 'F',  TCC => 'S',  TAC => 'Y',  TGC => 'C',
1061      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => 'L',  TCA => 'S',  TAA => '*',  TGA => '*',
1062      TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",      TTG => 'L',  TCG => 'S',  TAG => '*',  TGG => 'W',
1063      CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",      CTT => 'L',  CCT => 'P',  CAT => 'H',  CGT => 'R',
1064      CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",      CTC => 'L',  CCC => 'P',  CAC => 'H',  CGC => 'R',
1065      CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",      CTA => 'L',  CCA => 'P',  CAA => 'Q',  CGA => 'R',
1066      CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",      CTG => 'L',  CCG => 'P',  CAG => 'Q',  CGG => 'R',
1067      ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",      ATT => 'I',  ACT => 'T',  AAT => 'N',  AGT => 'S',
1068      ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",      ATC => 'I',  ACC => 'T',  AAC => 'N',  AGC => 'S',
1069      ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",      ATA => 'I',  ACA => 'T',  AAA => 'K',  AGA => 'R',
1070      ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",      ATG => 'M',  ACG => 'T',  AAG => 'K',  AGG => 'R',
1071      GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",      GTT => 'V',  GCT => 'A',  GAT => 'D',  GGT => 'G',
1072      GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",      GTC => 'V',  GCC => 'A',  GAC => 'D',  GGC => 'G',
1073      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => 'V',  GCA => 'A',  GAA => 'E',  GGA => 'G',
1074      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => 'V',  GCG => 'A',  GAG => 'E',  GGG => 'G',
   
     # RNA suppliment  
   
     UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",  
     UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",  
     UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",  
     UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",  
     CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",  
     CUC => "L",  
     CUA => "L",  
     CUG => "L",  
     AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",  
     AUC => "I",  
     AUA => "I",  
     AUG => "M",  
     GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",  
     GUC => "V",  
     GUA => "V",  
     GUG => "V",  
1075    
1076      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1077      #  speed up the processing of some ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1078    
1079      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => 'F',  TCY => 'S',  TAY => 'Y',  TGY => 'C',
1080      TTR => "L",  TCR => "S",  TAR => "*",      TTR => 'L',  TCR => 'S',  TAR => '*',
1081                   TCN => "S",                   TCN => 'S',
1082      CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",      CTY => 'L',  CCY => 'P',  CAY => 'H',  CGY => 'R',
1083      CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",      CTR => 'L',  CCR => 'P',  CAR => 'Q',  CGR => 'R',
1084      CTN => "L",  CCN => "P",               CGN => "R",      CTN => 'L',  CCN => 'P',               CGN => 'R',
1085      ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",      ATY => 'I',  ACY => 'T',  AAY => 'N',  AGY => 'S',
1086                   ACR => "T",  AAR => "K",  AGR => "R",                   ACR => 'T',  AAR => 'K',  AGR => 'R',
1087                   ACN => "T",                   ACN => 'T',
1088      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => 'V',  GCY => 'A',  GAY => 'D',  GGY => 'G',
1089      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => 'V',  GCR => 'A',  GAR => 'E',  GGR => 'G',
1090      GTN => "V",  GCN => "A",               GGN => "G",      GTN => 'V',  GCN => 'A',               GGN => 'G'
   
     UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",  
     UUR => "L",  UCR => "S",  UAR => "*",  
                  UCN => "S",  
     CUY => "L",  
     CUR => "L",  
     CUN => "L",  
     AUY => "I",  
     GUY => "V",  
     GUR => "V",  
     GUN => "V"  
1091  );  );
1092    
1093    #  Add RNA by construction:
1094    
1095    foreach ( grep { /T/ } keys %genetic_code )
1096    {
1097        my $codon = $_;
1098        $codon =~ s/T/U/g;
1099        $genetic_code{ $codon } = lc $genetic_code{ $_ }
1100    }
1101    
1102  #  Add lower case by construction:  #  Add lower case by construction:
1103    
1104  foreach ( keys %genetic_code ) {  foreach ( keys %genetic_code )
1105    {
1106      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1107  }  }
1108    
1109    
1110  #  Construct the genetic code with selenocysteine by difference:  #  Construct the genetic code with selenocysteine by difference:
1111    
1112  %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;  %genetic_code_with_U = %genetic_code;
1113  $genetic_code_with_U{ TGA } = "U";  $genetic_code_with_U{ TGA } = 'U';
1114  $genetic_code_with_U{ tga } = "u";  $genetic_code_with_U{ tga } = 'u';
1115  $genetic_code_with_U{ UGA } = "U";  $genetic_code_with_U{ UGA } = 'U';
1116  $genetic_code_with_U{ uga } = "u";  $genetic_code_with_U{ uga } = 'u';
1117    
1118    
1119  %amino_acid_codons_DNA = (  %amino_acid_codons_DNA = (
# Line 1395  Line 1377 
1377    
1378    
1379  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1380  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein.  Respects case of the
1381    #  nucleotide sequence.
1382  #  #
1383  #      $seq = translate_seq( $seq [, $met_start] )  #      $aa = translate_seq( $nt, $met_start )
1384    #      $aa = translate_seq( $nt )
1385  #  #
1386  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1387    
1388  sub translate_seq {  sub translate_seq
1389      my $seq = uc shift;  {
1390      $seq =~ tr/UX/TN/;      #  make it DNA, and allow X      my $seq = shift;
1391      $seq =~ tr/-//d;        #  remove gaps      $seq =~ tr/-//d;        #  remove gaps
1392    
1393      my $met = shift || 0;   #  a second argument that is true      my @codons = $seq =~ m/(...?)/g;  #  Will try to translate last 2 nt
1394                              #  forces first amino acid to be Met  
1395                              #  (note: undef is false)      #  A second argument that is true forces first amino acid to be Met
1396    
1397      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my @met;
1398      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1399      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      {
1400          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1401      }      }
1402    
1403      return $pep;      join( '', @met, map { translate_codon( $_ ) } @codons )
1404  }  }
1405    
1406    
1407  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1408  #  Translate a single triplet with "universal" genetic code  #  Translate a single triplet with "universal" genetic code.
 #  Uppercase and DNA are performed, then translate_uc_DNA_codon  
 #  is called.  
1409  #  #
1410  #      $aa = translate_codon( $triplet )  #      $aa = translate_codon( $triplet )
1411    #      $aa = translate_DNA_codon( $triplet )
1412    #      $aa = translate_uc_DNA_codon( $triplet )
1413  #  #
1414  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1415    
1416  sub translate_codon {  sub translate_DNA_codon { translate_codon( @_ ) }
     my $codon = uc shift;  #  Make it uppercase  
     $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X  
     return translate_uc_DNA_codon($codon);  
 }  
   
1417    
1418  #-----------------------------------------------------------------------------  sub translate_uc_DNA_codon { translate_codon( uc $_[0] ) }
 #  Translate a single triplet with "universal" genetic code  
 #  Uppercase and DNA assumed  
 #  Intended for private use by translate_codon and translate_seq  
 #  
 #      $aa = translate_uc_DNA_codon( $triplet )  
 #  
 #-----------------------------------------------------------------------------  
1419    
1420  sub translate_uc_DNA_codon {  sub translate_codon
1421    {
1422      my $codon = shift;      my $codon = shift;
1423      my $aa;      $codon =~ tr/Uu/Tt/;     #  Make it DNA
1424    
1425      #  Try a simple lookup:      #  Try a simple lookup:
1426    
1427        my $aa;
1428      if ( $aa = $genetic_code{ $codon } ) { return $aa }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1429    
1430      #  With the expanded code defined above, this catches simple N, R      #  Attempt to recover from mixed-case codons:
     #  and Y ambiguities in the third position.  Other codes like  
     #  GG[KMSWBDHV], or even GG, might be unambiguously translated by  
     #  converting the last position to N and seeing if this is in the  
     #  (expanded) code table:  
   
     if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }  
   
     #  Test that codon is valid and might have unambiguous aa:  
   
     if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }  
     #                     ^^  
     #                     |+- for leucine YTR  
     #                     +-- for arginine MGR  
1431    
1432      #  Expand all ambiguous nucleotides to see if they all yield same aa.      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1433      #  Loop order tries to fail quickly with first position change.      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1434    
1435      $aa = "";      #  The code defined above catches simple N, R and Y ambiguities in the
1436      for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {      #  third position.  Other codons (e.g., GG[KMSWBDHV], or even GG) might
1437          for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {      #  be unambiguously translated by converting the last position to N and
1438              for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {      #  seeing if this is in the code table:
1439                  #  set the first value of $aa  
1440                  if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }      my $N = ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1441                  #  or break out if any other amino acid is detected      if ( $aa = $genetic_code{ substr($codon,0,2) . $N } ) { return $aa }
1442                  elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }  
1443              }      #  Test that codon is valid for an unambiguous aa:
1444          }  
1445        my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1446        if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/i
1447          && $codon !~ m/^YT[AGR]$/i     #  Leu YTR
1448          && $codon !~ m/^MG[AGR]$/i     #  Arg MGR
1449           )
1450        {
1451            return $X;
1452      }      }
1453    
1454      return $aa || "X";      #  Expand all ambiguous nucleotides to see if they all yield same aa.
1455    
1456        my @n1 = @{ $DNA_letter_can_be{ substr( $codon, 0, 1 ) } };
1457        my $n2 =                        substr( $codon, 1, 1 );
1458        my @n3 = @{ $DNA_letter_can_be{ substr( $codon, 2, 1 ) } };
1459        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1460    
1461        my $triple = shift @triples;
1462        $aa = $genetic_code{ $triple };
1463        $aa or return $X;
1464    
1465        foreach $triple ( @triples ) { return $X if $aa ne $genetic_code{$triple} }
1466    
1467        return $aa;
1468  }  }
1469    
1470    
# Line 1492  Line 1473 
1473  #  Diagnose the use of upper versus lower, and T versus U in the supplied  #  Diagnose the use of upper versus lower, and T versus U in the supplied
1474  #  code, and transform the supplied nucleotide sequence to match.  #  code, and transform the supplied nucleotide sequence to match.
1475  #  #
1476  #  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )  #     $aa = translate_seq_with_user_code( $nt, \%gen_code )
1477    #     $aa = translate_seq_with_user_code( $nt, \%gen_code, $start_with_met )
1478  #  #
1479    #  Modified 2007-11-22 to be less intrusive in these diagnoses by sensing
1480    #  the presence of both versions in the user code.
1481  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1482    
1483  sub translate_seq_with_user_code {  sub translate_seq_with_user_code
1484    {
1485      my $seq = shift;      my $seq = shift;
1486      $seq =~ tr/-//d;     #  remove gaps  ***  Why?      $seq =~ tr/-//d;     #  remove gaps  ***  Why?
     $seq =~ tr/Xx/Nn/;   #  allow X  
1487    
1488      my $gc = shift;      #  Reference to hash of DNA alphabet code      my $gc = shift;      #  Reference to hash of code
1489      if (! $gc || ref($gc) ne "HASH") {      if (! $gc || ref($gc) ne "HASH")
1490          die "translate_seq_with_user_code needs genetic code hash as secondargument.";      {
1491            print STDERR "translate_seq_with_user_code needs genetic code hash as second argument.";
1492            return undef;
1493      }      }
1494    
1495      #  Test the type of code supplied: uppercase versus lowercase      #  Test code support for upper vs lower case:
   
     my ($RNA_F, $DNA_F, $M, $N, $X);  
1496    
1497      if ($gc->{ "AAA" }) {     #  Looks like uppercase code table      my ( $TTT, $UUU );
1498        if    ( $gc->{AAA} && ! $gc->{aaa} )   #  Uppercase only code table
1499        {
1500          $seq   = uc $seq;     #  Uppercase sequence          $seq   = uc $seq;     #  Uppercase sequence
1501          $RNA_F = "UUU";       #  Uppercase RNA Phe codon          ( $TTT, $UUU ) = ( 'TTT', 'UUU' );
         $DNA_F = "TTT";       #  Uppercase DNA Phe codon  
         $M     = "M";         #  Uppercase initiator  
         $N     = "N";         #  Uppercase ambiguous nuc  
         $X     = "X";         #  Uppercase ambiguous aa  
1502      }      }
1503      elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table      elsif ( $gc->{aaa} && ! $gc->{AAA} )   #  Lowercase only code table
1504        {
1505          $seq   = lc $seq;     #  Lowercase sequence          $seq   = lc $seq;     #  Lowercase sequence
1506          $RNA_F = "uuu";       #  Lowercase RNA Phe codon          ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
         $DNA_F = "ttt";       #  Lowercase DNA Phe codon  
         $M     = "m";         #  Lowercase initiator  
         $N     = "n";         #  Lowercase ambiguous nuc  
         $X     = "x";         #  Lowercase ambiguous aa  
1507      }      }
1508      else {      elsif ( $gc->{aaa} )
1509          die "User-supplied genetic code does not have aaa or AAA\n";      {
1510            ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
1511        }
1512        else
1513        {
1514            print STDERR "User-supplied genetic code does not have aaa or AAA\n";
1515            return undef;
1516      }      }
1517    
1518      #  Test the type of code supplied:  UUU versus TTT      #  Test code support for U vs T:
   
     my ($ambigs);  
1519    
1520      if ($gc->{ $RNA_F }) {     #  Looks like RNA code table      my $ambigs;
1521          $seq =~ tr/Tt/Uu/;      if    ( $gc->{$UUU} && ! $gc->{$TTT} )  # RNA only code table
1522        {
1523            $seq = tr/Tt/Uu/;
1524          $ambigs = \%RNA_letter_can_be;          $ambigs = \%RNA_letter_can_be;
1525      }      }
1526      elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table      elsif ( $gc->{$TTT} && ! $gc->{$UUU} )  # DNA only code table
1527          $seq =~ tr/Uu/Tt/;      {
1528            $seq = tr/Uu/Tt/;
1529          $ambigs = \%DNA_letter_can_be;          $ambigs = \%DNA_letter_can_be;
1530      }      }
1531      else {      else
1532          die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";      {
1533            my $t = $seq =~ tr/Tt//;
1534            my $u = $seq =~ tr/Uu//;
1535            $ambigs = ( $t > $u ) ? \%DNA_letter_can_be : \%RNA_letter_can_be;
1536      }      }
1537    
1538      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  We can now do the codon-by-codon translation:
1539    
1540      my $met = shift;     #  a third argument that is true      my @codons = $seq =~ m/(...?)/g;  #  will try to translate last 2 nt
1541                           #  forces first amino acid to be Met  
1542                           #  (note: undef is false)      #  A third argument that is true forces first amino acid to be Met
     my $pep  = ($met && ($imax >= 0)) ? $M : "";  
     my $aa;  
1543    
1544      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      my @met;
1545          $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1546        {
1547            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1548      }      }
1549    
1550      return $pep;      join( '', @met, map { translate_codon_with_user_code( $_, $gc, $ambigs ) } @codons )
1551  }  }
1552    
1553    
1554  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1555  #  Translate with user-supplied genetic code hash.  For speed, no error  #  Translate with user-supplied genetic code hash.  No error check on the code.
1556  #  check on the hash.  Calling programs should check for the hash at a  #  Should only be called through translate_seq_with_user_code.
 #  higher level.  
 #  
 #  Should only be called through translate_seq_with_user_code  
1557  #  #
1558  #   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )  #     $aa = translate_codon_with_user_code( $triplet, \%code, $ambig_table )
1559  #  #
1560  #  $triplet      speaks for itself  #  $triplet      speaks for itself
1561  #  $code         ref to the hash with the codon translations  #  $code         ref to the hash with the codon translations
 #  $N            character to use for ambiguous nucleotide  
 #  $X            character to use for ambiguous amino acid  
1562  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code
1563  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1564    
1565    sub translate_codon_with_user_code
1566  sub translate_codon_with_user_code {  {
1567      my $codon = shift;      my ( $codon, $gc, $N, $X, $ambigs ) = @_;
     my $gc    = shift;  
     my $aa;  
1568    
1569      #  Try a simple lookup:      #  Try a simple lookup:
1570    
1571        my $aa;
1572      if ( $aa = $gc->{ $codon } ) { return $aa }      if ( $aa = $gc->{ $codon } ) { return $aa }
1573    
1574      #  Test that codon is valid and might have unambiguous aa:      #  Attempt to recover from mixed-case codons:
1575    
1576      my ($N, $X, $ambigs) = @_;      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1577      if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
     if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }  
     #                          ^^  
     #                          |+- for leucine YTR  
     #                          +-- for arginine MGR  
1578    
1579      #  Expand all ambiguous nucleotides to see if they all yield same aa.      #  Test that codon is valid for an unambiguous aa:
     #  Loop order tries to fail quickly with first position change.  
1580    
1581      $aa = "";      my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1582      for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {  
1583          for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {      if ( $codon =~ m/^[ACGTU][ACGTU]$/i )  # Add N?
1584              for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {      {
1585                  #  set the first value of $aa          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
                 if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }  
                 #  break out if any other amino acid is detected  
                 elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }  
1586              }              }
1587        elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?
1588        {
1589            return $X;
1590          }          }
1591    
1592        #  Expand all ambiguous nucleotides to see if they all yield same aa.
1593    
1594        my @n1 = @{ $ambigs->{ substr( $codon, 0, 1 ) } };
1595        my $n2 =               substr( $codon, 1, 1 );
1596        my @n3 = @{ $ambigs->{ substr( $codon, 2, 1 ) } };
1597        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1598    
1599        my $triple = shift @triples;
1600        $aa = $gc->{ $triple } || $gc->{ lc $triple } || $gc->{ uc $triple };
1601        $aa or return $X;
1602    
1603        foreach $triple ( @triples )
1604        {
1605            return $X if $aa ne ( $gc->{$triple} || $gc->{lc $triple} || $gc->{uc $triple} );
1606      }      }
1607    
1608      return $aa || $X;      return $aa;
1609  }  }
1610    
1611    

Legend:
Removed from v.1.9  
changed lines
  Added in v.1.10

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3