[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Dec 1 16:54:26 2003 UTC revision 1.2, Wed Nov 30 18:33:41 2005 UTC
# Line 1  Line 1 
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  read_fasta_seqs( *FILEHANDLE )  #  A sequence entry is ( $id, $def, $seq )
4  #  read_next_fasta_seq( *FILEHANDLE )  #  A list of entries is a list of references
5  #  parse_fasta_title( $title )  #
6  #  split_fasta_title( $title )  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )
7  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )
8  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs
9    #
10    #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
11    #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
12    #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
13    #
14    #  ( $id, $def ) = parse_fasta_title( $title )
15    #  ( $id, $def ) = split_fasta_title( $title )
16    #
17    #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );
18    #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;
19    #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );
20  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )
21    #
22    #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
23    #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
24    #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
25    #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
26    #  $DNAseq = complement_DNA_seq( $NA_seq );
27    #  $RNAseq = complement_RNA_seq( $NA_seq );
28    #  $DNAseq = to_DNA_seq( $NA_seq );
29    #  $RNAseq = to_RNA_seq( $NA_seq );
30    #  $seq    = pack_seq( $sequence )
31    #  $seq    = clean_ae_sequence( $seq )
32    #
33    #  $seq = translate_seq( $seq [, $met_start] )
34    #  $aa  = translate_codon( $triplet );
35    #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
36    #
37    #  User-supplied genetic code must be upper case index and match the
38    #  DNA versus RNA type of sequence
39    #
40    #  Locations (= oriented intervals) are ( id, start, end )
41    #  Intervals are ( id, left, right )
42    #
43    #  @intervals = read_intervals( *FILEHANDLE )
44    #  @intervals = read_oriented_intervals( *FILEHANDLE )
45    #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
46    #  @joined    = join_intervals( @interval_refs )
47    #  @intervals = locations_2_intervals( @locations )
48    #  $interval  = locations_2_intervals( $location  )
49    #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
50    
 #  index_seq_list( @seq_entry_list )  
 #  seq_entry_by_id( \%seq_index, $seq_id )  
 #  seq_desc_by_id( \%seq_index, $seq_id )  
 #  seq_data_by_id( \%seq_index, $seq_id )  
   
 #  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )  
 #  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )  
 #  complement_DNA_entry( @seq_entry [, $fix_id] )  
 #  complement_RNA_entry( @seq_entry [, $fix_id] )  
 #  complement_DNA_seq( $NA_seq )  
 #  complement_RNA_seq( $NA_seq )  
 #  to_DNA_seq( $NA_seq )  
 #  to_RNA_seq( $NA_seq )  
 #  clean_ae_sequence( $seq )  
   
 #  translate_seq( $seq [, $met_start] )  
 #  translate_codon( $triplet )  
 #  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )  
51    
52  #  read_intervals( *FILEHANDLE )  use strict;
 #  join_intervals( @interval_refs )  
53    
54  use gjolib qw(wrap_text);  use gjolib qw(wrap_text);
55    
56    #  Exported global variables:
57    
58    our @aa_1_letter_order;  # Alpha by 1 letter
59    our @aa_3_letter_order;  # PAM matrix order
60    our @aa_n_codon_order;
61    our %genetic_code;
62    our %genetic_code_with_U;
63    our %amino_acid_codons_DNA;
64    our %amino_acid_codons_RNA;
65    our %n_codon_for_aa;
66    our %reverse_genetic_code_DNA;
67    our %reverse_genetic_code_RNA;
68    our %DNA_letter_can_be;
69    our %RNA_letter_can_be;
70    our %one_letter_to_three_letter_aa;
71    our %three_letter_to_one_letter_aa;
72    
73  require Exporter;  require Exporter;
74    
75  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
# Line 57  Line 95 
95          complement_RNA_seq          complement_RNA_seq
96          to_DNA_seq          to_DNA_seq
97          to_RNA_seq          to_RNA_seq
98            pack_seq
99          clean_ae_sequence          clean_ae_sequence
100    
101          translate_seq          translate_seq
# Line 64  Line 103 
103          translate_seq_with_user_code          translate_seq_with_user_code
104    
105          read_intervals          read_intervals
106            standardize_intervals
107          join_intervals          join_intervals
108            locations_2_intervals
109            read_oriented_intervals
110            reverse_intervals
111          );          );
112    
113  use strict;  our @EXPORT_OK = qw(
114            @aa_1_letter_order
115            @aa_3_letter_order
116            @aa_n_codon_order
117            %genetic_code
118            %genetic_code_with_U
119            %amino_acid_codons_DNA
120            %amino_acid_codons_RNA
121            %n_codon_for_aa
122            %reverse_genetic_code_DNA
123            %reverse_genetic_code_RNA
124            %DNA_letter_can_be
125            %RNA_letter_can_be
126            %one_letter_to_three_letter_aa
127            %three_letter_to_one_letter_aa
128            );
129    
130    
131  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
# Line 90  Line 148 
148              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
149          }          }
150          else {          else {
151              tr/\t 0-9//d;              tr/     0-9//d;
152              $seq .= $_ ;              $seq .= $_ ;
153          }          }
154      }      }
# Line 107  Line 165 
165  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)
166  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
167  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
168  #  (these should really be hashes indexed by file handle):  
169  #  {   #  Use bare block to scope the header hash
170  my $next_fasta_seq_header = "";  
171        my %next_header;
172    
173  sub read_next_fasta_seq {  sub read_next_fasta_seq {
     my $fh = shift;  
174      wantarray || die "read_next_fasta_seq requires list context\n";      wantarray || die "read_next_fasta_seq requires list context\n";
175    
176      my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );          my $fh = shift;
177            my ( $id, $desc );
178    
179            if ( defined( $next_header{$fh} ) ) {
180                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
181            }
182            else {
183                $next_header{$fh} = "";
184                ( $id, $desc ) = ( undef, "" );
185            }
186      my $seq = "";      my $seq = "";
187    
188      while ( <$fh> ) {      while ( <$fh> ) {
189          chomp;          chomp;
190          if ( /^>/ ) {        #  new id          if ( /^>/ ) {        #  new id
191              $next_fasta_seq_header = $_;                  $next_header{$fh} = $_;
192              if ( $id && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }
193              ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
194              $seq = "";              $seq = "";
195          }          }
196          else {          else {
197              tr/\t 0-9//d;                  tr/     0-9//d;
198              $seq .= $_ ;              $seq .= $_ ;
199          }          }
200      }      }
201    
202      #  Done with file, clear "next header"          #  Done with file, delete "next header"
203    
204      $next_fasta_seq_header = "";          delete $next_header{$fh};
205      return ($id && $seq) ? ($id, $desc, $seq) : () ;          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;
206        }
207  }  }
208    
209    
# Line 151  Line 219 
219      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
220          return ($1, $3 ? $3 : "");          return ($1, $3 ? $3 : "");
221      }      }
222      else {      elsif ($title =~ /^>/) {
223          return ("", "");          return ("", "");
224      }      }
225        else {
226            return (undef, "");
227        }
228  }  }
229    
230  sub split_fasta_title {  sub split_fasta_title {
# Line 225  Line 296 
296  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
297  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
298  #  #
299  #     my %seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
300  #  #
301  #  Usage example:  #  Usage example:
302  #  #
303  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries
304  #  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
305  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
306  #  #
307  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
# Line 332  Line 403 
403      }      }
404    
405      if ( $fix_id ) {      if ( $fix_id ) {
406          if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )          if ( ( $id =~ s/_(\d+)_(\d+)$// )
407            && ( abs($2-$1)+1 == $len ) ) {            && ( abs($2-$1)+1 == $len ) ) {
408              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
409              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
# Line 353  Line 424 
424      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
425                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
426      if ($fix_id) {      if ($fix_id) {
427          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
428              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
429          }          }
430          else {          else {
# Line 374  Line 445 
445      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
446                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
447      if ($fix_id) {      if ($fix_id) {
448          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
449              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
450          }          }
451          else {          else {
# Line 416  Line 487 
487  }  }
488    
489    
490    sub pack_seq {
491        my $seq = shift;
492        $seq =~ tr/A-Za-z//cd;
493        return $seq;
494    }
495    
496    
497  sub clean_ae_sequence {  sub clean_ae_sequence {
498      $_ = shift;      $_ = shift;
499      $_ = to7bit($_);      $_ = to7bit($_);
# Line 464  Line 542 
542  #  #
543  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
544    
545  our %genetic_code = (   # work as DNA  @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
546    @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
547    @aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );
548    
549    %genetic_code = (
550    
551        # DNA version
552    
553      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
554      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
555      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
# Line 482  Line 567 
567      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
568      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",
569    
570        # RNA suppliment
571    
572        UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",
573        UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",
574        UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",
575        UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",
576        CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",
577        CUC => "L",
578        CUA => "L",
579        CUG => "L",
580        AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",
581        AUC => "I",
582        AUA => "I",
583        AUG => "M",
584        GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",
585        GUC => "V",
586        GUA => "V",
587        GUG => "V",
588    
589      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
590      #  speed up the processing of some common ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
591    
592      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
593      TTR => "L",  TCR => "S",  TAR => "*",      TTR => "L",  TCR => "S",  TAR => "*",
# Line 496  Line 600 
600                   ACN => "T",                   ACN => "T",
601      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
602      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
603      GTN => "V",  GCN => "A",               GGN => "G"      GTN => "V",  GCN => "A",               GGN => "G",
604    
605        UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",
606        UUR => "L",  UCR => "S",  UAR => "*",
607                     UCN => "S",
608        CUY => "L",
609        CUR => "L",
610        CUN => "L",
611        AUY => "I",
612        GUY => "V",
613        GUR => "V",
614        GUN => "V"
615    );
616    
617    
618    #  Add lower case by construction:
619    
620    foreach ( keys %genetic_code ) {
621        $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
622    }
623    
624    
625    #  Construct the genetic code with selanocysteine by difference:
626    
627    %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;
628    $genetic_code_with_U{ TGA } = "U";
629    $genetic_code_with_U{ tga } = "u";
630    $genetic_code_with_U{ UGA } = "U";
631    $genetic_code_with_U{ uga } = "u";
632    
633    
634    %amino_acid_codons_DNA = (
635             L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
636             R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
637             S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
638             A  => [ qw( GCA GCG GCT GCC ) ],
639             G  => [ qw( GGA GGG GGT GGC ) ],
640             P  => [ qw( CCA CCG CCT CCC ) ],
641             T  => [ qw( ACA ACG ACT ACC ) ],
642             V  => [ qw( GTA GTG GTT GTC ) ],
643             I  => [ qw( ATA ATT ATC ) ],
644             C  => [ qw( TGT TGC ) ],
645             D  => [ qw( GAT GAC ) ],
646             E  => [ qw( GAA GAG ) ],
647             F  => [ qw( TTT TTC ) ],
648             H  => [ qw( CAT CAC ) ],
649             K  => [ qw( AAA AAG ) ],
650             N  => [ qw( AAT AAC ) ],
651             Q  => [ qw( CAA CAG ) ],
652             Y  => [ qw( TAT TAC ) ],
653             M  => [ qw( ATG ) ],
654             U  => [ qw( TGA ) ],
655             W  => [ qw( TGG ) ],
656    
657             l  => [ qw( tta ttg cta ctg ctt ctc ) ],
658             r  => [ qw( aga agg cga cgg cgt cgc ) ],
659             s  => [ qw( agt agc tca tcg tct tcc ) ],
660             a  => [ qw( gca gcg gct gcc ) ],
661             g  => [ qw( gga ggg ggt ggc ) ],
662             p  => [ qw( cca ccg cct ccc ) ],
663             t  => [ qw( aca acg act acc ) ],
664             v  => [ qw( gta gtg gtt gtc ) ],
665             i  => [ qw( ata att atc ) ],
666             c  => [ qw( tgt tgc ) ],
667             d  => [ qw( gat gac ) ],
668             e  => [ qw( gaa gag ) ],
669             f  => [ qw( ttt ttc ) ],
670             h  => [ qw( cat cac ) ],
671             k  => [ qw( aaa aag ) ],
672             n  => [ qw( aat aac ) ],
673             q  => [ qw( caa cag ) ],
674             y  => [ qw( tat tac ) ],
675             m  => [ qw( atg ) ],
676             u  => [ qw( tga ) ],
677             w  => [ qw( tgg ) ],
678    
679            '*' => [ qw( TAA TAG TGA ) ]
680    );
681    
682    
683    
684    %amino_acid_codons_RNA = (
685             L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
686             R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
687             S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
688             A  => [ qw( GCA GCG GCU GCC ) ],
689             G  => [ qw( GGA GGG GGU GGC ) ],
690             P  => [ qw( CCA CCG CCU CCC ) ],
691             T  => [ qw( ACA ACG ACU ACC ) ],
692             V  => [ qw( GUA GUG GUU GUC ) ],
693             B  => [ qw( GAU GAC AAU AAC ) ],
694             Z  => [ qw( GAA GAG CAA CAG ) ],
695             I  => [ qw( AUA AUU AUC ) ],
696             C  => [ qw( UGU UGC ) ],
697             D  => [ qw( GAU GAC ) ],
698             E  => [ qw( GAA GAG ) ],
699             F  => [ qw( UUU UUC ) ],
700             H  => [ qw( CAU CAC ) ],
701             K  => [ qw( AAA AAG ) ],
702             N  => [ qw( AAU AAC ) ],
703             Q  => [ qw( CAA CAG ) ],
704             Y  => [ qw( UAU UAC ) ],
705             M  => [ qw( AUG ) ],
706             U  => [ qw( UGA ) ],
707             W  => [ qw( UGG ) ],
708    
709             l  => [ qw( uua uug cua cug cuu cuc ) ],
710             r  => [ qw( aga agg cga cgg cgu cgc ) ],
711             s  => [ qw( agu agc uca ucg ucu ucc ) ],
712             a  => [ qw( gca gcg gcu gcc ) ],
713             g  => [ qw( gga ggg ggu ggc ) ],
714             p  => [ qw( cca ccg ccu ccc ) ],
715             t  => [ qw( aca acg acu acc ) ],
716             v  => [ qw( gua gug guu guc ) ],
717             b  => [ qw( gau gac aau aac ) ],
718             z  => [ qw( gaa gag caa cag ) ],
719             i  => [ qw( aua auu auc ) ],
720             c  => [ qw( ugu ugc ) ],
721             d  => [ qw( gau gac ) ],
722             e  => [ qw( gaa gag ) ],
723             f  => [ qw( uuu uuc ) ],
724             h  => [ qw( cau cac ) ],
725             k  => [ qw( aaa aag ) ],
726             n  => [ qw( aau aac ) ],
727             q  => [ qw( caa cag ) ],
728             y  => [ qw( uau uac ) ],
729             m  => [ qw( aug ) ],
730             u  => [ qw( uga ) ],
731             w  => [ qw( ugg ) ],
732    
733            '*' => [ qw( UAA UAG UGA ) ]
734    );
735    
736    
737    %n_codon_for_aa = map {
738        $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
739        } keys %amino_acid_codons_DNA;
740    
741    
742    %reverse_genetic_code_DNA = (
743             A  => "GCN",  a  => "gcn",
744             C  => "TGY",  c  => "tgy",
745             D  => "GAY",  d  => "gay",
746             E  => "GAR",  e  => "gar",
747             F  => "TTY",  f  => "tty",
748             G  => "GGN",  g  => "ggn",
749             H  => "CAY",  h  => "cay",
750             I  => "ATH",  i  => "ath",
751             K  => "AAR",  k  => "aar",
752             L  => "YTN",  l  => "ytn",
753             M  => "ATG",  m  => "atg",
754             N  => "AAY",  n  => "aay",
755             P  => "CCN",  p  => "ccn",
756             Q  => "CAR",  q  => "car",
757             R  => "MGN",  r  => "mgn",
758             S  => "WSN",  s  => "wsn",
759             T  => "ACN",  t  => "acn",
760             U  => "TGA",  u  => "tga",
761             V  => "GTN",  v  => "gtn",
762             W  => "TGG",  w  => "tgg",
763             X  => "NNN",  x  => "nnn",
764             Y  => "TAY",  y  => "tay",
765            '*' => "TRR"
766    );
767    
768    %reverse_genetic_code_RNA = (
769             A  => "GCN",  a  => "gcn",
770             C  => "UGY",  c  => "ugy",
771             D  => "GAY",  d  => "gay",
772             E  => "GAR",  e  => "gar",
773             F  => "UUY",  f  => "uuy",
774             G  => "GGN",  g  => "ggn",
775             H  => "CAY",  h  => "cay",
776             I  => "AUH",  i  => "auh",
777             K  => "AAR",  k  => "aar",
778             L  => "YUN",  l  => "yun",
779             M  => "AUG",  m  => "aug",
780             N  => "AAY",  n  => "aay",
781             P  => "CCN",  p  => "ccn",
782             Q  => "CAR",  q  => "car",
783             R  => "MGN",  r  => "mgn",
784             S  => "WSN",  s  => "wsn",
785             T  => "ACN",  t  => "acn",
786             U  => "UGA",  u  => "uga",
787             V  => "GUN",  v  => "gun",
788             W  => "UGG",  w  => "ugg",
789             X  => "NNN",  x  => "nnn",
790             Y  => "UAY",  y  => "uay",
791            '*' => "URR"
792  );  );
793    
794    
795  my %DNA_letter_can_be = (  %DNA_letter_can_be = (
796      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
797      B => ["C", "G", "T"],       b => ["c", "g", "t"],      B => ["C", "G", "T"],       b => ["c", "g", "t"],
798      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 520  Line 812 
812  );  );
813    
814    
815  my %RNA_letter_can_be = (  %RNA_letter_can_be = (
816      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
817      B => ["C", "G", "U"],       b => ["c", "g", "u"],      B => ["C", "G", "U"],       b => ["c", "g", "u"],
818      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 540  Line 832 
832  );  );
833    
834    
835  my %one_letter_to_three_letter_aa = (  %one_letter_to_three_letter_aa = {
836       A  => "Ala",           A  => "Ala", a  => "Ala",
837       B  => "Asx",           B  => "Asx", b  => "Asx",
838       C  => "Cys",           C  => "Cys", c  => "Cys",
839       D  => "Asp",           D  => "Asp", d  => "Asp",
840       E  => "Glu",           E  => "Glu", e  => "Glu",
841       F  => "Phe",           F  => "Phe", f  => "Phe",
842       G  => "Gly",           G  => "Gly", g  => "Gly",
843       H  => "His",           H  => "His", h  => "His",
844       I  => "Ile",           I  => "Ile", i  => "Ile",
845       K  => "Lys",           K  => "Lys", k  => "Lys",
846       L  => "Leu",           L  => "Leu", l  => "Leu",
847       M  => "Met",           M  => "Met", m  => "Met",
848       N  => "Asn",           N  => "Asn", n  => "Asn",
849       P  => "Pro",           P  => "Pro", p  => "Pro",
850       Q  => "Gln",           Q  => "Gln", q  => "Gln",
851       R  => "Arg",           R  => "Arg", r  => "Arg",
852       S  => "Ser",           S  => "Ser", s  => "Ser",
853       T  => "Thr",           T  => "Thr", t  => "Thr",
854       U  => "Sec",           U  => "Sec", u  => "Sec",
855       V  => "Val",           V  => "Val", v  => "Val",
856       W  => "Trp",           W  => "Trp", w  => "Trp",
857       X  => "Xxx",           X  => "Xxx", x  => "Xxx",
858       Y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
859       Z  => "Glx",           Z  => "Glx", z  => "Glx",
860      "*" => "***"          '*' => "***"
861            };
862    
863    
864    %three_letter_to_one_letter_aa = (
865         ALA  => "A",   Ala  => "A",   ala  => "a",
866         ARG  => "R",   Arg  => "R",   arg  => "r",
867         ASN  => "N",   Asn  => "N",   asn  => "n",
868         ASP  => "D",   Asp  => "D",   asp  => "d",
869         ASX  => "B",   Asx  => "B",   asx  => "b",
870         CYS  => "C",   Cys  => "C",   cys  => "c",
871         GLN  => "Q",   Gln  => "Q",   gln  => "q",
872         GLU  => "E",   Glu  => "E",   glu  => "e",
873         GLX  => "Z",   Glx  => "Z",   glx  => "z",
874         GLY  => "G",   Gly  => "G",   gly  => "g",
875         HIS  => "H",   His  => "H",   his  => "h",
876         ILE  => "I",   Ile  => "I",   ile  => "i",
877         LEU  => "L",   Leu  => "L",   leu  => "l",
878         LYS  => "K",   Lys  => "K",   lys  => "k",
879         MET  => "M",   Met  => "M",   met  => "m",
880         PHE  => "F",   Phe  => "F",   phe  => "f",
881         PRO  => "P",   Pro  => "P",   pro  => "p",
882         SEC  => "U",   Sec  => "U",   sec  => "u",
883         SER  => "S",   Ser  => "S",   ser  => "s",
884         THR  => "T",   Thr  => "T",   thr  => "t",
885         TRP  => "W",   Trp  => "W",   trp  => "w",
886         TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
887         VAL  => "V",   Val  => "V",   val  => "v",
888         XAA  => "X",   Xaa  => "X",   xaa  => "x",
889         XXX  => "X",   Xxx  => "X",   xxx  => "x",
890        '***' => "*"
891  );  );
892    
893    
# Line 586  Line 908 
908                              #  (note: undef is false)                              #  (note: undef is false)
909    
910      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
911      my $pep  = ($met && ($imax >= 0)) ? "M" : "";      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );
     my $aa;  
912      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
913          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
914      }      }
# Line 819  Line 1140 
1140    
1141    
1142  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1143    #  Convert a list of intervals to read [ id, left_end, right_end ].
1144    #
1145    #     @intervals = standardize_intervals( @interval_refs )
1146    #-----------------------------------------------------------------------------
1147    sub standardize_intervals {
1148        map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
1149    }
1150    
1151    
1152    #-----------------------------------------------------------------------------
1153  #  Take the union of a list of intervals  #  Take the union of a list of intervals
1154  #  #
1155  #     @joined = join_intervals( @interval_refs )  #     @joined = join_intervals( @interval_refs )
# Line 858  Line 1189 
1189      return @joined;      return @joined;
1190  }  }
1191    
1192    
1193    #-----------------------------------------------------------------------------
1194    #  Split location strings to oriented intervals.
1195    #
1196    #     @intervals = locations_2_intervals( @locations )
1197    #     $interval  = locations_2_intervals( $location  )
1198    #-----------------------------------------------------------------------------
1199    sub locations_2_intervals {
1200        my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
1201                           || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
1202                           || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
1203                           || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
1204                            ? [ $1, $2+0, $3+0 ]
1205                            : ()
1206                            } @_;
1207    
1208        return wantarray ? @intervals : $intervals[0];
1209    }
1210    
1211    
1212    #-----------------------------------------------------------------------------
1213    #  Read a list of oriented intervals from a file.
1214    #  Allow id_start_end, or id \s start \s end formats
1215    #
1216    #     @intervals = read_oriented_intervals( *FILEHANDLE )
1217    #-----------------------------------------------------------------------------
1218    sub read_oriented_intervals {
1219        my $fh = shift;
1220        my @intervals = ();
1221    
1222        while (<$fh>) {
1223            chomp;
1224               /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
1225            || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
1226            || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
1227            || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
1228            || next;
1229    
1230            #  Matched a pattern.  Store reference to (id, start, end):
1231            push @intervals, [ $1, $2+0, $3+0 ];
1232        }
1233        return @intervals;
1234    }
1235    
1236    
1237    #-----------------------------------------------------------------------------
1238    #  Reverse the orientation of a list of intervals
1239    #
1240    #     @reversed = reverse_intervals( @interval_refs )
1241    #-----------------------------------------------------------------------------
1242    sub reverse_intervals {
1243        map { [ $_->[0], $_->[2], $_->[1] ] } @_;
1244    }
1245    
1246    
1247  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.2

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3