[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Dec 1 16:54:26 2003 UTC revision 1.3, Mon Dec 5 19:06:30 2005 UTC
# Line 1  Line 1 
1    #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18  package gjoseqlib;  package gjoseqlib;
19    
20  #  read_fasta_seqs( *FILEHANDLE )  #  A sequence entry is ( $id, $def, $seq )
21  #  read_next_fasta_seq( *FILEHANDLE )  #  A list of entries is a list of references
22  #  parse_fasta_title( $title )  #
23  #  split_fasta_title( $title )  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )
24  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )
25  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs
26    #
27    #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
28    #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
29    #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
30    #
31    #  ( $id, $def ) = parse_fasta_title( $title )
32    #  ( $id, $def ) = split_fasta_title( $title )
33    #
34    #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );
35    #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;
36    #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );
37  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )
38    #
39    #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
40    #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
41    #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
42    #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
43    #  $DNAseq = complement_DNA_seq( $NA_seq );
44    #  $RNAseq = complement_RNA_seq( $NA_seq );
45    #  $DNAseq = to_DNA_seq( $NA_seq );
46    #  $RNAseq = to_RNA_seq( $NA_seq );
47    #  $seq    = pack_seq( $sequence )
48    #  $seq    = clean_ae_sequence( $seq )
49    #
50    #  $seq = translate_seq( $seq [, $met_start] )
51    #  $aa  = translate_codon( $triplet );
52    #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
53    #
54    #  User-supplied genetic code must be upper case index and match the
55    #  DNA versus RNA type of sequence
56    #
57    #  Locations (= oriented intervals) are ( id, start, end )
58    #  Intervals are ( id, left, right )
59    #
60    #  @intervals = read_intervals( *FILEHANDLE )
61    #  @intervals = read_oriented_intervals( *FILEHANDLE )
62    #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
63    #  @joined    = join_intervals( @interval_refs )
64    #  @intervals = locations_2_intervals( @locations )
65    #  $interval  = locations_2_intervals( $location  )
66    #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
67    
 #  index_seq_list( @seq_entry_list )  
 #  seq_entry_by_id( \%seq_index, $seq_id )  
 #  seq_desc_by_id( \%seq_index, $seq_id )  
 #  seq_data_by_id( \%seq_index, $seq_id )  
   
 #  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )  
 #  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )  
 #  complement_DNA_entry( @seq_entry [, $fix_id] )  
 #  complement_RNA_entry( @seq_entry [, $fix_id] )  
 #  complement_DNA_seq( $NA_seq )  
 #  complement_RNA_seq( $NA_seq )  
 #  to_DNA_seq( $NA_seq )  
 #  to_RNA_seq( $NA_seq )  
 #  clean_ae_sequence( $seq )  
   
 #  translate_seq( $seq [, $met_start] )  
 #  translate_codon( $triplet )  
 #  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )  
68    
69  #  read_intervals( *FILEHANDLE )  use strict;
 #  join_intervals( @interval_refs )  
70    
71  use gjolib qw(wrap_text);  use gjolib qw(wrap_text);
72    
73    #  Exported global variables:
74    
75    our @aa_1_letter_order;  # Alpha by 1 letter
76    our @aa_3_letter_order;  # PAM matrix order
77    our @aa_n_codon_order;
78    our %genetic_code;
79    our %genetic_code_with_U;
80    our %amino_acid_codons_DNA;
81    our %amino_acid_codons_RNA;
82    our %n_codon_for_aa;
83    our %reverse_genetic_code_DNA;
84    our %reverse_genetic_code_RNA;
85    our %DNA_letter_can_be;
86    our %RNA_letter_can_be;
87    our %one_letter_to_three_letter_aa;
88    our %three_letter_to_one_letter_aa;
89    
90  require Exporter;  require Exporter;
91    
92  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
# Line 57  Line 112 
112          complement_RNA_seq          complement_RNA_seq
113          to_DNA_seq          to_DNA_seq
114          to_RNA_seq          to_RNA_seq
115            pack_seq
116          clean_ae_sequence          clean_ae_sequence
117    
118          translate_seq          translate_seq
# Line 64  Line 120 
120          translate_seq_with_user_code          translate_seq_with_user_code
121    
122          read_intervals          read_intervals
123            standardize_intervals
124          join_intervals          join_intervals
125            locations_2_intervals
126            read_oriented_intervals
127            reverse_intervals
128          );          );
129    
130  use strict;  our @EXPORT_OK = qw(
131            @aa_1_letter_order
132            @aa_3_letter_order
133            @aa_n_codon_order
134            %genetic_code
135            %genetic_code_with_U
136            %amino_acid_codons_DNA
137            %amino_acid_codons_RNA
138            %n_codon_for_aa
139            %reverse_genetic_code_DNA
140            %reverse_genetic_code_RNA
141            %DNA_letter_can_be
142            %RNA_letter_can_be
143            %one_letter_to_three_letter_aa
144            %three_letter_to_one_letter_aa
145            );
146    
147    
148  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
# Line 90  Line 165 
165              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
166          }          }
167          else {          else {
168              tr/\t 0-9//d;              tr/     0-9//d;
169              $seq .= $_ ;              $seq .= $_ ;
170          }          }
171      }      }
# Line 107  Line 182 
182  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)
183  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
184  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
185  #  (these should really be hashes indexed by file handle):  
186  #  {   #  Use bare block to scope the header hash
187  my $next_fasta_seq_header = "";  
188        my %next_header;
189    
190  sub read_next_fasta_seq {  sub read_next_fasta_seq {
     my $fh = shift;  
191      wantarray || die "read_next_fasta_seq requires list context\n";      wantarray || die "read_next_fasta_seq requires list context\n";
192    
193      my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );          my $fh = shift;
194            my ( $id, $desc );
195    
196            if ( defined( $next_header{$fh} ) ) {
197                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
198            }
199            else {
200                $next_header{$fh} = "";
201                ( $id, $desc ) = ( undef, "" );
202            }
203      my $seq = "";      my $seq = "";
204    
205      while ( <$fh> ) {      while ( <$fh> ) {
206          chomp;          chomp;
207          if ( /^>/ ) {        #  new id          if ( /^>/ ) {        #  new id
208              $next_fasta_seq_header = $_;                  $next_header{$fh} = $_;
209              if ( $id && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }
210              ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
211              $seq = "";              $seq = "";
212          }          }
213          else {          else {
214              tr/\t 0-9//d;                  tr/     0-9//d;
215              $seq .= $_ ;              $seq .= $_ ;
216          }          }
217      }      }
218    
219      #  Done with file, clear "next header"          #  Done with file, delete "next header"
220    
221      $next_fasta_seq_header = "";          delete $next_header{$fh};
222      return ($id && $seq) ? ($id, $desc, $seq) : () ;          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;
223        }
224  }  }
225    
226    
# Line 151  Line 236 
236      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
237          return ($1, $3 ? $3 : "");          return ($1, $3 ? $3 : "");
238      }      }
239      else {      elsif ($title =~ /^>/) {
240          return ("", "");          return ("", "");
241      }      }
242        else {
243            return (undef, "");
244        }
245  }  }
246    
247  sub split_fasta_title {  sub split_fasta_title {
# Line 225  Line 313 
313  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
314  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
315  #  #
316  #     my %seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
317  #  #
318  #  Usage example:  #  Usage example:
319  #  #
320  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries
321  #  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
322  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
323  #  #
324  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
# Line 332  Line 420 
420      }      }
421    
422      if ( $fix_id ) {      if ( $fix_id ) {
423          if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )          if ( ( $id =~ s/_(\d+)_(\d+)$// )
424            && ( abs($2-$1)+1 == $len ) ) {            && ( abs($2-$1)+1 == $len ) ) {
425              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
426              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
# Line 353  Line 441 
441      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
442                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
443      if ($fix_id) {      if ($fix_id) {
444          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
445              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
446          }          }
447          else {          else {
# Line 374  Line 462 
462      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
463                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
464      if ($fix_id) {      if ($fix_id) {
465          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
466              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
467          }          }
468          else {          else {
# Line 416  Line 504 
504  }  }
505    
506    
507    sub pack_seq {
508        my $seq = shift;
509        $seq =~ tr/A-Za-z//cd;
510        return $seq;
511    }
512    
513    
514  sub clean_ae_sequence {  sub clean_ae_sequence {
515      $_ = shift;      $_ = shift;
516      $_ = to7bit($_);      $_ = to7bit($_);
# Line 464  Line 559 
559  #  #
560  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
561    
562  our %genetic_code = (   # work as DNA  @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
563    @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
564    @aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );
565    
566    %genetic_code = (
567    
568        # DNA version
569    
570      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
571      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
572      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
# Line 482  Line 584 
584      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
585      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",
586    
587        # RNA suppliment
588    
589        UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",
590        UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",
591        UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",
592        UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",
593        CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",
594        CUC => "L",
595        CUA => "L",
596        CUG => "L",
597        AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",
598        AUC => "I",
599        AUA => "I",
600        AUG => "M",
601        GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",
602        GUC => "V",
603        GUA => "V",
604        GUG => "V",
605    
606      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
607      #  speed up the processing of some common ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
608    
609      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
610      TTR => "L",  TCR => "S",  TAR => "*",      TTR => "L",  TCR => "S",  TAR => "*",
# Line 496  Line 617 
617                   ACN => "T",                   ACN => "T",
618      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
619      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
620      GTN => "V",  GCN => "A",               GGN => "G"      GTN => "V",  GCN => "A",               GGN => "G",
621    
622        UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",
623        UUR => "L",  UCR => "S",  UAR => "*",
624                     UCN => "S",
625        CUY => "L",
626        CUR => "L",
627        CUN => "L",
628        AUY => "I",
629        GUY => "V",
630        GUR => "V",
631        GUN => "V"
632    );
633    
634    
635    #  Add lower case by construction:
636    
637    foreach ( keys %genetic_code ) {
638        $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
639    }
640    
641    
642    #  Construct the genetic code with selanocysteine by difference:
643    
644    %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;
645    $genetic_code_with_U{ TGA } = "U";
646    $genetic_code_with_U{ tga } = "u";
647    $genetic_code_with_U{ UGA } = "U";
648    $genetic_code_with_U{ uga } = "u";
649    
650    
651    %amino_acid_codons_DNA = (
652             L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
653             R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
654             S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
655             A  => [ qw( GCA GCG GCT GCC ) ],
656             G  => [ qw( GGA GGG GGT GGC ) ],
657             P  => [ qw( CCA CCG CCT CCC ) ],
658             T  => [ qw( ACA ACG ACT ACC ) ],
659             V  => [ qw( GTA GTG GTT GTC ) ],
660             I  => [ qw( ATA ATT ATC ) ],
661             C  => [ qw( TGT TGC ) ],
662             D  => [ qw( GAT GAC ) ],
663             E  => [ qw( GAA GAG ) ],
664             F  => [ qw( TTT TTC ) ],
665             H  => [ qw( CAT CAC ) ],
666             K  => [ qw( AAA AAG ) ],
667             N  => [ qw( AAT AAC ) ],
668             Q  => [ qw( CAA CAG ) ],
669             Y  => [ qw( TAT TAC ) ],
670             M  => [ qw( ATG ) ],
671             U  => [ qw( TGA ) ],
672             W  => [ qw( TGG ) ],
673    
674             l  => [ qw( tta ttg cta ctg ctt ctc ) ],
675             r  => [ qw( aga agg cga cgg cgt cgc ) ],
676             s  => [ qw( agt agc tca tcg tct tcc ) ],
677             a  => [ qw( gca gcg gct gcc ) ],
678             g  => [ qw( gga ggg ggt ggc ) ],
679             p  => [ qw( cca ccg cct ccc ) ],
680             t  => [ qw( aca acg act acc ) ],
681             v  => [ qw( gta gtg gtt gtc ) ],
682             i  => [ qw( ata att atc ) ],
683             c  => [ qw( tgt tgc ) ],
684             d  => [ qw( gat gac ) ],
685             e  => [ qw( gaa gag ) ],
686             f  => [ qw( ttt ttc ) ],
687             h  => [ qw( cat cac ) ],
688             k  => [ qw( aaa aag ) ],
689             n  => [ qw( aat aac ) ],
690             q  => [ qw( caa cag ) ],
691             y  => [ qw( tat tac ) ],
692             m  => [ qw( atg ) ],
693             u  => [ qw( tga ) ],
694             w  => [ qw( tgg ) ],
695    
696            '*' => [ qw( TAA TAG TGA ) ]
697    );
698    
699    
700    
701    %amino_acid_codons_RNA = (
702             L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
703             R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
704             S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
705             A  => [ qw( GCA GCG GCU GCC ) ],
706             G  => [ qw( GGA GGG GGU GGC ) ],
707             P  => [ qw( CCA CCG CCU CCC ) ],
708             T  => [ qw( ACA ACG ACU ACC ) ],
709             V  => [ qw( GUA GUG GUU GUC ) ],
710             B  => [ qw( GAU GAC AAU AAC ) ],
711             Z  => [ qw( GAA GAG CAA CAG ) ],
712             I  => [ qw( AUA AUU AUC ) ],
713             C  => [ qw( UGU UGC ) ],
714             D  => [ qw( GAU GAC ) ],
715             E  => [ qw( GAA GAG ) ],
716             F  => [ qw( UUU UUC ) ],
717             H  => [ qw( CAU CAC ) ],
718             K  => [ qw( AAA AAG ) ],
719             N  => [ qw( AAU AAC ) ],
720             Q  => [ qw( CAA CAG ) ],
721             Y  => [ qw( UAU UAC ) ],
722             M  => [ qw( AUG ) ],
723             U  => [ qw( UGA ) ],
724             W  => [ qw( UGG ) ],
725    
726             l  => [ qw( uua uug cua cug cuu cuc ) ],
727             r  => [ qw( aga agg cga cgg cgu cgc ) ],
728             s  => [ qw( agu agc uca ucg ucu ucc ) ],
729             a  => [ qw( gca gcg gcu gcc ) ],
730             g  => [ qw( gga ggg ggu ggc ) ],
731             p  => [ qw( cca ccg ccu ccc ) ],
732             t  => [ qw( aca acg acu acc ) ],
733             v  => [ qw( gua gug guu guc ) ],
734             b  => [ qw( gau gac aau aac ) ],
735             z  => [ qw( gaa gag caa cag ) ],
736             i  => [ qw( aua auu auc ) ],
737             c  => [ qw( ugu ugc ) ],
738             d  => [ qw( gau gac ) ],
739             e  => [ qw( gaa gag ) ],
740             f  => [ qw( uuu uuc ) ],
741             h  => [ qw( cau cac ) ],
742             k  => [ qw( aaa aag ) ],
743             n  => [ qw( aau aac ) ],
744             q  => [ qw( caa cag ) ],
745             y  => [ qw( uau uac ) ],
746             m  => [ qw( aug ) ],
747             u  => [ qw( uga ) ],
748             w  => [ qw( ugg ) ],
749    
750            '*' => [ qw( UAA UAG UGA ) ]
751  );  );
752    
753    
754  my %DNA_letter_can_be = (  %n_codon_for_aa = map {
755        $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
756        } keys %amino_acid_codons_DNA;
757    
758    
759    %reverse_genetic_code_DNA = (
760             A  => "GCN",  a  => "gcn",
761             C  => "TGY",  c  => "tgy",
762             D  => "GAY",  d  => "gay",
763             E  => "GAR",  e  => "gar",
764             F  => "TTY",  f  => "tty",
765             G  => "GGN",  g  => "ggn",
766             H  => "CAY",  h  => "cay",
767             I  => "ATH",  i  => "ath",
768             K  => "AAR",  k  => "aar",
769             L  => "YTN",  l  => "ytn",
770             M  => "ATG",  m  => "atg",
771             N  => "AAY",  n  => "aay",
772             P  => "CCN",  p  => "ccn",
773             Q  => "CAR",  q  => "car",
774             R  => "MGN",  r  => "mgn",
775             S  => "WSN",  s  => "wsn",
776             T  => "ACN",  t  => "acn",
777             U  => "TGA",  u  => "tga",
778             V  => "GTN",  v  => "gtn",
779             W  => "TGG",  w  => "tgg",
780             X  => "NNN",  x  => "nnn",
781             Y  => "TAY",  y  => "tay",
782            '*' => "TRR"
783    );
784    
785    %reverse_genetic_code_RNA = (
786             A  => "GCN",  a  => "gcn",
787             C  => "UGY",  c  => "ugy",
788             D  => "GAY",  d  => "gay",
789             E  => "GAR",  e  => "gar",
790             F  => "UUY",  f  => "uuy",
791             G  => "GGN",  g  => "ggn",
792             H  => "CAY",  h  => "cay",
793             I  => "AUH",  i  => "auh",
794             K  => "AAR",  k  => "aar",
795             L  => "YUN",  l  => "yun",
796             M  => "AUG",  m  => "aug",
797             N  => "AAY",  n  => "aay",
798             P  => "CCN",  p  => "ccn",
799             Q  => "CAR",  q  => "car",
800             R  => "MGN",  r  => "mgn",
801             S  => "WSN",  s  => "wsn",
802             T  => "ACN",  t  => "acn",
803             U  => "UGA",  u  => "uga",
804             V  => "GUN",  v  => "gun",
805             W  => "UGG",  w  => "ugg",
806             X  => "NNN",  x  => "nnn",
807             Y  => "UAY",  y  => "uay",
808            '*' => "URR"
809    );
810    
811    
812    %DNA_letter_can_be = (
813      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
814      B => ["C", "G", "T"],       b => ["c", "g", "t"],      B => ["C", "G", "T"],       b => ["c", "g", "t"],
815      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 520  Line 829 
829  );  );
830    
831    
832  my %RNA_letter_can_be = (  %RNA_letter_can_be = (
833      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
834      B => ["C", "G", "U"],       b => ["c", "g", "u"],      B => ["C", "G", "U"],       b => ["c", "g", "u"],
835      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 540  Line 849 
849  );  );
850    
851    
852  my %one_letter_to_three_letter_aa = (  %one_letter_to_three_letter_aa = {
853       A  => "Ala",           A  => "Ala", a  => "Ala",
854       B  => "Asx",           B  => "Asx", b  => "Asx",
855       C  => "Cys",           C  => "Cys", c  => "Cys",
856       D  => "Asp",           D  => "Asp", d  => "Asp",
857       E  => "Glu",           E  => "Glu", e  => "Glu",
858       F  => "Phe",           F  => "Phe", f  => "Phe",
859       G  => "Gly",           G  => "Gly", g  => "Gly",
860       H  => "His",           H  => "His", h  => "His",
861       I  => "Ile",           I  => "Ile", i  => "Ile",
862       K  => "Lys",           K  => "Lys", k  => "Lys",
863       L  => "Leu",           L  => "Leu", l  => "Leu",
864       M  => "Met",           M  => "Met", m  => "Met",
865       N  => "Asn",           N  => "Asn", n  => "Asn",
866       P  => "Pro",           P  => "Pro", p  => "Pro",
867       Q  => "Gln",           Q  => "Gln", q  => "Gln",
868       R  => "Arg",           R  => "Arg", r  => "Arg",
869       S  => "Ser",           S  => "Ser", s  => "Ser",
870       T  => "Thr",           T  => "Thr", t  => "Thr",
871       U  => "Sec",           U  => "Sec", u  => "Sec",
872       V  => "Val",           V  => "Val", v  => "Val",
873       W  => "Trp",           W  => "Trp", w  => "Trp",
874       X  => "Xxx",           X  => "Xxx", x  => "Xxx",
875       Y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
876       Z  => "Glx",           Z  => "Glx", z  => "Glx",
877      "*" => "***"          '*' => "***"
878            };
879    
880    
881    %three_letter_to_one_letter_aa = (
882         ALA  => "A",   Ala  => "A",   ala  => "a",
883         ARG  => "R",   Arg  => "R",   arg  => "r",
884         ASN  => "N",   Asn  => "N",   asn  => "n",
885         ASP  => "D",   Asp  => "D",   asp  => "d",
886         ASX  => "B",   Asx  => "B",   asx  => "b",
887         CYS  => "C",   Cys  => "C",   cys  => "c",
888         GLN  => "Q",   Gln  => "Q",   gln  => "q",
889         GLU  => "E",   Glu  => "E",   glu  => "e",
890         GLX  => "Z",   Glx  => "Z",   glx  => "z",
891         GLY  => "G",   Gly  => "G",   gly  => "g",
892         HIS  => "H",   His  => "H",   his  => "h",
893         ILE  => "I",   Ile  => "I",   ile  => "i",
894         LEU  => "L",   Leu  => "L",   leu  => "l",
895         LYS  => "K",   Lys  => "K",   lys  => "k",
896         MET  => "M",   Met  => "M",   met  => "m",
897         PHE  => "F",   Phe  => "F",   phe  => "f",
898         PRO  => "P",   Pro  => "P",   pro  => "p",
899         SEC  => "U",   Sec  => "U",   sec  => "u",
900         SER  => "S",   Ser  => "S",   ser  => "s",
901         THR  => "T",   Thr  => "T",   thr  => "t",
902         TRP  => "W",   Trp  => "W",   trp  => "w",
903         TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
904         VAL  => "V",   Val  => "V",   val  => "v",
905         XAA  => "X",   Xaa  => "X",   xaa  => "x",
906         XXX  => "X",   Xxx  => "X",   xxx  => "x",
907        '***' => "*"
908  );  );
909    
910    
# Line 586  Line 925 
925                              #  (note: undef is false)                              #  (note: undef is false)
926    
927      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
928      my $pep  = ($met && ($imax >= 0)) ? "M" : "";      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );
     my $aa;  
929      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
930          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
931      }      }
# Line 819  Line 1157 
1157    
1158    
1159  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1160    #  Convert a list of intervals to read [ id, left_end, right_end ].
1161    #
1162    #     @intervals = standardize_intervals( @interval_refs )
1163    #-----------------------------------------------------------------------------
1164    sub standardize_intervals {
1165        map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
1166    }
1167    
1168    
1169    #-----------------------------------------------------------------------------
1170  #  Take the union of a list of intervals  #  Take the union of a list of intervals
1171  #  #
1172  #     @joined = join_intervals( @interval_refs )  #     @joined = join_intervals( @interval_refs )
# Line 858  Line 1206 
1206      return @joined;      return @joined;
1207  }  }
1208    
1209    
1210    #-----------------------------------------------------------------------------
1211    #  Split location strings to oriented intervals.
1212    #
1213    #     @intervals = locations_2_intervals( @locations )
1214    #     $interval  = locations_2_intervals( $location  )
1215    #-----------------------------------------------------------------------------
1216    sub locations_2_intervals {
1217        my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
1218                           || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
1219                           || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
1220                           || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
1221                            ? [ $1, $2+0, $3+0 ]
1222                            : ()
1223                            } @_;
1224    
1225        return wantarray ? @intervals : $intervals[0];
1226    }
1227    
1228    
1229    #-----------------------------------------------------------------------------
1230    #  Read a list of oriented intervals from a file.
1231    #  Allow id_start_end, or id \s start \s end formats
1232    #
1233    #     @intervals = read_oriented_intervals( *FILEHANDLE )
1234    #-----------------------------------------------------------------------------
1235    sub read_oriented_intervals {
1236        my $fh = shift;
1237        my @intervals = ();
1238    
1239        while (<$fh>) {
1240            chomp;
1241               /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
1242            || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
1243            || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
1244            || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
1245            || next;
1246    
1247            #  Matched a pattern.  Store reference to (id, start, end):
1248            push @intervals, [ $1, $2+0, $3+0 ];
1249        }
1250        return @intervals;
1251    }
1252    
1253    
1254    #-----------------------------------------------------------------------------
1255    #  Reverse the orientation of a list of intervals
1256    #
1257    #     @reversed = reverse_intervals( @interval_refs )
1258    #-----------------------------------------------------------------------------
1259    sub reverse_intervals {
1260        map { [ $_->[0], $_->[2], $_->[1] ] } @_;
1261    }
1262    
1263    
1264  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3