[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

View of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Nov 30 18:33:41 2005 UTC (14 years ago) by golsen
Branch: MAIN
Changes since 1.1: +476 -90 lines
This package is not currently used by any scripts, but I anticipate doing
so.  I had not been keeping the FIG version up-to-date relative to my
own.

package gjoseqlib;

#  A sequence entry is ( $id, $def, $seq )
#  A list of entries is a list of references
#
#  @seq_entry = read_next_fasta_seq( *FILEHANDLE )
#  @seq_entries = read_fasta_seqs( *FILEHANDLE )
#  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs
#
#  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
#  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
#  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
#
#  ( $id, $def ) = parse_fasta_title( $title )
#  ( $id, $def ) = split_fasta_title( $title )
#
#  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );
#  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;
#  print_seq_as_fasta( *FILEHANDLE, @seq_entry );
#  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )
#
#  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
#  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
#  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
#  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
#  $DNAseq = complement_DNA_seq( $NA_seq );
#  $RNAseq = complement_RNA_seq( $NA_seq );
#  $DNAseq = to_DNA_seq( $NA_seq );
#  $RNAseq = to_RNA_seq( $NA_seq );
#  $seq    = pack_seq( $sequence )
#  $seq    = clean_ae_sequence( $seq )
#
#  $seq = translate_seq( $seq [, $met_start] )
#  $aa  = translate_codon( $triplet );
#  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
#
#  User-supplied genetic code must be upper case index and match the
#  DNA versus RNA type of sequence
#
#  Locations (= oriented intervals) are ( id, start, end )
#  Intervals are ( id, left, right )
#
#  @intervals = read_intervals( *FILEHANDLE )
#  @intervals = read_oriented_intervals( *FILEHANDLE )
#  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
#  @joined    = join_intervals( @interval_refs )
#  @intervals = locations_2_intervals( @locations )
#  $interval  = locations_2_intervals( $location  )
#  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)


use strict;

use gjolib qw( wrap_text );

#  Exported global variables:

our @aa_1_letter_order;  # Alpha by 1 letter
our @aa_3_letter_order;  # PAM matrix order
our @aa_n_codon_order;  
our %genetic_code;
our %genetic_code_with_U;
our %amino_acid_codons_DNA;
our %amino_acid_codons_RNA;
our %n_codon_for_aa;
our %reverse_genetic_code_DNA;
our %reverse_genetic_code_RNA;
our %DNA_letter_can_be;
our %RNA_letter_can_be;
our %one_letter_to_three_letter_aa;
our %three_letter_to_one_letter_aa;

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(
        read_fasta_seqs
        read_next_fasta_seq
        parse_fasta_title
        split_fasta_title
        print_seq_list_as_fasta
        print_seq_as_fasta
        print_gb_locus

        index_seq_list
        seq_entry_by_id
        seq_desc_by_id
        seq_data_by_id

        subseq_DNA_entry
        subseq_RNA_entry
        complement_DNA_entry
        complement_RNA_entry
        complement_DNA_seq
        complement_RNA_seq
        to_DNA_seq
        to_RNA_seq
        pack_seq
        clean_ae_sequence

        translate_seq
        translate_codon
        translate_seq_with_user_code

        read_intervals
        standardize_intervals
        join_intervals
        locations_2_intervals
        read_oriented_intervals
        reverse_intervals
        );

our @EXPORT_OK = qw(
        @aa_1_letter_order
        @aa_3_letter_order
        @aa_n_codon_order
        %genetic_code
        %genetic_code_with_U
        %amino_acid_codons_DNA
        %amino_acid_codons_RNA
        %n_codon_for_aa
        %reverse_genetic_code_DNA
        %reverse_genetic_code_RNA
        %DNA_letter_can_be
        %RNA_letter_can_be
        %one_letter_to_three_letter_aa
        %three_letter_to_one_letter_aa
        );


#-----------------------------------------------------------------------------
#  Read fasta sequences from a file.
#  Save the contents in a list of refs to arrays:  (id, description, seq)
#
#     @seqs = read_fasta_seqs(*FILEHANDLE)
#-----------------------------------------------------------------------------
sub read_fasta_seqs {
    my $fh = shift;
    wantarray || die "read_fasta_seqs requires list context\n";

    my @seqs = ();
    my ($id, $desc, $seq) = ("", "", "");

    while (<$fh>) {
        chomp;
        if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
            if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
            ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
        }
        else {
            tr/     0-9//d;
            $seq .= $_ ;
        }
    }

    if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
    return @seqs;
}


#-----------------------------------------------------------------------------
#  Read one fasta sequence at a time from a file.
#  Return the contents as an array:  (id, description, seq)
#
#     @seq_entry = read_next_fasta_seq(*FILEHANDLE)
#-----------------------------------------------------------------------------
#  Reading always overshoots, so save next id and description

{   #  Use bare block to scope the header hash

    my %next_header;

    sub read_next_fasta_seq {
        wantarray || die "read_next_fasta_seq requires list context\n";

        my $fh = shift;
        my ( $id, $desc );

        if ( defined( $next_header{$fh} ) ) {
            ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
        }
        else {
            $next_header{$fh} = "";
            ( $id, $desc ) = ( undef, "" );
        }
        my $seq = "";

        while ( <$fh> ) {
            chomp;
            if ( /^>/ ) {        #  new id
                $next_header{$fh} = $_;
                if ( defined($id) && $seq ) { return ($id, $desc, $seq) }
                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
                $seq = "";
            }
            else {
                tr/     0-9//d;
                $seq .= $_ ;
            }
        }

        #  Done with file, delete "next header"

        delete $next_header{$fh};
        return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;
    }
}


#-----------------------------------------------------------------------------
#  Parse a fasta file header to id and definition parts
#
#     ($id, $def) = parse_fasta_title( $title )
#     ($id, $def) = split_fasta_title( $title )
#-----------------------------------------------------------------------------
sub parse_fasta_title {
    my $title = shift;
    chomp;
    if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
        return ($1, $3 ? $3 : "");
    }
    elsif ($title =~ /^>/) {
        return ("", "");
    }
    else {
        return (undef, "");
    }
}

sub split_fasta_title {
    parse_fasta_title ( shift );
}


#-----------------------------------------------------------------------------
#  Print list of sequence entries in fasta format
#
#     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);
#-----------------------------------------------------------------------------
sub print_seq_list_as_fasta {
    my $fh = shift;
    my @seq_list = @_;

    foreach my $seq_ptr (@seq_list) {
        print_seq_as_fasta($fh, @$seq_ptr);
    }
}


#-----------------------------------------------------------------------------
#  Print one sequence in fasta format to an open file
#
#     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);
#     print_seq_as_fasta(*FILEHANDLE, @seq_entry);
#-----------------------------------------------------------------------------
sub print_seq_as_fasta {
    my $fh = shift;
    my ($id, $desc, $seq) = @_;

    printf $fh ($desc) ? ">$id $desc\n" : ">$id\n";
    my $len = length($seq);
    for (my $i = 0; $i < $len; $i += 60) {
        print $fh substr($seq, $i, 60) . "\n";
    }
}


#-----------------------------------------------------------------------------
#  Print one sequence in GenBank flat file format:
#
#     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )
#-----------------------------------------------------------------------------
sub print_gb_locus {
    my ($fh, $loc, $def, $acc, $seq) = @_;
    my ($len, $i, $imax);
    my $istep = 10;

    $len = length($seq);
    printf $fh  "LOCUS       %-10s%7d bp\n", substr($loc,0,10), $len;
    print  $fh  "DEFINITION  " . substr(wrap_text($def,80,12), 12) . "\n";
    if ($acc) { print  $fh  "ACCESSION   $acc\n" }
    print  $fh "ORIGIN\n";

    for ($i = 1; $i <= $len; ) {
        printf $fh "%9d", $i;
        $imax = $i + 59; if ($imax > $len) { $imax = $len }
        for ( ; $i <= $imax; $i += $istep) {
            print $fh " " . substr($seq, $i-1, $istep);
        }
        print $fh "\n";
        }
    print $fh "//\n";
}


#-----------------------------------------------------------------------------
#  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
#
#     my \%seq_ind  = index_seq_list(@seq_list);
#
#  Usage example:
#
#  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries
#  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
#  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
#
#-----------------------------------------------------------------------------
sub index_seq_list {
    my %seq_index = map { @{$_}[0] => $_ } @_;
    return \%seq_index;
}


#-----------------------------------------------------------------------------
#  Three routines to access all or part of sequence entry by id:
#
#     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );
#     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );
#     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );
#
#-----------------------------------------------------------------------------
sub seq_entry_by_id {
    (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
    (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
    wantarray || die "entry_by_id requires list context\n";
    return @{ $ind_ref->{$id} };
}


sub seq_desc_by_id {
    (my $ind_ref = shift)  || die "No index supplied to seq_desc_by_id\n";
    (my $id      = shift)  || die "No id supplied to seq_desc_by_id\n";
    return ${ $ind_ref->{$id} }[1];
}


sub seq_data_by_id {
    (my $ind_ref = shift)  || die "No index supplied to seq_data_by_id\n";
    (my $id      = shift)  || die "No id supplied to seq_data_by_id\n";
    return ${ $ind_ref->{$id} }[2];
}


#-----------------------------------------------------------------------------
#  Some simple sequence manipulations:
#
#     @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
#     @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
#     @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
#     @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
#     $DNAseq = complement_DNA_seq( $NA_seq );
#     $RNAseq = complement_RNA_seq( $NA_seq );
#     $DNAseq = to_DNA_seq( $NA_seq );
#     $RNAseq = to_RNA_seq( $NA_seq );
#
#-----------------------------------------------------------------------------

sub subseq_DNA_entry {
    my ($id, $desc, @rest) = @_;
    wantarray || die "subseq_DNA_entry requires array context\n";

    my $seq;
    ($id, $seq) = subseq_nt(1, $id, @rest);  # 1 is for DNA, not RNA
    return ($id, $desc, $seq);
}


sub subseq_RNA_entry {
    my ($id, $desc, @rest) = @_;
    wantarray || die "subseq_RNA_entry requires array context\n";

    my $seq;
    ($id, $seq) = subseq_nt(0, $id, @rest);  # 0 is for not DNA, i.e., RNA
    return ($id, $desc, $seq);
}


sub subseq_nt {
    my ($DNA, $id, $seq, $from, $to, $fix_id) = @_;
    $fix_id ||= 0;     #  fix undef value

    my $len   = length($seq);
    if (         ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
    if (! $to || ( $to   eq '$' ) || ( $to   eq "" ) ) { $to   = $len }

    my $left  = ( $from < $to ) ? $from : $to;
    my $right = ( $from < $to ) ? $to   : $from;
    if ( ( $right < 1 ) || ( $left > $len ) ) { return ($id, "") }
    if ( $right > $len ) { $right = $len }
    if ( $left  < 1    ) { $left  = 1 }

    $seq = substr($seq, $left-1, $right-$left+1);
    if ( $from > $to ) {
        $seq = reverse $seq;
        if ( $DNA ) {
            $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
                      [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
        }
        else {
            $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
                      [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
        }
    }

    if ( $fix_id ) {
        if ( ( $id =~ s/_(\d+)_(\d+)$// )
          && ( abs($2-$1)+1 == $len ) ) {
            if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
            else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
        }
        $id .= "_" . $from . "_" . $to;
    }

    return ($id, $seq);
}


sub complement_DNA_entry {
    my ($id, $desc, $seq, $fix_id) = @_;
    $fix_id ||= 0;     #  fix undef values

    wantarray || die "complement_DNA_entry requires array context\n";
    $seq = reverse $seq;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
    if ($fix_id) {
        if ($id =~ s/_(\d+)_(\d+)$//) {
            $id .= "_" . $2 . "_" . $1;
        }
        else {
            $id .= "_" . length($seq) . "_1";
        }
    }

    return ($id, $desc, $seq);
}


sub complement_RNA_entry {
    my ($id, $desc, $seq, $fix_id) = @_;
    $fix_id ||= 0;     #  fix undef values

    wantarray || die "complement_DNA_entry requires array context\n";
    $seq = reverse $seq;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
    if ($fix_id) {
        if ($id =~ s/_(\d+)_(\d+)$//) {
            $id .= "_" . $2 . "_" . $1;
        }
        else {
            $id .= "_" . length($seq) . "_1";
        }
    }

    return ($id, $desc, $seq);
}


sub complement_DNA_seq {
    my $seq = reverse shift;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
    return $seq;
}


sub complement_RNA_seq {
    my $seq = reverse shift;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
    return $seq;
}


sub to_DNA_seq {
    my $seq = shift;
    $seq =~ tr/Uu/Tt/;
    return $seq;
}


sub to_RNA_seq {
    my $seq = shift;
    $seq =~ tr/Tt/Uu/;
    return $seq;
}


sub pack_seq {
    my $seq = shift;
    $seq =~ tr/A-Za-z//cd;
    return $seq;
}


sub clean_ae_sequence {
    $_ = shift;
    $_ = to7bit($_);
    s/[+]/1/g;
    s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g;
    return $_;
}


sub to7bit {
    $_ = shift;
    my ($o, $c);
    while (/\\([0-3][0-7][0-7])/) {
        $o = oct($1) % 128;
        $c = sprintf("%c", $o);
        s/\\$1/$c/g;
    }
    return $_;
}


sub to8bit {
    $_ = shift;
    my ($o, $c);
    while (/\\([0-3][0-7][0-7])/) {
        $o = oct($1);
        $c = sprintf("%c", $o);
        s/\\$1/$c/g;
    }
    return $_;
}



#-----------------------------------------------------------------------------
#  Translate nucleotides to one letter protein:
#
#  $seq = translate_seq( $seq [, $met_start] )
#  $aa  = translate_codon( $triplet );
#  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
#
#  User-supplied genetic code must be upper case index and match the
#  DNA versus RNA type of sequence
#
#  $seq = translate_seq_with_user_code( $seq, $gen_code_hash [, $met_start] )
#
#-----------------------------------------------------------------------------

@aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
@aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
@aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );  

%genetic_code = (

    # DNA version

    TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
    TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
    TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
    TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",
    CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",
    CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",
    CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",
    CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",
    ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",
    ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",
    ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",
    ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",
    GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",
    GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",
    GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
    GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",

    # RNA suppliment

    UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",
    UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",
    UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",
    UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",
    CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",
    CUC => "L",
    CUA => "L",
    CUG => "L",
    AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",
    AUC => "I",
    AUA => "I",
    AUG => "M",
    GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",
    GUC => "V",
    GUA => "V",
    GUG => "V",

    #  The following ambiguous encodings are not necessary,  but
    #  speed up the processing of some ambiguous triplets:

    TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
    TTR => "L",  TCR => "S",  TAR => "*",
                 TCN => "S",
    CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",
    CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",
    CTN => "L",  CCN => "P",               CGN => "R",
    ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",
                 ACR => "T",  AAR => "K",  AGR => "R",
                 ACN => "T",
    GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
    GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
    GTN => "V",  GCN => "A",               GGN => "G",

    UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",
    UUR => "L",  UCR => "S",  UAR => "*",
                 UCN => "S",
    CUY => "L",
    CUR => "L",
    CUN => "L",             
    AUY => "I",
    GUY => "V",
    GUR => "V",
    GUN => "V"
);


#  Add lower case by construction:

foreach ( keys %genetic_code ) {
    $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
}


#  Construct the genetic code with selanocysteine by difference:

%genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;
$genetic_code_with_U{ TGA } = "U";
$genetic_code_with_U{ tga } = "u";
$genetic_code_with_U{ UGA } = "U";
$genetic_code_with_U{ uga } = "u";


%amino_acid_codons_DNA = (
	 L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
	 R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
	 S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
	 A  => [ qw( GCA GCG GCT GCC ) ],
	 G  => [ qw( GGA GGG GGT GGC ) ],
	 P  => [ qw( CCA CCG CCT CCC ) ],
	 T  => [ qw( ACA ACG ACT ACC ) ],
	 V  => [ qw( GTA GTG GTT GTC ) ],
	 I  => [ qw( ATA ATT ATC ) ],
	 C  => [ qw( TGT TGC ) ],
	 D  => [ qw( GAT GAC ) ],
	 E  => [ qw( GAA GAG ) ],
	 F  => [ qw( TTT TTC ) ],
	 H  => [ qw( CAT CAC ) ],
	 K  => [ qw( AAA AAG ) ],
	 N  => [ qw( AAT AAC ) ],
	 Q  => [ qw( CAA CAG ) ],
	 Y  => [ qw( TAT TAC ) ],
	 M  => [ qw( ATG ) ],
	 U  => [ qw( TGA ) ],
	 W  => [ qw( TGG ) ],

	 l  => [ qw( tta ttg cta ctg ctt ctc ) ],
	 r  => [ qw( aga agg cga cgg cgt cgc ) ],
	 s  => [ qw( agt agc tca tcg tct tcc ) ],
	 a  => [ qw( gca gcg gct gcc ) ],
	 g  => [ qw( gga ggg ggt ggc ) ],
	 p  => [ qw( cca ccg cct ccc ) ],
	 t  => [ qw( aca acg act acc ) ],
	 v  => [ qw( gta gtg gtt gtc ) ],
	 i  => [ qw( ata att atc ) ],
	 c  => [ qw( tgt tgc ) ],
	 d  => [ qw( gat gac ) ],
	 e  => [ qw( gaa gag ) ],
	 f  => [ qw( ttt ttc ) ],
	 h  => [ qw( cat cac ) ],
	 k  => [ qw( aaa aag ) ],
	 n  => [ qw( aat aac ) ],
	 q  => [ qw( caa cag ) ],
	 y  => [ qw( tat tac ) ],
	 m  => [ qw( atg ) ],
	 u  => [ qw( tga ) ],
	 w  => [ qw( tgg ) ],

	'*' => [ qw( TAA TAG TGA ) ]
);



%amino_acid_codons_RNA = (
	 L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
	 R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
	 S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
	 A  => [ qw( GCA GCG GCU GCC ) ],
	 G  => [ qw( GGA GGG GGU GGC ) ],
	 P  => [ qw( CCA CCG CCU CCC ) ],
	 T  => [ qw( ACA ACG ACU ACC ) ],
	 V  => [ qw( GUA GUG GUU GUC ) ],
	 B  => [ qw( GAU GAC AAU AAC ) ],
	 Z  => [ qw( GAA GAG CAA CAG ) ],
	 I  => [ qw( AUA AUU AUC ) ],
	 C  => [ qw( UGU UGC ) ],
	 D  => [ qw( GAU GAC ) ],
	 E  => [ qw( GAA GAG ) ],
	 F  => [ qw( UUU UUC ) ],
	 H  => [ qw( CAU CAC ) ],
	 K  => [ qw( AAA AAG ) ],
	 N  => [ qw( AAU AAC ) ],
	 Q  => [ qw( CAA CAG ) ],
	 Y  => [ qw( UAU UAC ) ],
	 M  => [ qw( AUG ) ],
	 U  => [ qw( UGA ) ],
	 W  => [ qw( UGG ) ],

	 l  => [ qw( uua uug cua cug cuu cuc ) ],
	 r  => [ qw( aga agg cga cgg cgu cgc ) ],
	 s  => [ qw( agu agc uca ucg ucu ucc ) ],
	 a  => [ qw( gca gcg gcu gcc ) ],
	 g  => [ qw( gga ggg ggu ggc ) ],
	 p  => [ qw( cca ccg ccu ccc ) ],
	 t  => [ qw( aca acg acu acc ) ],
	 v  => [ qw( gua gug guu guc ) ],
	 b  => [ qw( gau gac aau aac ) ],
	 z  => [ qw( gaa gag caa cag ) ],
	 i  => [ qw( aua auu auc ) ],
	 c  => [ qw( ugu ugc ) ],
	 d  => [ qw( gau gac ) ],
	 e  => [ qw( gaa gag ) ],
	 f  => [ qw( uuu uuc ) ],
	 h  => [ qw( cau cac ) ],
	 k  => [ qw( aaa aag ) ],
	 n  => [ qw( aau aac ) ],
	 q  => [ qw( caa cag ) ],
	 y  => [ qw( uau uac ) ],
	 m  => [ qw( aug ) ],
	 u  => [ qw( uga ) ],
	 w  => [ qw( ugg ) ],

	'*' => [ qw( UAA UAG UGA ) ]
);


%n_codon_for_aa = map {
    $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
    } keys %amino_acid_codons_DNA;


%reverse_genetic_code_DNA = (
	 A  => "GCN",  a  => "gcn",
	 C  => "TGY",  c  => "tgy",
	 D  => "GAY",  d  => "gay",
	 E  => "GAR",  e  => "gar",
	 F  => "TTY",  f  => "tty",
	 G  => "GGN",  g  => "ggn",
	 H  => "CAY",  h  => "cay",
	 I  => "ATH",  i  => "ath",
	 K  => "AAR",  k  => "aar",
	 L  => "YTN",  l  => "ytn",
	 M  => "ATG",  m  => "atg",
	 N  => "AAY",  n  => "aay",
	 P  => "CCN",  p  => "ccn",
	 Q  => "CAR",  q  => "car",
	 R  => "MGN",  r  => "mgn",
	 S  => "WSN",  s  => "wsn",
	 T  => "ACN",  t  => "acn",
	 U  => "TGA",  u  => "tga",
	 V  => "GTN",  v  => "gtn",
	 W  => "TGG",  w  => "tgg",
	 X  => "NNN",  x  => "nnn",
	 Y  => "TAY",  y  => "tay",
	'*' => "TRR"
);

%reverse_genetic_code_RNA = (
	 A  => "GCN",  a  => "gcn",
	 C  => "UGY",  c  => "ugy",
	 D  => "GAY",  d  => "gay",
	 E  => "GAR",  e  => "gar",
	 F  => "UUY",  f  => "uuy",
	 G  => "GGN",  g  => "ggn",
	 H  => "CAY",  h  => "cay",
	 I  => "AUH",  i  => "auh",
	 K  => "AAR",  k  => "aar",
	 L  => "YUN",  l  => "yun",
	 M  => "AUG",  m  => "aug",
	 N  => "AAY",  n  => "aay",
	 P  => "CCN",  p  => "ccn",
	 Q  => "CAR",  q  => "car",
	 R  => "MGN",  r  => "mgn",
	 S  => "WSN",  s  => "wsn",
	 T  => "ACN",  t  => "acn",
	 U  => "UGA",  u  => "uga",
	 V  => "GUN",  v  => "gun",
	 W  => "UGG",  w  => "ugg",
	 X  => "NNN",  x  => "nnn",
	 Y  => "UAY",  y  => "uay",
	'*' => "URR"
);


%DNA_letter_can_be = (
    A => ["A"],                 a => ["a"],
    B => ["C", "G", "T"],       b => ["c", "g", "t"],
    C => ["C"],                 c => ["c"],
    D => ["A", "G", "T"],       d => ["a", "g", "t"],
    G => ["G"],                 g => ["g"],
    H => ["A", "C", "T"],       h => ["a", "c", "t"],
    K => ["G", "T"],            k => ["g", "t"],
    M => ["A", "C"],            m => ["a", "c"],
    N => ["A", "C", "G", "T"],  n => ["a", "c", "g", "t"],
    R => ["A", "G"],            r => ["a", "g"],
    S => ["C", "G"],            s => ["c", "g"],
    T => ["T"],                 t => ["t"],
    U => ["T"],                 u => ["t"],
    V => ["A", "C", "G"],       v => ["a", "c", "g"],
    W => ["A", "T"],            w => ["a", "t"],
    Y => ["C", "T"],            y => ["c", "t"]
);


%RNA_letter_can_be = (
    A => ["A"],                 a => ["a"],
    B => ["C", "G", "U"],       b => ["c", "g", "u"],
    C => ["C"],                 c => ["c"],
    D => ["A", "G", "U"],       d => ["a", "g", "u"],
    G => ["G"],                 g => ["g"],
    H => ["A", "C", "U"],       h => ["a", "c", "u"],
    K => ["G", "U"],            k => ["g", "u"],
    M => ["A", "C"],            m => ["a", "c"],
    N => ["A", "C", "G", "U"],  n => ["a", "c", "g", "u"],
    R => ["A", "G"],            r => ["a", "g"],
    S => ["C", "G"],            s => ["c", "g"],
    T => ["U"],                 t => ["u"],
    U => ["U"],                 u => ["u"],
    V => ["A", "C", "G"],       v => ["a", "c", "g"],
    W => ["A", "U"],            w => ["a", "u"],
    Y => ["C", "U"],            y => ["c", "u"]
);


%one_letter_to_three_letter_aa = {
	 A  => "Ala", a  => "Ala",
	 B  => "Asx", b  => "Asx",
	 C  => "Cys", c  => "Cys",
	 D  => "Asp", d  => "Asp",
	 E  => "Glu", e  => "Glu",
	 F  => "Phe", f  => "Phe",
	 G  => "Gly", g  => "Gly",
	 H  => "His", h  => "His",
	 I  => "Ile", i  => "Ile",
	 K  => "Lys", k  => "Lys",
	 L  => "Leu", l  => "Leu",
	 M  => "Met", m  => "Met",
	 N  => "Asn", n  => "Asn",
	 P  => "Pro", p  => "Pro",
	 Q  => "Gln", q  => "Gln",
	 R  => "Arg", r  => "Arg",
	 S  => "Ser", s  => "Ser",
	 T  => "Thr", t  => "Thr",
	 U  => "Sec", u  => "Sec",
	 V  => "Val", v  => "Val",
	 W  => "Trp", w  => "Trp",
	 X  => "Xxx", x  => "Xxx",
	 Y  => "Tyr", y  => "Tyr",
	 Z  => "Glx", z  => "Glx",
	'*' => "***"
	};


%three_letter_to_one_letter_aa = (
     ALA  => "A",   Ala  => "A",   ala  => "a",
     ARG  => "R",   Arg  => "R",   arg  => "r",
     ASN  => "N",   Asn  => "N",   asn  => "n",
     ASP  => "D",   Asp  => "D",   asp  => "d",
     ASX  => "B",   Asx  => "B",   asx  => "b",
     CYS  => "C",   Cys  => "C",   cys  => "c",
     GLN  => "Q",   Gln  => "Q",   gln  => "q",
     GLU  => "E",   Glu  => "E",   glu  => "e",
     GLX  => "Z",   Glx  => "Z",   glx  => "z",
     GLY  => "G",   Gly  => "G",   gly  => "g",
     HIS  => "H",   His  => "H",   his  => "h",
     ILE  => "I",   Ile  => "I",   ile  => "i",
     LEU  => "L",   Leu  => "L",   leu  => "l",
     LYS  => "K",   Lys  => "K",   lys  => "k",
     MET  => "M",   Met  => "M",   met  => "m",
     PHE  => "F",   Phe  => "F",   phe  => "f",
     PRO  => "P",   Pro  => "P",   pro  => "p",
     SEC  => "U",   Sec  => "U",   sec  => "u",
     SER  => "S",   Ser  => "S",   ser  => "s",
     THR  => "T",   Thr  => "T",   thr  => "t",
     TRP  => "W",   Trp  => "W",   trp  => "w",
     TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
     VAL  => "V",   Val  => "V",   val  => "v",
     XAA  => "X",   Xaa  => "X",   xaa  => "x",
     XXX  => "X",   Xxx  => "X",   xxx  => "x",
    '***' => "*"
);


#-----------------------------------------------------------------------------
#  Translate nucleotides to one letter protein:
#
#      $seq = translate_seq( $seq [, $met_start] )
#
#-----------------------------------------------------------------------------

sub translate_seq {
    my $seq = uc shift;
    $seq =~ tr/UX/TN/;      #  make it DNA, and allow X
    $seq =~ tr/-//d;        #  remove gaps

    my $met = shift || 0;   #  a second argument that is true
                            #  forces first amino acid to be Met
                            #  (note: undef is false)

    my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
    my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );
    for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
        $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
    }

    return $pep;
}


#-----------------------------------------------------------------------------
#  Translate a single triplet with "universal" genetic code
#  Uppercase and DNA are performed, then translate_uc_DNA_codon
#  is called.
#
#      $aa = translate_codon( $triplet )
#
#-----------------------------------------------------------------------------

sub translate_codon {
    my $codon = uc shift;  #  Make it uppercase
    $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X
    return translate_uc_DNA_codon($codon);
}


#-----------------------------------------------------------------------------
#  Translate a single triplet with "universal" genetic code
#  Uppercase and DNA assumed
#  Intended for private use by translate_codon and translate_seq
#
#      $aa = translate_uc_DNA_codon( $triplet )
#
#-----------------------------------------------------------------------------

sub translate_uc_DNA_codon {
    my $codon = shift;
    my $aa;

    #  Try a simple lookup:

    if ( $aa = $genetic_code{ $codon } ) { return $aa }

    #  With the expanded code defined above, this catches simple N, R
    #  and Y ambiguities in the third position.  Other codes like
    #  GG[KMSWBDHV], or even GG, might be unambiguously translated by
    #  converting the last position to N and seeing if this is in the
    #  (expanded) code table:

    if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }

    #  Test that codon is valid and might have unambiguous aa:

    if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }
    #                     ^^
    #                     |+- for leucine YTR
    #                     +-- for arginine MGR

    #  Expand all ambiguous nucleotides to see if they all yield same aa.
    #  Loop order tries to fail quickly with first position change.

    $aa = "";
    for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {
        for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {
            for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {
                #  set the first value of $aa
                if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }
                #  or break out if any other amino acid is detected
                elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }
            }
        }
    }

    return $aa || "X";
}


#-----------------------------------------------------------------------------
#  Translate with a user-supplied genetic code to translate a sequence.
#  Diagnose the use of upper versus lower, and T versus U in the supplied
#  code, and transform the supplied nucleotide sequence to match.
#
#  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )
#
#-----------------------------------------------------------------------------

sub translate_seq_with_user_code {
    my $seq = shift;
    $seq =~ tr/-//d;     #  remove gaps  ***  Why?
    $seq =~ tr/Xx/Nn/;   #  allow X

    my $gc = shift;      #  Reference to hash of DNA alphabet code
    if (! $gc || ref($gc) ne "HASH") {
        die "translate_seq_with_user_code needs genetic code hash as secondargument.";
    }

    #  Test the type of code supplied: uppercase versus lowercase

    my ($RNA_F, $DNA_F, $M, $N, $X);

    if ($gc->{ "AAA" }) {     #  Looks like uppercase code table
        $seq   = uc $seq;     #  Uppercase sequence
        $RNA_F = "UUU";       #  Uppercase RNA Phe codon
        $DNA_F = "TTT";       #  Uppercase DNA Phe codon
        $M     = "M";         #  Uppercase initiator
        $N     = "N";         #  Uppercase ambiguous nuc
        $X     = "X";         #  Uppercase ambiguous aa
    }
    elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table
        $seq   = lc $seq;     #  Lowercase sequence
        $RNA_F = "uuu";       #  Lowercase RNA Phe codon
        $DNA_F = "ttt";       #  Lowercase DNA Phe codon
        $M     = "m";         #  Lowercase initiator
        $N     = "n";         #  Lowercase ambiguous nuc
        $X     = "x";         #  Lowercase ambiguous aa
    }
    else {
        die "User-supplied genetic code does not have aaa or AAA\n";
    }

    #  Test the type of code supplied:  UUU versus TTT

    my ($ambigs);

    if ($gc->{ $RNA_F }) {     #  Looks like RNA code table
        $seq =~ tr/Tt/Uu/;
        $ambigs = \%RNA_letter_can_be;
    }
    elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table
        $seq =~ tr/Uu/Tt/;
        $ambigs = \%DNA_letter_can_be;
    }
    else {
        die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";
    }

    my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!

    my $met = shift;     #  a third argument that is true
                         #  forces first amino acid to be Met
                         #  (note: undef is false)
    my $pep  = ($met && ($imax >= 0)) ? $M : "";
    my $aa;

    for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
        $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );
    }

    return $pep;
}


#-----------------------------------------------------------------------------
#  Translate with user-supplied genetic code hash.  For speed, no error
#  check on the hash.  Calling programs should check for the hash at a
#  higher level.
#
#  Should only be called through translate_seq_with_user_code
#
#   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )
#
#  $triplet      speaks for itself
#  $code         ref to the hash with the codon translations
#  $N            character to use for ambiguous nucleotide
#  $X            character to use for ambiguous amino acid
#  $ambig_table  ref to hash with lists of nucleotides for each ambig code
#-----------------------------------------------------------------------------


sub translate_codon_with_user_code {
    my $codon = shift;
    my $gc    = shift;
    my $aa;

    #  Try a simple lookup:

    if ( $aa = $gc->{ $codon } ) { return $aa }

    #  Test that codon is valid and might have unambiguous aa:

    my ($N, $X, $ambigs) = @_;
    if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }
    if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }
    #                          ^^
    #                          |+- for leucine YTR
    #                          +-- for arginine MGR

    #  Expand all ambiguous nucleotides to see if they all yield same aa.
    #  Loop order tries to fail quickly with first position change.

    $aa = "";
    for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {
        for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {
            for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {
                #  set the first value of $aa
                if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }
                #  break out if any other amino acid is detected
                elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }
            }
        }
    }

    return $aa || $X;
}


#-----------------------------------------------------------------------------
#  Read a list of intervals from a file.
#  Allow id_start_end, or id \s start \s end formats
#
#     @intervals = read_intervals( *FILEHANDLE )
#-----------------------------------------------------------------------------
sub read_intervals {
    my $fh = shift;
    my @intervals = ();

    while (<$fh>) {
        chomp;
           /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
        || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
        || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
        || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
        || next;

        #  Matched a pattern.  Store reference to (id, left, right):
        push @intervals, ($2 < $3) ? [ $1, $2+0, $3+0 ]
                                   : [ $1, $3+0, $2+0 ];
    }
    return @intervals;
}


#-----------------------------------------------------------------------------
#  Convert a list of intervals to read [ id, left_end, right_end ].
#
#     @intervals = standardize_intervals( @interval_refs )
#-----------------------------------------------------------------------------
sub standardize_intervals {
    map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
}


#-----------------------------------------------------------------------------
#  Take the union of a list of intervals
#
#     @joined = join_intervals( @interval_refs )
#-----------------------------------------------------------------------------
sub join_intervals {
    my @ordered = sort { $a->[0] cmp $b->[0]   # first by id
                      || $a->[1] <=> $b->[1]   # next by left end
                      || $b->[2] <=> $a->[2]   # finally longest first
                       } @_;

    my @joined = ();
    my $n_int = @ordered;

    my ($cur_id)    = "";
    my ($cur_left)  = -1;
    my ($cur_right) = -1;
    my ($new_id, $new_left, $new_right);

    for (my $i = 0; $i < $n_int; $i++) {
        ($new_id, $new_left, $new_right) = @{$ordered[$i]};  # get the new data

        if ( ( $new_id ne $cur_id)          # if new contig
          || ( $new_left > $cur_right + 1)  #    or not touching previous
           ) {                              # push the previous interval
            if ($cur_id) { push (@joined, [ $cur_id, $cur_left, $cur_right ]) }
            $cur_id = $new_id;              # update the current interval
            $cur_left = $new_left;
            $cur_right = $new_right;
        }

        elsif ($new_right > $cur_right) {   # extend the right end if necessary
            $cur_right = $new_right;
        }
    }

    if ($cur_id) { push (@joined, [$cur_id, $cur_left, $cur_right]) }
    return @joined;
}


#-----------------------------------------------------------------------------
#  Split location strings to oriented intervals.
#
#     @intervals = locations_2_intervals( @locations )
#     $interval  = locations_2_intervals( $location  )
#-----------------------------------------------------------------------------
sub locations_2_intervals {
    my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
                       || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
                       || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
                       || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
                        ? [ $1, $2+0, $3+0 ]
                        : ()
                        } @_;

    return wantarray ? @intervals : $intervals[0];
}


#-----------------------------------------------------------------------------
#  Read a list of oriented intervals from a file.
#  Allow id_start_end, or id \s start \s end formats
#
#     @intervals = read_oriented_intervals( *FILEHANDLE )
#-----------------------------------------------------------------------------
sub read_oriented_intervals {
    my $fh = shift;
    my @intervals = ();

    while (<$fh>) {
        chomp;
           /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
        || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
        || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
        || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
        || next;

        #  Matched a pattern.  Store reference to (id, start, end):
        push @intervals, [ $1, $2+0, $3+0 ];
    }
    return @intervals;
}


#-----------------------------------------------------------------------------
#  Reverse the orientation of a list of intervals
#
#     @reversed = reverse_intervals( @interval_refs )
#-----------------------------------------------------------------------------
sub reverse_intervals {
    map { [ $_->[0], $_->[2], $_->[1] ] } @_;
}


1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3