[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

View of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Dec 1 16:54:26 2003 UTC (16 years, 2 months ago) by efrank
Branch: MAIN
Branch point for: initial
Initial revision

package gjoseqlib;

#  read_fasta_seqs( *FILEHANDLE )
#  read_next_fasta_seq( *FILEHANDLE )
#  parse_fasta_title( $title )
#  split_fasta_title( $title )
#  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )
#  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )
#  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )

#  index_seq_list( @seq_entry_list )
#  seq_entry_by_id( \%seq_index, $seq_id )
#  seq_desc_by_id( \%seq_index, $seq_id )
#  seq_data_by_id( \%seq_index, $seq_id )

#  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )
#  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )
#  complement_DNA_entry( @seq_entry [, $fix_id] )
#  complement_RNA_entry( @seq_entry [, $fix_id] )
#  complement_DNA_seq( $NA_seq )
#  complement_RNA_seq( $NA_seq )
#  to_DNA_seq( $NA_seq )
#  to_RNA_seq( $NA_seq )
#  clean_ae_sequence( $seq )

#  translate_seq( $seq [, $met_start] )
#  translate_codon( $triplet )
#  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )

#  read_intervals( *FILEHANDLE )
#  join_intervals( @interval_refs )

use gjolib qw(wrap_text);

require Exporter;

our @ISA = qw(Exporter);
our @EXPORT = qw(
        read_fasta_seqs
        read_next_fasta_seq
        parse_fasta_title
        split_fasta_title
        print_seq_list_as_fasta
        print_seq_as_fasta
        print_gb_locus

        index_seq_list
        seq_entry_by_id
        seq_desc_by_id
        seq_data_by_id

        subseq_DNA_entry
        subseq_RNA_entry
        complement_DNA_entry
        complement_RNA_entry
        complement_DNA_seq
        complement_RNA_seq
        to_DNA_seq
        to_RNA_seq
        clean_ae_sequence

        translate_seq
        translate_codon
        translate_seq_with_user_code

        read_intervals
        join_intervals
        );

use strict;


#-----------------------------------------------------------------------------
#  Read fasta sequences from a file.
#  Save the contents in a list of refs to arrays:  (id, description, seq)
#
#     @seqs = read_fasta_seqs(*FILEHANDLE)
#-----------------------------------------------------------------------------
sub read_fasta_seqs {
    my $fh = shift;
    wantarray || die "read_fasta_seqs requires list context\n";

    my @seqs = ();
    my ($id, $desc, $seq) = ("", "", "");

    while (<$fh>) {
        chomp;
        if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
            if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
            ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
        }
        else {
            tr/\t 0-9//d;
            $seq .= $_ ;
        }
    }

    if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
    return @seqs;
}


#-----------------------------------------------------------------------------
#  Read one fasta sequence at a time from a file.
#  Return the contents as an array:  (id, description, seq)
#
#     @seq_entry = read_next_fasta_seq(*FILEHANDLE)
#-----------------------------------------------------------------------------
#  Reading always overshoots, so save next id and description
#  (these should really be hashes indexed by file handle):
#
my $next_fasta_seq_header = "";

sub read_next_fasta_seq {
    my $fh = shift;
    wantarray || die "read_next_fasta_seq requires list context\n";

    my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );
    my $seq = "";

    while ( <$fh> ) {
        chomp;
        if ( /^>/ ) {        #  new id
            $next_fasta_seq_header = $_;
            if ( $id && $seq ) { return ($id, $desc, $seq) }
            ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );
            $seq = "";
        }
        else {
            tr/\t 0-9//d;
            $seq .= $_ ;
        }
    }

    #  Done with file, clear "next header"

    $next_fasta_seq_header = "";
    return ($id && $seq) ? ($id, $desc, $seq) : () ;
}


#-----------------------------------------------------------------------------
#  Parse a fasta file header to id and definition parts
#
#     ($id, $def) = parse_fasta_title( $title )
#     ($id, $def) = split_fasta_title( $title )
#-----------------------------------------------------------------------------
sub parse_fasta_title {
    my $title = shift;
    chomp;
    if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
        return ($1, $3 ? $3 : "");
    }
    else {
        return ("", "");
    }
}

sub split_fasta_title {
    parse_fasta_title ( shift );
}


#-----------------------------------------------------------------------------
#  Print list of sequence entries in fasta format
#
#     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);
#-----------------------------------------------------------------------------
sub print_seq_list_as_fasta {
    my $fh = shift;
    my @seq_list = @_;

    foreach my $seq_ptr (@seq_list) {
        print_seq_as_fasta($fh, @$seq_ptr);
    }
}


#-----------------------------------------------------------------------------
#  Print one sequence in fasta format to an open file
#
#     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);
#     print_seq_as_fasta(*FILEHANDLE, @seq_entry);
#-----------------------------------------------------------------------------
sub print_seq_as_fasta {
    my $fh = shift;
    my ($id, $desc, $seq) = @_;

    printf $fh ($desc) ? ">$id $desc\n" : ">$id\n";
    my $len = length($seq);
    for (my $i = 0; $i < $len; $i += 60) {
        print $fh substr($seq, $i, 60) . "\n";
    }
}


#-----------------------------------------------------------------------------
#  Print one sequence in GenBank flat file format:
#
#     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )
#-----------------------------------------------------------------------------
sub print_gb_locus {
    my ($fh, $loc, $def, $acc, $seq) = @_;
    my ($len, $i, $imax);
    my $istep = 10;

    $len = length($seq);
    printf $fh  "LOCUS       %-10s%7d bp\n", substr($loc,0,10), $len;
    print  $fh  "DEFINITION  " . substr(wrap_text($def,80,12), 12) . "\n";
    if ($acc) { print  $fh  "ACCESSION   $acc\n" }
    print  $fh "ORIGIN\n";

    for ($i = 1; $i <= $len; ) {
        printf $fh "%9d", $i;
        $imax = $i + 59; if ($imax > $len) { $imax = $len }
        for ( ; $i <= $imax; $i += $istep) {
            print $fh " " . substr($seq, $i-1, $istep);
        }
        print $fh "\n";
        }
    print $fh "//\n";
}


#-----------------------------------------------------------------------------
#  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
#
#     my %seq_ind  = index_seq_list(@seq_list);
#
#  Usage example:
#
#  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries
#  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers
#  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
#
#-----------------------------------------------------------------------------
sub index_seq_list {
    my %seq_index = map { @{$_}[0] => $_ } @_;
    return \%seq_index;
}


#-----------------------------------------------------------------------------
#  Three routines to access all or part of sequence entry by id:
#
#     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );
#     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );
#     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );
#
#-----------------------------------------------------------------------------
sub seq_entry_by_id {
    (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
    (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
    wantarray || die "entry_by_id requires list context\n";
    return @{ $ind_ref->{$id} };
}


sub seq_desc_by_id {
    (my $ind_ref = shift)  || die "No index supplied to seq_desc_by_id\n";
    (my $id      = shift)  || die "No id supplied to seq_desc_by_id\n";
    return ${ $ind_ref->{$id} }[1];
}


sub seq_data_by_id {
    (my $ind_ref = shift)  || die "No index supplied to seq_data_by_id\n";
    (my $id      = shift)  || die "No id supplied to seq_data_by_id\n";
    return ${ $ind_ref->{$id} }[2];
}


#-----------------------------------------------------------------------------
#  Some simple sequence manipulations:
#
#     @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
#     @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
#     @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
#     @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
#     $DNAseq = complement_DNA_seq( $NA_seq );
#     $RNAseq = complement_RNA_seq( $NA_seq );
#     $DNAseq = to_DNA_seq( $NA_seq );
#     $RNAseq = to_RNA_seq( $NA_seq );
#
#-----------------------------------------------------------------------------

sub subseq_DNA_entry {
    my ($id, $desc, @rest) = @_;
    wantarray || die "subseq_DNA_entry requires array context\n";

    my $seq;
    ($id, $seq) = subseq_nt(1, $id, @rest);  # 1 is for DNA, not RNA
    return ($id, $desc, $seq);
}


sub subseq_RNA_entry {
    my ($id, $desc, @rest) = @_;
    wantarray || die "subseq_RNA_entry requires array context\n";

    my $seq;
    ($id, $seq) = subseq_nt(0, $id, @rest);  # 0 is for not DNA, i.e., RNA
    return ($id, $desc, $seq);
}


sub subseq_nt {
    my ($DNA, $id, $seq, $from, $to, $fix_id) = @_;
    $fix_id ||= 0;     #  fix undef value

    my $len   = length($seq);
    if (         ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
    if (! $to || ( $to   eq '$' ) || ( $to   eq "" ) ) { $to   = $len }

    my $left  = ( $from < $to ) ? $from : $to;
    my $right = ( $from < $to ) ? $to   : $from;
    if ( ( $right < 1 ) || ( $left > $len ) ) { return ($id, "") }
    if ( $right > $len ) { $right = $len }
    if ( $left  < 1    ) { $left  = 1 }

    $seq = substr($seq, $left-1, $right-$left+1);
    if ( $from > $to ) {
        $seq = reverse $seq;
        if ( $DNA ) {
            $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
                      [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
        }
        else {
            $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
                      [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
        }
    }

    if ( $fix_id ) {
        if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )
          && ( abs($2-$1)+1 == $len ) ) {
            if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
            else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
        }
        $id .= "_" . $from . "_" . $to;
    }

    return ($id, $seq);
}


sub complement_DNA_entry {
    my ($id, $desc, $seq, $fix_id) = @_;
    $fix_id ||= 0;     #  fix undef values

    wantarray || die "complement_DNA_entry requires array context\n";
    $seq = reverse $seq;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
    if ($fix_id) {
        if ($id =~ s/_([0-9]+)_([0-9]+)$//) {
            $id .= "_" . $2 . "_" . $1;
        }
        else {
            $id .= "_" . length($seq) . "_1";
        }
    }

    return ($id, $desc, $seq);
}


sub complement_RNA_entry {
    my ($id, $desc, $seq, $fix_id) = @_;
    $fix_id ||= 0;     #  fix undef values

    wantarray || die "complement_DNA_entry requires array context\n";
    $seq = reverse $seq;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
    if ($fix_id) {
        if ($id =~ s/_([0-9]+)_([0-9]+)$//) {
            $id .= "_" . $2 . "_" . $1;
        }
        else {
            $id .= "_" . length($seq) . "_1";
        }
    }

    return ($id, $desc, $seq);
}


sub complement_DNA_seq {
    my $seq = reverse shift;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
    return $seq;
}


sub complement_RNA_seq {
    my $seq = reverse shift;
    $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
              [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
    return $seq;
}


sub to_DNA_seq {
    my $seq = shift;
    $seq =~ tr/Uu/Tt/;
    return $seq;
}


sub to_RNA_seq {
    my $seq = shift;
    $seq =~ tr/Tt/Uu/;
    return $seq;
}


sub clean_ae_sequence {
    $_ = shift;
    $_ = to7bit($_);
    s/[+]/1/g;
    s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g;
    return $_;
}


sub to7bit {
    $_ = shift;
    my ($o, $c);
    while (/\\([0-3][0-7][0-7])/) {
        $o = oct($1) % 128;
        $c = sprintf("%c", $o);
        s/\\$1/$c/g;
    }
    return $_;
}


sub to8bit {
    $_ = shift;
    my ($o, $c);
    while (/\\([0-3][0-7][0-7])/) {
        $o = oct($1);
        $c = sprintf("%c", $o);
        s/\\$1/$c/g;
    }
    return $_;
}



#-----------------------------------------------------------------------------
#  Translate nucleotides to one letter protein:
#
#  $seq = translate_seq( $seq [, $met_start] )
#  $aa  = translate_codon( $triplet );
#  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
#
#  User-supplied genetic code must be upper case index and match the
#  DNA versus RNA type of sequence
#
#  $seq = translate_seq_with_user_code( $seq, $gen_code_hash [, $met_start] )
#
#-----------------------------------------------------------------------------

our %genetic_code = (   # work as DNA
    TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
    TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
    TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
    TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",
    CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",
    CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",
    CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",
    CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",
    ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",
    ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",
    ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",
    ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",
    GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",
    GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",
    GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
    GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",

    #  The following ambiguous encodings are not necessary,  but
    #  speed up the processing of some common ambiguous triplets:

    TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
    TTR => "L",  TCR => "S",  TAR => "*",
                 TCN => "S",
    CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",
    CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",
    CTN => "L",  CCN => "P",               CGN => "R",
    ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",
                 ACR => "T",  AAR => "K",  AGR => "R",
                 ACN => "T",
    GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
    GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
    GTN => "V",  GCN => "A",               GGN => "G"
);


my %DNA_letter_can_be = (
    A => ["A"],                 a => ["a"],
    B => ["C", "G", "T"],       b => ["c", "g", "t"],
    C => ["C"],                 c => ["c"],
    D => ["A", "G", "T"],       d => ["a", "g", "t"],
    G => ["G"],                 g => ["g"],
    H => ["A", "C", "T"],       h => ["a", "c", "t"],
    K => ["G", "T"],            k => ["g", "t"],
    M => ["A", "C"],            m => ["a", "c"],
    N => ["A", "C", "G", "T"],  n => ["a", "c", "g", "t"],
    R => ["A", "G"],            r => ["a", "g"],
    S => ["C", "G"],            s => ["c", "g"],
    T => ["T"],                 t => ["t"],
    U => ["T"],                 u => ["t"],
    V => ["A", "C", "G"],       v => ["a", "c", "g"],
    W => ["A", "T"],            w => ["a", "t"],
    Y => ["C", "T"],            y => ["c", "t"]
);


my %RNA_letter_can_be = (
    A => ["A"],                 a => ["a"],
    B => ["C", "G", "U"],       b => ["c", "g", "u"],
    C => ["C"],                 c => ["c"],
    D => ["A", "G", "U"],       d => ["a", "g", "u"],
    G => ["G"],                 g => ["g"],
    H => ["A", "C", "U"],       h => ["a", "c", "u"],
    K => ["G", "U"],            k => ["g", "u"],
    M => ["A", "C"],            m => ["a", "c"],
    N => ["A", "C", "G", "U"],  n => ["a", "c", "g", "u"],
    R => ["A", "G"],            r => ["a", "g"],
    S => ["C", "G"],            s => ["c", "g"],
    T => ["U"],                 t => ["u"],
    U => ["U"],                 u => ["u"],
    V => ["A", "C", "G"],       v => ["a", "c", "g"],
    W => ["A", "U"],            w => ["a", "u"],
    Y => ["C", "U"],            y => ["c", "u"]
);


my %one_letter_to_three_letter_aa = (
     A  => "Ala",
     B  => "Asx",
     C  => "Cys",
     D  => "Asp",
     E  => "Glu",
     F  => "Phe",
     G  => "Gly",
     H  => "His",
     I  => "Ile",
     K  => "Lys",
     L  => "Leu",
     M  => "Met",
     N  => "Asn",
     P  => "Pro",
     Q  => "Gln",
     R  => "Arg",
     S  => "Ser",
     T  => "Thr",
     U  => "Sec",
     V  => "Val",
     W  => "Trp",
     X  => "Xxx",
     Y  => "Tyr",
     Z  => "Glx",
    "*" => "***"
);


#-----------------------------------------------------------------------------
#  Translate nucleotides to one letter protein:
#
#      $seq = translate_seq( $seq [, $met_start] )
#
#-----------------------------------------------------------------------------

sub translate_seq {
    my $seq = uc shift;
    $seq =~ tr/UX/TN/;      #  make it DNA, and allow X
    $seq =~ tr/-//d;        #  remove gaps

    my $met = shift || 0;   #  a second argument that is true
                            #  forces first amino acid to be Met
                            #  (note: undef is false)

    my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
    my $pep  = ($met && ($imax >= 0)) ? "M" : "";
    my $aa;
    for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
        $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
    }

    return $pep;
}


#-----------------------------------------------------------------------------
#  Translate a single triplet with "universal" genetic code
#  Uppercase and DNA are performed, then translate_uc_DNA_codon
#  is called.
#
#      $aa = translate_codon( $triplet )
#
#-----------------------------------------------------------------------------

sub translate_codon {
    my $codon = uc shift;  #  Make it uppercase
    $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X
    return translate_uc_DNA_codon($codon);
}


#-----------------------------------------------------------------------------
#  Translate a single triplet with "universal" genetic code
#  Uppercase and DNA assumed
#  Intended for private use by translate_codon and translate_seq
#
#      $aa = translate_uc_DNA_codon( $triplet )
#
#-----------------------------------------------------------------------------

sub translate_uc_DNA_codon {
    my $codon = shift;
    my $aa;

    #  Try a simple lookup:

    if ( $aa = $genetic_code{ $codon } ) { return $aa }

    #  With the expanded code defined above, this catches simple N, R
    #  and Y ambiguities in the third position.  Other codes like
    #  GG[KMSWBDHV], or even GG, might be unambiguously translated by
    #  converting the last position to N and seeing if this is in the
    #  (expanded) code table:

    if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }

    #  Test that codon is valid and might have unambiguous aa:

    if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }
    #                     ^^
    #                     |+- for leucine YTR
    #                     +-- for arginine MGR

    #  Expand all ambiguous nucleotides to see if they all yield same aa.
    #  Loop order tries to fail quickly with first position change.

    $aa = "";
    for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {
        for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {
            for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {
                #  set the first value of $aa
                if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }
                #  or break out if any other amino acid is detected
                elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }
            }
        }
    }

    return $aa || "X";
}


#-----------------------------------------------------------------------------
#  Translate with a user-supplied genetic code to translate a sequence.
#  Diagnose the use of upper versus lower, and T versus U in the supplied
#  code, and transform the supplied nucleotide sequence to match.
#
#  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )
#
#-----------------------------------------------------------------------------

sub translate_seq_with_user_code {
    my $seq = shift;
    $seq =~ tr/-//d;     #  remove gaps  ***  Why?
    $seq =~ tr/Xx/Nn/;   #  allow X

    my $gc = shift;      #  Reference to hash of DNA alphabet code
    if (! $gc || ref($gc) ne "HASH") {
        die "translate_seq_with_user_code needs genetic code hash as secondargument.";
    }

    #  Test the type of code supplied: uppercase versus lowercase

    my ($RNA_F, $DNA_F, $M, $N, $X);

    if ($gc->{ "AAA" }) {     #  Looks like uppercase code table
        $seq   = uc $seq;     #  Uppercase sequence
        $RNA_F = "UUU";       #  Uppercase RNA Phe codon
        $DNA_F = "TTT";       #  Uppercase DNA Phe codon
        $M     = "M";         #  Uppercase initiator
        $N     = "N";         #  Uppercase ambiguous nuc
        $X     = "X";         #  Uppercase ambiguous aa
    }
    elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table
        $seq   = lc $seq;     #  Lowercase sequence
        $RNA_F = "uuu";       #  Lowercase RNA Phe codon
        $DNA_F = "ttt";       #  Lowercase DNA Phe codon
        $M     = "m";         #  Lowercase initiator
        $N     = "n";         #  Lowercase ambiguous nuc
        $X     = "x";         #  Lowercase ambiguous aa
    }
    else {
        die "User-supplied genetic code does not have aaa or AAA\n";
    }

    #  Test the type of code supplied:  UUU versus TTT

    my ($ambigs);

    if ($gc->{ $RNA_F }) {     #  Looks like RNA code table
        $seq =~ tr/Tt/Uu/;
        $ambigs = \%RNA_letter_can_be;
    }
    elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table
        $seq =~ tr/Uu/Tt/;
        $ambigs = \%DNA_letter_can_be;
    }
    else {
        die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";
    }

    my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!

    my $met = shift;     #  a third argument that is true
                         #  forces first amino acid to be Met
                         #  (note: undef is false)
    my $pep  = ($met && ($imax >= 0)) ? $M : "";
    my $aa;

    for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
        $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );
    }

    return $pep;
}


#-----------------------------------------------------------------------------
#  Translate with user-supplied genetic code hash.  For speed, no error
#  check on the hash.  Calling programs should check for the hash at a
#  higher level.
#
#  Should only be called through translate_seq_with_user_code
#
#   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )
#
#  $triplet      speaks for itself
#  $code         ref to the hash with the codon translations
#  $N            character to use for ambiguous nucleotide
#  $X            character to use for ambiguous amino acid
#  $ambig_table  ref to hash with lists of nucleotides for each ambig code
#-----------------------------------------------------------------------------


sub translate_codon_with_user_code {
    my $codon = shift;
    my $gc    = shift;
    my $aa;

    #  Try a simple lookup:

    if ( $aa = $gc->{ $codon } ) { return $aa }

    #  Test that codon is valid and might have unambiguous aa:

    my ($N, $X, $ambigs) = @_;
    if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }
    if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }
    #                          ^^
    #                          |+- for leucine YTR
    #                          +-- for arginine MGR

    #  Expand all ambiguous nucleotides to see if they all yield same aa.
    #  Loop order tries to fail quickly with first position change.

    $aa = "";
    for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {
        for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {
            for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {
                #  set the first value of $aa
                if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }
                #  break out if any other amino acid is detected
                elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }
            }
        }
    }

    return $aa || $X;
}


#-----------------------------------------------------------------------------
#  Read a list of intervals from a file.
#  Allow id_start_end, or id \s start \s end formats
#
#     @intervals = read_intervals(*FILEHANDLE)
#-----------------------------------------------------------------------------
sub read_intervals {
    my $fh = shift;
    my @intervals = ();

    while (<$fh>) {
        chomp;
           /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
        || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
        || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
        || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
        || next;

        #  Matched a pattern.  Store reference to (id, left, right):
        push @intervals, ($2 < $3) ? [ $1, $2+0, $3+0 ]
                                   : [ $1, $3+0, $2+0 ];
    }
    return @intervals;
}


#-----------------------------------------------------------------------------
#  Take the union of a list of intervals
#
#     @joined = join_intervals( @interval_refs )
#-----------------------------------------------------------------------------
sub join_intervals {
    my @ordered = sort { $a->[0] cmp $b->[0]   # first by id
                      || $a->[1] <=> $b->[1]   # next by left end
                      || $b->[2] <=> $a->[2]   # finally longest first
                       } @_;

    my @joined = ();
    my $n_int = @ordered;

    my ($cur_id)    = "";
    my ($cur_left)  = -1;
    my ($cur_right) = -1;
    my ($new_id, $new_left, $new_right);

    for (my $i = 0; $i < $n_int; $i++) {
        ($new_id, $new_left, $new_right) = @{$ordered[$i]};  # get the new data

        if ( ( $new_id ne $cur_id)          # if new contig
          || ( $new_left > $cur_right + 1)  #    or not touching previous
           ) {                              # push the previous interval
            if ($cur_id) { push (@joined, [ $cur_id, $cur_left, $cur_right ]) }
            $cur_id = $new_id;              # update the current interval
            $cur_left = $new_left;
            $cur_right = $new_right;
        }

        elsif ($new_right > $cur_right) {   # extend the right end if necessary
            $cur_right = $new_right;
        }
    }

    if ($cur_id) { push (@joined, [$cur_id, $cur_left, $cur_right]) }
    return @joined;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3