package gjoseqlib; # # This is a SAS component. # =head1 Sequence-IO Utilities A sequence entry is ( $id, $def, $seq ) A list of entries is a list of references Efficient reading of an entire file of sequences: @seq_entries = read_fasta( ) # STDIN @seq_entries = read_fasta( \*FILEHANDLE ) @seq_entries = read_fasta( $filename ) Legacy interface: @seq_entries = read_fasta_seqs( \*FILEHANDLE ) # Original form Reading sequences one at a time to conserve memory. Calls to different files can be intermixed. @entry = next_fasta( \*FILEHANDLE ) \@entry = next_fasta( \*FILEHANDLE ) @entry = next_fasta( $filename ) \@entry = next_fasta( $filename ) @entry = next_fasta() # STDIN \@entry = next_fasta() # STDIN @entry = read_next_fasta( \*FILEHANDLE ) \@entry = read_next_fasta( \*FILEHANDLE ) @entry = read_next_fasta( $filename ) \@entry = read_next_fasta( $filename ) @entry = read_next_fasta() # STDIN \@entry = read_next_fasta() # STDIN Close an open file that has not been read to the end: close_fasta( $filename ) Legacy interface: @entry = read_next_fasta_seq( \*FILEHANDLE ) \@entry = read_next_fasta_seq( \*FILEHANDLE ) @entry = read_next_fasta_seq( $filename ) \@entry = read_next_fasta_seq( $filename ) @entry = read_next_fasta_seq() # STDIN \@entry = read_next_fasta_seq() # STDIN Reading clustal alignment. @seq_entries = read_clustal( ) # STDIN @seq_entries = read_clustal( \*FILEHANDLE ) @seq_entries = read_clustal( $filename ) Legacy interface: @seq_entries = read_clustal_file( $filename ) $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entries @seq_entry = seq_entry_by_id( \%seq_index, $seq_id ); $seq_desc = seq_desc_by_id( \%seq_index, $seq_id ); $seq = seq_data_by_id( \%seq_index, $seq_id ); ( $id, $def ) = parse_fasta_title( $title ) ( $id, $def ) = split_fasta_title( $title ) @id_def_org = nr_def_as_id_def_org( $nr_seq_title ) @id_def_org = split_id_def_org( @seq_titles ); Write a fasta format file from sequences. write_fasta( @seq_entry_list ); # STDOUT write_fasta( \@seq_entry_list ); # STDOUT write_fasta( \*FILEHANDLE, @seq_entry_list ); write_fasta( \*FILEHANDLE, \@seq_entry_list ); write_fasta( $filename, @seq_entry_list ); write_fasta( $filename, \@seq_entry_list ); Legacy interface: print_alignment_as_fasta( @seq_entry_list ); # STDOUT print_alignment_as_fasta( \@seq_entry_list ); # STDOUT print_alignment_as_fasta( \*FILEHANDLE, @seq_entry_list ); print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list ); print_alignment_as_fasta( $filename, @seq_entry_list ); print_alignment_as_fasta( $filename, \@seq_entry_list ); Legacy interface: print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list ); # Original form Interface that it really meant for internal use to write the next sequence to an open file: print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq ); print_seq_as_fasta( $id, $desc, $seq ); print_seq_as_fasta( \*FILEHANDLE, $id, $seq ); print_seq_as_fasta( $id, $seq ); Write an interleaved alignment to a file (or string) $n_seq = write_alignment_as_text( \@seqs, \%opts ); # STDOUT $n_seq = write_alignment_as_text( \*FH, \@seqs, \%opts ); # open file handle $n_seq = write_alignment_as_text( $file, \@seqs, \%opts ); # file $n_seq = write_alignment_as_text( \$str, \@seqs, \%opts ); # string Options: columns => int # Alignment columns for line definitions => bool # Include definition with each ID (ugly) legend => bool # Add a legend with the definition of each ID Write PHYLIP alignment. Names might be altered to fit 10 character limit: print_alignment_as_phylip( @seq_entry_list ); # STDOUT print_alignment_as_phylip( \@seq_entry_list ); # STDOUT print_alignment_as_phylip( \*FILEHANDLE, @seq_entry_list ); print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list ); print_alignment_as_phylip( $filename, @seq_entry_list ); print_alignment_as_phylip( $filename, \@seq_entry_list ); Write basic NEXUS alignment for PAUP. print_alignment_as_nexus( [ \%label_hash, ] @seq_entry_list ); print_alignment_as_nexus( [ \%label_hash, ] \@seq_entry_list ); print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] @seq_entry_list ); print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list ); print_alignment_as_nexus( $filename, [ \%label_hash, ] @seq_entry_list ); print_alignment_as_nexus( $filename, [ \%label_hash, ] \@seq_entry_list ); print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq ); Remove extra columns of alignment gaps from an alignment: @packed_seqs = pack_alignment( @seqs ) @packed_seqs = pack_alignment( \@seqs ) \@packed_seqs = pack_alignment( @seqs ) \@packed_seqs = pack_alignment( \@seqs ) Pack mask for an alignment (gap = 0x00, others are 0xFF) $mask = alignment_gap_mask( @seqs ) $mask = alignment_gap_mask( \@seqs ) Pack a sequence alignment according to a mask: @packed = pack_alignment_by_mask( $mask, @align ) @packed = pack_alignment_by_mask( $mask, \@align ) \@packed = pack_alignment_by_mask( $mask, @align ) \@packed = pack_alignment_by_mask( $mask, \@align ) Expand sequence by a mask, adding indel at "\000" or '-' in mask: $expanded = expand_sequence_by_mask( $seq, $mask ) Remove all alignment gaps from sequences (modify a copy): @packed_seqs = pack_sequences( @seqs ) # Works for one sequence, too @packed_seqs = pack_sequences( \@seqs ) \@packed_seqs = pack_sequences( @seqs ) \@packed_seqs = pack_sequences( \@seqs ) Basic sequence manipulation functions: @entry = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] ) @entry = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] ) $DNAseq = DNA_subseq( $seq, $from, $to, $dir ) # In these functions, $dir $DNAseq = DNA_subseq( \$seq, $from, $to, $dir ) # only matters if $from == $to, $RNAseq = RNA_subseq( $seq, $from, $to, $dir ) # in which case the direction $RNAseq = RNA_subseq( \$seq, $from, $to, $dir ) # would be ambiguous. @entry = complement_DNA_entry( @seq_entry [, $fix_id] ) @entry = complement_RNA_entry( @seq_entry [, $fix_id] ) $DNAseq = complement_DNA_seq( $NA_seq ) $RNAseq = complement_RNA_seq( $NA_seq ) $DNAseq = to_DNA_seq( $NA_seq ) $RNAseq = to_RNA_seq( $NA_seq ) $seq = pack_seq( $sequence ) # modifies a copy $seq = clean_ae_sequence( $seq ) $aa_seq = aa_subseq( $seq, $from, $to ) $aa = translate_seq( $nt, $met_start ) $aa = translate_seq( $nt ) $aa = translate_codon( $triplet ); User-supplied genetic code. The supplied code needs to be complete in RNA and/or DNA, and upper and/or lower case. The program guesses based on lysine and phenylalanine codons. $aa = translate_seq_with_user_code( $nt, $gen_code_hash, $met_start ) $aa = translate_seq_with_user_code( $nt, $gen_code_hash ) Locations (= oriented intervals) are ( id, start, end ). Intervals are ( id, left, right ). @intervals = read_intervals( \*FILEHANDLE ) @intervals = read_oriented_intervals( \*FILEHANDLE ) @intervals = standardize_intervals( @interval_refs ) # (id, left, right) @joined = join_intervals( @interval_refs ) @intervals = locations_2_intervals( @locations ) $interval = locations_2_intervals( $location ) @reversed = reverse_intervals( @interval_refs ) # (id, end, start) Convert GenBank locations to SEED locations @seed_locs = gb_location_2_seed( $contig, @gb_locs ) Read quality scores from a fasta-like file: @seq_entries = read_qual( ) # STDIN \@seq_entries = read_qual( ) # STDIN @seq_entries = read_qual( \*FILEHANDLE ) \@seq_entries = read_qual( \*FILEHANDLE ) @seq_entries = read_qual( $filename ) \@seq_entries = read_qual( $filename ) Evaluate alignments: $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options ) $fraction_diff = fraction_nt_diff( $seq1, $seq2 ) $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_weight ) $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_open, $gap_extend ) ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 ) ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_aa_align( $seq1, $seq2 ) Remove alignment columns with a nonvalid character or shared gaps ( $seq1, $seq2 ) = useful_nt_align( $seq1, $seq2 ); ( $seq1, $seq2 ) = useful_aa_align( $seq1, $seq2 ); Remove columns with terminal gaps ( $seq1, $seq2 ) = trim_terminal_gap_columns( $seq1, $seq2 ); Analysis of shared oligomers @sims = oligomer_similarity( $seq1, $seq2, \%opts ) Guess type of a sequence: $type = guess_seq_type( $sequence ) $bool = is_dna( $sequence ) # not RNA or prot $bool = is_rna( $sequence ) # not DNA or prot $bool = is_na( $sequence ) # nucleic acid, not prot $bool = is_prot( $sequence ) # not DNA or RNA $sequence can be a sequence string, a reference to a sequence string, or a [$id, $def, $seq] triple. $type will be keyword 'DNA', 'RNA' or 'protein', or undef in case of error. Verify the structure of an [ id, desc, sequence ] triple and the structure of an array of sequence triples: $bool = is_sequence_triple( $triple ) $bool = is_array_of_sequence_triples( \@triples ) =cut use strict; use Carp; use Data::Dumper; # Exported global variables: our @aa_1_letter_order; # Alpha by 1 letter our @aa_3_letter_order; # PAM matrix order our @aa_n_codon_order; our %genetic_code; our %genetic_code_with_U; our %amino_acid_codons_DNA; our %amino_acid_codons_RNA; our %n_codon_for_aa; our %reverse_genetic_code_DNA; our %reverse_genetic_code_RNA; our %DNA_letter_can_be; our %RNA_letter_can_be; our %one_letter_to_three_letter_aa; our %three_letter_to_one_letter_aa; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw( read_fasta read_next_fasta next_fasta close_fasta read_fasta_seqs read_next_fasta_seq read_clustal_file read_clustal parse_fasta_title split_fasta_title nr_def_as_id_def_org split_id_def_org print_seq_list_as_fasta print_alignment_as_fasta write_fasta write_alignment_as_text print_alignment_as_phylip print_alignment_as_nexus print_seq_as_fasta print_gb_locus index_seq_list seq_entry_by_id seq_desc_by_id seq_data_by_id pack_alignment alignment_gap_mask pack_alignment_by_mask expand_sequence_by_mask pack_sequences subseq_DNA_entry subseq_RNA_entry DNA_subseq RNA_subseq complement_DNA_entry complement_RNA_entry complement_DNA_seq complement_RNA_seq to_DNA_seq to_RNA_seq pack_seq clean_ae_sequence aa_subseq translate_seq translate_codon translate_seq_with_user_code read_intervals standardize_intervals join_intervals locations_2_intervals read_oriented_intervals reverse_intervals gb_location_2_seed read_qual fraction_nt_diff interpret_nt_align interpret_aa_align oligomer_similarity ); our @EXPORT_OK = qw( @aa_1_letter_order @aa_3_letter_order @aa_n_codon_order %genetic_code %genetic_code_with_U %amino_acid_codons_DNA %amino_acid_codons_RNA %n_codon_for_aa %reverse_genetic_code_DNA %reverse_genetic_code_RNA %DNA_letter_can_be %RNA_letter_can_be %one_letter_to_three_letter_aa %three_letter_to_one_letter_aa ); =head2 input_file_handle Helper function for defining an input filehandle: =over 4 =item 1 filehandle is passed through =item 2 string is taken as file name to be openend =item 3 undef or "" defaults to STDOUT =back ( \*FH, $name, $close [, $file] ) = input_filehandle( $file ); =cut sub input_filehandle { my $file = shift; # FILEHANDLE return ( $file, $file, 0 ) if ( is_filehandle($file) ); # Null string or undef return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) ); # File name if ( ! ref( $file ) ) { my $fh; if ( -f $file ) { } elsif ( $file =~ /^<(.+)$/ && -f $1 ) { $file = $1 } else { die "Could not find input file '$file'\n" } open( $fh, '<', $file ) || die "Could not open '$file' for input\n"; return ( $fh, $file, 1 ); } # Some other kind of reference; return the unused value return ( \*STDIN, undef, 0, $file ); } =head2 read_fasta_seqs Read fasta sequences from a filehandle (legacy interface; use read_fasta) Save the contents in a list of refs to arrays: (id, description, seq) @seq_entries = read_fasta_seqs( \*FILEHANDLE ) =cut sub read_fasta_seqs { read_fasta( @_ ) } =head2 read_fasta Read fasta sequences. Save the contents in a list of refs to arrays: $seq_entry = [ id, description, seq ] @seq_entries = read_fasta( ) # STDIN \@seq_entries = read_fasta( ) # STDIN @seq_entries = read_fasta( \*FILEHANDLE ) \@seq_entries = read_fasta( \*FILEHANDLE ) @seq_entries = read_fasta( $filename ) \@seq_entries = read_fasta( $filename ) @seq_entries = read_fasta( \$string ) # reference to file as string \@seq_entries = read_fasta( \$string ) # reference to file as string =cut sub read_fasta { my $dataR = ( $_[0] && ref $_[0] eq 'SCALAR' ) ? $_[0] : slurp( @_ ); $dataR && $$dataR or return wantarray ? () : []; my $is_fasta = $$dataR =~ m/^[\s\r]*>/; my @seqs = map { $_->[2] =~ tr/ \n\r\t//d; $_ } map { /^(\S+)([ \t]+([^\n\r]+)?)?[\n\r]+(.*)$/s ? [ $1, $3 || '', $4 || '' ] : () } split /[\n\r]+>[ \t]*/m, $$dataR; # Fix the first sequence, if necessary if ( @seqs ) { if ( $is_fasta ) { $seqs[0]->[0] =~ s/^>//; # remove > if present } elsif ( @seqs == 1 ) { $seqs[0]->[1] =~ s/\s+//g; @{ $seqs[0] } = ( 'raw_seq', '', join( '', @{$seqs[0]} ) ); } else # First sequence is not fasta, but others are! Throw it away. { shift @seqs; } } wantarray ? @seqs : \@seqs; } =head2 slurp A fast file reader: $dataR = slurp( ) # \*STDIN $dataR = slurp( \*FILEHANDLE ) # an open file handle $dataR = slurp( $filename ) # a file name $dataR = slurp( "<$filename" ) # file with explicit direction =head3 Note It is faster to read lines by reading the file and splitting than by reading the lines sequentially. If space is not an issue, this is the way to go. If space is an issue, then lines or records should be processed one-by-one (rather than loading the whole input into a string or array). =cut sub slurp { my ( $fh, $close ); if ( $_[0] && is_filehandle($_[0])) { $fh = shift; } elsif ( $_[0] && ! ref $_[0] ) { my $file = shift; if ( -f $file ) { } elsif ( $file =~ /^<(.*)$/ && -f $1 ) { $file = $1 } # Explicit read else { return undef } open( $fh, '<', $file ) or return undef; $close = 1; } else { $fh = \*STDIN; $close = 0; } my $out = ''; my $inc = 1048576; my $end = 0; my $read; while ( $read = read( $fh, $out, $inc, $end ) ) { $end += $read } close( $fh ) if $close; \$out; } =head2 read_fasta_0 Previous, 50% slower fasta reader: =cut sub read_fasta_0 { my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] ); $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n"; my @seqs = (); my ($id, $desc, $seq) = ("", "", ""); while ( <$fh> ) { chomp; if (/^>\s*(\S+)(\s+(.*))?$/) { # new id if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] } ($id, $desc, $seq) = ($1, $3 ? $3 : "", ""); } else { tr/ 0-9//d; $seq .= $_ ; } } close( $fh ) if $close; if ( $id && $seq ) { push @seqs, [ $id, $desc, $seq ] } return wantarray ? @seqs : \@seqs; } =head2 read_next_fasta Read one fasta sequence at a time from a file. This is half as fast a read_fasta(), but can handle an arbitrarily large file. State information is retained in hashes, so any number of streams can be interlaced. @entry = read_next_fasta( \*FILEHANDLE ) \@entry = read_next_fasta( \*FILEHANDLE ) @entry = read_next_fasta( $filename ) \@entry = read_next_fasta( $filename ) @entry = read_next_fasta() # \*STDIN \@entry = read_next_fasta() # \*STDIN Where, @entry = ( $id, $description, $seq ) Longer, legacy name: @entry = read_next_fasta_seq( \*FILEHANDLE ) \@entry = read_next_fasta_seq( \*FILEHANDLE ) @entry = read_next_fasta_seq( $filename ) \@entry = read_next_fasta_seq( $filename ) @entry = read_next_fasta_seq() # \*STDIN \@entry = read_next_fasta_seq() # \*STDIN When reading at the end of file, () is returned. With a filename, reading past this will reopen the file at the beginning. Beware: openning a filename, and not reading it to the end leaves the file open, which can hit the file system limit for open files. So, use close_fasta( $filename ) =cut # Reading always overshoots, so save next id and description { # Use bare block to scope the header hash my %next_header; my %file_handle; my %close_file; sub close_fasta { $_[0] or return 0; my $fh = $file_handle{ $_[0] } or return 0; if ( $close_file{ $fh } ) { close $fh; delete $close_file{ $fh } } delete $file_handle{ $_[0] }; 1; } sub read_next_fasta_seq { next_fasta( @_ ) } sub read_next_fasta { next_fasta( @_ ) } sub next_fasta { $_[0] ||= \*STDIN; # Undefined $_[0] fails with use warn my $fh = $file_handle{ $_[0] }; if ( ! $fh ) { if ( ref $_[0] ) { return () if ! is_filehandle($_[0]); $fh = $_[0]; } else { my $file = $_[0]; if ( -f $file ) { } elsif ( $file =~ /^<(.+)$/ && -f $1 ) { $file = $1 } # Explicit read else { return () } open( $fh, '<', $file ) or return (); $close_file{ $fh } = 1; } $file_handle{ $_[0] } = $fh; } my ( $id, $desc, $seq ) = ( undef, '', '' ); if ( defined( $next_header{$fh} ) ) { ( $id, $desc ) = parse_fasta_title( $next_header{$fh} ); } else { $next_header{$fh} = ''; } local $_; while ( <$fh> ) { chomp; if ( /^>/ ) # new id { $next_header{$fh} = $_; if ( defined($id) && $seq ) { return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq] } ( $id, $desc ) = parse_fasta_title( $next_header{$fh} ); $seq = ''; } else { tr/ \t\r//d; $seq .= $_; } } # Done with file; there is no next header: delete $next_header{ $fh }; # Return last set of data: if ( defined($id) && $seq ) { return wantarray ? ($id,$desc,$seq) : [$id,$desc,$seq] } # Or close everything out (returning the empty list tells caller # that we are done) if ( $close_file{ $fh } ) { close $fh; delete $close_file{ $fh } } delete $file_handle{ $_[0] }; return (); } } =head2 read_clustal_file Read a clustal alignment from a file. Save the contents in a list of refs to arrays: (id, description, seq) @seq_entries = read_clustal_file( $filename ) =cut sub read_clustal_file { read_clustal( @_ ) } =head2 read_clustal Read a clustal alignment. Save the contents in a list of refs to arrays: (id, description, seq) @seq_entries = read_clustal( ) # STDIN @seq_entries = read_clustal( \*FILEHANDLE ) @seq_entries = read_clustal( $filename ) =cut sub read_clustal { my ( $fh, undef, $close, $unused ) = input_filehandle( shift ); $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n"; my ( %seq, @ids, $line ); while ( defined( $line = <$fh> ) ) { ( $line =~ /^[A-Za-z0-9]/ ) or next; chomp $line; my @flds = split /\s+/, $line; if ( @flds == 2 ) { $seq{ $flds[0] } or push @ids, $flds[0]; push @{ $seq{ $flds[0] } }, $flds[1]; } } close( $fh ) if $close; map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids; } =head2 parse_fasta_title Parse a fasta file header to id and definition parts ($id, $def) = parse_fasta_title( $title ) ($id, $def) = split_fasta_title( $title ) =cut sub parse_fasta_title { my $title = shift; chomp $title; return $title =~ /^>?\s*(\S+)(\s+(.*\S)?\s*)?$/ ? ( $1, $3 || '' ) : $title =~ /^>/ ? ( '', '' ) : ( undef, undef ) } sub split_fasta_title { parse_fasta_title( @_ ) } =head2 nr_def_as_id_def_org Some functions to split fasta file def lines into [id, definition, organism] These forms DO NOT want a > at the beinging of the line (it will become part of the id). @id_def_org = nr_def_as_id_def_org( $nr_seq_title ) @id_def_org = split_id_def_org( @seq_titles ); =cut # # @id_def_org = nr_def_as_id_def_org( $nr_id_def_line ); # sub nr_def_as_id_def_org { defined($_[0]) ? split_id_def_org( split /\001/, $_[0] ) : (); } =head2 split_id_def_org @id_def_org = split_id_def_org( @id_def_org_lines ); =cut sub split_id_def_org { map { ! defined( $_ ) ? () : ! /^\s*\S/ ? () : /^\s*(\S+)\s+(.*\S)\s+\[([^\[\]]+)\]\s*$/ ? [ $1, $2, $3 ] # id def org : /^\s*(\S+)\s+\[([^\[\]]+)\]\s*$/ ? [ $1, '', $2 ] # id org : /^\s*(\S+)\s+(.*[^\]\s])\s*$/ ? [ $1, $2, '' ] # id def : /^\s*(\S+)\s*$/ ? [ $1, '', '' ] # id : split_id_def_org_2( $_ ); } @_; } # # @id_def_org = split_id_def_org( @id_def_org_lines ); # sub split_id_def_org_2 { local $_ = $_[0]; s/^\s+//; s/\s+$//; # split into segments starting with [ or ] my @parts = grep { defined($_) && length($_) } split /([\]\[][^\]\[]*)/; my $n = 0; for ( my $i = @parts-1; $i > 0; $i-- ) { # a part starts with [ or ]. if ( $parts[$i] =~ /^\[/ ) { $n--; # If the open bracket balances, and there # is white space before, we are done if ( $n == 0 && $parts[$i-1] =~ s/\s+$// ) { my $def = join( '', @parts[ 0 .. $i-1 ] ); $parts[$i] =~ s/^\[\s*//; # remove open bracket $parts[@parts-1] =~ s/\s*\]$//; # remove close bracket my $org = join( '', @parts[ $i .. @parts-1 ] ); return $def =~ /^(\S+)\s+(\S.*)$/ ? [ $1, $2, $org ] : [ $def, '', $org ]; } } elsif ( $parts[$i] =~ /^\]/ ) { $n++; } else { carp "Logic error in split_id_def_org_2()"; return m/^(\S+)\s+(\S.*)$/ ? [ $1, $2, '' ] : [ $_, '', '' ]; } } # Fall through means that we failed to balance organism brackets m/^(\S+)\s+(\S.*)$/ ? [ $1, $2, '' ] : [ $_, '', '' ]; } =head2 output_filehandle Helper function for defining an output filehandle: =over 4 =item 1 filehandle is passed through =item 2 string is taken as file name to be openend =item 3 undef or "" defaults to STDOUT =back ( \*FH, $close [, $file] ) = output_filehandle( $file ); =cut sub output_filehandle { my $file = shift; # Null string or undef return ( \*STDOUT, 0 ) if ( ! defined( $file ) || ( $file eq "" ) ); # FILEHANDLE return ( $file, 0 ) if ( is_filehandle($file) ); # Some other kind of reference; return the unused value return ( \*STDOUT, 0, $file ) if ref( $file ); # File name my $fh; open( $fh, '>', $file ) || die "Could not open output $file\n"; return ( $fh, 1 ); } =head2 print_seq_list_as_fasta Legacy function for printing fasta sequence set: print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list ); =cut sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) } =head2 print_alignment_as_fasta Print list of sequence entries in fasta format. Missing, undef or "" filename defaults to STDOUT. print_alignment_as_fasta( @seq_entry_list ); # STDOUT print_alignment_as_fasta( \@seq_entry_list ); # STDOUT print_alignment_as_fasta( \*FILEHANDLE, @seq_entry_list ); print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list ); print_alignment_as_fasta( $filename, @seq_entry_list ); print_alignment_as_fasta( $filename, \@seq_entry_list ); =cut sub print_alignment_as_fasta { return &write_fasta(@_); } =head2 write_fasta Print list of sequence entries in fasta format. Missing, undef or "" filename defaults to STDOUT. write_fasta( @seq_entry_list ); # STDOUT write_fasta( \@seq_entry_list ); # STDOUT write_fasta( \*FILEHANDLE, @seq_entry_list ); write_fasta( \*FILEHANDLE, \@seq_entry_list ); write_fasta( $filename, @seq_entry_list ); write_fasta( $filename, \@seq_entry_list ); =cut sub write_fasta { my ( $fh, $close, $unused ) = output_filehandle( shift ); ( unshift @_, $unused ) if $unused; ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n"; # Expand the sequence entry list if necessary: if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } } foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) } close( $fh ) if $close; } =head2 write_alignment_as_text Write an interleaved alignment to a file (or string) $n_seq = write_alignment_as_text( \@seqs, \%opts ); # STDOUT $n_seq = write_alignment_as_text( \*FH, \@seqs, \%opts ); # open file handle $n_seq = write_alignment_as_text( $file, \@seqs, \%opts ); # file $n_seq = write_alignment_as_text( \$str, \@seqs, \%opts ); # string Options: columns => int # Alignment columns for line definitions => bool # Include definition with each ID (ugly) legend => bool # Add a legend with the definition of each ID =cut sub write_alignment_as_text { my $opts = ref($_[-1]) eq 'HASH' ? pop : {}; my ( $outFH, $close, $unused ) = output_filehandle( shift ); ( unshift @_, $unused ) if $unused; ref( $_[0] ) eq 'ARRAY' or die "Bad sequence alignment format."; my @seq = @{ scalar shift }; my $col_per_line = $opts->{ columns } || 60; # Alignment columns my $show_def = $opts->{ definitions }; # Show id and def (ugly) my $legend = $opts->{ legend }; # Add legend with id and def my @ids; my $id_len; if ( $legend ) { @ids = map { $_->[0] } @seq; $id_len = 0; foreach ( @ids ) { $id_len = length( $_ ) if length( $_ ) > $id_len } my $fmt = "%-${id_len}s %s\n"; printf $outFH $fmt, 'ID', 'Definition'; for ( my $i = 0; $i < @ids; $i++ ) { printf $outFH $fmt, $ids[$i], $seq[$i]->[1]; } print $outFH "\n\n"; # Fix the IDs if they need to be ugly if ( $show_def ) { @ids = map { "$_->[0] $_->[1]" } @seq; $id_len = 0; foreach ( @ids ) { $id_len = length( $_ ) if length( $_ ) > $id_len } } } else { @ids = map { $show_def ? "$_->[0] $_->[1]" : $_->[0] } @seq; $id_len = 0; foreach ( @ids ) { $id_len = length( $_ ) if length( $_ ) > $id_len } } my @segs = map { [ $_->[2] =~ m/(.{1,$col_per_line})/go ] } @seq; my $fmt = "%-${id_len}s %s\n"; while ( @{ $segs[0] } ) { for ( my $i = 0; $i < @ids; $i++ ) { printf $outFH $fmt, $ids[$i], shift @{$segs[$i]}; } print $outFH "\n"; } close( $outFH ) if $close; scalar @seq; } =print_alignment_as_phylip Print list of sequence entries in phylip format. Missing, undef or "" filename defaults to STDOUT. print_alignment_as_phylip( @seq_entry_list ); # STDOUT print_alignment_as_phylip( \@seq_entry_list ); # STDOUT print_alignment_as_phylip( \*FILEHANDLE, @seq_entry_list ); print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list ); print_alignment_as_phylip( $filename, @seq_entry_list ); print_alignment_as_phylip( $filename, \@seq_entry_list ); =cut sub print_alignment_as_phylip { my ( $fh, $close, $unused ) = output_filehandle( shift ); ( unshift @_, $unused ) if $unused; ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n"; my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_; my ( %id2, %used ); my $maxlen = 0; foreach ( @seq_list ) { my ( $id, undef, $seq ) = @$_; # Need a name that is unique within 10 characters my $id2 = substr( $id, 0, 10 ); $id2 =~ s/_/ /g; # PHYLIP sequence files accept spaces my $n = "0"; while ( $used{ $id2 } ) { $n++; $id2 = substr( $id, 0, 10 - length( $n ) ) . $n; } $used{ $id2 } = 1; $id2{ $id } = $id2; # Prepare to pad sequences (should not be necessary, but ...) my $len = length( $seq ); $maxlen = $len if ( $len > $maxlen ); } my $nseq = @seq_list; print $fh "$nseq $maxlen\n"; foreach ( @seq_list ) { my ( $id, undef, $seq ) = @$_; my $len = length( $seq ); printf $fh "%-10s %s%s\n", $id2{ $id }, $seq, $len<$maxlen ? ("?" x ($maxlen-$len)) : ""; } close( $fh ) if $close; } =head2 print_alignment_as_nexus Print list of sequence entries in nexus format. Missing, undef or "" filename defaults to STDOUT. print_alignment_as_nexus( [ \%label_hash, ] @seq_entry_list ); print_alignment_as_nexus( [ \%label_hash, ] \@seq_entry_list ); print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] @seq_entry_list ); print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list ); print_alignment_as_nexus( $filename, [ \%label_hash, ] @seq_entry_list ); print_alignment_as_nexus( $filename, [ \%label_hash, ] \@seq_entry_list ); =cut sub print_alignment_as_nexus { my ( $fh, $close, $unused ) = output_filehandle( shift ); ( unshift @_, $unused ) if $unused; my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef; ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n"; my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_; my %id2; my ( $maxidlen, $maxseqlen ) = ( 0, 0 ); my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 ); foreach ( @seq_list ) { my ( $id, undef, $seq ) = @$_; my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id; if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ ) { $id2 =~ s/'/''/g; $id2 = qq('$id2'); } $id2{ $id } = $id2; my $idlen = length( $id2 ); $maxidlen = $idlen if ( $idlen > $maxidlen ); my $seqlen = length( $seq ); $maxseqlen = $seqlen if ( $seqlen > $maxseqlen ); $nt += $seq =~ tr/Tt//d; $nu += $seq =~ tr/Uu//d; $n1 += $seq =~ tr/ACGNacgn//d; $n2 += $seq =~ tr/A-Za-z//d; } my $nseq = @seq_list; my $type = ( $n1 < 2 * $n2 ) ? 'protein' : ($nt>$nu) ? 'DNA' : 'RNA'; print $fh <<"END_HEAD"; #NEXUS BEGIN Data; Dimensions NTax=$nseq NChar=$maxseqlen ; Format DataType=$type Gap=- Missing=? ; Matrix END_HEAD foreach ( @seq_list ) { my ( $id, undef, $seq ) = @$_; my $len = length( $seq ); printf $fh "%-${maxidlen}s %s%s\n", $id2{ $id }, $seq, $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : ""; } print $fh <<"END_TAIL"; ; END; END_TAIL close( $fh ) if $close; } =head2 print_seq_as_fasta Print one sequence in fasta format to an open file. print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq ); print_seq_as_fasta( $id, $desc, $seq ); print_seq_as_fasta( \*FILEHANDLE, $id, $seq ); print_seq_as_fasta( $id, $seq ); print_seq_as_fasta() is meant more as a internal support routine than an external interface. To print a single sequence to a named file use: print_alignment_as_fasta( $filename, [ $id, $desc, $seq ] ); print_alignment_as_fasta( $filename, [ $id, $seq ] ); =cut sub print_seq_as_fasta { my $fh = ( is_filehandle($_[0]) ) ? shift : \*STDOUT; return if ( @_ < 2 ) || ( @_ > 3 ) || ! ( defined $_[0] && defined $_[-1] ); # Print header line print $fh ( @_ == 3 && defined $_[1] && $_[1] =~ /\S/ ) ? ">$_[0] $_[1]\n" : ">$_[0]\n"; # Print sequence, 60 chars per line print $fh join( "\n", $_[-1] =~ m/.{1,60}/g ), "\n"; } =head2 print_gb_locus Print one sequence in GenBank flat file format: print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq ) =cut sub print_gb_locus { my ($fh, $loc, $def, $acc, $seq) = @_; my ($len, $i, $imax); my $istep = 10; $len = length($seq); printf $fh "LOCUS %-10s%7d bp\n", substr($loc,0,10), $len; print $fh "DEFINITION " . substr(wrap_text($def,80,12), 12) . "\n"; if ($acc) { print $fh "ACCESSION $acc\n" } print $fh "ORIGIN\n"; for ($i = 1; $i <= $len; ) { printf $fh "%9d", $i; $imax = $i + 59; if ($imax > $len) { $imax = $len } for ( ; $i <= $imax; $i += $istep) { print $fh " " . substr($seq, $i-1, $istep); } print $fh "\n"; } print $fh "//\n"; } =head2 wrap_text Return a string with text wrapped to defined line lengths: $wrapped_text = wrap_text( $str ) # default len = 80 $wrapped_text = wrap_text( $str, $len ) # default ind = 0 $wrapped_text = wrap_text( $str, $len, $indent ) # default ind_n = ind $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n ) =cut sub wrap_text { my ($str, $len, $ind, $indn) = @_; defined($str) || die "wrap_text called without a string\n"; defined($len) || ($len = 80); defined($ind) || ($ind = 0); ($ind < $len) || die "wrap error: indent greater than line length\n"; defined($indn) || ($indn = $ind); ($indn < $len) || die "wrap error: indent_n greater than line length\n"; $str =~ s/\s+$//; $str =~ s/^\s+//; my ($maxchr, $maxchr1); my (@lines) = (); while ($str) { $maxchr1 = ($maxchr = $len - $ind) - 1; if ($maxchr >= length($str)) { push @lines, (" " x $ind) . $str; last; } elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {} push @lines, (" " x $ind) . $1; $str = $2; } elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) { push @lines, (" " x $ind) . $1; $str = $2; } else { push @lines, (" " x $ind) . substr($str, 0, $maxchr); $str = substr($str, $maxchr); } $ind = $indn; } return join("\n", @lines); } =head2 index_seq_list Build an index from seq_id to pointer to sequence entry: (id, desc, seq) my \%seq_ind = index_seq_list( @seq_list ); my \%seq_ind = index_seq_list( \@seq_list ); Usage example: my @seq_list = read_fasta_seqs(\*STDIN); # list of pointers to entries my \%seq_ind = index_seq_list(@seq_list); # hash from names to pointers my @chosen_seq = @{%seq_ind{"contig1"}}; # extract one entry =cut sub index_seq_list { ( ref( $_[0] ) ne 'ARRAY' ) ? {} : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ } : { map { $_->[0] => $_ } @{ $_[0] } } } =head2 seq access methods Three routines to access all or part of sequence entry by id: @seq_entry = seq_entry_by_id( \%seq_index, $seq_id ); \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id ); $seq_desc = seq_desc_by_id( \%seq_index, $seq_id ); $seq = seq_data_by_id( \%seq_index, $seq_id ); =cut sub seq_entry_by_id { (my $ind_ref = shift) || die "No index supplied to seq_entry_by_id\n"; (my $id = shift) || die "No id supplied to seq_entry_by_id\n"; return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id}; } sub seq_desc_by_id { (my $ind_ref = shift) || die "No index supplied to seq_desc_by_id\n"; (my $id = shift) || die "No id supplied to seq_desc_by_id\n"; return ${ $ind_ref->{$id} }[1]; } sub seq_data_by_id { (my $ind_ref = shift) || die "No index supplied to seq_data_by_id\n"; (my $id = shift) || die "No id supplied to seq_data_by_id\n"; return ${ $ind_ref->{$id} }[2]; } =head2 pack_alignment Remove columns of alignment gaps from sequences: @packed_seqs = pack_alignment( @seqs ) @packed_seqs = pack_alignment( \@seqs ) \@packed_seqs = pack_alignment( @seqs ) \@packed_seqs = pack_alignment( \@seqs ) Gap characters are defined below as '-', '~', '.', and ' '. =cut sub pack_alignment { $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) or return (); my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; @seqs or return wantarray ? () : []; my $mask = gap_mask( $seqs[0]->[2] ); foreach ( @seqs[ 1 .. (@seqs-1) ] ) { $mask |= gap_mask( $_->[2] ); } my $seq; my @seqs2 = map { $seq = $_->[2] & $mask; $seq =~ tr/\000//d; [ $_->[0], $_->[1], $seq ] } @seqs; wantarray ? @seqs2 : \@seqs2; } =head2 alignment_gap_mask Produce a packing mask for columns of alignment gaps in sequences. Gap columns are 0x00 characters, and all others are 0xFF. $mask = alignment_gap_mask( @seqs ) $mask = alignment_gap_mask( \@seqs ) =cut sub alignment_gap_mask { $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) or return undef; my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; @seqs or return undef; my $mask = gap_mask( $seqs[0]->[2] ); foreach ( @seqs[ 1 .. (@seqs-1) ] ) { $mask |= gap_mask( $_->[2] ) } $mask; } =head2 pack_alignment_by_mask Pack a sequence alignment according to a mask, removing positions where mask has 0x00 (or '-') characters @packed = pack_alignment_by_mask( $mask, @align ) @packed = pack_alignment_by_mask( $mask, \@align ) \@packed = pack_alignment_by_mask( $mask, @align ) \@packed = pack_alignment_by_mask( $mask, \@align ) =cut sub pack_alignment_by_mask { my $mask = shift; defined $mask && ! ref( $mask ) && length( $mask ) or return (); $mask =~ tr/-/\000/; # Allow '-' as a column to be removed $mask =~ tr/\000/\377/c; # Make sure that everything not null is 0xFF $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) or return (); my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; my $seq; my @seqs2 = map { $seq = $_->[2] & $mask; # Apply mask to sequence $seq =~ tr/\000//d; # Delete null characters [ $_->[0], $_->[1], $seq ] # Rebuild sequence entries } @seqs; wantarray ? @seqs2 : \@seqs2; } =head2 weight_alignment_by_mask Weight a sequence alignment according to a mask of digits, 0-9. @packed = weight_alignment_by_mask( $mask, @align ) @packed = weight_alignment_by_mask( $mask, \@align ) \@packed = weight_alignment_by_mask( $mask, @align ) \@packed = weight_alignment_by_mask( $mask, \@align ) =cut sub weight_alignment_by_mask { my $mask = shift; defined $mask && ! ref( $mask ) && length( $mask ) or return (); $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) or return (); my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; my @seqs2 = map { [ $_->[0], $_->[1], weight_seq_by_mask_0( $mask, $_->[2] ) ] } @seqs; wantarray ? @seqs2 : \@seqs2; } # # Assume data are valid # sub weight_seq_by_mask_0 { my ( $mask, $seq ) = @_; # Remove 0 weight columns, which is fast and easy: my $m0 = $mask; $m0 =~ tr/123456789/\377/; $m0 =~ tr/\377/\000/c; ( $mask &= $m0 ) =~ tr/\000//d; ( $seq &= $m0 ) =~ tr/\000//d; # If all remaining cols are weight 1, we are done: return $seq if $mask =~ /^1*$/; my @seq; for ( my $i = 0; $i < length( $mask ); $i++ ) { push @seq, substr( $seq, $i, 1 ) x substr( $mask, $i, 1 ); } join( '', @seq ); } =head2 gap_mask Make a mask in which gap characters ('-', '~', '.', and ' ') are converted to 0x00, and all other characters to 0xFF. $mask = gap_mask( $seq ) =cut sub gap_mask { my $mask = shift; defined $mask or return ''; $mask =~ tr/-~. /\000/; # Gap characters (might be extended) $mask =~ tr/\000/\377/c; # Non-gap characters $mask; } =head2 expand_sequence_by_mask Expand sequences with the local gap character in a manner that reverses the pack by mask function. $expanded = expand_sequence_by_mask( $seq, $mask ) The columns to be added can be marked by '-' or "\000" in the mask. =head3 Code Note The function attempts to match the local gap character in the sequence. $c0 and $c1 are the previous and next characters in the sequence being expanded. (($c0,$c1)=($c1,shift @s1))[0] updates the values and evaluates to what was the next character, and becomes the new previous character. The following really does print w, x, y, and z, one per line: ( $a, $b, @c ) = ("", split //, "wxyz"); while ( defined $b ) { print( (($a,$b)=($b,shift @c))[0], "\n" ) } =cut sub expand_sequence_by_mask { my ( $seq, $mask ) = @_; $mask =~ tr/-/\000/; # Allow hyphen or null in mask at added positions. my ( $c0, $c1, @s1 ) = ( '', split( //, $seq ), '' ); join '', map { $_ ne "\000" ? (($c0,$c1)=($c1,shift @s1))[0] : $c0 eq '~' || $c1 eq '~' ? '~' : $c0 eq '.' || $c1 eq '.' ? '.' : '-' } split //, $mask; } =head2 pack_sequences Remove all alignment gaps from sequences. This function minimally rewrites sequence entries, saving memory if nothing else. @packed_seqs = pack_sequences( @seqs ) # Also handles single sequence @packed_seqs = pack_sequences( \@seqs ) \@packed_seqs = pack_sequences( @seqs ) \@packed_seqs = pack_sequences( \@seqs ) =cut sub pack_sequences { $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] ) or return (); my @seqs = map { $_->[2] =~ /[^A-Za-z*]/ ? [ @$_[0,1], pack_seq( $_->[2] ) ] : $_ } ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_; wantarray ? @seqs : \@seqs; } =head2 Simple Sequence Manipulation @entry = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] ); \@entry = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] ); @entry = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] ); \@entry = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] ); @entry = complement_DNA_entry( @seq_entry [, $fix_id] ); @entry = complement_RNA_entry( @seq_entry [, $fix_id] ); $AAsub = aa_subseq($AAseq, $from, $to); $DNAseq = complement_DNA_seq( $NA_seq ); $RNAseq = complement_RNA_seq( $NA_seq ); $DNAseq = to_DNA_seq( $NA_seq ); $RNAseq = to_RNA_seq( $NA_seq ); =cut sub subseq_DNA_entry { my ($id, $desc, @rest) = @_; my $seq; ($id, $seq) = subseq_nt(1, $id, @rest); # 1 is for DNA, not RNA wantarray ? ( $id, $desc, $seq ) : [ $id, $desc, $seq ]; } sub subseq_RNA_entry { my ($id, $desc, @rest) = @_; my $seq; ($id, $seq) = subseq_nt(0, $id, @rest); # 0 is for not DNA, i.e., RNA wantarray ? ( $id, $desc, $seq ) : [ $id, $desc, $seq ]; } sub subseq_nt { my ($DNA, $id, $seq, $from, $to, $fix_id) = @_; $fix_id ||= 0; # fix undef value my $len = length($seq); if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len } if (! $to || ( $to eq '$' ) || ( $to eq "" ) ) { $to = $len } my $left = ( $from < $to ) ? $from : $to; my $right = ( $from < $to ) ? $to : $from; if ( ( $right < 1 ) || ( $left > $len ) ) { return ($id, "") } if ( $right > $len ) { $right = $len } if ( $left < 1 ) { $left = 1 } $seq = substr($seq, $left-1, $right-$left+1); if ( $from > $to ) { $seq = reverse $seq; if ( $DNA ) { $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; } else { $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; } } if ( $fix_id ) { if ( ( $id =~ s/_(\d+)_(\d+)$// ) && ( abs($2-$1)+1 == $len ) ) { if ( $1 <= $2 ) { $from += $1 - 1; $to += $1 - 1 } else { $from = $1 + 1 - $from; $to = $1 + 1 - $to } } $id .= "_" . $from . "_" . $to; } return ($id, $seq); } sub DNA_subseq { my $seqR = ref( $_[0] ) eq 'SCALAR' ? $_[0] : \$_[0]; my ( $from, $to, $dir ) = @_[1,2,3]; my $len = length( $$seqR ); if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len } if ( ( $to eq '$' ) || ( ! $to ) ) { $to = $len } my ( $left, $right ) = min_max( $from, $to ); return undef if ( $left < 1 ) || ( $right > $len ); my $subseq = substr( $$seqR, $left-1, $right-$left+1 ); # $dir is only used if the direction is ambiguous if ( $from > $to || ( $from == $to && $dir =~ /^-/ ) ) { $subseq = reverse $subseq; $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; } $subseq; } sub RNA_subseq { my $seqR = ref( $_[0] ) eq 'SCALAR' ? $_[0] : \$_[0]; my ( $from, $to, $dir ) = @_[1,2,3]; my $len = length( $$seqR ); if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len } if ( ( $to eq '$' ) || ( ! $to ) ) { $to = $len } my ( $left, $right ) = min_max( $from, $to ); return undef if ( $left < 1 ) || ( $right > $len ); my $subseq = substr( $$seqR, $left-1, $right-$left+1 ); # $dir is only used if the direction is ambiguous if ( $from > $to || ( $from == $to && $dir =~ /^-/ ) ) { $subseq = reverse $subseq; $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; } $subseq; } sub min_max { $_[0] <= $_[1] ? @_[0,1] : @_[1,0] } sub aa_subseq { my $seqR = ref( $_[0] ) eq 'SCALAR' ? $_[0] : \$_[0]; my ( $from, $to ) = @_[1,2]; my $len = length( $$seqR ); $from = $from =~ /(\d+)/ && $1 ? $1 : 1; $to = $to =~ /(\d+)/ && $1 < $len ? $1 : $len; return undef if $from > $to; substr( $$seqR, $from-1, $to-$from+1 ); } sub complement_DNA_entry { my ($id, $desc, $seq, $fix_id) = @_; $fix_id ||= 0; # fix undef values wantarray || die "complement_DNA_entry requires array context\n"; $seq = reverse $seq; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; if ($fix_id) { if ($id =~ s/_(\d+)_(\d+)$//) { $id .= "_" . $2 . "_" . $1; } else { $id .= "_" . length($seq) . "_1"; } } return ($id, $desc, $seq); } sub complement_RNA_entry { my ($id, $desc, $seq, $fix_id) = @_; $fix_id ||= 0; # fix undef values wantarray || die "complement_DNA_entry requires array context\n"; $seq = reverse $seq; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; if ($fix_id) { if ($id =~ s/_(\d+)_(\d+)$//) { $id .= "_" . $2 . "_" . $1; } else { $id .= "_" . length($seq) . "_1"; } } return ($id, $desc, $seq); } sub complement_DNA_seq { my $seq = reverse shift; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn]; return $seq; } sub complement_RNA_seq { my $seq = reverse shift; $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn] [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn]; return $seq; } sub to_DNA_seq { my $seq = shift; $seq =~ tr/Uu/Tt/; return $seq; } sub to_RNA_seq { my $seq = shift; $seq =~ tr/Tt/Uu/; return $seq; } =head2 Sequence Cleaning Utilities $seqx = pack_seq($seq); $cleaned = clean_ae_sequence($seq); $utf8 = to7bit($seq); $ascii = to8bit($seq); =cut sub pack_seq { my $seq = shift; $seq =~ tr/A-Za-z*//cd; $seq; } sub clean_ae_sequence { local $_ = shift; $_ = to7bit($_); s/\+/1/g; s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g; return $_; } sub to7bit { local $_ = shift; my ($o, $c); while (/\\([0-3][0-7][0-7])/) { $o = oct($1) % 128; $c = sprintf("%c", $o); s/\\$1/$c/g; } return $_; } sub to8bit { local $_ = shift; my ($o, $c); while (/\\([0-3][0-7][0-7])/) { $o = oct($1); $c = sprintf("%c", $o); s/\\$1/$c/g; } return $_; } =head2 Protein Translation Utilities Translate nucleotides to one letter protein: $seq = translate_seq( $seq [, $met_start] ) Translate a single triplet with "universal" genetic code. $aa = translate_codon( $triplet ) $aa = translate_DNA_codon( $triplet ) # Does not rely on DNA $aa = translate_uc_DNA_codon( $triplet ) # Does not rely on uc or DNA User-supplied genetic code must be upper case index and match the DNA versus RNA type of sequence $seq = translate_seq_with_user_code( $seq, $gen_code_hash [, $met_start] ) =cut @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y ); # Alpha by 1 letter @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V ); # PAM matrix order @aa_n_codon_order = qw( L R S A G P T V I C D E F H K N Q Y M W ); %genetic_code = ( # DNA version TTT => 'F', TCT => 'S', TAT => 'Y', TGT => 'C', TTC => 'F', TCC => 'S', TAC => 'Y', TGC => 'C', TTA => 'L', TCA => 'S', TAA => '*', TGA => '*', TTG => 'L', TCG => 'S', TAG => '*', TGG => 'W', CTT => 'L', CCT => 'P', CAT => 'H', CGT => 'R', CTC => 'L', CCC => 'P', CAC => 'H', CGC => 'R', CTA => 'L', CCA => 'P', CAA => 'Q', CGA => 'R', CTG => 'L', CCG => 'P', CAG => 'Q', CGG => 'R', ATT => 'I', ACT => 'T', AAT => 'N', AGT => 'S', ATC => 'I', ACC => 'T', AAC => 'N', AGC => 'S', ATA => 'I', ACA => 'T', AAA => 'K', AGA => 'R', ATG => 'M', ACG => 'T', AAG => 'K', AGG => 'R', GTT => 'V', GCT => 'A', GAT => 'D', GGT => 'G', GTC => 'V', GCC => 'A', GAC => 'D', GGC => 'G', GTA => 'V', GCA => 'A', GAA => 'E', GGA => 'G', GTG => 'V', GCG => 'A', GAG => 'E', GGG => 'G', # The following ambiguous encodings are not necessary, but # speed up the processing of some ambiguous triplets: TTY => 'F', TCY => 'S', TAY => 'Y', TGY => 'C', TTR => 'L', TCR => 'S', TAR => '*', TCN => 'S', CTY => 'L', CCY => 'P', CAY => 'H', CGY => 'R', CTR => 'L', CCR => 'P', CAR => 'Q', CGR => 'R', CTN => 'L', CCN => 'P', CGN => 'R', ATY => 'I', ACY => 'T', AAY => 'N', AGY => 'S', ACR => 'T', AAR => 'K', AGR => 'R', ACN => 'T', GTY => 'V', GCY => 'A', GAY => 'D', GGY => 'G', GTR => 'V', GCR => 'A', GAR => 'E', GGR => 'G', GTN => 'V', GCN => 'A', GGN => 'G' ); # Add RNA by construction: foreach ( grep { /T/ } keys %genetic_code ) { my $codon = $_; $codon =~ s/T/U/g; $genetic_code{ $codon } = lc $genetic_code{ $_ } } # Add lower case by construction: foreach ( keys %genetic_code ) { $genetic_code{ lc $_ } = lc $genetic_code{ $_ } } # Construct the genetic code with selenocysteine by difference: %genetic_code_with_U = %genetic_code; $genetic_code_with_U{ TGA } = 'U'; $genetic_code_with_U{ tga } = 'u'; $genetic_code_with_U{ UGA } = 'U'; $genetic_code_with_U{ uga } = 'u'; %amino_acid_codons_DNA = ( L => [ qw( TTA TTG CTA CTG CTT CTC ) ], R => [ qw( AGA AGG CGA CGG CGT CGC ) ], S => [ qw( AGT AGC TCA TCG TCT TCC ) ], A => [ qw( GCA GCG GCT GCC ) ], G => [ qw( GGA GGG GGT GGC ) ], P => [ qw( CCA CCG CCT CCC ) ], T => [ qw( ACA ACG ACT ACC ) ], V => [ qw( GTA GTG GTT GTC ) ], I => [ qw( ATA ATT ATC ) ], C => [ qw( TGT TGC ) ], D => [ qw( GAT GAC ) ], E => [ qw( GAA GAG ) ], F => [ qw( TTT TTC ) ], H => [ qw( CAT CAC ) ], K => [ qw( AAA AAG ) ], N => [ qw( AAT AAC ) ], Q => [ qw( CAA CAG ) ], Y => [ qw( TAT TAC ) ], M => [ qw( ATG ) ], U => [ qw( TGA ) ], W => [ qw( TGG ) ], l => [ qw( tta ttg cta ctg ctt ctc ) ], r => [ qw( aga agg cga cgg cgt cgc ) ], s => [ qw( agt agc tca tcg tct tcc ) ], a => [ qw( gca gcg gct gcc ) ], g => [ qw( gga ggg ggt ggc ) ], p => [ qw( cca ccg cct ccc ) ], t => [ qw( aca acg act acc ) ], v => [ qw( gta gtg gtt gtc ) ], i => [ qw( ata att atc ) ], c => [ qw( tgt tgc ) ], d => [ qw( gat gac ) ], e => [ qw( gaa gag ) ], f => [ qw( ttt ttc ) ], h => [ qw( cat cac ) ], k => [ qw( aaa aag ) ], n => [ qw( aat aac ) ], q => [ qw( caa cag ) ], y => [ qw( tat tac ) ], m => [ qw( atg ) ], u => [ qw( tga ) ], w => [ qw( tgg ) ], '*' => [ qw( TAA TAG TGA ) ] ); %amino_acid_codons_RNA = ( L => [ qw( UUA UUG CUA CUG CUU CUC ) ], R => [ qw( AGA AGG CGA CGG CGU CGC ) ], S => [ qw( AGU AGC UCA UCG UCU UCC ) ], A => [ qw( GCA GCG GCU GCC ) ], G => [ qw( GGA GGG GGU GGC ) ], P => [ qw( CCA CCG CCU CCC ) ], T => [ qw( ACA ACG ACU ACC ) ], V => [ qw( GUA GUG GUU GUC ) ], B => [ qw( GAU GAC AAU AAC ) ], Z => [ qw( GAA GAG CAA CAG ) ], I => [ qw( AUA AUU AUC ) ], C => [ qw( UGU UGC ) ], D => [ qw( GAU GAC ) ], E => [ qw( GAA GAG ) ], F => [ qw( UUU UUC ) ], H => [ qw( CAU CAC ) ], K => [ qw( AAA AAG ) ], N => [ qw( AAU AAC ) ], Q => [ qw( CAA CAG ) ], Y => [ qw( UAU UAC ) ], M => [ qw( AUG ) ], U => [ qw( UGA ) ], W => [ qw( UGG ) ], l => [ qw( uua uug cua cug cuu cuc ) ], r => [ qw( aga agg cga cgg cgu cgc ) ], s => [ qw( agu agc uca ucg ucu ucc ) ], a => [ qw( gca gcg gcu gcc ) ], g => [ qw( gga ggg ggu ggc ) ], p => [ qw( cca ccg ccu ccc ) ], t => [ qw( aca acg acu acc ) ], v => [ qw( gua gug guu guc ) ], b => [ qw( gau gac aau aac ) ], z => [ qw( gaa gag caa cag ) ], i => [ qw( aua auu auc ) ], c => [ qw( ugu ugc ) ], d => [ qw( gau gac ) ], e => [ qw( gaa gag ) ], f => [ qw( uuu uuc ) ], h => [ qw( cau cac ) ], k => [ qw( aaa aag ) ], n => [ qw( aau aac ) ], q => [ qw( caa cag ) ], y => [ qw( uau uac ) ], m => [ qw( aug ) ], u => [ qw( uga ) ], w => [ qw( ugg ) ], '*' => [ qw( UAA UAG UGA ) ] ); %n_codon_for_aa = map { $_ => scalar @{ $amino_acid_codons_DNA{ $_ } } } keys %amino_acid_codons_DNA; %reverse_genetic_code_DNA = ( A => "GCN", a => "gcn", C => "TGY", c => "tgy", D => "GAY", d => "gay", E => "GAR", e => "gar", F => "TTY", f => "tty", G => "GGN", g => "ggn", H => "CAY", h => "cay", I => "ATH", i => "ath", K => "AAR", k => "aar", L => "YTN", l => "ytn", M => "ATG", m => "atg", N => "AAY", n => "aay", P => "CCN", p => "ccn", Q => "CAR", q => "car", R => "MGN", r => "mgn", S => "WSN", s => "wsn", T => "ACN", t => "acn", U => "TGA", u => "tga", V => "GTN", v => "gtn", W => "TGG", w => "tgg", X => "NNN", x => "nnn", Y => "TAY", y => "tay", '*' => "TRR" ); %reverse_genetic_code_RNA = ( A => "GCN", a => "gcn", C => "UGY", c => "ugy", D => "GAY", d => "gay", E => "GAR", e => "gar", F => "UUY", f => "uuy", G => "GGN", g => "ggn", H => "CAY", h => "cay", I => "AUH", i => "auh", K => "AAR", k => "aar", L => "YUN", l => "yun", M => "AUG", m => "aug", N => "AAY", n => "aay", P => "CCN", p => "ccn", Q => "CAR", q => "car", R => "MGN", r => "mgn", S => "WSN", s => "wsn", T => "ACN", t => "acn", U => "UGA", u => "uga", V => "GUN", v => "gun", W => "UGG", w => "ugg", X => "NNN", x => "nnn", Y => "UAY", y => "uay", '*' => "URR" ); %DNA_letter_can_be = ( A => ["A"], a => ["a"], B => ["C", "G", "T"], b => ["c", "g", "t"], C => ["C"], c => ["c"], D => ["A", "G", "T"], d => ["a", "g", "t"], G => ["G"], g => ["g"], H => ["A", "C", "T"], h => ["a", "c", "t"], K => ["G", "T"], k => ["g", "t"], M => ["A", "C"], m => ["a", "c"], N => ["A", "C", "G", "T"], n => ["a", "c", "g", "t"], R => ["A", "G"], r => ["a", "g"], S => ["C", "G"], s => ["c", "g"], T => ["T"], t => ["t"], U => ["T"], u => ["t"], V => ["A", "C", "G"], v => ["a", "c", "g"], W => ["A", "T"], w => ["a", "t"], Y => ["C", "T"], y => ["c", "t"] ); %RNA_letter_can_be = ( A => ["A"], a => ["a"], B => ["C", "G", "U"], b => ["c", "g", "u"], C => ["C"], c => ["c"], D => ["A", "G", "U"], d => ["a", "g", "u"], G => ["G"], g => ["g"], H => ["A", "C", "U"], h => ["a", "c", "u"], K => ["G", "U"], k => ["g", "u"], M => ["A", "C"], m => ["a", "c"], N => ["A", "C", "G", "U"], n => ["a", "c", "g", "u"], R => ["A", "G"], r => ["a", "g"], S => ["C", "G"], s => ["c", "g"], T => ["U"], t => ["u"], U => ["U"], u => ["u"], V => ["A", "C", "G"], v => ["a", "c", "g"], W => ["A", "U"], w => ["a", "u"], Y => ["C", "U"], y => ["c", "u"] ); %one_letter_to_three_letter_aa = ( A => "Ala", a => "Ala", B => "Asx", b => "Asx", C => "Cys", c => "Cys", D => "Asp", d => "Asp", E => "Glu", e => "Glu", F => "Phe", f => "Phe", G => "Gly", g => "Gly", H => "His", h => "His", I => "Ile", i => "Ile", K => "Lys", k => "Lys", L => "Leu", l => "Leu", M => "Met", m => "Met", N => "Asn", n => "Asn", P => "Pro", p => "Pro", Q => "Gln", q => "Gln", R => "Arg", r => "Arg", S => "Ser", s => "Ser", T => "Thr", t => "Thr", U => "Sec", u => "Sec", V => "Val", v => "Val", W => "Trp", w => "Trp", X => "Xxx", x => "Xxx", Y => "Tyr", y => "Tyr", Z => "Glx", z => "Glx", '*' => "***" ); %three_letter_to_one_letter_aa = ( ALA => "A", Ala => "A", ala => "a", ARG => "R", Arg => "R", arg => "r", ASN => "N", Asn => "N", asn => "n", ASP => "D", Asp => "D", asp => "d", ASX => "B", Asx => "B", asx => "b", CYS => "C", Cys => "C", cys => "c", GLN => "Q", Gln => "Q", gln => "q", GLU => "E", Glu => "E", glu => "e", GLX => "Z", Glx => "Z", glx => "z", GLY => "G", Gly => "G", gly => "g", HIS => "H", His => "H", his => "h", ILE => "I", Ile => "I", ile => "i", LEU => "L", Leu => "L", leu => "l", LYS => "K", Lys => "K", lys => "k", MET => "M", Met => "M", met => "m", PHE => "F", Phe => "F", phe => "f", PRO => "P", Pro => "P", pro => "p", SEC => "U", Sec => "U", sec => "u", SER => "S", Ser => "S", ser => "s", THR => "T", Thr => "T", thr => "t", TRP => "W", Trp => "W", trp => "w", TYR => "Y", Tyr => "Y", tyr => "y", VAL => "V", Val => "V", val => "v", XAA => "X", Xaa => "X", xaa => "x", XXX => "X", Xxx => "X", xxx => "x", '***' => "*" ); sub translate_seq { my $seq = shift; $seq =~ tr/-//d; # remove gaps my @codons = $seq =~ m/(...?)/g; # Will try to translate last 2 nt # A second argument that is true forces first amino acid to be Met my @met; if ( ( shift @_ ) && ( my $codon1 = shift @codons ) ) { push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' ); } join( '', @met, map { translate_codon( $_ ) } @codons ) } =pod Translate a single triplet with "universal" genetic code. $aa = translate_codon( $triplet ) $aa = translate_DNA_codon( $triplet ) $aa = translate_uc_DNA_codon( $triplet ) =cut sub translate_DNA_codon { translate_codon( @_ ) } sub translate_uc_DNA_codon { translate_codon( uc $_[0] ) } sub translate_codon { my $codon = shift; $codon =~ tr/Uu/Tt/; # Make it DNA # Try a simple lookup: my $aa; if ( $aa = $genetic_code{ $codon } ) { return $aa } # Attempt to recover from mixed-case codons: $codon = ( $codon =~ /^.[a-z]/ ) ? lc $codon : uc $codon; if ( $aa = $genetic_code{ $codon } ) { return $aa } # The code defined above catches simple N, R and Y ambiguities in the # third position. Other codons (e.g., GG[KMSWBDHV], or even GG) might # be unambiguously translated by converting the last position to N and # seeing if this is in the code table: my $N = ( $codon =~ /^.[a-z]/ ) ? 'n' : 'N'; if ( $aa = $genetic_code{ substr($codon,0,2) . $N } ) { return $aa } # Test that codon is valid for an unambiguous aa: my $X = ( $codon =~ /^.[a-z]/ ) ? 'x' : 'X'; if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/i && $codon !~ m/^YT[AGR]$/i # Leu YTR && $codon !~ m/^MG[AGR]$/i # Arg MGR ) { return $X; } # Expand all ambiguous nucleotides to see if they all yield same aa. my @n1 = @{ $DNA_letter_can_be{ substr( $codon, 0, 1 ) } }; my $n2 = substr( $codon, 1, 1 ); my @n3 = @{ $DNA_letter_can_be{ substr( $codon, 2, 1 ) } }; my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1; my $triple = shift @triples; $aa = $genetic_code{ $triple }; $aa or return $X; foreach $triple ( @triples ) { return $X if $aa ne $genetic_code{$triple} } return $aa; } =pod Translate with a user-supplied genetic code to translate a sequence. Diagnose the use of upper versus lower, and T versus U in the supplied code, and transform the supplied nucleotide sequence to match. $aa = translate_seq_with_user_code( $nt, \%gen_code ) $aa = translate_seq_with_user_code( $nt, \%gen_code, $start_with_met ) Modified 2007-11-22 to be less intrusive in these diagnoses by sensing the presence of both versions in the user code. =cut sub translate_seq_with_user_code { my $seq = shift; $seq =~ tr/-//d; # remove gaps *** Why? my $gc = shift; # Reference to hash of code if (! $gc || ref($gc) ne "HASH") { print STDERR "translate_seq_with_user_code needs genetic code hash as second argument."; return undef; } # Test code support for upper vs lower case: my ( $TTT, $UUU ); if ( $gc->{AAA} && ! $gc->{aaa} ) # Uppercase only code table { $seq = uc $seq; # Uppercase sequence ( $TTT, $UUU ) = ( 'TTT', 'UUU' ); } elsif ( $gc->{aaa} && ! $gc->{AAA} ) # Lowercase only code table { $seq = lc $seq; # Lowercase sequence ( $TTT, $UUU ) = ( 'ttt', 'uuu' ); } elsif ( $gc->{aaa} ) { ( $TTT, $UUU ) = ( 'ttt', 'uuu' ); } else { print STDERR "User-supplied genetic code does not have aaa or AAA\n"; return undef; } # Test code support for U vs T: my $ambigs; if ( $gc->{$UUU} && ! $gc->{$TTT} ) # RNA only code table { $seq =~ tr/Tt/Uu/; $ambigs = \%RNA_letter_can_be; } elsif ( $gc->{$TTT} && ! $gc->{$UUU} ) # DNA only code table { $seq =~ tr/Uu/Tt/; $ambigs = \%DNA_letter_can_be; } else { my $t = $seq =~ tr/Tt//; my $u = $seq =~ tr/Uu//; $ambigs = ( $t > $u ) ? \%DNA_letter_can_be : \%RNA_letter_can_be; } # We can now do the codon-by-codon translation: my @codons = $seq =~ m/(...?)/g; # will try to translate last 2 nt # A third argument that is true forces first amino acid to be Met my @met; if ( ( shift @_ ) && ( my $codon1 = shift @codons ) ) { push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' ); } join( '', @met, map { translate_codon_with_user_code( $_, $gc, $ambigs ) } @codons ) } =head3 translate_codon_with_user_code Translate with user-supplied genetic code hash. No error check on the code. Should only be called through translate_seq_with_user_code. $aa = translate_codon_with_user_code( $triplet, \%code, \%ambig_table ) =over 4 =item $triplet speaks for itself =item \%code ref to the hash with the codon translations =item \%ambig_table ref to hash with lists of nucleotides for each ambig code =back =cut sub translate_codon_with_user_code { my ( $codon, $gc, $ambigs ) = @_; # Try a simple lookup: my $aa; if ( $aa = $gc->{ $codon } ) { return $aa } # Attempt to recover from mixed-case codons: if ( $aa = $gc->{ lc $codon } ) { return( $gc->{ $codon } = $aa ) } if ( $aa = $gc->{ uc $codon } ) { return( $gc->{ $codon } = $aa ) } $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon; # Test that codon is valid for an unambiguous aa: my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X'; if ( $codon =~ m/^[ACGTU][ACGTU]$/i ) # Add N? { $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N'; } # This makes assumptions about the user code, but tranlating ambiguous # codons is really a bit off the wall to start with: elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid? { return( $gc->{ $codon } = $X ); } # Expand all ambiguous nucleotides to see if they all yield same aa. my @n1 = @{ $ambigs->{ substr( $codon, 0, 1 ) } }; my $n2 = substr( $codon, 1, 1 ); my @n3 = @{ $ambigs->{ substr( $codon, 2, 1 ) } }; my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1; my $triple = shift @triples; $aa = $gc->{ $triple } || $gc->{ lc $triple } || $gc->{ uc $triple }; $aa or return( $gc->{ $codon } = $X ); foreach $triple ( @triples ) { return( $gc->{ $codon } = $X ) if $aa ne ( $gc->{$triple} || $gc->{lc $triple} || $gc->{uc $triple} ); } return( $gc->{ $codon } = $aa ); } =head2 read_intervals Read a list of intervals from a file. Allow id_start_end, or id \s start \s end formats @intervals = read_intervals( \*FILEHANDLE ) =cut sub read_intervals { my $fh = shift; my @intervals = (); while (<$fh>) { chomp; /^(\S+)_(\d+)_(\d+)(\s.*)?$/ # id_start_end WIT2 || /^(\S+)_(\d+)-(\d+)(\s.*)?$/ # id_start-end ??? || /^(\S+)=(\d+)=(\d+)(\s.*)?$/ # id=start=end Badger || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/ # id \s start \s end || next; # Matched a pattern. Store reference to (id, left, right): push @intervals, ($2 < $3) ? [ $1, $2+0, $3+0 ] : [ $1, $3+0, $2+0 ]; } return @intervals; } =head2 standardize_intervals Convert a list of intervals to read [ id, left_end, right_end ]. @intervals = standardize_intervals( @interval_refs ) =cut sub standardize_intervals { map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_; } =head2 join_intervals Take the union of a list of intervals @joined = join_intervals( @interval_refs ) =cut sub join_intervals { my @ordered = sort { $a->[0] cmp $b->[0] # first by id || $a->[1] <=> $b->[1] # next by left end || $b->[2] <=> $a->[2] # finally longest first } @_; my @joined = (); my $n_int = @ordered; my ($cur_id) = ""; my ($cur_left) = -1; my ($cur_right) = -1; my ($new_id, $new_left, $new_right); for (my $i = 0; $i < $n_int; $i++) { ($new_id, $new_left, $new_right) = @{$ordered[$i]}; # get the new data if ( ( $new_id ne $cur_id) # if new contig || ( $new_left > $cur_right + 1) # or not touching previous ) { # push the previous interval if ($cur_id) { push (@joined, [ $cur_id, $cur_left, $cur_right ]) } $cur_id = $new_id; # update the current interval $cur_left = $new_left; $cur_right = $new_right; } elsif ($new_right > $cur_right) { # extend the right end if necessary $cur_right = $new_right; } } if ($cur_id) { push (@joined, [$cur_id, $cur_left, $cur_right]) } return @joined; } =head2 locations_2_intervals Split location strings to oriented intervals. @intervals = locations_2_intervals( @locations ) $interval = locations_2_intervals( $location ) =cut sub locations_2_intervals { my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/ || /^(\S+)_(\d+)-(\d+)(\s.*)?$/ || /^(\S+)=(\d+)=(\d+)(\s.*)?$/ || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/ ? [ $1, $2+0, $3+0 ] : () } @_; return wantarray ? @intervals : $intervals[0]; } =head2 read_oriented_intervals Read a list of oriented intervals from a file. Allow id_start_end, or id \s start \s end formats @intervals = read_oriented_intervals( \*FILEHANDLE ) =cut sub read_oriented_intervals { my $fh = shift; my @intervals = (); while (<$fh>) { chomp; /^(\S+)_(\d+)_(\d+)(\s.*)?$/ # id_start_end WIT2 || /^(\S+)_(\d+)-(\d+)(\s.*)?$/ # id_start-end ??? || /^(\S+)=(\d+)=(\d+)(\s.*)?$/ # id=start=end Badger || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/ # id \s start \s end || next; # Matched a pattern. Store reference to (id, start, end): push @intervals, [ $1, $2+0, $3+0 ]; } return @intervals; } =head2 reverse_intervals Reverse the orientation of a list of intervals @reversed = reverse_intervals( @interval_refs ) =cut sub reverse_intervals { map { [ $_->[0], $_->[2], $_->[1] ] } @_; } =head2 gb_location_2_seed Convert GenBank locations to SEED locations @seed_locs = gb_location_2_seed( $contig, @gb_locs ) =cut sub gb_location_2_seed { my $contig = shift @_; $contig or die "First arg of gb_location_2_seed must be contig_id\n"; map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_ } sub gb_loc_2_seed_2 { my ( $contig, $loc ) = @_; if ( $loc =~ /^(\d+)\.\.(\d+)$/ ) { join( '_', $contig, $1, $2 ) } elsif ( $loc =~ /^join\((.*)\)$/ ) { $loc = $1; my $lvl = 0; for ( my $i = length( $loc )-1; $i >= 0; $i-- ) { for ( substr( $loc, $i, 1 ) ) { /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t"; /\(/ and $lvl--; /\)/ and $lvl++; } } $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die; map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc } elsif ( $loc =~ /^complement\((.*)\)$/ ) { map { s/_(\d+)_(\d+)$/_$2_$1/; $_ } reverse gb_loc_2_seed_2( $contig, $1 ) } else { () } } =head2 read_qual Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ] @seq_entries = read_qual( ) # STDIN \@seq_entries = read_qual( ) # STDIN @seq_entries = read_qual( \*FILEHANDLE ) \@seq_entries = read_qual( \*FILEHANDLE ) @seq_entries = read_qual( $filename ) \@seq_entries = read_qual( $filename ) =cut sub read_qual { my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] ); $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n"; my @quals = (); my ($id, $desc, $qual) = ("", "", []); while ( <$fh> ) { chomp; if (/^>\s*(\S+)(\s+(.*))?$/) { # new id if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] } ($id, $desc, $qual) = ($1, $3 ? $3 : "", []); } else { push @$qual, split; } } close( $fh ) if $close; if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] } return wantarray ? @quals : \@quals; } =head2 fraction_nt_diff Fraction difference for an alignment of two nucleotide sequences in terms of number of differing residues, number of gaps, and number of gap opennings. $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options ) or $fraction_diff = fraction_nt_diff( $seq1, $seq2 ) $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_wgt ) $fraction_diff = fraction_nt_diff( $seq1, $seq2, $open_wgt, $extend_wgt ) Options: gap => $gap_wgt # Gap open and extend weight (D = 0.5) open => $open_wgt # Gap openning weight (D = gap_wgt) extend => $extend_wgt # Gap extension weight (D = open_wgt) t_gap => $term_gap_wgt # Terminal open and extend weight t_open => $term_open_wgt # Terminal gap open weight (D = open_wgt) t_extend => $term_extend_wgt # Terminal gap extend weight (D = extend_wgt) Default gap open and gap extend weights are 1/2. Beware that $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1 ) and $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1, 0 ) are different. The first has equal openning and extension weights, whereas the second has an openning weight of 1, and and extension weight of 0 (it only penalizes the number of runs of gaps). =cut sub fraction_nt_diff { my ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( @_[0,1] ); my $diff_scr; if ( ref( $_[2] ) eq 'HASH' ) { my $opts = $_[2]; my $gap_open = defined $opts->{ open } ? $opts->{ open } : defined $opts->{ gap } ? $opts->{ gap } : 0.5; my $gap_extend = defined $opts->{ extend } ? $opts->{ extend } : $gap_open; my $term_open = defined $opts->{ t_open } ? $opts->{ t_open } : defined $opts->{ t_gap } ? $opts->{ t_gap } : $gap_open; my $term_extend = defined $opts->{ t_extend } ? $opts->{ t_extend } : defined $opts->{ t_gap } ? $opts->{ t_gap } : $gap_extend; $nopen -= $topen; $ngap -= $tgap; $diff_scr = $ndif + $gap_open * $nopen + $gap_extend * ($ngap-$nopen) + $term_open * $topen + $term_extend * ($tgap-$topen); } else { my $gap_open = defined( $_[2] ) ? $_[2] : 0.5; my $gap_extend = defined( $_[3] ) ? $_[3] : $gap_open; $diff_scr = $ndif + $gap_open * $nopen + $gap_extend * ($ngap-$nopen); } my $ttl_scr = $nid + $diff_scr; $ttl_scr ? $diff_scr / $ttl_scr : undef } =head2 interpret_nt_align Interpret an alignment of two nucleotide sequences in terms of: useful aligned positions (unambiguous, and not a common gap), number of identical residues, number of differing residues, number of gaps, and number of gap opennings. ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 ) $npos = total aligned positons (= $nid + $ndif + $ngap) $nid = number of positions with identical nucleotides (ignoring case) $ndif = number of positions with differing nucleotides $ngap = number of positions with gap in one sequence but not the other $nopen = number of runs of gaps $tgap = number of gaps in runs adjacent to a terminus $topen = number of alignment ends with gaps Some of the methods might seem overly complex, but are necessary for cases in which the gaps switch strands in the alignment: seq1 ---ACGTGAC--TTGCAGAG seq2 TTT---TGACGG--GCAGGG mask 00000011110000111111 npos = 20 nid = 9 ndif = 1 ngap = 10 nopen = 4 tgap = 3 topen = 1 Although there are 4 gap opennings, there are only 2 runs in the mask, and the terminal run is length 6, not 3. (Why handle these? Because pairs of sequences from a multiple sequence alignment can look like this.) =cut sub interpret_nt_align { # Remove alignment columns that are not informative: my ( $s1, $s2 ) = useful_nt_align( @_[0,1] ); my $nmat = length( $s1 ); # Useful alignment length my $m1 = $s1; $m1 =~ tr/ACGT/\377/; # Nucleotides to all 1 bits $m1 =~ tr/\377/\000/c; # Others (gaps) to null byte my $m2 = $s2; $m2 =~ tr/ACGT/\377/; # Nucleotides to all 1 bits $m2 =~ tr/\377/\000/c; # Others (gaps) to null byte $m1 &= $m2; # Gap in either sequence becomes null $s1 &= $m1; # Apply mask to sequence 1 $s2 &= $m1; # Apply mask to sequence 2 my $nopen = @{[ $s1 =~ /\000+/g ]} # Gap opens in sequence 1 + @{[ $s2 =~ /\000+/g ]}; # Gap opens in sequence 2 my ( $tgap, $topen ) = ( 0, 0 ); if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ } if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ } $s1 =~ tr/\000//d; # Remove nulls (former gaps) $s2 =~ tr/\000//d; # Remove nulls (former gaps) my $ngap = $nmat - length( $s1 ); # Total gaps my $xor = $s1 ^ $s2; # xor of identical residues is null byte my $nid = ( $xor =~ tr/\000//d ); # Count the nulls (identical residues) my $ndif = $nmat - $nid - $ngap; ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) } sub useful_nt_align { my ( $s1, $s2 ) = map { uc $_ } @_; $s1 =~ tr/U/T/; # Convert U to T my $m1 = $s1; $m1 =~ tr/ACGT-/\377/; # Allowed symbols to hex FF byte $m1 =~ tr/\377/\000/c; # All else to null byte $s2 =~ tr/U/T/; # Convert U to T my $m2 = $s2; $m2 =~ tr/ACGT-/\377/; # Allowed symbols to hex FF byte $m2 =~ tr/\377/\000/c; # All else to null byte $m1 &= $m2; # Invalid in either sequence becomes null $s1 &= $m1; # Apply mask to sequence 1 $s1 =~ tr/\000//d; # Delete nulls in sequence 1 $s2 &= $m1; # Apply mask to sequence 2 $s2 =~ tr/\000//d; # Delete nulls in sequence 2 ( $s1, $s2 ) } =head2 interpret_aa_align Interpret an alignment of two protein sequences in terms of: useful aligned positions (unambiguous, and not a common gap), number of identical residues, number of differing residues, number of gaps, and number of gap openings. ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_aa_align( $seq1, $seq2 ) $nmat = total aligned positons (= $nid + $ndif + $ngap) $nid = number of positions with identical amino acids (ignoring case) $ndif = number of positions with differing amino acids $ngap = number of positions with gap in one sequence but not the other $nopen = number of runs of gaps $tgap = number of gaps in runs adjacent to a terminus $topen = number of alignment ends with gaps =cut sub interpret_aa_align { # Remove alignment columns that are not informative: my ( $s1, $s2 ) = useful_aa_align( @_[0,1] ); my $nmat = length( $s1 ); # Useful alignment length my $m1 = $s1; $m1 =~ tr/ACDEFGHIKLMNPQRSTUVWY/\377/; # Amino acids to all 1 bits $m1 =~ tr/\377/\000/c; # Others (gaps) to null byte my $m2 = $s2; $m2 =~ tr/ACDEFGHIKLMNPQRSTUVWY/\377/; # Amino acids to all 1 bits $m2 =~ tr/\377/\000/c; # Others (gaps) to null byte $m1 &= $m2; # Gap in either sequence becomes null $s1 &= $m1; # Apply mask to sequence 1 $s2 &= $m1; # Apply mask to sequence 2 my $nopen = @{[ $s1 =~ /\000+/g ]} # Gap opens in sequence 1 + @{[ $s2 =~ /\000+/g ]}; # Gap opens in sequence 2 my ( $tgap, $topen ) = ( 0, 0 ); if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ } if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ } $s1 =~ tr/\000//d; # Remove nulls (former gaps) $s2 =~ tr/\000//d; # Remove nulls (former gaps) my $ngap = $nmat - length( $s1 ); # Total gaps my $xor = $s1 ^ $s2; # xor of identical residues is null byte my $nid = ( $xor =~ tr/\000//d ); # Count the nulls (identical residues) my $ndif = $nmat - $nid - $ngap; ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) } =head2 useful_aa_align Remove alignment columns with a nonvalid character or shared gaps ( $seq1, $seq2 ) = useful_aa_align( $seq1, $seq2 ); =cut sub useful_aa_align { defined( $_[0] ) && defined( $_[1] ) or return (); my $m1 = my $s1 = uc $_[0]; my $m2 = my $s2 = uc $_[1]; # Valid characters in seq1 and seq2 $m1 =~ tr/ACDEFGHIKLMNPQRSTUVWY-/\377/; $m1 =~ tr/\377/\000/c; $m2 =~ tr/ACDEFGHIKLMNPQRSTUVWY-/\377/; $m2 =~ tr/\377/\000/c; $m1 &= $m2; # Valid in seq1 and seq2 # Shared gaps my $m3 = $s1; $m3 =~ tr/-/\000/; $m3 =~ tr/\000/\377/c; my $m4 = $s2; $m4 =~ tr/-/\000/; $m4 =~ tr/\000/\377/c; # Valid and not shared gap; $m1 &= $m3 | $m4; $s1 &= $m1; # Apply mask to sequence 1 $s1 =~ tr/\000//d; # Delete nulls in sequence 1 $s2 &= $m1; # Apply mask to sequence 2 $s2 =~ tr/\000//d; # Delete nulls in sequence 2 ( $s1, $s2 ); } =head2 trim_terminal_gap_columns Remove columns with terminal gaps ( $seq1, $seq2 ) = trim_terminal_gap_columns( $seq1, $seq2 ); =cut sub trim_terminal_gap_columns { defined( $_[0] ) && defined( $_[1] ) or return (); my $beg = max( $_[0] =~ /^(-+)/ ? length( $1 ) : 0, $_[1] =~ /^(-+)/ ? length( $1 ) : 0 ); my $end = max( $_[0] =~ /(-+)$/ ? length( $1 ) : 0, $_[1] =~ /(-+)$/ ? length( $1 ) : 0 ); my $len0 = length( $_[0] ); my $len = $len0 - ( $beg + $end ); return $len == $len0 ? @_[0,1] : $len > 0 ? ( substr( $_[0], $beg, $len ), substr( $_[1], $beg, $len ) ) : ( '', '' ); } sub max { $_[0] >= $_[1] ? $_[0] : $_[1] } =head2 oligomer_similarity Return the fraction identity for oligomers over a range of lengths @sims = oligomer_similarity( $seq1, $seq2, \%opts ) =cut sub oligomer_similarity { my ( $seq1, $seq2, $opts ) = @_; $seq1 && $seq2 or return (); $seq1 = $seq1->[2] if ref( $seq1 ) eq 'ARRAY'; $seq2 = $seq2->[2] if ref( $seq2 ) eq 'ARRAY'; $opts && ref( $opts ) eq 'HASH' or ( $opts = {} ); my $min = $opts->{ min } || $opts->{ max } || 2; my $max = $opts->{ max } || $min; return () if $max < $min; # Remove shared gaps my $mask1 = gap_mask( $seq1 ); my $mask2 = gap_mask( $seq2 ); my $mask = $mask1 | $mask2; $seq1 = $seq1 & $mask; # Apply mask to sequence $seq1 =~ tr/\000//d; # Delete null characters $seq2 = $seq2 & $mask; # Apply mask to sequence $seq2 =~ tr/\000//d; # Delete null characters # Remove terminal gaps $mask = $mask1 & $mask2; my $n1 = $mask =~ /^(\000+)/ ? length( $1 ) : 0; my $n2 = $mask =~ /(\000+)$/ ? length( $1 ) : 0; if ( $n1 || $n2 ) { my $len = length( $seq1 ) - ( $n1 + $n2 ); $seq1 = substr( $seq1, $n1, $len ); $seq2 = substr( $seq2, $n1, $len ); } # Find the runs of identity my $xor = $seq1 ^ $seq2; # xor of identical residues is null byte # Runs with one or more identities my %cnt; foreach ( $xor =~ m/(\000+)/g ) { $cnt{ length($_) }++ } map { my $n = $_; my $ttl = 0; for ( grep { $_ >= $n } keys %cnt ) { $ttl += $cnt{$_} * ( $_ - ($n-1) ) } my $nmax = length( $xor ) - ($n-1); $nmax > 0 ? $ttl / $nmax : undef; } ( $min .. $max ); } =head2 Guess Type of a Sequence $type = guess_seq_type( $sequence ) $bool = is_dna( $sequence ) # not RNA or prot $bool = is_rna( $sequence ) # not DNA or prot $bool = is_na( $sequence ) # nucleic acid, not prot $bool = is_prot( $sequence ) # not DNA or RNA $sequence can be a string, a reference to a string, or an id_def_seq triple. $type will be a keyword: 'DNA', 'RNA' or 'protein', or '' in case of error. =cut sub guess_seq_type { local $_ = ( ! $_[0] ) ? undef : ( ! ref( $_[0] ) ) ? $_[0] : ( ref( $_[0] ) eq 'SCALAR' ) ? ${$_[0]} : ( ref( $_[0] ) eq 'ARRAY' ) ? $_[0]->[2] : undef; return '' unless $_; my $nt = tr/ACGNTUacgntu//; # nucleotides my $aa = tr/ACDEFGHIKLMNPQRSTVWXYacdefghiklmnpqrstvwxy//; # amino acids return '' unless $nt + $aa > 10; return $nt < 0.75 * $aa ? 'protein' # amino acids > nucleotides : tr/EFILPQefilpq// ? '' # nonnucleotides are very bad : tr/Uu// > tr/Tt// ? 'RNA' : 'DNA'; } sub is_dna { guess_seq_type( $_[0] ) eq 'DNA' } sub is_rna { guess_seq_type( $_[0] ) eq 'RNA' } sub is_na { guess_seq_type( $_[0] ) =~ /^.NA$/ } sub is_prot { guess_seq_type( $_[0] ) =~ /^prot/ } =head2 Is (Array Of) Sequence Triple Verify the structure of an [ id, desc, sequence ] triple and the structure of an array of sequence triples $bool = is_sequence_triple( $triple ) $bool = is_array_of_sequence_triples( \@triples ) =cut sub is_sequence_triple { local $_ = shift; $_ && ( ref($_) eq 'ARRAY' ) && ( @$_ == 3 ) && defined( $_->[0] ) && defined( $_->[2] ); } sub is_array_of_sequence_triples { local $_ = $_[0]; $_ && ref( $_ ) eq 'ARRAY' && @$_ == grep { is_sequence_triple( $_ ) } @$_; } sub is_filehandle { my( $fh ) = @_; ref($fh) && ref($fh) ne 'HASH' && ref($fh) ne 'ARRAY' && ref($fh) ne 'SCALAR' && ((ref($fh) eq 'GLOB') || eval { $fh->can('print') }); } 1;