[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.6, Sun Jun 10 17:24:38 2007 UTC revision 1.22, Fri Sep 17 20:18:15 2010 UTC
# Line 1  Line 1 
1  package gjoseqlib;  package gjoseqlib;
2    
3    # This is a SAS component.
4    
5  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
6  #  A list of entries is a list of references  #  A list of entries is a list of references
7  #  #
8  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )  #  Efficient reading of an entire file of sequences:
9  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form  #
10  #  @seq_entries = read_fasta( )                     # STDIN  #  @seq_entries = read_fasta( )                     # STDIN
11  #  @seq_entries = read_fasta( \*FILEHANDLE )  #  @seq_entries = read_fasta( \*FILEHANDLE )
12  #  @seq_entries = read_fasta(  $filename )  #  @seq_entries = read_fasta(  $filename )
13    #
14    #  Reading sequences one at a time to conserve memory.  Calls to different
15    #  files can be intermixed.
16    #
17    #  @entry = read_next_fasta_seq( \*FILEHANDLE )
18    # \@entry = read_next_fasta_seq( \*FILEHANDLE )
19    #  @entry = read_next_fasta_seq(  $filename )
20    # \@entry = read_next_fasta_seq(  $filename )
21    #  @entry = read_next_fasta_seq()                   # STDIN
22    # \@entry = read_next_fasta_seq()                   # STDIN
23    #
24    #  Legacy interface:
25    #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
26    #
27    #  Reading clustal alignment.
28    #
29  #  @seq_entries = read_clustal( )                   # STDIN  #  @seq_entries = read_clustal( )                   # STDIN
30  #  @seq_entries = read_clustal( \*FILEHANDLE )  #  @seq_entries = read_clustal( \*FILEHANDLE )
31  #  @seq_entries = read_clustal(  $filename )  #  @seq_entries = read_clustal(  $filename )
32    #
33    #  Legacy interface:
34  #  @seq_entries = read_clustal_file(  $filename )  #  @seq_entries = read_clustal_file(  $filename )
35  #  #
36  #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries  #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
# Line 21  Line 41 
41  #  ( $id, $def ) = parse_fasta_title( $title )  #  ( $id, $def ) = parse_fasta_title( $title )
42  #  ( $id, $def ) = split_fasta_title( $title )  #  ( $id, $def ) = split_fasta_title( $title )
43  #  #
44  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form  #  Write a fasta format file from sequences.
45    #
46  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
47  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
48  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
49  #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );  #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
50  #  print_alignment_as_fasta(  $filename,    @seq_entry_list );  #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
51  #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );  #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
52    #
53    #  Legacy interface:
54    #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
55    #
56    #  Interface that it really meant for internal use to write the next sequence
57    #  to an open file:
58    #
59    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
60    #  print_seq_as_fasta(               $id, $desc, $seq );
61    #  print_seq_as_fasta( \*FILEHANDLE, $id,        $seq );
62    #  print_seq_as_fasta(               $id,        $seq );
63    #
64    #  Write PHYLIP alignment.  Names might be altered to fit 10 character limit:
65    #
66  #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT  #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
67  #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT  #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
68  #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );  #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
69  #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );  #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
70  #  print_alignment_as_phylip(  $filename,    @seq_entry_list );  #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
71  #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );  #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
72    #
73    #  Write basic NEXUS alignment for PAUP.
74    #
75  #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );  #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
76  #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );  #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
77  #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );  #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
78  #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );  #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
79  #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );  #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
80  #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );  #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
81  #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;  #
 #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );  
82  #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );  #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
83  #  #
84    #  Remove extra columns of alignment gaps from an alignment:
85    #
86  #   @packed_seqs = pack_alignment(  @seqs )  #   @packed_seqs = pack_alignment(  @seqs )
87  #   @packed_seqs = pack_alignment( \@seqs )  #   @packed_seqs = pack_alignment( \@seqs )
88  #  \@packed_seqs = pack_alignment(  @seqs )  #  \@packed_seqs = pack_alignment(  @seqs )
89  #  \@packed_seqs = pack_alignment( \@seqs )  #  \@packed_seqs = pack_alignment( \@seqs )
90  #  #
91    #  Pack mask for an alignment (gap = 0x00, others are 0xFF)
92    #
93    #   $mask = alignment_gap_mask(  @seqs )
94    #   $mask = alignment_gap_mask( \@seqs )
95    #
96    #  Pack a sequence alignment according to a mask:
97    #
98    #   @packed = pack_alignment_by_mask( $mask,  @align )
99    #   @packed = pack_alignment_by_mask( $mask, \@align )
100    #  \@packed = pack_alignment_by_mask( $mask,  @align )
101    #  \@packed = pack_alignment_by_mask( $mask, \@align )
102    #
103    #  Expand sequence by a mask, adding indel at "\000" or '-' in mask:
104    #
105    #   $expanded = expand_sequence_by_mask( $seq, $mask )
106    #
107    #  Remove all alignment gaps from sequences:
108    #
109    #   @packed_seqs = pack_sequences(  @seqs )  #  Works for one sequence, too
110    #   @packed_seqs = pack_sequences( \@seqs )
111    #  \@packed_seqs = pack_sequences(  @seqs )
112    #  \@packed_seqs = pack_sequences( \@seqs )
113    #
114    # Basic sequence manipulation functions:
115    #
116  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
117  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
118    #  $DNAseq = DNA_subseq(  $seq, $from, $to );
119    #  $DNAseq = DNA_subseq( \$seq, $from, $to );
120    #  $RNAseq = RNA_subseq(  $seq, $from, $to );
121    #  $RNAseq = RNA_subseq( \$seq, $from, $to );
122  #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );  #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
123  #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );  #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
124  #  $DNAseq = complement_DNA_seq( $NA_seq );  #  $DNAseq = complement_DNA_seq( $NA_seq );
# Line 60  Line 128 
128  #  $seq    = pack_seq( $sequence )  #  $seq    = pack_seq( $sequence )
129  #  $seq    = clean_ae_sequence( $seq )  #  $seq    = clean_ae_sequence( $seq )
130  #  #
131  #  $seq = translate_seq( $seq [, $met_start] )  #  $aa = translate_seq( $nt, $met_start )
132    #  $aa = translate_seq( $nt )
133  #  $aa  = translate_codon( $triplet );  #  $aa  = translate_codon( $triplet );
 #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  
134  #  #
135  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code.  The supplied code needs to be complete in
136  #  DNA versus RNA type of sequence  #  RNA and/or DNA, and upper and/or lower case.  The program guesses based
137    #  on lysine and phenylalanine codons.
138    #
139    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash, $met_start )
140    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash )
141  #  #
142  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
143  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
# Line 81  Line 153 
153  #  Convert GenBank locations to SEED locations  #  Convert GenBank locations to SEED locations
154  #  #
155  #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )  #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
156    #
157    #  Read quality scores from a fasta-like file:
158    #
159    #  @seq_entries = read_qual( )               #  STDIN
160    # \@seq_entries = read_qual( )               #  STDIN
161    #  @seq_entries = read_qual( \*FILEHANDLE )
162    # \@seq_entries = read_qual( \*FILEHANDLE )
163    #  @seq_entries = read_qual(  $filename )
164    # \@seq_entries = read_qual(  $filename )
165    #
166    #  Evaluate alignments:
167    #
168    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
169    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
170    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_weight )
171    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_open, $gap_extend )
172    #
173    #  ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
174    #  ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_aa_align( $seq1, $seq2 )
175    #
176    #  @sims = oligomer_similarity( $seq1, $seq2, \%opts )
177    #
178    #===============================================================================
179    
180  use strict;  use strict;
181    use Carp;
182  use gjolib qw( wrap_text );  use Data::Dumper;
183    
184  #  Exported global variables:  #  Exported global variables:
185    
# Line 128  Line 222 
222          seq_data_by_id          seq_data_by_id
223    
224          pack_alignment          pack_alignment
225            alignment_gap_mask
226            pack_alignment_by_mask
227            expand_sequence_by_mask
228            pack_sequences
229    
230          subseq_DNA_entry          subseq_DNA_entry
231          subseq_RNA_entry          subseq_RNA_entry
232            DNA_subseq
233            RNA_subseq
234          complement_DNA_entry          complement_DNA_entry
235          complement_RNA_entry          complement_RNA_entry
236          complement_DNA_seq          complement_DNA_seq
# Line 152  Line 252 
252          reverse_intervals          reverse_intervals
253    
254          gb_location_2_seed          gb_location_2_seed
255    
256            read_qual
257    
258            fraction_nt_diff
259            interpret_nt_align
260            interpret_aa_align
261            oligomer_similarity
262          );          );
263    
264  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 198  Line 305 
305      if ( ! ref( $file ) )      if ( ! ref( $file ) )
306      {      {
307          my $fh;          my $fh;
308          -f $file or die "Could not find input file \"$file\"\n";          if    ( -f $file                       ) { }
309          open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";          elsif (    $file =~ /^>(.+)$/ && -f $1 ) { $file = $1 }
310            else { die "Could not find input file '$file'\n" }
311            open( $fh, "<$file" ) || die "Could not open '$file' for input\n";
312          return ( $fh, $file, 1 );          return ( $fh, $file, 1 );
313      }      }
314    
# Line 219  Line 328 
328    
329    
330  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
331  #  Read fasta sequences.  #  Read fasta sequences.  Save the contents in a list of refs to arrays:
332  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #
333    #     $seq_entry = [ id, description, seq ]
334  #  #
335  #     @seq_entries = read_fasta( )               #  STDIN  #     @seq_entries = read_fasta( )               #  STDIN
336  #    \@seq_entries = read_fasta( )               #  STDIN  #    \@seq_entries = read_fasta( )               #  STDIN
# Line 228  Line 338 
338  #    \@seq_entries = read_fasta( \*FILEHANDLE )  #    \@seq_entries = read_fasta( \*FILEHANDLE )
339  #     @seq_entries = read_fasta(  $filename )  #     @seq_entries = read_fasta(  $filename )
340  #    \@seq_entries = read_fasta(  $filename )  #    \@seq_entries = read_fasta(  $filename )
341    #  #  @seq_entries = read_fasta( "command |" )   #  open and read from pipe
342    #  # \@seq_entries = read_fasta( "command |" )   #  open and read from pipe
343    #     @seq_entries = read_fasta( \$string )      #  reference to file as string
344    #    \@seq_entries = read_fasta( \$string )      #  reference to file as string
345    #
346    #-----------------------------------------------------------------------------
347    sub read_fasta
348    {
349        my @seqs;
350        if ( $_[0] && ref $_[0] eq 'SCALAR' )
351        {
352            @seqs = map { $_->[2] =~ tr/ \n\r\t//d; $_ }
353                    map { /^(\S+)([ \t]+([^\n]*\S)?\s*)?\n(.+)$/s ? [ $1, $3 || '', $4 ] : () }
354                    split /^>\s*/m, ${$_[0]};
355        }
356        else
357        {
358            @seqs = map { $_->[2] =~ tr/ \n\r\t//d; $_ }
359                    map { /^(\S+)([ \t]+([^\n]*\S)?\s*)?\n(.+)$/s ? [ $1, $3 || '', $4 ] : () }
360                    split /^>\s*/m, slurp( @_ );
361        }
362    
363        wantarray() ? @seqs : \@seqs;
364    }
365    
366    #-----------------------------------------------------------------------------
367    #  A fast file reader:
368    #
369    #     $data = slurp( )               #  \*STDIN
370    #     $data = slurp( \*FILEHANDLE )  #  an open file handle
371    #     $data = slurp(  $filename )    #  a file name
372    #     $data = slurp( "<$filename" )  #  file with explicit direction
373    #   # $data = slurp( "$command |" )  #  open and read from pipe
374    #
375    #  Note:  It is faster to read lines by reading the file and splitting
376    #         than by reading the lines sequentially.  If space is not an
377    #         issue, this is the way to go.  If space is an issue, then lines
378    #         or records should be processed one-by-one (rather than loading
379    #         the whole input into a string or array).
380    #-----------------------------------------------------------------------------
381    sub slurp
382    {
383        my ( $fh, $close );
384        if ( ref $_[0] eq 'GLOB' )
385        {
386            $fh = shift;
387        }
388        elsif ( $_[0] && ! ref $_[0] )
389        {
390            my $file = shift;
391            if    ( -f $file                       ) { $file = "<$file" }
392            elsif (    $file =~ /^<(.*)$/ && -f $1 ) { }  # Explicit read
393          # elsif (    $file =~ /\S\s*\|$/         ) { }  # Read from a pipe
394            else                                     { return undef }
395            open $fh, $file or return undef;
396            $close = 1;
397        }
398        else
399        {
400            $fh = \*STDIN;
401            $close = 0;
402        }
403    
404        my $out = '';
405        my $inc = 1048576;
406        my $end =       0;
407        my $read;
408        while ( $read = read( $fh, $out, $inc, $end ) ) { $end += $read }
409        close( $fh ) if $close;
410    
411        $out;
412    }
413    
414    
415    #-----------------------------------------------------------------------------
416    #  Previous, 50% slower fasta reader:
417  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
418  sub read_fasta {  sub read_fasta_0
419    {
420      my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );      my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
421      $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";      $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
422    
# Line 255  Line 442 
442    
443    
444  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
445  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.  This is half as fast a
446  #  Return the contents as an array:  (id, description, seq)  #  read_fasta(), but can handle an arbitrarily large file.  State information
447    #  is retained in hashes, so any number of streams can be interlaced.
448    #
449    #      @entry = read_next_fasta_seq( \*FILEHANDLE )
450    #     \@entry = read_next_fasta_seq( \*FILEHANDLE )
451    #      @entry = read_next_fasta_seq(  $filename )
452    #     \@entry = read_next_fasta_seq(  $filename )
453    #      @entry = read_next_fasta_seq()                # \*STDIN
454    #     \@entry = read_next_fasta_seq()                # \*STDIN
455  #  #
456  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )  #      @entry = ( $id, $description, $seq )
457    #
458    #  When reading at the end of file, () is returned.
459    #  With a filename, reading past this will reopen the file at the beginning.
460  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
461  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
462    
463  {   #  Use bare block to scope the header hash  {   #  Use bare block to scope the header hash
464    
465      my %next_header;      my %next_header;
466        my %file_handle;
467        my %close_file;
468    
469      sub read_next_fasta_seq {      sub read_next_fasta_seq
470          my $fh = shift;      {
471          my ( $id, $desc );          $_[0] ||= \*STDIN;               #  Undefined $_[0] fails with use warn
472            my $fh = $file_handle{ $_[0] };
473            if ( ! $fh )
474            {
475                if ( ref $_[0] )
476                {
477                    return () if ref $_[0] ne 'GLOB';
478                    $fh = $_[0];
479                }
480                else
481                {
482                    my $file = $_[0];
483                    if    ( -f $file                       ) { $file = "<$file" }
484                    elsif (    $file =~ /^<(.*)$/ && -f $1 ) { }  # Explicit read
485                  # elsif (    $file =~ /\S\s*\|$/         ) { }  # Read from a pipe
486                    else                                     { return () }
487                    open $fh, $file or return ();
488                    $close_file{ $fh } = 1;
489                }
490                $file_handle{ $_[0] } = $fh;
491            }
492    
493          if ( defined( $next_header{$fh} ) ) {          my ( $id, $desc, $seq ) = ( undef, '', '' );
494            if ( defined( $next_header{$fh} ) )
495            {
496              ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );              ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
497          }          }
498          else {          else
499              $next_header{$fh} = "";          {
500              ( $id, $desc ) = ( undef, "" );              $next_header{$fh} = '';
501          }          }
         my $seq = "";  
502    
503          while ( <$fh> ) {          while ( <$fh> )
504            {
505              chomp;              chomp;
506              if ( /^>/ ) {        #  new id              if ( /^>/ )        #  new id
507                {
508                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
509                  if ( defined($id) && $seq )                  if ( defined($id) && $seq )
510                  {                  {
511                      return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]                      return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
512                  }                  }
513                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
514                  $seq = "";                  $seq = '';
515              }              }
516              else {              else
517                  tr/     0-9//d;              {
518                    tr/ \t\r//d;
519                  $seq .= $_ ;                  $seq .= $_ ;
520              }              }
521          }          }
522    
523          #  Done with file, delete "next header"          #  Done with file; there is no next header:
524    
525          delete $next_header{$fh};          delete $next_header{$fh};
526          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)  
527                                                        : [$id, $desc, $seq]          #  Return last set of data:
528                                            )  
529                                          : () ;          if ( defined($id) && $seq )
530            {
531                return wantarray ? ($id,$desc,$seq) : [$id,$desc,$seq]
532            }
533    
534            #  Or close everything out (returning the empty list tells caller
535            #  that we are done)
536    
537            if ( $close_file{ $fh } ) { close $fh; delete $close_file{ $fh } }
538            delete $file_handle{ $_[0] };
539    
540            return ();
541      }      }
542  }  }
543    
# Line 352  Line 587 
587  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
588  #     ($id, $def) = split_fasta_title( $title )  #     ($id, $def) = split_fasta_title( $title )
589  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
590  sub parse_fasta_title {  sub parse_fasta_title
591    {
592      my $title = shift;      my $title = shift;
593      chomp;      chomp $title;
     if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {  
         return ($1, $3 ? $3 : "");  
     }  
     elsif ($title =~ /^>/) {  
         return ("", "");  
     }  
     else {  
         return (undef, "");  
     }  
 }  
594    
595  sub split_fasta_title {      return $title =~ /^>?\s*(\S+)(\s+(.*\S)?\s*)?$/ ? ( $1, $3 || '' )
596      parse_fasta_title ( shift );           : $title =~ /^>/                           ? ( '', '' )
597             :                                            ( undef, undef )
598  }  }
599    
600    sub split_fasta_title { parse_fasta_title( @_ ) }
601    
602    
603  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
604  #  Helper function for defining an output filehandle:  #  Helper function for defining an output filehandle:
# Line 377  Line 606 
606  #     string is taken as file name to be openend  #     string is taken as file name to be openend
607  #     undef or "" defaults to STDOUT  #     undef or "" defaults to STDOUT
608  #  #
609  #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );  #    ( \*FH, $close [, $file] ) = output_filehandle( $file );
610  #  #
611  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
612  sub output_filehandle  sub output_filehandle
613  {  {
614      my $file = shift;      my $file = shift;
615    
616        #  Null string or undef
617    
618        return ( \*STDOUT, 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
619    
620      #  FILEHANDLE      #  FILEHANDLE
621    
622      return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );      return ( $file, 0 ) if ( ref( $file ) eq "GLOB" );
623    
624      #  Null string or undef      #  Some other kind of reference; return the unused value
625    
626      return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );      return ( \*STDOUT, 0, $file ) if ref( $file );
627    
628      #  File name      #  File name
629    
     if ( ! ref( $file ) )  
     {  
630          my $fh;          my $fh;
631          open( $fh, ">$file" ) || die "Could not open output $file\n";          open( $fh, ">$file" ) || die "Could not open output $file\n";
632          return ( $fh, $file, 1 );      return ( $fh, 1 );
     }  
   
     #  Some other kind of reference; return the unused value  
   
     return ( \*STDOUT, undef, 0, $file );  
633  }  }
634    
635    
# Line 427  Line 653 
653  #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );  #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
654  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
655  sub print_alignment_as_fasta {  sub print_alignment_as_fasta {
656      my ( $fh, undef, $close, $unused ) = output_filehandle( shift );      my ( $fh, $close, $unused ) = output_filehandle( shift );
657      ( unshift @_, $unused ) if $unused;      ( unshift @_, $unused ) if $unused;
658    
659      ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";      ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n";
660    
661      #  Expand the sequence entry list if necessary:      #  Expand the sequence entry list if necessary:
662    
# Line 454  Line 680 
680  #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );  #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
681  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
682  sub print_alignment_as_phylip {  sub print_alignment_as_phylip {
683      my ( $fh, undef, $close, $unused ) = output_filehandle( shift );      my ( $fh, $close, $unused ) = output_filehandle( shift );
684      ( unshift @_, $unused ) if $unused;      ( unshift @_, $unused ) if $unused;
685    
686      ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";      ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
# Line 513  Line 739 
739  #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );  #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
740  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
741  sub print_alignment_as_nexus {  sub print_alignment_as_nexus {
742      my ( $fh, undef, $close, $unused ) = output_filehandle( shift );      my ( $fh, $close, $unused ) = output_filehandle( shift );
743      ( unshift @_, $unused ) if $unused;      ( unshift @_, $unused ) if $unused;
744    
745      my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;      my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
# Line 587  Line 813 
813    
814    
815  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
816  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file.
817  #  #
818  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
819  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );  #     print_seq_as_fasta(               $id, $desc, $seq );
820  #-----------------------------------------------------------------------------  #     print_seq_as_fasta( \*FILEHANDLE, $id,        $seq );
821  sub print_seq_as_fasta {  #     print_seq_as_fasta(               $id,        $seq );
822      my $fh = shift;  #
823      my ($id, $desc, $seq) = @_;  #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
824    #  print_seq_as_fasta() is meant more as a internal support routine than an
825      printf $fh ($desc) ? ">$id $desc\n" : ">$id\n";  #  external interface.  To print a single sequence to a named file use:
826      my $len = length($seq);  #
827      for (my $i = 0; $i < $len; $i += 60) {  #     print_alignment_as_fasta( $filename, [ $id, $desc, $seq ] );
828          print $fh substr($seq, $i, 60) . "\n";  #     print_alignment_as_fasta( $filename, [ $id,        $seq ] );
829      }  #-----------------------------------------------------------------------------
830    sub print_seq_as_fasta
831    {
832        my $fh = ( ref $_[0] eq 'GLOB' ) ? shift : \*STDOUT;
833        return if ( @_ < 2 ) || ( @_ > 3 ) || ! ( defined $_[0] && defined $_[-1] );
834        #  Print header line
835        print $fh  ( @_ == 3 && defined $_[1] && $_[1] =~ /\S/ ) ? ">$_[0] $_[1]\n" : ">$_[0]\n";
836        #  Print sequence, 60 chars per line
837        print $fh  join( "\n", $_[-1] =~ m/.{1,60}/g ), "\n";
838  }  }
839    
840    
# Line 633  Line 867 
867    
868    
869  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
870    #  Return a string with text wrapped to defined line lengths:
871    #
872    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
873    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
874    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
875    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
876    #-----------------------------------------------------------------------------
877    sub wrap_text {
878        my ($str, $len, $ind, $indn) = @_;
879    
880        defined($str)  || die "wrap_text called without a string\n";
881        defined($len)  || ($len  =   80);
882        defined($ind)  || ($ind  =    0);
883        ($ind  < $len) || die "wrap error: indent greater than line length\n";
884        defined($indn) || ($indn = $ind);
885        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
886    
887        $str =~ s/\s+$//;
888        $str =~ s/^\s+//;
889        my ($maxchr, $maxchr1);
890        my (@lines) = ();
891    
892        while ($str) {
893            $maxchr1 = ($maxchr = $len - $ind) - 1;
894            if ($maxchr >= length($str)) {
895                push @lines, (" " x $ind) . $str;
896                last;
897            }
898            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
899                push @lines, (" " x $ind) . $1;
900                $str = $2;
901            }
902            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
903                push @lines, (" " x $ind) . $1;
904                $str = $2;
905            }
906            else {
907                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
908                $str = substr($str, $maxchr);
909            }
910            $ind = $indn;
911        }
912    
913        return join("\n", @lines);
914    }
915    
916    
917    #-----------------------------------------------------------------------------
918  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
919  #  #
920  #     my \%seq_ind  = index_seq_list(  @seq_list );  #     my \%seq_ind  = index_seq_list(  @seq_list );
# Line 681  Line 963 
963      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
964  }  }
965    
966    
967  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
968  #  Remove columns of alignment gaps from sequences:  #  Remove columns of alignment gaps from sequences:
969  #  #
# Line 689  Line 972 
972  #  \@packed_seqs = pack_alignment(  @seqs )  #  \@packed_seqs = pack_alignment(  @seqs )
973  #  \@packed_seqs = pack_alignment( \@seqs )  #  \@packed_seqs = pack_alignment( \@seqs )
974  #  #
975    #  Gap characters are defined below as '-', '~', '.', and ' '.
976  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
   
977  sub pack_alignment  sub pack_alignment
978  {  {
979      my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;      $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
980            or return ();
981    
982        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
983      @seqs or return wantarray ? () : [];      @seqs or return wantarray ? () : [];
984    
985      my $mask  = pack_mask( $seqs[0]->[2] );      my $mask  = gap_mask( $seqs[0]->[2] );
986      foreach ( @seqs[ 1 .. (@seqs-1) ] )      foreach ( @seqs[ 1 .. (@seqs-1) ] )
987      {      {
988          $mask |= pack_mask( $_->[2] );          $mask |= gap_mask( $_->[2] );
989      }      }
990    
991      my $seq;      my $seq;
# Line 709  Line 995 
995                      }                      }
996                  @seqs;                  @seqs;
997    
998      return wantarray ? @seqs2 : \@seqs2;      wantarray ? @seqs2 : \@seqs2;
999    }
1000    
1001    
1002    #-------------------------------------------------------------------------------
1003    #  Produce a packing mask for columns of alignment gaps in sequences.  Gap
1004    #  columns are 0x00 characters, and all others are 0xFF.
1005    #
1006    #   $mask = alignment_gap_mask(  @seqs )
1007    #   $mask = alignment_gap_mask( \@seqs )
1008    #
1009    #-------------------------------------------------------------------------------
1010    sub alignment_gap_mask
1011    {
1012        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1013            or return undef;
1014    
1015        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1016        @seqs or return undef;
1017    
1018        my $mask = gap_mask( $seqs[0]->[2] );
1019        foreach ( @seqs[ 1 .. (@seqs-1) ] ) { $mask |= gap_mask( $_->[2] ) }
1020    
1021        $mask;
1022    }
1023    
1024    
1025    #-------------------------------------------------------------------------------
1026    #  Pack a sequence alignment according to a mask, removing positions where
1027    #  mask has 0x00 (or '-') characters
1028    #
1029    #   @packed = pack_alignment_by_mask( $mask,  @align )
1030    #   @packed = pack_alignment_by_mask( $mask, \@align )
1031    #  \@packed = pack_alignment_by_mask( $mask,  @align )
1032    #  \@packed = pack_alignment_by_mask( $mask, \@align )
1033    #
1034    #-------------------------------------------------------------------------------
1035    sub pack_alignment_by_mask
1036    {
1037        my $mask = shift;
1038        defined $mask && ! ref( $mask ) && length( $mask )
1039            or return ();
1040        $mask =~ tr/-/\000/;      # Allow '-' as a column to be removed
1041        $mask =~ tr/\000/\377/c;  # Make sure that everything not null is 0xFF
1042    
1043        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1044            or return ();
1045        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1046    
1047        my $seq;
1048        my @seqs2 = map { $seq = $_->[2] & $mask;     # Apply mask to sequence
1049                          $seq =~ tr/\000//d;         # Delete null characters
1050                          [ $_->[0], $_->[1], $seq ]  # Rebuild sequence entries
1051                        }
1052                    @seqs;
1053    
1054        wantarray ? @seqs2 : \@seqs2;
1055  }  }
1056    
1057  sub pack_mask  
1058    #-------------------------------------------------------------------------------
1059    #  Weight a sequence alignment according to a mask of digits, 0-9.
1060    #
1061    #   @packed = weight_alignment_by_mask( $mask,  @align )
1062    #   @packed = weight_alignment_by_mask( $mask, \@align )
1063    #  \@packed = weight_alignment_by_mask( $mask,  @align )
1064    #  \@packed = weight_alignment_by_mask( $mask, \@align )
1065    #
1066    #-------------------------------------------------------------------------------
1067    sub weight_alignment_by_mask
1068  {  {
1069      my $mask = shift;      my $mask = shift;
1070      $mask =~ tr/-/\000/;      defined $mask && ! ref( $mask ) && length( $mask )
1071      $mask =~ tr/\000/\377/c;          or return ();
1072      return $mask;  
1073        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1074            or return ();
1075        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1076    
1077        my @seqs2 = map { [ $_->[0], $_->[1], weight_seq_by_mask_0( $mask, $_->[2] ) ] } @seqs;
1078    
1079        wantarray ? @seqs2 : \@seqs2;
1080  }  }
1081    
1082    
1083    #
1084    #  Assume data are valid
1085    #
1086    sub weight_seq_by_mask_0
1087    {
1088        my ( $mask, $seq ) = @_;
1089    
1090        #  Remove 0 weight columns, which is fast and easy:
1091        my $m0 = $mask;
1092        $m0 =~ tr/123456789/\377/;
1093        $m0 =~ tr/\377/\000/c;
1094        ( $mask &= $m0 ) =~ tr/\000//d;
1095        ( $seq  &= $m0 ) =~ tr/\000//d;
1096    
1097        #  If all remaining cols are weight 1, we are done:
1098        return $seq if $mask =~ /^1*$/;
1099    
1100        my @seq;
1101        for ( my $i = 0; $i < length( $mask ); $i++ )
1102        {
1103            push @seq, substr( $seq, $i, 1 ) x substr( $mask, $i, 1 );
1104        }
1105    
1106        join( '', @seq );
1107    }
1108    
1109    
1110    #-----------------------------------------------------------------------------
1111    #  Make a mask in which gap characters ('-', '~', '.', and ' ') are converted
1112    #  to 0x00, and all other characters to 0xFF.
1113    #
1114    #      $mask = gap_mask( $seq )
1115    #
1116    #-----------------------------------------------------------------------------
1117    sub gap_mask
1118    {
1119        my $mask = shift;
1120        defined $mask or return '';
1121    
1122        $mask =~ tr/-~. /\000/;    #  Gap characters (might be extended)
1123        $mask =~ tr/\000/\377/c;   #  Non-gap characters
1124        $mask;
1125    }
1126    
1127    
1128    #===============================================================================
1129    #  Expand sequences with the local gap character in a manner that reverses the
1130    #  pack by mask function.
1131    #
1132    #      $expanded = expand_sequence_by_mask( $seq, $mask )
1133    #
1134    #  The columns to be added can be marked by '-' or "\000" in the mask.
1135    #
1136    #  Code note:
1137    #
1138    #  The function attempts to match the local gap character in the sequence.
1139    #  $c0 and $c1 are the previous and next characters in the sequence being
1140    #  expanded.  (($c0,$c1)=($c1,shift @s1))[0] updates the values and evaluates
1141    #  to what was the next character, and becomes the new previous character.
1142    #  The following really does print w, x, y, and z, one per line:
1143    #
1144    #     ( $a, $b, @c ) = ("", split //, "wxyz");
1145    #     while ( defined $b ) { print( (($a,$b)=($b,shift @c))[0], "\n" ) }
1146    #===============================================================================
1147    
1148    sub expand_sequence_by_mask
1149    {
1150        my ( $seq, $mask ) = @_;
1151    
1152        $mask =~ tr/-/\000/;  #  Allow hyphen or null in mask at added positions.
1153        my ( $c0, $c1, @s1 ) = ( '', split( //, $seq ), '' );
1154    
1155        join '', map { $_  ne "\000"            ? (($c0,$c1)=($c1,shift @s1))[0]
1156                     : $c0 eq '~' || $c1 eq '~' ? '~'
1157                     : $c0 eq '.' || $c1 eq '.' ? '.'
1158                     :                            '-'
1159                     }
1160                 split //, $mask;
1161    }
1162    
1163    
1164    #-----------------------------------------------------------------------------
1165    #  Remove all alignment gaps from sequences:
1166    #
1167    #   @packed_seqs = pack_sequences(  @seqs )  # Also handles single sequence
1168    #   @packed_seqs = pack_sequences( \@seqs )
1169    #  \@packed_seqs = pack_sequences(  @seqs )
1170    #  \@packed_seqs = pack_sequences( \@seqs )
1171    #
1172    #-----------------------------------------------------------------------------
1173    sub pack_sequences
1174    {
1175        $_[0] && ( ref( $_[0] ) eq 'ARRAY' ) && @{$_[0]} && defined( $_[0]->[0] )
1176            or return ();
1177    
1178        my @seqs = ( ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
1179    
1180        my @seqs2 = map { [ $_->[0], $_->[1], pack_seq( $_->[2] ) ] } @seqs;
1181    
1182        wantarray ? @seqs2 : \@seqs2;
1183    }
1184    
1185    
1186  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1187  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
1188  #  #
# Line 794  Line 1257 
1257  }  }
1258    
1259    
1260    sub DNA_subseq
1261    {
1262        my ( $seq, $from, $to ) = @_;
1263    
1264        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
1265                                          : length(  $seq );
1266        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
1267        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
1268    
1269        my $left  = ( $from < $to ) ? $from : $to;
1270        my $right = ( $from < $to ) ? $to   : $from;
1271        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
1272        if ( $right > $len ) { $right = $len }
1273        if ( $left  < 1    ) { $left  =    1 }
1274    
1275        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
1276                                             : substr(  $seq, $left-1, $right-$left+1 );
1277    
1278        if ( $from > $to )
1279        {
1280            $subseq = reverse $subseq;
1281            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
1282                         [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
1283        }
1284    
1285        $subseq
1286    }
1287    
1288    
1289    sub RNA_subseq
1290    {
1291        my ( $seq, $from, $to ) = @_;
1292    
1293        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
1294                                          : length(  $seq );
1295        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
1296        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
1297    
1298        my $left  = ( $from < $to ) ? $from : $to;
1299        my $right = ( $from < $to ) ? $to   : $from;
1300        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
1301        if ( $right > $len ) { $right = $len }
1302        if ( $left  < 1    ) { $left  =    1 }
1303    
1304        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
1305                                             : substr(  $seq, $left-1, $right-$left+1 );
1306    
1307        if ( $from > $to )
1308        {
1309            $subseq = reverse $subseq;
1310            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
1311                         [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
1312        }
1313    
1314        $subseq
1315    }
1316    
1317    
1318  sub complement_DNA_entry {  sub complement_DNA_entry {
1319      my ($id, $desc, $seq, $fix_id) = @_;      my ($id, $desc, $seq, $fix_id) = @_;
1320      $fix_id ||= 0;     #  fix undef values      $fix_id ||= 0;     #  fix undef values
# Line 868  Line 1389 
1389    
1390  sub pack_seq {  sub pack_seq {
1391      my $seq = shift;      my $seq = shift;
1392      $seq =~ tr/A-Za-z//cd;      $seq =~ tr/A-Za-z*//cd;
1393      return $seq;      $seq;
1394  }  }
1395    
1396    
1397  sub clean_ae_sequence {  sub clean_ae_sequence {
1398      $_ = shift;      local $_ = shift;
1399      $_ = to7bit($_);      $_ = to7bit($_);
1400      s/[+]/1/g;      s/\+/1/g;
1401      s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g;      s/[^0-9A-IK-NP-Za-ik-np-z~.-]/-/g;
1402      return $_;      return $_;
1403  }  }
1404    
1405    
1406  sub to7bit {  sub to7bit {
1407      $_ = shift;      local $_ = shift;
1408      my ($o, $c);      my ($o, $c);
1409      while (/\\([0-3][0-7][0-7])/) {      while (/\\([0-3][0-7][0-7])/) {
1410          $o = oct($1) % 128;          $o = oct($1) % 128;
# Line 895  Line 1416 
1416    
1417    
1418  sub to8bit {  sub to8bit {
1419      $_ = shift;      local $_ = shift;
1420      my ($o, $c);      my ($o, $c);
1421      while (/\\([0-3][0-7][0-7])/) {      while (/\\([0-3][0-7][0-7])/) {
1422          $o = oct($1);          $o = oct($1);
# Line 911  Line 1432 
1432  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein:
1433  #  #
1434  #  $seq = translate_seq( $seq [, $met_start] )  #  $seq = translate_seq( $seq [, $met_start] )
1435  #  $aa  = translate_codon( $triplet );  #     $aa  = translate_codon( $triplet )
1436  #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  #     $aa  = translate_DNA_codon( $triplet )     # Does not rely on DNA
1437    #     $aa  = translate_uc_DNA_codon( $triplet )  # Does not rely on uc or DNA
1438  #  #
1439  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code must be upper case index and match the
1440  #  DNA versus RNA type of sequence  #  DNA versus RNA type of sequence
# Line 929  Line 1451 
1451    
1452      # DNA version      # DNA version
1453    
1454      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => 'F',  TCT => 'S',  TAT => 'Y',  TGT => 'C',
1455      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => 'F',  TCC => 'S',  TAC => 'Y',  TGC => 'C',
1456      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => 'L',  TCA => 'S',  TAA => '*',  TGA => '*',
1457      TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",      TTG => 'L',  TCG => 'S',  TAG => '*',  TGG => 'W',
1458      CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",      CTT => 'L',  CCT => 'P',  CAT => 'H',  CGT => 'R',
1459      CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",      CTC => 'L',  CCC => 'P',  CAC => 'H',  CGC => 'R',
1460      CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",      CTA => 'L',  CCA => 'P',  CAA => 'Q',  CGA => 'R',
1461      CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",      CTG => 'L',  CCG => 'P',  CAG => 'Q',  CGG => 'R',
1462      ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",      ATT => 'I',  ACT => 'T',  AAT => 'N',  AGT => 'S',
1463      ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",      ATC => 'I',  ACC => 'T',  AAC => 'N',  AGC => 'S',
1464      ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",      ATA => 'I',  ACA => 'T',  AAA => 'K',  AGA => 'R',
1465      ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",      ATG => 'M',  ACG => 'T',  AAG => 'K',  AGG => 'R',
1466      GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",      GTT => 'V',  GCT => 'A',  GAT => 'D',  GGT => 'G',
1467      GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",      GTC => 'V',  GCC => 'A',  GAC => 'D',  GGC => 'G',
1468      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => 'V',  GCA => 'A',  GAA => 'E',  GGA => 'G',
1469      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => 'V',  GCG => 'A',  GAG => 'E',  GGG => 'G',
   
     # RNA suppliment  
   
     UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",  
     UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",  
     UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",  
     UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",  
     CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",  
     CUC => "L",  
     CUA => "L",  
     CUG => "L",  
     AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",  
     AUC => "I",  
     AUA => "I",  
     AUG => "M",  
     GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",  
     GUC => "V",  
     GUA => "V",  
     GUG => "V",  
1470    
1471      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1472      #  speed up the processing of some ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1473    
1474      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => 'F',  TCY => 'S',  TAY => 'Y',  TGY => 'C',
1475      TTR => "L",  TCR => "S",  TAR => "*",      TTR => 'L',  TCR => 'S',  TAR => '*',
1476                   TCN => "S",                   TCN => 'S',
1477      CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",      CTY => 'L',  CCY => 'P',  CAY => 'H',  CGY => 'R',
1478      CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",      CTR => 'L',  CCR => 'P',  CAR => 'Q',  CGR => 'R',
1479      CTN => "L",  CCN => "P",               CGN => "R",      CTN => 'L',  CCN => 'P',               CGN => 'R',
1480      ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",      ATY => 'I',  ACY => 'T',  AAY => 'N',  AGY => 'S',
1481                   ACR => "T",  AAR => "K",  AGR => "R",                   ACR => 'T',  AAR => 'K',  AGR => 'R',
1482                   ACN => "T",                   ACN => 'T',
1483      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => 'V',  GCY => 'A',  GAY => 'D',  GGY => 'G',
1484      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => 'V',  GCR => 'A',  GAR => 'E',  GGR => 'G',
1485      GTN => "V",  GCN => "A",               GGN => "G",      GTN => 'V',  GCN => 'A',               GGN => 'G'
   
     UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",  
     UUR => "L",  UCR => "S",  UAR => "*",  
                  UCN => "S",  
     CUY => "L",  
     CUR => "L",  
     CUN => "L",  
     AUY => "I",  
     GUY => "V",  
     GUR => "V",  
     GUN => "V"  
1486  );  );
1487    
1488    #  Add RNA by construction:
1489    
1490    foreach ( grep { /T/ } keys %genetic_code )
1491    {
1492        my $codon = $_;
1493        $codon =~ s/T/U/g;
1494        $genetic_code{ $codon } = lc $genetic_code{ $_ }
1495    }
1496    
1497  #  Add lower case by construction:  #  Add lower case by construction:
1498    
1499  foreach ( keys %genetic_code ) {  foreach ( keys %genetic_code )
1500    {
1501      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1502  }  }
1503    
1504    
1505  #  Construct the genetic code with selanocysteine by difference:  #  Construct the genetic code with selenocysteine by difference:
1506    
1507  %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;  %genetic_code_with_U = %genetic_code;
1508  $genetic_code_with_U{ TGA } = "U";  $genetic_code_with_U{ TGA } = 'U';
1509  $genetic_code_with_U{ tga } = "u";  $genetic_code_with_U{ tga } = 'u';
1510  $genetic_code_with_U{ UGA } = "U";  $genetic_code_with_U{ UGA } = 'U';
1511  $genetic_code_with_U{ uga } = "u";  $genetic_code_with_U{ uga } = 'u';
1512    
1513    
1514  %amino_acid_codons_DNA = (  %amino_acid_codons_DNA = (
# Line 1271  Line 1772 
1772    
1773    
1774  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1775  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein.  Respects case of the
1776    #  nucleotide sequence.
1777  #  #
1778  #      $seq = translate_seq( $seq [, $met_start] )  #      $aa = translate_seq( $nt, $met_start )
1779    #      $aa = translate_seq( $nt )
1780  #  #
1781  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1782    
1783  sub translate_seq {  sub translate_seq
1784      my $seq = uc shift;  {
1785      $seq =~ tr/UX/TN/;      #  make it DNA, and allow X      my $seq = shift;
1786      $seq =~ tr/-//d;        #  remove gaps      $seq =~ tr/-//d;        #  remove gaps
1787    
1788      my $met = shift || 0;   #  a second argument that is true      my @codons = $seq =~ m/(...?)/g;  #  Will try to translate last 2 nt
                             #  forces first amino acid to be Met  
                             #  (note: undef is false)  
1789    
1790      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  A second argument that is true forces first amino acid to be Met
1791      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );  
1792      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      my @met;
1793          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1794        {
1795            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1796      }      }
1797    
1798      return $pep;      join( '', @met, map { translate_codon( $_ ) } @codons )
1799  }  }
1800    
1801    
1802  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1803  #  Translate a single triplet with "universal" genetic code  #  Translate a single triplet with "universal" genetic code.
 #  Uppercase and DNA are performed, then translate_uc_DNA_codon  
 #  is called.  
1804  #  #
1805  #      $aa = translate_codon( $triplet )  #      $aa = translate_codon( $triplet )
1806    #      $aa = translate_DNA_codon( $triplet )
1807    #      $aa = translate_uc_DNA_codon( $triplet )
1808  #  #
1809  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1810    
1811  sub translate_codon {  sub translate_DNA_codon { translate_codon( @_ ) }
     my $codon = uc shift;  #  Make it uppercase  
     $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X  
     return translate_uc_DNA_codon($codon);  
 }  
1812    
1813    sub translate_uc_DNA_codon { translate_codon( uc $_[0] ) }
1814    
1815  #-----------------------------------------------------------------------------  sub translate_codon
1816  #  Translate a single triplet with "universal" genetic code  {
 #  Uppercase and DNA assumed  
 #  Intended for private use by translate_codon and translate_seq  
 #  
 #      $aa = translate_uc_DNA_codon( $triplet )  
 #  
 #-----------------------------------------------------------------------------  
   
 sub translate_uc_DNA_codon {  
1817      my $codon = shift;      my $codon = shift;
1818      my $aa;      $codon =~ tr/Uu/Tt/;     #  Make it DNA
1819    
1820      #  Try a simple lookup:      #  Try a simple lookup:
1821    
1822        my $aa;
1823      if ( $aa = $genetic_code{ $codon } ) { return $aa }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1824    
1825      #  With the expanded code defined above, this catches simple N, R      #  Attempt to recover from mixed-case codons:
     #  and Y ambiguities in the third position.  Other codes like  
     #  GG[KMSWBDHV], or even GG, might be unambiguously translated by  
     #  converting the last position to N and seeing if this is in the  
     #  (expanded) code table:  
   
     if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }  
   
     #  Test that codon is valid and might have unambiguous aa:  
   
     if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }  
     #                     ^^  
     #                     |+- for leucine YTR  
     #                     +-- for arginine MGR  
1826    
1827      #  Expand all ambiguous nucleotides to see if they all yield same aa.      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1828      #  Loop order tries to fail quickly with first position change.      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1829    
1830      $aa = "";      #  The code defined above catches simple N, R and Y ambiguities in the
1831      for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {      #  third position.  Other codons (e.g., GG[KMSWBDHV], or even GG) might
1832          for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {      #  be unambiguously translated by converting the last position to N and
1833              for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {      #  seeing if this is in the code table:
1834                  #  set the first value of $aa  
1835                  if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }      my $N = ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1836                  #  or break out if any other amino acid is detected      if ( $aa = $genetic_code{ substr($codon,0,2) . $N } ) { return $aa }
1837                  elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }  
1838              }      #  Test that codon is valid for an unambiguous aa:
1839          }  
1840        my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1841        if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/i
1842          && $codon !~ m/^YT[AGR]$/i     #  Leu YTR
1843          && $codon !~ m/^MG[AGR]$/i     #  Arg MGR
1844           )
1845        {
1846            return $X;
1847      }      }
1848    
1849      return $aa || "X";      #  Expand all ambiguous nucleotides to see if they all yield same aa.
1850    
1851        my @n1 = @{ $DNA_letter_can_be{ substr( $codon, 0, 1 ) } };
1852        my $n2 =                        substr( $codon, 1, 1 );
1853        my @n3 = @{ $DNA_letter_can_be{ substr( $codon, 2, 1 ) } };
1854        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1855    
1856        my $triple = shift @triples;
1857        $aa = $genetic_code{ $triple };
1858        $aa or return $X;
1859    
1860        foreach $triple ( @triples ) { return $X if $aa ne $genetic_code{$triple} }
1861    
1862        return $aa;
1863  }  }
1864    
1865    
# Line 1368  Line 1868 
1868  #  Diagnose the use of upper versus lower, and T versus U in the supplied  #  Diagnose the use of upper versus lower, and T versus U in the supplied
1869  #  code, and transform the supplied nucleotide sequence to match.  #  code, and transform the supplied nucleotide sequence to match.
1870  #  #
1871  #  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )  #     $aa = translate_seq_with_user_code( $nt, \%gen_code )
1872    #     $aa = translate_seq_with_user_code( $nt, \%gen_code, $start_with_met )
1873  #  #
1874    #  Modified 2007-11-22 to be less intrusive in these diagnoses by sensing
1875    #  the presence of both versions in the user code.
1876  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1877    
1878  sub translate_seq_with_user_code {  sub translate_seq_with_user_code
1879    {
1880      my $seq = shift;      my $seq = shift;
1881      $seq =~ tr/-//d;     #  remove gaps  ***  Why?      $seq =~ tr/-//d;     #  remove gaps  ***  Why?
     $seq =~ tr/Xx/Nn/;   #  allow X  
1882    
1883      my $gc = shift;      #  Reference to hash of DNA alphabet code      my $gc = shift;      #  Reference to hash of code
1884      if (! $gc || ref($gc) ne "HASH") {      if (! $gc || ref($gc) ne "HASH")
1885          die "translate_seq_with_user_code needs genetic code hash as secondargument.";      {
1886            print STDERR "translate_seq_with_user_code needs genetic code hash as second argument.";
1887            return undef;
1888      }      }
1889    
1890      #  Test the type of code supplied: uppercase versus lowercase      #  Test code support for upper vs lower case:
   
     my ($RNA_F, $DNA_F, $M, $N, $X);  
1891    
1892      if ($gc->{ "AAA" }) {     #  Looks like uppercase code table      my ( $TTT, $UUU );
1893        if    ( $gc->{AAA} && ! $gc->{aaa} )   #  Uppercase only code table
1894        {
1895          $seq   = uc $seq;     #  Uppercase sequence          $seq   = uc $seq;     #  Uppercase sequence
1896          $RNA_F = "UUU";       #  Uppercase RNA Phe codon          ( $TTT, $UUU ) = ( 'TTT', 'UUU' );
         $DNA_F = "TTT";       #  Uppercase DNA Phe codon  
         $M     = "M";         #  Uppercase initiator  
         $N     = "N";         #  Uppercase ambiguous nuc  
         $X     = "X";         #  Uppercase ambiguous aa  
1897      }      }
1898      elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table      elsif ( $gc->{aaa} && ! $gc->{AAA} )   #  Lowercase only code table
1899        {
1900          $seq   = lc $seq;     #  Lowercase sequence          $seq   = lc $seq;     #  Lowercase sequence
1901          $RNA_F = "uuu";       #  Lowercase RNA Phe codon          ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
         $DNA_F = "ttt";       #  Lowercase DNA Phe codon  
         $M     = "m";         #  Lowercase initiator  
         $N     = "n";         #  Lowercase ambiguous nuc  
         $X     = "x";         #  Lowercase ambiguous aa  
1902      }      }
1903      else {      elsif ( $gc->{aaa} )
1904          die "User-supplied genetic code does not have aaa or AAA\n";      {
1905            ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
1906        }
1907        else
1908        {
1909            print STDERR "User-supplied genetic code does not have aaa or AAA\n";
1910            return undef;
1911      }      }
1912    
1913      #  Test the type of code supplied:  UUU versus TTT      #  Test code support for U vs T:
   
     my ($ambigs);  
1914    
1915      if ($gc->{ $RNA_F }) {     #  Looks like RNA code table      my $ambigs;
1916          $seq =~ tr/Tt/Uu/;      if    ( $gc->{$UUU} && ! $gc->{$TTT} )  # RNA only code table
1917        {
1918            $seq = tr/Tt/Uu/;
1919          $ambigs = \%RNA_letter_can_be;          $ambigs = \%RNA_letter_can_be;
1920      }      }
1921      elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table      elsif ( $gc->{$TTT} && ! $gc->{$UUU} )  # DNA only code table
1922          $seq =~ tr/Uu/Tt/;      {
1923            $seq = tr/Uu/Tt/;
1924          $ambigs = \%DNA_letter_can_be;          $ambigs = \%DNA_letter_can_be;
1925      }      }
1926      else {      else
1927          die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";      {
1928            my $t = $seq =~ tr/Tt//;
1929            my $u = $seq =~ tr/Uu//;
1930            $ambigs = ( $t > $u ) ? \%DNA_letter_can_be : \%RNA_letter_can_be;
1931      }      }
1932    
1933      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  We can now do the codon-by-codon translation:
1934    
1935      my $met = shift;     #  a third argument that is true      my @codons = $seq =~ m/(...?)/g;  #  will try to translate last 2 nt
1936                           #  forces first amino acid to be Met  
1937                           #  (note: undef is false)      #  A third argument that is true forces first amino acid to be Met
     my $pep  = ($met && ($imax >= 0)) ? $M : "";  
     my $aa;  
1938    
1939      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      my @met;
1940          $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1941        {
1942            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1943      }      }
1944    
1945      return $pep;      join( '', @met, map { translate_codon_with_user_code( $_, $gc, $ambigs ) } @codons )
1946  }  }
1947    
1948    
1949  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1950  #  Translate with user-supplied genetic code hash.  For speed, no error  #  Translate with user-supplied genetic code hash.  No error check on the code.
1951  #  check on the hash.  Calling programs should check for the hash at a  #  Should only be called through translate_seq_with_user_code.
 #  higher level.  
 #  
 #  Should only be called through translate_seq_with_user_code  
1952  #  #
1953  #   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )  #     $aa = translate_codon_with_user_code( $triplet, \%code, \%ambig_table )
1954  #  #
1955  #  $triplet      speaks for itself  #  $triplet      speaks for itself
1956  #  $code         ref to the hash with the codon translations  #  \%code         ref to the hash with the codon translations
1957  #  $N            character to use for ambiguous nucleotide  #  \%ambig_table  ref to hash with lists of nucleotides for each ambig code
 #  $X            character to use for ambiguous amino acid  
 #  $ambig_table  ref to hash with lists of nucleotides for each ambig code  
1958  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1959    
1960    sub translate_codon_with_user_code
1961  sub translate_codon_with_user_code {  {
1962      my $codon = shift;      my ( $codon, $gc, $ambigs ) = @_;
     my $gc    = shift;  
     my $aa;  
1963    
1964      #  Try a simple lookup:      #  Try a simple lookup:
1965    
1966        my $aa;
1967      if ( $aa = $gc->{ $codon } ) { return $aa }      if ( $aa = $gc->{ $codon } ) { return $aa }
1968    
1969      #  Test that codon is valid and might have unambiguous aa:      #  Attempt to recover from mixed-case codons:
1970    
1971      my ($N, $X, $ambigs) = @_;      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1972      if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
     if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }  
     #                          ^^  
     #                          |+- for leucine YTR  
     #                          +-- for arginine MGR  
1973    
1974      #  Expand all ambiguous nucleotides to see if they all yield same aa.      #  Test that codon is valid for an unambiguous aa:
     #  Loop order tries to fail quickly with first position change.  
1975    
1976      $aa = "";      my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1977      for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {  
1978          for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {      if ( $codon =~ m/^[ACGTU][ACGTU]$/i )  # Add N?
1979              for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {      {
1980                  #  set the first value of $aa          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
                 if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }  
                 #  break out if any other amino acid is detected  
                 elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }  
1981              }              }
1982        #  This makes assumptions about the user code, but tranlating ambiguous
1983        #  codons is really a bit off the wall to start with:
1984        elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?
1985        {
1986            return $X;
1987          }          }
1988    
1989        #  Expand all ambiguous nucleotides to see if they all yield same aa.
1990    
1991        my @n1 = @{ $ambigs->{ substr( $codon, 0, 1 ) } };
1992        my $n2 =               substr( $codon, 1, 1 );
1993        my @n3 = @{ $ambigs->{ substr( $codon, 2, 1 ) } };
1994        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1995    
1996        my $triple = shift @triples;
1997        $aa = $gc->{ $triple } || $gc->{ lc $triple } || $gc->{ uc $triple };
1998        $aa or return $X;
1999    
2000        foreach $triple ( @triples )
2001        {
2002            return $X if $aa ne ( $gc->{$triple} || $gc->{lc $triple} || $gc->{uc $triple} );
2003      }      }
2004    
2005      return $aa || $X;      return $aa;
2006  }  }
2007    
2008    
# Line 1676  Line 2190 
2190  }  }
2191    
2192    
2193    #-----------------------------------------------------------------------------
2194    #  Read qual.
2195    #
2196    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
2197    #
2198    #     @seq_entries = read_qual( )               #  STDIN
2199    #    \@seq_entries = read_qual( )               #  STDIN
2200    #     @seq_entries = read_qual( \*FILEHANDLE )
2201    #    \@seq_entries = read_qual( \*FILEHANDLE )
2202    #     @seq_entries = read_qual(  $filename )
2203    #    \@seq_entries = read_qual(  $filename )
2204    #-----------------------------------------------------------------------------
2205    sub read_qual {
2206        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
2207        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
2208    
2209        my @quals = ();
2210        my ($id, $desc, $qual) = ("", "", []);
2211    
2212        while ( <$fh> ) {
2213            chomp;
2214            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
2215                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
2216                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
2217            }
2218            else {
2219                push @$qual, split;
2220            }
2221        }
2222        close( $fh ) if $close;
2223    
2224        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
2225        return wantarray ? @quals : \@quals;
2226    }
2227    
2228    
2229    #-------------------------------------------------------------------------------
2230    #  Fraction difference for an alignment of two nucleotide sequences in terms of
2231    #  number of differing residues, number of gaps, and number of gap opennings.
2232    #
2233    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
2234    #
2235    #  or
2236    #
2237    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
2238    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_wgt )
2239    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $open_wgt, $extend_wgt )
2240    #
2241    #  Options:
2242    #
2243    #      gap      => $gap_wgt          # Gap open and extend weight (D = 0.5)
2244    #      open     => $open_wgt         # Gap openning weight (D = gap_wgt)
2245    #      extend   => $extend_wgt       # Gap extension weight (D = open_wgt)
2246    #      t_gap    => $term_gap_wgt     # Terminal open and extend weight
2247    #      t_open   => $term_open_wgt    # Terminal gap open weight (D = open_wgt)
2248    #      t_extend => $term_extend_wgt  # Terminal gap extend weight (D = extend_wgt)
2249    #
2250    #  Default gap open and gap extend weights are 1/2.  Beware that
2251    #
2252    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1 )
2253    #
2254    #  and
2255    #
2256    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1, 0 )
2257    #
2258    #  are different.  The first has equal openning and extension weights, whereas
2259    #  the second has an openning weight of 1, and and extension weight of 0 (it
2260    #  only penalizes the number of runs of gaps).
2261    #-------------------------------------------------------------------------------
2262    sub fraction_nt_diff
2263    {
2264        my ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( @_[0,1] );
2265    
2266        my $diff_scr;
2267        if ( ref( $_[2] ) eq 'HASH' )
2268        {
2269            my $opts = $_[2];
2270            my $gap_open    = defined $opts->{ open }     ? $opts->{ open }
2271                            : defined $opts->{ gap }      ? $opts->{ gap }
2272                            : 0.5;
2273            my $gap_extend  = defined $opts->{ extend }   ? $opts->{ extend }
2274                            : $gap_open;
2275            my $term_open   = defined $opts->{ t_open }   ? $opts->{ t_open }
2276                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
2277                            : $gap_open;
2278            my $term_extend = defined $opts->{ t_extend } ? $opts->{ t_extend }
2279                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
2280                            : $gap_extend;
2281    
2282            $nopen -= $topen;
2283            $ngap  -= $tgap;
2284            $diff_scr = $ndif + $gap_open  * $nopen + $gap_extend  * ($ngap-$nopen)
2285                              + $term_open * $topen + $term_extend * ($tgap-$topen);
2286        }
2287        else
2288        {
2289            my $gap_open   = defined( $_[2] ) ? $_[2] : 0.5;
2290            my $gap_extend = defined( $_[3] ) ? $_[3] : $gap_open;
2291            $diff_scr = $ndif + $gap_open * $nopen + $gap_extend * ($ngap-$nopen);
2292        }
2293        my $ttl_scr = $nid + $diff_scr;
2294    
2295        $ttl_scr ? $diff_scr / $ttl_scr : undef
2296    }
2297    
2298    
2299    #-------------------------------------------------------------------------------
2300    #  Interpret an alignment of two nucleotide sequences in terms of: useful
2301    #  aligned positions (unambiguous, and not a common gap), number of identical
2302    #  residues, number of differing residues, number of gaps, and number of gap
2303    #  opennings.
2304    #
2305    #     ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
2306    #
2307    #  $npos  = total aligned positons (= $nid + $ndif + $ngap)
2308    #  $nid   = number of positions with identical nucleotides (ignoring case)
2309    #  $ndif  = number of positions with differing nucleotides
2310    #  $ngap  = number of positions with gap in one sequence but not the other
2311    #  $nopen = number of runs of gaps
2312    #  $tgap  = number of gaps in runs adjacent to a terminus
2313    #  $topen = number of alignment ends with gaps
2314    #
2315    #  Some of the methods might seem overly complex, but are necessary for cases
2316    #  in which the gaps switch strands in the alignment:
2317    #
2318    #     seq1  ---ACGTGAC--TTGCAGAG
2319    #     seq2  TTT---TGACGG--GCAGGG
2320    #     mask  00000011110000111111
2321    #
2322    #     npos  = 20
2323    #     nid   =  9
2324    #     ndif  =  1
2325    #     ngap  = 10
2326    #     nopen =  4
2327    #     tgap  =  3
2328    #     topen =  1
2329    #
2330    #  Although there are 4 gap opennings, there are only 2 runs in the mask,
2331    #  and the terminal run is length 6, not 3.  (Why handle these?  Because
2332    #  pairs of sequences from a multiple sequence alignment can look like this.)
2333    #-------------------------------------------------------------------------------
2334    sub interpret_nt_align
2335    {
2336        #  Remove alignment columns that are not informative:
2337        my ( $s1, $s2 ) = useful_nt_align( @_[0,1] );
2338        my $nmat = length( $s1 );          # Useful alignment length
2339    
2340        my $m1 = $s1;
2341        $m1 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
2342        $m1 =~ tr/\377/\000/c;             # Others (gaps) to null byte
2343        my $m2 = $s2;
2344        $m2 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
2345        $m2 =~ tr/\377/\000/c;             # Others (gaps) to null byte
2346        $m1 &= $m2;                        # Gap in either sequence becomes null
2347        $s1 &= $m1;                        # Apply mask to sequence 1
2348        $s2 &= $m1;                        # Apply mask to sequence 2
2349        my $nopen = @{[ $s1 =~ /\000+/g ]}   # Gap opens in sequence 1
2350                  + @{[ $s2 =~ /\000+/g ]};  # Gap opens in sequence 2
2351        my ( $tgap, $topen ) = ( 0, 0 );
2352        if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ }
2353        if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ }
2354        $s1 =~ tr/\000//d;                 # Remove nulls (former gaps)
2355        $s2 =~ tr/\000//d;                 # Remove nulls (former gaps)
2356        my $ngap = $nmat - length( $s1 );  # Total gaps
2357    
2358        my $xor = $s1 ^ $s2;               # xor of identical residues is null byte
2359        my $nid = ( $xor =~ tr/\000//d );  # Count the nulls (identical residues)
2360        my $ndif = $nmat - $nid - $ngap;
2361    
2362        ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen )
2363    }
2364    
2365    
2366    sub useful_nt_align
2367    {
2368        my ( $s1, $s2 ) = map { uc $_ } @_;
2369        $s1 =~ tr/U/T/;         # Convert U to T
2370        my $m1 = $s1;
2371        $m1 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
2372        $m1 =~ tr/\377/\000/c;  # All else to null byte
2373        $s2 =~ tr/U/T/;         # Convert U to T
2374        my $m2 = $s2;
2375        $m2 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
2376        $m2 =~ tr/\377/\000/c;  # All else to null byte
2377        $m1 &= $m2;             # Invalid in either sequence becomes null
2378        $s1 &= $m1;             # Apply mask to sequence 1
2379        $s1 =~ tr/\000//d;      # Delete nulls in sequence 1
2380        $s2 &= $m1;             # Apply mask to sequence 2
2381        $s2 =~ tr/\000//d;      # Delete nulls in sequence 2
2382        ( $s1, $s2 )
2383    }
2384    
2385    
2386    #-------------------------------------------------------------------------------
2387    #  Interpret an alignment of two protein sequences in terms of: useful
2388    #  aligned positions (unambiguous, and not a common gap), number of identical
2389    #  residues, number of differing residues, number of gaps, and number of gap
2390    #  opennings.
2391    #
2392    #     ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_aa_align( $seq1, $seq2 )
2393    #
2394    #  $npos  = total aligned positons (= $nid + $ndif + $ngap)
2395    #  $nid   = number of positions with identical amino acids (ignoring case)
2396    #  $ndif  = number of positions with differing amino acids
2397    #  $ngap  = number of positions with gap in one sequence but not the other
2398    #  $nopen = number of runs of gaps
2399    #  $tgap  = number of gaps in runs adjacent to a terminus
2400    #  $topen = number of alignment ends with gaps
2401    #
2402    #-------------------------------------------------------------------------------
2403    sub interpret_aa_align
2404    {
2405        #  Remove alignment columns that are not informative:
2406        my ( $s1, $s2 ) = useful_aa_align( @_[0,1] );
2407        my $nmat = length( $s1 );            # Useful alignment length
2408    
2409        my $m1 = $s1;
2410        $m1 =~ tr/ACDEFGHIKLMNPQRSTUVWY/\377/;  # Amino acids to all 1 bits
2411        $m1 =~ tr/\377/\000/c;               # Others (gaps) to null byte
2412        my $m2 = $s2;
2413        $m2 =~ tr/ACDEFGHIKLMNPQRSTUVWY/\377/;  # Amino acids to all 1 bits
2414        $m2 =~ tr/\377/\000/c;               # Others (gaps) to null byte
2415        $m1 &= $m2;                          # Gap in either sequence becomes null
2416        $s1 &= $m1;                          # Apply mask to sequence 1
2417        $s2 &= $m1;                          # Apply mask to sequence 2
2418        my $nopen = @{[ $s1 =~ /\000+/g ]}   # Gap opens in sequence 1
2419                  + @{[ $s2 =~ /\000+/g ]};  # Gap opens in sequence 2
2420        my ( $tgap, $topen ) = ( 0, 0 );
2421        if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ }
2422        if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ }
2423        $s1 =~ tr/\000//d;                 # Remove nulls (former gaps)
2424        $s2 =~ tr/\000//d;                 # Remove nulls (former gaps)
2425        my $ngap = $nmat - length( $s1 );  # Total gaps
2426    
2427        my $xor = $s1 ^ $s2;               # xor of identical residues is null byte
2428        my $nid = ( $xor =~ tr/\000//d );  # Count the nulls (identical residues)
2429        my $ndif = $nmat - $nid - $ngap;
2430    
2431        ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen )
2432    }
2433    
2434    
2435    sub useful_aa_align
2436    {
2437        my ( $s1, $s2 ) = map { uc $_ } @_;
2438        my $m1 = $s1;
2439        $m1 =~ tr/ACDEFGHIKLMNPQRSTUVWY-/\377/;  # Allowed symbols to hex FF byte
2440        $m1 =~ tr/\377/\000/c;  # All else to null byte
2441        my $m2 = $s2;
2442        $m2 =~ tr/ACDEFGHIKLMNPQRSTUVWY-/\377/;  # Allowed symbols to hex FF byte
2443        $m2 =~ tr/\377/\000/c;  # All else to null byte
2444        $m1 &= $m2;             # Invalid in either sequence becomes null
2445        $s1 &= $m1;             # Apply mask to sequence 1
2446        $s1 =~ tr/\000//d;      # Delete nulls in sequence 1
2447        $s2 &= $m1;             # Apply mask to sequence 2
2448        $s2 =~ tr/\000//d;      # Delete nulls in sequence 2
2449        ( $s1, $s2 )
2450    }
2451    
2452    
2453    #-------------------------------------------------------------------------------
2454    #  Return the fraction identity for oligomers over a range of lengths
2455    #
2456    #     @sims = oligomer_similarity( $seq1, $seq2, \%opts )
2457    #
2458    #-------------------------------------------------------------------------------
2459    sub oligomer_similarity
2460    {
2461        my ( $seq1, $seq2, $opts ) = @_;
2462        $seq1 && $seq2 or return ();
2463        $seq1 = $seq1->[2] if ref( $seq1 ) eq 'ARRAY';
2464        $seq2 = $seq2->[2] if ref( $seq2 ) eq 'ARRAY';
2465        $opts && ref( $opts ) eq 'HASH' or ( $opts = {} );
2466    
2467        my $min = $opts->{ min } || $opts->{ max } || 2;
2468        my $max = $opts->{ max } || $min;
2469        return () if $max < $min;
2470    
2471        #  Remove shared gaps
2472    
2473        my $mask1 = gap_mask( $seq1 );
2474        my $mask2 = gap_mask( $seq2 );
2475        my $mask  = $mask1 | $mask2;
2476        $seq1     = $seq1 & $mask;          # Apply mask to sequence
2477        $seq1     =~ tr/\000//d;     # Delete null characters
2478        $seq2     = $seq2 & $mask;          # Apply mask to sequence
2479        $seq2     =~ tr/\000//d;     # Delete null characters
2480        #  Remove terminal gaps
2481    
2482        $mask  = $mask1 & $mask2;
2483        my $n1 = $mask =~ /^(\000+)/ ? length( $1 ) : 0;
2484        my $n2 = $mask =~ /(\000+)$/ ? length( $1 ) : 0;
2485        if ( $n1 || $n2 )
2486        {
2487            my $len = length( $seq1 ) - ( $n1 + $n2 );
2488            $seq1 = substr( $seq1, $n1, $len );
2489            $seq2 = substr( $seq2, $n1, $len );
2490        }
2491    
2492        #  Find the runs of identity
2493    
2494        my $xor = $seq1 ^ $seq2;           # xor of identical residues is null byte
2495    
2496        # Runs with one or more identities
2497    
2498        my %cnt;
2499        foreach ( $xor =~ m/(\000+)/g ) { $cnt{ length($_) }++ }
2500    print Dumper( \%cnt );
2501        map { my $n = $_;
2502              my $ttl = 0;
2503              for ( grep { $_ >= $n } keys %cnt ) { $ttl += $cnt{$_} * ( $_ - ($n-1) ) }
2504              my $nmax = length( $xor ) - ($n-1);
2505              $nmax > 0 ? $ttl / $nmax : undef;
2506            } ( $min .. $max );
2507    }
2508    
2509    
2510  1;  1;

Legend:
Removed from v.1.6  
changed lines
  Added in v.1.22

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3