[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Dec 1 16:54:26 2003 UTC revision 1.12, Mon Feb 11 20:37:27 2008 UTC
# Line 1  Line 1 
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  read_fasta_seqs( *FILEHANDLE )  #  A sequence entry is ( $id, $def, $seq )
4  #  read_next_fasta_seq( *FILEHANDLE )  #  A list of entries is a list of references
5  #  parse_fasta_title( $title )  #
6  #  split_fasta_title( $title )  #  @seq_entries = read_fasta( )                     # STDIN
7  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )  #  @seq_entries = read_fasta( \*FILEHANDLE )
8  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )  #  @seq_entries = read_fasta(  $filename )
9  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  @entry = read_next_fasta_seq( \*FILEHANDLE )
10    # \@entry = read_next_fasta_seq( \*FILEHANDLE )
11  #  index_seq_list( @seq_entry_list )  #  @entry = read_next_fasta_seq(  $filename )
12  #  seq_entry_by_id( \%seq_index, $seq_id )  # \@entry = read_next_fasta_seq(  $filename )
13  #  seq_desc_by_id( \%seq_index, $seq_id )  #  @entry = read_next_fasta_seq()                   # STDIN
14  #  seq_data_by_id( \%seq_index, $seq_id )  # \@entry = read_next_fasta_seq()                   # STDIN
15    #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
16  #  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  @seq_entries = read_clustal( )                   # STDIN
17  #  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  @seq_entries = read_clustal( \*FILEHANDLE )
18  #  complement_DNA_entry( @seq_entry [, $fix_id] )  #  @seq_entries = read_clustal(  $filename )
19  #  complement_RNA_entry( @seq_entry [, $fix_id] )  #  @seq_entries = read_clustal_file(  $filename )
20  #  complement_DNA_seq( $NA_seq )  #
21  #  complement_RNA_seq( $NA_seq )  #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
22  #  to_DNA_seq( $NA_seq )  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
23  #  to_RNA_seq( $NA_seq )  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
24  #  clean_ae_sequence( $seq )  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
25    #
26  #  translate_seq( $seq [, $met_start] )  #  ( $id, $def ) = parse_fasta_title( $title )
27  #  translate_codon( $triplet )  #  ( $id, $def ) = split_fasta_title( $title )
28  #  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )  #
29    #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
30    #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
31    #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
32    #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
33    #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
34    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
35    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
36    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
37    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
38    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
39    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
40    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
41    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
42    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
43    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
44    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
45    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
46    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
47    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
48    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
49    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
50    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
51    #
52    #   @packed_seqs = pack_alignment(  @seqs )
53    #   @packed_seqs = pack_alignment( \@seqs )
54    #  \@packed_seqs = pack_alignment(  @seqs )
55    #  \@packed_seqs = pack_alignment( \@seqs )
56    #
57    #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
58    #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
59    #  $DNAseq = DNA_subseq(  $seq, $from, $to );
60    #  $DNAseq = DNA_subseq( \$seq, $from, $to );
61    #  $RNAseq = RNA_subseq(  $seq, $from, $to );
62    #  $RNAseq = RNA_subseq( \$seq, $from, $to );
63    #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
64    #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
65    #  $DNAseq = complement_DNA_seq( $NA_seq );
66    #  $RNAseq = complement_RNA_seq( $NA_seq );
67    #  $DNAseq = to_DNA_seq( $NA_seq );
68    #  $RNAseq = to_RNA_seq( $NA_seq );
69    #  $seq    = pack_seq( $sequence )
70    #  $seq    = clean_ae_sequence( $seq )
71    #
72    #  $aa = translate_seq( $nt, $met_start )
73    #  $aa = translate_seq( $nt )
74    #  $aa = translate_codon( $triplet );
75    #
76    #  User-supplied genetic code.  The supplied code needs to be complete in
77    #  RNA and/or DNA, and upper and/or lower case.  The program guesses based
78    #  on lysine and phenylalanine codons.
79    #
80    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash, $met_start )
81    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash )
82    #
83    #  Locations (= oriented intervals) are ( id, start, end )
84    #  Intervals are ( id, left, right )
85    #
86    #  @intervals = read_intervals( \*FILEHANDLE )
87    #  @intervals = read_oriented_intervals( \*FILEHANDLE )
88    #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
89    #  @joined    = join_intervals( @interval_refs )
90    #  @intervals = locations_2_intervals( @locations )
91    #  $interval  = locations_2_intervals( $location  )
92    #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
93    #
94    #  Convert GenBank locations to SEED locations
95    #
96    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
97    #
98    #  Read quality scores from a fasta-like file:
99    #
100    #  @seq_entries = read_qual( )               #  STDIN
101    # \@seq_entries = read_qual( )               #  STDIN
102    #  @seq_entries = read_qual( \*FILEHANDLE )
103    # \@seq_entries = read_qual( \*FILEHANDLE )
104    #  @seq_entries = read_qual(  $filename )
105    # \@seq_entries = read_qual(  $filename )
106    #
107    #  Evaluate nucleotide alignments:
108    #
109    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
110    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
111    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_weight )
112    #  $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_open, $gap_extend )
113    #
114    #  ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
115    #
116    
117    use strict;
118    use Carp;
119    
120  #  read_intervals( *FILEHANDLE )  #  Exported global variables:
 #  join_intervals( @interval_refs )  
121    
122  use gjolib qw(wrap_text);  our @aa_1_letter_order;  # Alpha by 1 letter
123    our @aa_3_letter_order;  # PAM matrix order
124    our @aa_n_codon_order;
125    our %genetic_code;
126    our %genetic_code_with_U;
127    our %amino_acid_codons_DNA;
128    our %amino_acid_codons_RNA;
129    our %n_codon_for_aa;
130    our %reverse_genetic_code_DNA;
131    our %reverse_genetic_code_RNA;
132    our %DNA_letter_can_be;
133    our %RNA_letter_can_be;
134    our %one_letter_to_three_letter_aa;
135    our %three_letter_to_one_letter_aa;
136    
137  require Exporter;  require Exporter;
138    
139  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
140  our @EXPORT = qw(  our @EXPORT = qw(
141          read_fasta_seqs          read_fasta_seqs
142            read_fasta
143          read_next_fasta_seq          read_next_fasta_seq
144            read_clustal_file
145            read_clustal
146          parse_fasta_title          parse_fasta_title
147          split_fasta_title          split_fasta_title
148          print_seq_list_as_fasta          print_seq_list_as_fasta
149            print_alignment_as_fasta
150            print_alignment_as_phylip
151            print_alignment_as_nexus
152          print_seq_as_fasta          print_seq_as_fasta
153          print_gb_locus          print_gb_locus
154    
# Line 49  Line 157 
157          seq_desc_by_id          seq_desc_by_id
158          seq_data_by_id          seq_data_by_id
159    
160            pack_alignment
161    
162          subseq_DNA_entry          subseq_DNA_entry
163          subseq_RNA_entry          subseq_RNA_entry
164            DNA_subseq
165            RNA_subseq
166          complement_DNA_entry          complement_DNA_entry
167          complement_RNA_entry          complement_RNA_entry
168          complement_DNA_seq          complement_DNA_seq
169          complement_RNA_seq          complement_RNA_seq
170          to_DNA_seq          to_DNA_seq
171          to_RNA_seq          to_RNA_seq
172            pack_seq
173          clean_ae_sequence          clean_ae_sequence
174    
175          translate_seq          translate_seq
# Line 64  Line 177 
177          translate_seq_with_user_code          translate_seq_with_user_code
178    
179          read_intervals          read_intervals
180            standardize_intervals
181          join_intervals          join_intervals
182            locations_2_intervals
183            read_oriented_intervals
184            reverse_intervals
185    
186            gb_location_2_seed
187    
188            read_qual
189    
190            fraction_nt_diff
191            interpret_nt_align
192          );          );
193    
194  use strict;  our @EXPORT_OK = qw(
195            @aa_1_letter_order
196            @aa_3_letter_order
197            @aa_n_codon_order
198            %genetic_code
199            %genetic_code_with_U
200            %amino_acid_codons_DNA
201            %amino_acid_codons_RNA
202            %n_codon_for_aa
203            %reverse_genetic_code_DNA
204            %reverse_genetic_code_RNA
205            %DNA_letter_can_be
206            %RNA_letter_can_be
207            %one_letter_to_three_letter_aa
208            %three_letter_to_one_letter_aa
209            );
210    
211    
212  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
213  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
214    #     filehandle is passed through
215    #     string is taken as file name to be openend
216    #     undef or "" defaults to STDOUT
217    #
218    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
219    #
220    #-----------------------------------------------------------------------------
221    sub input_filehandle
222    {
223        my $file = shift;
224    
225        #  FILEHANDLE
226    
227        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
228    
229        #  Null string or undef
230    
231        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
232    
233        #  File name
234    
235        if ( ! ref( $file ) )
236        {
237            my $fh;
238            if    ( -f $file                       ) { }
239            elsif (    $file =~ /^>(.+)$/ && -f $1 ) { $file = $1 }
240            else { die "Could not find input file '$file'\n" }
241            open( $fh, "<$file" ) || die "Could not open '$file' for input\n";
242            return ( $fh, $file, 1 );
243        }
244    
245        #  Some other kind of reference; return the unused value
246    
247        return ( \*STDIN, undef, 0, $file );
248    }
249    
250    
251    #-----------------------------------------------------------------------------
252    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
253  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
254  #  #
255  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
256  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
257  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
258      my $fh = shift;  
259      wantarray || die "read_fasta_seqs requires list context\n";  
260    #-----------------------------------------------------------------------------
261    #  Read fasta sequences.  Save the contents in a list of refs to arrays:
262    #
263    #     $seq_entry = [ id, description, seq ]
264    #
265    #     @seq_entries = read_fasta( )               #  STDIN
266    #    \@seq_entries = read_fasta( )               #  STDIN
267    #     @seq_entries = read_fasta( \*FILEHANDLE )
268    #    \@seq_entries = read_fasta( \*FILEHANDLE )
269    #     @seq_entries = read_fasta(  $filename )
270    #    \@seq_entries = read_fasta(  $filename )
271    #  #  @seq_entries = read_fasta( "command |" )   #  open and read from pipe
272    #  # \@seq_entries = read_fasta( "command |" )   #  open and read from pipe
273    #
274    #-----------------------------------------------------------------------------
275    sub read_fasta
276    {
277        my @seqs = map { $_->[2] =~ tr/ \n\r\t//d; $_ }
278                   map { /^(\S+)(\s+([^\n]*\S)?\s*)?\n(.+)$/s ? [ $1, $3 || '', $4 ] : () }
279                   split /^>\s*/m, slurp( @_ );
280        wantarray() ? @seqs : \@seqs;
281    }
282    
283    #-----------------------------------------------------------------------------
284    #  A fast file reader:
285    #
286    #     $data = slurp( )               #  \*STDIN
287    #     $data = slurp( \*FILEHANDLE )  #  an open file handle
288    #     $data = slurp(  $filename )    #  a file name
289    #     $data = slurp( "<$filename" )  #  file with explicit direction
290    #   # $data = slurp( "$command |" )  #  open and read from pipe
291    #
292    #  Note:  It is faster to read lines by reading the file and splitting
293    #         than by reading the lines sequentially.  If space is not an
294    #         issue, this is the way to go.  If space is an issue, then lines
295    #         or records should be processed one-by-one (rather than loading
296    #         the whole input into a string or array).
297    #-----------------------------------------------------------------------------
298    sub slurp
299    {
300        my ( $fh, $close );
301        if ( ref $_[0] eq 'GLOB' )
302        {
303            $fh = shift;
304        }
305        elsif ( $_[0] && ! ref $_[0] )
306        {
307            my $file = shift;
308            if    ( -f $file                       ) { $file = "<$file" }
309            elsif (    $file =~ /^<(.*)$/ && -f $1 ) { }  # Explicit read
310          # elsif (    $file =~ /\S\s*\|$/         ) { }  # Read from a pipe
311            else                                     { return undef }
312            open $fh, $file or return undef;
313            $close = 1;
314        }
315        else
316        {
317            $fh = \*STDIN;
318        }
319    
320        my $out = '';
321        my $inc = 1048576;
322        my $end =       0;
323        my $read;
324        while ( $read = read( $fh, $out, $inc, $end ) ) { $end += $read }
325        close( $fh ) if $close;
326    
327        $out;
328    }
329    
330    
331    #-----------------------------------------------------------------------------
332    #  Previous, 50% slower fasta reader:
333    #-----------------------------------------------------------------------------
334    sub read_fasta_0
335    {
336        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
337        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
338    
339      my @seqs = ();      my @seqs = ();
340      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 90  Line 346 
346              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
347          }          }
348          else {          else {
349              tr/\t 0-9//d;              tr/     0-9//d;
350              $seq .= $_ ;              $seq .= $_ ;
351          }          }
352      }      }
353        close( $fh ) if $close;
354    
355      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
356      return @seqs;      return wantarray ? @seqs : \@seqs;
357  }  }
358    
359    
360  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
361  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.  This is half as fast a
362  #  Return the contents as an array:  (id, description, seq)  #  read_fasta(), but can handle an arbitrarily large file.  State information
363    #  is retained in hashes, so any number of streams can be interlaced.
364    #
365    #      @entry = read_next_fasta_seq( \*FILEHANDLE )
366    #     \@entry = read_next_fasta_seq( \*FILEHANDLE )
367    #      @entry = read_next_fasta_seq(  $filename )
368    #     \@entry = read_next_fasta_seq(  $filename )
369    #      @entry = read_next_fasta_seq()                # \*STDIN
370    #     \@entry = read_next_fasta_seq()                # \*STDIN
371  #  #
372  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #      @entry = ( $id, $description, $seq )
373    #
374    #  When reading at the end of file, () is returned.
375    #  With a filename, reading past this will reopen the file at the beginning.
376  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
377  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
 #  (these should really be hashes indexed by file handle):  
 #  
 my $next_fasta_seq_header = "";  
378    
379  sub read_next_fasta_seq {  {   #  Use bare block to scope the header hash
     my $fh = shift;  
     wantarray || die "read_next_fasta_seq requires list context\n";  
380    
381      my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );      my %next_header;
382      my $seq = "";      my %file_handle;
383        my %close_file;
384    
385        sub read_next_fasta_seq
386        {
387            my $fh = $file_handle{ $_[0] };
388            if ( ! $fh )
389            {
390                if ( ref $_[0] )
391                {
392                    return () if ref $_[0] ne 'GLOB';
393                    $fh = $_[0];
394                }
395                elsif ( $_[0] )
396                {
397                    my $file = $_[0];
398                    if    ( -f $file                       ) { $file = "<$file" }
399                    elsif (    $file =~ /^<(.*)$/ && -f $1 ) { }  # Explicit read
400                  # elsif (    $file =~ /\S\s*\|$/         ) { }  # Read from a pipe
401                    else                                     { return () }
402                    open $fh, $file or return ();
403                    $close_file{ $fh } = 1;
404                }
405                else
406                {
407                    $fh = \*STDIN;
408                }
409                $file_handle{ $_[0] } = $fh;
410            }
411    
412            my ( $id, $desc, $seq ) = ( undef, '', '' );
413            if ( defined( $next_header{$fh} ) )
414            {
415                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
416            }
417            else
418            {
419                $next_header{$fh} = '';
420            }
421    
422      while ( <$fh> ) {          while ( <$fh> )
423            {
424          chomp;          chomp;
425          if ( /^>/ ) {        #  new id              if ( /^>/ )        #  new id
426              $next_fasta_seq_header = $_;              {
427              if ( $id && $seq ) { return ($id, $desc, $seq) }                  $next_header{$fh} = $_;
428              ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );                  if ( defined($id) && $seq )
429              $seq = "";                  {
430          }                      return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
431          else {                  }
432              tr/\t 0-9//d;                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
433                    $seq = '';
434                }
435                else
436                {
437                    tr/ \t\r//d;
438              $seq .= $_ ;              $seq .= $_ ;
439          }          }
440      }      }
441    
442      #  Done with file, clear "next header"          #  Done with file; there is no next header:
443    
444            delete $next_header{ $fh };
445    
446            #  Return last set of data:
447    
448      $next_fasta_seq_header = "";          if ( defined($id) && $seq )
449      return ($id && $seq) ? ($id, $desc, $seq) : () ;          {
450                return wantarray ? ($id,$desc,$seq) : [$id,$desc,$seq]
451            }
452    
453            #  Or close everything out (returning the empty list tells caller
454            #  that we are done)
455    
456            if ( $close_file{ $fh } ) { close $fh; delete $close_file{ $fh } }
457            delete $file_handle{ $_[0] };
458    
459            return ();
460        }
461    }
462    
463    
464    #-----------------------------------------------------------------------------
465    #  Read a clustal alignment from a file.
466    #  Save the contents in a list of refs to arrays:  (id, description, seq)
467    #
468    #     @seq_entries = read_clustal_file( $filename )
469    #-----------------------------------------------------------------------------
470    sub read_clustal_file { read_clustal( @_ ) }
471    
472    
473    #-----------------------------------------------------------------------------
474    #  Read a clustal alignment.
475    #  Save the contents in a list of refs to arrays:  (id, description, seq)
476    #
477    #     @seq_entries = read_clustal( )              # STDIN
478    #     @seq_entries = read_clustal( \*FILEHANDLE )
479    #     @seq_entries = read_clustal(  $filename )
480    #-----------------------------------------------------------------------------
481    sub read_clustal {
482        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
483        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
484    
485        my ( %seq, @ids, $line );
486        while ( defined( $line = <$fh> ) )
487        {
488            ( $line =~ /^[A-Za-z0-9]/ ) or next;
489            chomp $line;
490            my @flds = split /\s+/, $line;
491            if ( @flds == 2 )
492            {
493                $seq{ $flds[0] } or push @ids, $flds[0];
494                push @{ $seq{ $flds[0] } }, $flds[1];
495            }
496        }
497        close( $fh ) if $close;
498    
499        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
500  }  }
501    
502    
# Line 145  Line 506 
506  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
507  #     ($id, $def) = split_fasta_title( $title )  #     ($id, $def) = split_fasta_title( $title )
508  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
509  sub parse_fasta_title {  sub parse_fasta_title
510    {
511      my $title = shift;      my $title = shift;
512      chomp;      chomp $title;
513      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {  
514          return ($1, $3 ? $3 : "");      return $title =~ /^>?\s*(\S+)(\s+(.*\S)?\s*)?$/ ? ( $1, $3 || '' )
515             : $title =~ /^>/                           ? ( '', '' )
516             :                                            ( undef, undef )
517      }      }
518      else {  
519          return ("", "");  sub split_fasta_title { parse_fasta_title( @_ ) }
520    
521    
522    #-----------------------------------------------------------------------------
523    #  Helper function for defining an output filehandle:
524    #     filehandle is passed through
525    #     string is taken as file name to be openend
526    #     undef or "" defaults to STDOUT
527    #
528    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
529    #
530    #-----------------------------------------------------------------------------
531    sub output_filehandle
532    {
533        my $file = shift;
534    
535        #  FILEHANDLE
536    
537        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
538    
539        #  Null string or undef
540    
541        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
542    
543        #  File name
544    
545        if ( ! ref( $file ) )
546        {
547            my $fh;
548            open( $fh, ">$file" ) || die "Could not open output $file\n";
549            return ( $fh, $file, 1 );
550      }      }
551    
552        #  Some other kind of reference; return the unused value
553    
554        return ( \*STDOUT, undef, 0, $file );
555  }  }
556    
557  sub split_fasta_title {  
558      parse_fasta_title ( shift );  #-----------------------------------------------------------------------------
559    #  Legacy function for printing fasta sequence set:
560    #
561    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
562    #-----------------------------------------------------------------------------
563    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
564    
565    
566    #-----------------------------------------------------------------------------
567    #  Print list of sequence entries in fasta format.
568    #  Missing, undef or "" filename defaults to STDOUT.
569    #
570    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
571    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
572    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
573    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
574    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
575    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
576    #-----------------------------------------------------------------------------
577    sub print_alignment_as_fasta {
578        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
579        ( unshift @_, $unused ) if $unused;
580    
581        ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n";
582    
583        #  Expand the sequence entry list if necessary:
584    
585        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
586    
587        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
588    
589        close( $fh ) if $close;
590  }  }
591    
592    
593  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
594  #  Print list of sequence entries in fasta format  #  Print list of sequence entries in phylip format.
595    #  Missing, undef or "" filename defaults to STDOUT.
596  #  #
597  #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
598    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
599    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
600    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
601    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
602    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
603  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
604  sub print_seq_list_as_fasta {  sub print_alignment_as_phylip {
605      my $fh = shift;      my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
606      my @seq_list = @_;      ( unshift @_, $unused ) if $unused;
607    
608        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
609    
610        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
611    
612        my ( %id2, %used );
613        my $maxlen = 0;
614        foreach ( @seq_list )
615        {
616            my ( $id, undef, $seq ) = @$_;
617    
618            #  Need a name that is unique within 10 characters
619    
620            my $id2 = substr( $id, 0, 10 );
621            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
622            my $n = "0";
623            while ( $used{ $id2 } )
624            {
625                $n++;
626                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
627            }
628            $used{ $id2 } = 1;
629            $id2{ $id } = $id2;
630    
631                    #  Prepare to pad sequences (should not be necessary, but ...)
632    
633            my $len = length( $seq );
634            $maxlen = $len if ( $len > $maxlen );
635        }
636    
637      foreach my $seq_ptr (@seq_list) {      my $nseq = @seq_list;
638          print_seq_as_fasta($fh, @$seq_ptr);      print $fh "$nseq  $maxlen\n";
639        foreach ( @seq_list )
640        {
641            my ( $id, undef, $seq ) = @$_;
642            my $len = length( $seq );
643            printf $fh "%-10s  %s%s\n", $id2{ $id },
644                                        $seq,
645                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
646      }      }
647    
648        close( $fh ) if $close;
649    }
650    
651    
652    #-----------------------------------------------------------------------------
653    #  Print list of sequence entries in nexus format.
654    #  Missing, undef or "" filename defaults to STDOUT.
655    #
656    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
657    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
658    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
659    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
660    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
661    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
662    #-----------------------------------------------------------------------------
663    sub print_alignment_as_nexus {
664        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
665        ( unshift @_, $unused ) if $unused;
666    
667        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
668    
669        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
670    
671        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
672    
673        my %id2;
674        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
675        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
676        foreach ( @seq_list )
677        {
678            my ( $id, undef, $seq ) = @$_;
679            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
680            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
681            {
682                    $id2 =~ s/'/''/g;
683                    $id2 = qq('$id2');
684                }
685            $id2{ $id } = $id2;
686            my $idlen = length( $id2 );
687            $maxidlen = $idlen if ( $idlen > $maxidlen );
688    
689            my $seqlen = length( $seq );
690            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
691    
692            $nt += $seq =~ tr/Tt//d;
693            $nu += $seq =~ tr/Uu//d;
694            $n1 += $seq =~ tr/ACGNacgn//d;
695            $n2 += $seq =~ tr/A-Za-z//d;
696        }
697    
698        my $nseq = @seq_list;
699        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
700    
701        print $fh <<"END_HEAD";
702    #NEXUS
703    
704    BEGIN Data;
705        Dimensions
706            NTax=$nseq
707            NChar=$maxseqlen
708            ;
709        Format
710            DataType=$type
711            Gap=-
712            Missing=?
713            ;
714        Matrix
715    
716    END_HEAD
717    
718        foreach ( @seq_list )
719        {
720            my ( $id, undef, $seq ) = @$_;
721            my $len = length( $seq );
722            printf  $fh  "%-${maxidlen}s  %s%s\n",
723                         $id2{ $id },
724                         $seq,
725                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
726        }
727    
728        print $fh <<"END_TAIL";
729    ;
730    END;
731    END_TAIL
732    
733        close( $fh ) if $close;
734  }  }
735    
736    
737  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
738  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
739  #  #
740  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
741  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
742  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
743  sub print_seq_as_fasta {  sub print_seq_as_fasta {
744      my $fh = shift;      my $fh = shift;
# Line 197  Line 755 
755  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
756  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
757  #  #
758  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
759  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
760  sub print_gb_locus {  sub print_gb_locus {
761      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 223  Line 781 
781    
782    
783  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
784    #  Return a string with text wrapped to defined line lengths:
785    #
786    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
787    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
788    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
789    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
790    #-----------------------------------------------------------------------------
791    sub wrap_text {
792        my ($str, $len, $ind, $indn) = @_;
793    
794        defined($str)  || die "wrap_text called without a string\n";
795        defined($len)  || ($len  =   80);
796        defined($ind)  || ($ind  =    0);
797        ($ind  < $len) || die "wrap error: indent greater than line length\n";
798        defined($indn) || ($indn = $ind);
799        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
800    
801        $str =~ s/\s+$//;
802        $str =~ s/^\s+//;
803        my ($maxchr, $maxchr1);
804        my (@lines) = ();
805    
806        while ($str) {
807            $maxchr1 = ($maxchr = $len - $ind) - 1;
808            if ($maxchr >= length($str)) {
809                push @lines, (" " x $ind) . $str;
810                last;
811            }
812            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
813                push @lines, (" " x $ind) . $1;
814                $str = $2;
815            }
816            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
817                push @lines, (" " x $ind) . $1;
818                $str = $2;
819            }
820            else {
821                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
822                $str = substr($str, $maxchr);
823            }
824            $ind = $indn;
825        }
826    
827        return join("\n", @lines);
828    }
829    
830    
831    #-----------------------------------------------------------------------------
832  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
833  #  #
834  #     my %seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(  @seq_list );
835    #     my \%seq_ind  = index_seq_list( \@seq_list );
836  #  #
837  #  Usage example:  #  Usage example:
838  #  #
839  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
840  #  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind    = index_seq_list(@seq_list); # hash from names to pointers
841  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
842  #  #
843  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
844  sub index_seq_list {  sub index_seq_list {
845      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
846      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
847      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
848  }  }
849    
850    
851  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
852  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
853  #  #
854  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
855  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
856  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
857    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
858  #  #
859  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
860  sub seq_entry_by_id {  sub seq_entry_by_id {
861      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
862      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
863      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
864  }  }
865    
866    
# Line 269  Line 877 
877      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
878  }  }
879    
880    #-----------------------------------------------------------------------------
881    #  Remove columns of alignment gaps from sequences:
882    #
883    #   @packed_seqs = pack_alignment(  @seqs )
884    #   @packed_seqs = pack_alignment( \@seqs )
885    #  \@packed_seqs = pack_alignment(  @seqs )
886    #  \@packed_seqs = pack_alignment( \@seqs )
887    #
888    #-----------------------------------------------------------------------------
889    
890    sub pack_alignment
891    {
892        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
893        @seqs or return wantarray ? () : [];
894    
895        my $mask  = pack_mask( $seqs[0]->[2] );
896        foreach ( @seqs[ 1 .. (@seqs-1) ] )
897        {
898            $mask |= pack_mask( $_->[2] );
899        }
900    
901        my $seq;
902        my @seqs2 = map { $seq = $_->[2] & $mask;
903                          $seq =~ tr/\000//d;
904                          [ $_->[0], $_->[1], $seq ]
905                        }
906                    @seqs;
907    
908        return wantarray ? @seqs2 : \@seqs2;
909    }
910    
911    sub pack_mask
912    {
913        my $mask = shift;
914        $mask =~ tr/-/\000/;
915        $mask =~ tr/\000/\377/c;
916        return $mask;
917    }
918    
919  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
920  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 332  Line 978 
978      }      }
979    
980      if ( $fix_id ) {      if ( $fix_id ) {
981          if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )          if ( ( $id =~ s/_(\d+)_(\d+)$// )
982            && ( abs($2-$1)+1 == $len ) ) {            && ( abs($2-$1)+1 == $len ) ) {
983              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
984              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
# Line 344  Line 990 
990  }  }
991    
992    
993    sub DNA_subseq
994    {
995        my ( $seq, $from, $to ) = @_;
996    
997        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
998                                          : length(  $seq );
999        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
1000        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
1001    
1002        my $left  = ( $from < $to ) ? $from : $to;
1003        my $right = ( $from < $to ) ? $to   : $from;
1004        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
1005        if ( $right > $len ) { $right = $len }
1006        if ( $left  < 1    ) { $left  =    1 }
1007    
1008        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
1009                                             : substr(  $seq, $left-1, $right-$left+1 );
1010    
1011        if ( $from > $to )
1012        {
1013            $subseq = reverse $subseq;
1014            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
1015                         [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
1016        }
1017    
1018        $subseq
1019    }
1020    
1021    
1022    sub RNA_subseq
1023    {
1024        my ( $seq, $from, $to ) = @_;
1025    
1026        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
1027                                          : length(  $seq );
1028        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
1029        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
1030    
1031        my $left  = ( $from < $to ) ? $from : $to;
1032        my $right = ( $from < $to ) ? $to   : $from;
1033        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
1034        if ( $right > $len ) { $right = $len }
1035        if ( $left  < 1    ) { $left  =    1 }
1036    
1037        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
1038                                             : substr(  $seq, $left-1, $right-$left+1 );
1039    
1040        if ( $from > $to )
1041        {
1042            $subseq = reverse $subseq;
1043            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
1044                         [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
1045        }
1046    
1047        $subseq
1048    }
1049    
1050    
1051  sub complement_DNA_entry {  sub complement_DNA_entry {
1052      my ($id, $desc, $seq, $fix_id) = @_;      my ($id, $desc, $seq, $fix_id) = @_;
1053      $fix_id ||= 0;     #  fix undef values      $fix_id ||= 0;     #  fix undef values
# Line 353  Line 1057 
1057      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
1058                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
1059      if ($fix_id) {      if ($fix_id) {
1060          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
1061              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
1062          }          }
1063          else {          else {
# Line 374  Line 1078 
1078      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
1079                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
1080      if ($fix_id) {      if ($fix_id) {
1081          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
1082              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
1083          }          }
1084          else {          else {
# Line 416  Line 1120 
1120  }  }
1121    
1122    
1123    sub pack_seq {
1124        my $seq = shift;
1125        $seq =~ tr/A-Za-z//cd;
1126        return $seq;
1127    }
1128    
1129    
1130  sub clean_ae_sequence {  sub clean_ae_sequence {
1131      $_ = shift;      $_ = shift;
1132      $_ = to7bit($_);      $_ = to7bit($_);
# Line 454  Line 1165 
1165  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein:
1166  #  #
1167  #  $seq = translate_seq( $seq [, $met_start] )  #  $seq = translate_seq( $seq [, $met_start] )
1168  #  $aa  = translate_codon( $triplet );  #     $aa  = translate_codon( $triplet )
1169  #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  #     $aa  = translate_DNA_codon( $triplet )     # Does not rely on DNA
1170    #     $aa  = translate_uc_DNA_codon( $triplet )  # Does not rely on uc or DNA
1171  #  #
1172  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code must be upper case index and match the
1173  #  DNA versus RNA type of sequence  #  DNA versus RNA type of sequence
# Line 464  Line 1176 
1176  #  #
1177  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1178    
1179  our %genetic_code = (   # work as DNA  @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
1180      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",  @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
1181      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",  @aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );
1182      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",  
1183      TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",  %genetic_code = (
1184      CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",  
1185      CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",      # DNA version
1186      CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",  
1187      CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",      TTT => 'F',  TCT => 'S',  TAT => 'Y',  TGT => 'C',
1188      ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",      TTC => 'F',  TCC => 'S',  TAC => 'Y',  TGC => 'C',
1189      ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",      TTA => 'L',  TCA => 'S',  TAA => '*',  TGA => '*',
1190      ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",      TTG => 'L',  TCG => 'S',  TAG => '*',  TGG => 'W',
1191      ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",      CTT => 'L',  CCT => 'P',  CAT => 'H',  CGT => 'R',
1192      GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",      CTC => 'L',  CCC => 'P',  CAC => 'H',  CGC => 'R',
1193      GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",      CTA => 'L',  CCA => 'P',  CAA => 'Q',  CGA => 'R',
1194      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      CTG => 'L',  CCG => 'P',  CAG => 'Q',  CGG => 'R',
1195      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      ATT => 'I',  ACT => 'T',  AAT => 'N',  AGT => 'S',
1196        ATC => 'I',  ACC => 'T',  AAC => 'N',  AGC => 'S',
1197        ATA => 'I',  ACA => 'T',  AAA => 'K',  AGA => 'R',
1198        ATG => 'M',  ACG => 'T',  AAG => 'K',  AGG => 'R',
1199        GTT => 'V',  GCT => 'A',  GAT => 'D',  GGT => 'G',
1200        GTC => 'V',  GCC => 'A',  GAC => 'D',  GGC => 'G',
1201        GTA => 'V',  GCA => 'A',  GAA => 'E',  GGA => 'G',
1202        GTG => 'V',  GCG => 'A',  GAG => 'E',  GGG => 'G',
1203    
1204      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1205      #  speed up the processing of some common ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1206    
1207        TTY => 'F',  TCY => 'S',  TAY => 'Y',  TGY => 'C',
1208        TTR => 'L',  TCR => 'S',  TAR => '*',
1209                     TCN => 'S',
1210        CTY => 'L',  CCY => 'P',  CAY => 'H',  CGY => 'R',
1211        CTR => 'L',  CCR => 'P',  CAR => 'Q',  CGR => 'R',
1212        CTN => 'L',  CCN => 'P',               CGN => 'R',
1213        ATY => 'I',  ACY => 'T',  AAY => 'N',  AGY => 'S',
1214                     ACR => 'T',  AAR => 'K',  AGR => 'R',
1215                     ACN => 'T',
1216        GTY => 'V',  GCY => 'A',  GAY => 'D',  GGY => 'G',
1217        GTR => 'V',  GCR => 'A',  GAR => 'E',  GGR => 'G',
1218        GTN => 'V',  GCN => 'A',               GGN => 'G'
1219    );
1220    
1221    #  Add RNA by construction:
1222    
1223      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",  foreach ( grep { /T/ } keys %genetic_code )
1224      TTR => "L",  TCR => "S",  TAR => "*",  {
1225                   TCN => "S",      my $codon = $_;
1226      CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",      $codon =~ s/T/U/g;
1227      CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",      $genetic_code{ $codon } = lc $genetic_code{ $_ }
1228      CTN => "L",  CCN => "P",               CGN => "R",  }
1229      ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",  
1230                   ACR => "T",  AAR => "K",  AGR => "R",  #  Add lower case by construction:
1231                   ACN => "T",  
1232      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",  foreach ( keys %genetic_code )
1233      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",  {
1234      GTN => "V",  GCN => "A",               GGN => "G"      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1235    }
1236    
1237    
1238    #  Construct the genetic code with selenocysteine by difference:
1239    
1240    %genetic_code_with_U = %genetic_code;
1241    $genetic_code_with_U{ TGA } = 'U';
1242    $genetic_code_with_U{ tga } = 'u';
1243    $genetic_code_with_U{ UGA } = 'U';
1244    $genetic_code_with_U{ uga } = 'u';
1245    
1246    
1247    %amino_acid_codons_DNA = (
1248             L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
1249             R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
1250             S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
1251             A  => [ qw( GCA GCG GCT GCC ) ],
1252             G  => [ qw( GGA GGG GGT GGC ) ],
1253             P  => [ qw( CCA CCG CCT CCC ) ],
1254             T  => [ qw( ACA ACG ACT ACC ) ],
1255             V  => [ qw( GTA GTG GTT GTC ) ],
1256             I  => [ qw( ATA ATT ATC ) ],
1257             C  => [ qw( TGT TGC ) ],
1258             D  => [ qw( GAT GAC ) ],
1259             E  => [ qw( GAA GAG ) ],
1260             F  => [ qw( TTT TTC ) ],
1261             H  => [ qw( CAT CAC ) ],
1262             K  => [ qw( AAA AAG ) ],
1263             N  => [ qw( AAT AAC ) ],
1264             Q  => [ qw( CAA CAG ) ],
1265             Y  => [ qw( TAT TAC ) ],
1266             M  => [ qw( ATG ) ],
1267             U  => [ qw( TGA ) ],
1268             W  => [ qw( TGG ) ],
1269    
1270             l  => [ qw( tta ttg cta ctg ctt ctc ) ],
1271             r  => [ qw( aga agg cga cgg cgt cgc ) ],
1272             s  => [ qw( agt agc tca tcg tct tcc ) ],
1273             a  => [ qw( gca gcg gct gcc ) ],
1274             g  => [ qw( gga ggg ggt ggc ) ],
1275             p  => [ qw( cca ccg cct ccc ) ],
1276             t  => [ qw( aca acg act acc ) ],
1277             v  => [ qw( gta gtg gtt gtc ) ],
1278             i  => [ qw( ata att atc ) ],
1279             c  => [ qw( tgt tgc ) ],
1280             d  => [ qw( gat gac ) ],
1281             e  => [ qw( gaa gag ) ],
1282             f  => [ qw( ttt ttc ) ],
1283             h  => [ qw( cat cac ) ],
1284             k  => [ qw( aaa aag ) ],
1285             n  => [ qw( aat aac ) ],
1286             q  => [ qw( caa cag ) ],
1287             y  => [ qw( tat tac ) ],
1288             m  => [ qw( atg ) ],
1289             u  => [ qw( tga ) ],
1290             w  => [ qw( tgg ) ],
1291    
1292            '*' => [ qw( TAA TAG TGA ) ]
1293    );
1294    
1295    
1296    
1297    %amino_acid_codons_RNA = (
1298             L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
1299             R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
1300             S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
1301             A  => [ qw( GCA GCG GCU GCC ) ],
1302             G  => [ qw( GGA GGG GGU GGC ) ],
1303             P  => [ qw( CCA CCG CCU CCC ) ],
1304             T  => [ qw( ACA ACG ACU ACC ) ],
1305             V  => [ qw( GUA GUG GUU GUC ) ],
1306             B  => [ qw( GAU GAC AAU AAC ) ],
1307             Z  => [ qw( GAA GAG CAA CAG ) ],
1308             I  => [ qw( AUA AUU AUC ) ],
1309             C  => [ qw( UGU UGC ) ],
1310             D  => [ qw( GAU GAC ) ],
1311             E  => [ qw( GAA GAG ) ],
1312             F  => [ qw( UUU UUC ) ],
1313             H  => [ qw( CAU CAC ) ],
1314             K  => [ qw( AAA AAG ) ],
1315             N  => [ qw( AAU AAC ) ],
1316             Q  => [ qw( CAA CAG ) ],
1317             Y  => [ qw( UAU UAC ) ],
1318             M  => [ qw( AUG ) ],
1319             U  => [ qw( UGA ) ],
1320             W  => [ qw( UGG ) ],
1321    
1322             l  => [ qw( uua uug cua cug cuu cuc ) ],
1323             r  => [ qw( aga agg cga cgg cgu cgc ) ],
1324             s  => [ qw( agu agc uca ucg ucu ucc ) ],
1325             a  => [ qw( gca gcg gcu gcc ) ],
1326             g  => [ qw( gga ggg ggu ggc ) ],
1327             p  => [ qw( cca ccg ccu ccc ) ],
1328             t  => [ qw( aca acg acu acc ) ],
1329             v  => [ qw( gua gug guu guc ) ],
1330             b  => [ qw( gau gac aau aac ) ],
1331             z  => [ qw( gaa gag caa cag ) ],
1332             i  => [ qw( aua auu auc ) ],
1333             c  => [ qw( ugu ugc ) ],
1334             d  => [ qw( gau gac ) ],
1335             e  => [ qw( gaa gag ) ],
1336             f  => [ qw( uuu uuc ) ],
1337             h  => [ qw( cau cac ) ],
1338             k  => [ qw( aaa aag ) ],
1339             n  => [ qw( aau aac ) ],
1340             q  => [ qw( caa cag ) ],
1341             y  => [ qw( uau uac ) ],
1342             m  => [ qw( aug ) ],
1343             u  => [ qw( uga ) ],
1344             w  => [ qw( ugg ) ],
1345    
1346            '*' => [ qw( UAA UAG UGA ) ]
1347  );  );
1348    
1349    
1350  my %DNA_letter_can_be = (  %n_codon_for_aa = map {
1351        $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
1352        } keys %amino_acid_codons_DNA;
1353    
1354    
1355    %reverse_genetic_code_DNA = (
1356             A  => "GCN",  a  => "gcn",
1357             C  => "TGY",  c  => "tgy",
1358             D  => "GAY",  d  => "gay",
1359             E  => "GAR",  e  => "gar",
1360             F  => "TTY",  f  => "tty",
1361             G  => "GGN",  g  => "ggn",
1362             H  => "CAY",  h  => "cay",
1363             I  => "ATH",  i  => "ath",
1364             K  => "AAR",  k  => "aar",
1365             L  => "YTN",  l  => "ytn",
1366             M  => "ATG",  m  => "atg",
1367             N  => "AAY",  n  => "aay",
1368             P  => "CCN",  p  => "ccn",
1369             Q  => "CAR",  q  => "car",
1370             R  => "MGN",  r  => "mgn",
1371             S  => "WSN",  s  => "wsn",
1372             T  => "ACN",  t  => "acn",
1373             U  => "TGA",  u  => "tga",
1374             V  => "GTN",  v  => "gtn",
1375             W  => "TGG",  w  => "tgg",
1376             X  => "NNN",  x  => "nnn",
1377             Y  => "TAY",  y  => "tay",
1378            '*' => "TRR"
1379    );
1380    
1381    %reverse_genetic_code_RNA = (
1382             A  => "GCN",  a  => "gcn",
1383             C  => "UGY",  c  => "ugy",
1384             D  => "GAY",  d  => "gay",
1385             E  => "GAR",  e  => "gar",
1386             F  => "UUY",  f  => "uuy",
1387             G  => "GGN",  g  => "ggn",
1388             H  => "CAY",  h  => "cay",
1389             I  => "AUH",  i  => "auh",
1390             K  => "AAR",  k  => "aar",
1391             L  => "YUN",  l  => "yun",
1392             M  => "AUG",  m  => "aug",
1393             N  => "AAY",  n  => "aay",
1394             P  => "CCN",  p  => "ccn",
1395             Q  => "CAR",  q  => "car",
1396             R  => "MGN",  r  => "mgn",
1397             S  => "WSN",  s  => "wsn",
1398             T  => "ACN",  t  => "acn",
1399             U  => "UGA",  u  => "uga",
1400             V  => "GUN",  v  => "gun",
1401             W  => "UGG",  w  => "ugg",
1402             X  => "NNN",  x  => "nnn",
1403             Y  => "UAY",  y  => "uay",
1404            '*' => "URR"
1405    );
1406    
1407    
1408    %DNA_letter_can_be = (
1409      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1410      B => ["C", "G", "T"],       b => ["c", "g", "t"],      B => ["C", "G", "T"],       b => ["c", "g", "t"],
1411      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 520  Line 1425 
1425  );  );
1426    
1427    
1428  my %RNA_letter_can_be = (  %RNA_letter_can_be = (
1429      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1430      B => ["C", "G", "U"],       b => ["c", "g", "u"],      B => ["C", "G", "U"],       b => ["c", "g", "u"],
1431      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 540  Line 1445 
1445  );  );
1446    
1447    
1448  my %one_letter_to_three_letter_aa = (  %one_letter_to_three_letter_aa = (
1449       A  => "Ala",           A  => "Ala", a  => "Ala",
1450       B  => "Asx",           B  => "Asx", b  => "Asx",
1451       C  => "Cys",           C  => "Cys", c  => "Cys",
1452       D  => "Asp",           D  => "Asp", d  => "Asp",
1453       E  => "Glu",           E  => "Glu", e  => "Glu",
1454       F  => "Phe",           F  => "Phe", f  => "Phe",
1455       G  => "Gly",           G  => "Gly", g  => "Gly",
1456       H  => "His",           H  => "His", h  => "His",
1457       I  => "Ile",           I  => "Ile", i  => "Ile",
1458       K  => "Lys",           K  => "Lys", k  => "Lys",
1459       L  => "Leu",           L  => "Leu", l  => "Leu",
1460       M  => "Met",           M  => "Met", m  => "Met",
1461       N  => "Asn",           N  => "Asn", n  => "Asn",
1462       P  => "Pro",           P  => "Pro", p  => "Pro",
1463       Q  => "Gln",           Q  => "Gln", q  => "Gln",
1464       R  => "Arg",           R  => "Arg", r  => "Arg",
1465       S  => "Ser",           S  => "Ser", s  => "Ser",
1466       T  => "Thr",           T  => "Thr", t  => "Thr",
1467       U  => "Sec",           U  => "Sec", u  => "Sec",
1468       V  => "Val",           V  => "Val", v  => "Val",
1469       W  => "Trp",           W  => "Trp", w  => "Trp",
1470       X  => "Xxx",           X  => "Xxx", x  => "Xxx",
1471       Y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1472       Z  => "Glx",           Z  => "Glx", z  => "Glx",
1473      "*" => "***"          '*' => "***"
1474            );
1475    
1476    
1477    %three_letter_to_one_letter_aa = (
1478         ALA  => "A",   Ala  => "A",   ala  => "a",
1479         ARG  => "R",   Arg  => "R",   arg  => "r",
1480         ASN  => "N",   Asn  => "N",   asn  => "n",
1481         ASP  => "D",   Asp  => "D",   asp  => "d",
1482         ASX  => "B",   Asx  => "B",   asx  => "b",
1483         CYS  => "C",   Cys  => "C",   cys  => "c",
1484         GLN  => "Q",   Gln  => "Q",   gln  => "q",
1485         GLU  => "E",   Glu  => "E",   glu  => "e",
1486         GLX  => "Z",   Glx  => "Z",   glx  => "z",
1487         GLY  => "G",   Gly  => "G",   gly  => "g",
1488         HIS  => "H",   His  => "H",   his  => "h",
1489         ILE  => "I",   Ile  => "I",   ile  => "i",
1490         LEU  => "L",   Leu  => "L",   leu  => "l",
1491         LYS  => "K",   Lys  => "K",   lys  => "k",
1492         MET  => "M",   Met  => "M",   met  => "m",
1493         PHE  => "F",   Phe  => "F",   phe  => "f",
1494         PRO  => "P",   Pro  => "P",   pro  => "p",
1495         SEC  => "U",   Sec  => "U",   sec  => "u",
1496         SER  => "S",   Ser  => "S",   ser  => "s",
1497         THR  => "T",   Thr  => "T",   thr  => "t",
1498         TRP  => "W",   Trp  => "W",   trp  => "w",
1499         TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
1500         VAL  => "V",   Val  => "V",   val  => "v",
1501         XAA  => "X",   Xaa  => "X",   xaa  => "x",
1502         XXX  => "X",   Xxx  => "X",   xxx  => "x",
1503        '***' => "*"
1504  );  );
1505    
1506    
1507  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1508  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein.  Respects case of the
1509    #  nucleotide sequence.
1510  #  #
1511  #      $seq = translate_seq( $seq [, $met_start] )  #      $aa = translate_seq( $nt, $met_start )
1512    #      $aa = translate_seq( $nt )
1513  #  #
1514  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1515    
1516  sub translate_seq {  sub translate_seq
1517      my $seq = uc shift;  {
1518      $seq =~ tr/UX/TN/;      #  make it DNA, and allow X      my $seq = shift;
1519      $seq =~ tr/-//d;        #  remove gaps      $seq =~ tr/-//d;        #  remove gaps
1520    
1521      my $met = shift || 0;   #  a second argument that is true      my @codons = $seq =~ m/(...?)/g;  #  Will try to translate last 2 nt
                             #  forces first amino acid to be Met  
                             #  (note: undef is false)  
1522    
1523      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  A second argument that is true forces first amino acid to be Met
1524      my $pep  = ($met && ($imax >= 0)) ? "M" : "";  
1525      my $aa;      my @met;
1526      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1527          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );      {
1528            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1529      }      }
1530    
1531      return $pep;      join( '', @met, map { translate_codon( $_ ) } @codons )
1532  }  }
1533    
1534    
1535  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1536  #  Translate a single triplet with "universal" genetic code  #  Translate a single triplet with "universal" genetic code.
 #  Uppercase and DNA are performed, then translate_uc_DNA_codon  
 #  is called.  
1537  #  #
1538  #      $aa = translate_codon( $triplet )  #      $aa = translate_codon( $triplet )
1539    #      $aa = translate_DNA_codon( $triplet )
1540    #      $aa = translate_uc_DNA_codon( $triplet )
1541  #  #
1542  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1543    
1544  sub translate_codon {  sub translate_DNA_codon { translate_codon( @_ ) }
     my $codon = uc shift;  #  Make it uppercase  
     $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X  
     return translate_uc_DNA_codon($codon);  
 }  
1545    
1546    sub translate_uc_DNA_codon { translate_codon( uc $_[0] ) }
1547    
1548  #-----------------------------------------------------------------------------  sub translate_codon
1549  #  Translate a single triplet with "universal" genetic code  {
 #  Uppercase and DNA assumed  
 #  Intended for private use by translate_codon and translate_seq  
 #  
 #      $aa = translate_uc_DNA_codon( $triplet )  
 #  
 #-----------------------------------------------------------------------------  
   
 sub translate_uc_DNA_codon {  
1550      my $codon = shift;      my $codon = shift;
1551      my $aa;      $codon =~ tr/Uu/Tt/;     #  Make it DNA
1552    
1553      #  Try a simple lookup:      #  Try a simple lookup:
1554    
1555        my $aa;
1556      if ( $aa = $genetic_code{ $codon } ) { return $aa }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1557    
1558      #  With the expanded code defined above, this catches simple N, R      #  Attempt to recover from mixed-case codons:
     #  and Y ambiguities in the third position.  Other codes like  
     #  GG[KMSWBDHV], or even GG, might be unambiguously translated by  
     #  converting the last position to N and seeing if this is in the  
     #  (expanded) code table:  
   
     if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }  
   
     #  Test that codon is valid and might have unambiguous aa:  
   
     if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }  
     #                     ^^  
     #                     |+- for leucine YTR  
     #                     +-- for arginine MGR  
1559    
1560      #  Expand all ambiguous nucleotides to see if they all yield same aa.      $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1561      #  Loop order tries to fail quickly with first position change.      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1562    
1563      $aa = "";      #  The code defined above catches simple N, R and Y ambiguities in the
1564      for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {      #  third position.  Other codons (e.g., GG[KMSWBDHV], or even GG) might
1565          for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {      #  be unambiguously translated by converting the last position to N and
1566              for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {      #  seeing if this is in the code table:
1567                  #  set the first value of $aa  
1568                  if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }      my $N = ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1569                  #  or break out if any other amino acid is detected      if ( $aa = $genetic_code{ substr($codon,0,2) . $N } ) { return $aa }
1570                  elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }  
1571              }      #  Test that codon is valid for an unambiguous aa:
1572          }  
1573        my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1574        if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/i
1575          && $codon !~ m/^YT[AGR]$/i     #  Leu YTR
1576          && $codon !~ m/^MG[AGR]$/i     #  Arg MGR
1577           )
1578        {
1579            return $X;
1580      }      }
1581    
1582      return $aa || "X";      #  Expand all ambiguous nucleotides to see if they all yield same aa.
1583    
1584        my @n1 = @{ $DNA_letter_can_be{ substr( $codon, 0, 1 ) } };
1585        my $n2 =                        substr( $codon, 1, 1 );
1586        my @n3 = @{ $DNA_letter_can_be{ substr( $codon, 2, 1 ) } };
1587        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1588    
1589        my $triple = shift @triples;
1590        $aa = $genetic_code{ $triple };
1591        $aa or return $X;
1592    
1593        foreach $triple ( @triples ) { return $X if $aa ne $genetic_code{$triple} }
1594    
1595        return $aa;
1596  }  }
1597    
1598    
# Line 668  Line 1601 
1601  #  Diagnose the use of upper versus lower, and T versus U in the supplied  #  Diagnose the use of upper versus lower, and T versus U in the supplied
1602  #  code, and transform the supplied nucleotide sequence to match.  #  code, and transform the supplied nucleotide sequence to match.
1603  #  #
1604  #  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )  #     $aa = translate_seq_with_user_code( $nt, \%gen_code )
1605    #     $aa = translate_seq_with_user_code( $nt, \%gen_code, $start_with_met )
1606  #  #
1607    #  Modified 2007-11-22 to be less intrusive in these diagnoses by sensing
1608    #  the presence of both versions in the user code.
1609  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1610    
1611  sub translate_seq_with_user_code {  sub translate_seq_with_user_code
1612    {
1613      my $seq = shift;      my $seq = shift;
1614      $seq =~ tr/-//d;     #  remove gaps  ***  Why?      $seq =~ tr/-//d;     #  remove gaps  ***  Why?
     $seq =~ tr/Xx/Nn/;   #  allow X  
1615    
1616      my $gc = shift;      #  Reference to hash of DNA alphabet code      my $gc = shift;      #  Reference to hash of code
1617      if (! $gc || ref($gc) ne "HASH") {      if (! $gc || ref($gc) ne "HASH")
1618          die "translate_seq_with_user_code needs genetic code hash as secondargument.";      {
1619            print STDERR "translate_seq_with_user_code needs genetic code hash as second argument.";
1620            return undef;
1621      }      }
1622    
1623      #  Test the type of code supplied: uppercase versus lowercase      #  Test code support for upper vs lower case:
1624    
1625      my ($RNA_F, $DNA_F, $M, $N, $X);      my ( $TTT, $UUU );
1626        if    ( $gc->{AAA} && ! $gc->{aaa} )   #  Uppercase only code table
1627      if ($gc->{ "AAA" }) {     #  Looks like uppercase code table      {
1628          $seq   = uc $seq;     #  Uppercase sequence          $seq   = uc $seq;     #  Uppercase sequence
1629          $RNA_F = "UUU";       #  Uppercase RNA Phe codon          ( $TTT, $UUU ) = ( 'TTT', 'UUU' );
         $DNA_F = "TTT";       #  Uppercase DNA Phe codon  
         $M     = "M";         #  Uppercase initiator  
         $N     = "N";         #  Uppercase ambiguous nuc  
         $X     = "X";         #  Uppercase ambiguous aa  
1630      }      }
1631      elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table      elsif ( $gc->{aaa} && ! $gc->{AAA} )   #  Lowercase only code table
1632        {
1633          $seq   = lc $seq;     #  Lowercase sequence          $seq   = lc $seq;     #  Lowercase sequence
1634          $RNA_F = "uuu";       #  Lowercase RNA Phe codon          ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
         $DNA_F = "ttt";       #  Lowercase DNA Phe codon  
         $M     = "m";         #  Lowercase initiator  
         $N     = "n";         #  Lowercase ambiguous nuc  
         $X     = "x";         #  Lowercase ambiguous aa  
1635      }      }
1636      else {      elsif ( $gc->{aaa} )
1637          die "User-supplied genetic code does not have aaa or AAA\n";      {
1638            ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
1639        }
1640        else
1641        {
1642            print STDERR "User-supplied genetic code does not have aaa or AAA\n";
1643            return undef;
1644      }      }
1645    
1646      #  Test the type of code supplied:  UUU versus TTT      #  Test code support for U vs T:
   
     my ($ambigs);  
1647    
1648      if ($gc->{ $RNA_F }) {     #  Looks like RNA code table      my $ambigs;
1649          $seq =~ tr/Tt/Uu/;      if    ( $gc->{$UUU} && ! $gc->{$TTT} )  # RNA only code table
1650        {
1651            $seq = tr/Tt/Uu/;
1652          $ambigs = \%RNA_letter_can_be;          $ambigs = \%RNA_letter_can_be;
1653      }      }
1654      elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table      elsif ( $gc->{$TTT} && ! $gc->{$UUU} )  # DNA only code table
1655          $seq =~ tr/Uu/Tt/;      {
1656            $seq = tr/Uu/Tt/;
1657          $ambigs = \%DNA_letter_can_be;          $ambigs = \%DNA_letter_can_be;
1658      }      }
1659      else {      else
1660          die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";      {
1661            my $t = $seq =~ tr/Tt//;
1662            my $u = $seq =~ tr/Uu//;
1663            $ambigs = ( $t > $u ) ? \%DNA_letter_can_be : \%RNA_letter_can_be;
1664      }      }
1665    
1666      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  We can now do the codon-by-codon translation:
1667    
1668      my $met = shift;     #  a third argument that is true      my @codons = $seq =~ m/(...?)/g;  #  will try to translate last 2 nt
1669                           #  forces first amino acid to be Met  
1670                           #  (note: undef is false)      #  A third argument that is true forces first amino acid to be Met
     my $pep  = ($met && ($imax >= 0)) ? $M : "";  
     my $aa;  
1671    
1672      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      my @met;
1673          $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1674        {
1675            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1676      }      }
1677    
1678      return $pep;      join( '', @met, map { translate_codon_with_user_code( $_, $gc, $ambigs ) } @codons )
1679  }  }
1680    
1681    
1682  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1683  #  Translate with user-supplied genetic code hash.  For speed, no error  #  Translate with user-supplied genetic code hash.  No error check on the code.
1684  #  check on the hash.  Calling programs should check for the hash at a  #  Should only be called through translate_seq_with_user_code.
 #  higher level.  
1685  #  #
1686  #  Should only be called through translate_seq_with_user_code  #     $aa = translate_codon_with_user_code( $triplet, \%code, \%ambig_table )
 #  
 #   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )  
1687  #  #
1688  #  $triplet      speaks for itself  #  $triplet      speaks for itself
1689  #  $code         ref to the hash with the codon translations  #  \%code         ref to the hash with the codon translations
1690  #  $N            character to use for ambiguous nucleotide  #  \%ambig_table  ref to hash with lists of nucleotides for each ambig code
 #  $X            character to use for ambiguous amino acid  
 #  $ambig_table  ref to hash with lists of nucleotides for each ambig code  
1691  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1692    
1693    sub translate_codon_with_user_code
1694  sub translate_codon_with_user_code {  {
1695      my $codon = shift;      my ( $codon, $gc, $ambigs ) = @_;
     my $gc    = shift;  
     my $aa;  
1696    
1697      #  Try a simple lookup:      #  Try a simple lookup:
1698    
1699        my $aa;
1700      if ( $aa = $gc->{ $codon } ) { return $aa }      if ( $aa = $gc->{ $codon } ) { return $aa }
1701    
1702      #  Test that codon is valid and might have unambiguous aa:      #  Attempt to recover from mixed-case codons:
1703    
1704        $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1705        if ( $aa = $genetic_code{ $codon } ) { return $aa }
1706    
1707      my ($N, $X, $ambigs) = @_;      #  Test that codon is valid for an unambiguous aa:
     if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }  
     if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }  
     #                          ^^  
     #                          |+- for leucine YTR  
     #                          +-- for arginine MGR  
1708    
1709      #  Expand all ambiguous nucleotides to see if they all yield same aa.      my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
     #  Loop order tries to fail quickly with first position change.  
1710    
1711      $aa = "";      if ( $codon =~ m/^[ACGTU][ACGTU]$/i )  # Add N?
1712      for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {      {
1713          for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1714              for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {      }
1715                  #  set the first value of $aa      #  This makes assumptions about the user code, but tranlating ambiguous
1716                  if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }      #  codons is really a bit off the wall to start with:
1717                  #  break out if any other amino acid is detected      elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?
1718                  elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }      {
1719              }          return $X;
1720          }          }
1721    
1722        #  Expand all ambiguous nucleotides to see if they all yield same aa.
1723    
1724        my @n1 = @{ $ambigs->{ substr( $codon, 0, 1 ) } };
1725        my $n2 =               substr( $codon, 1, 1 );
1726        my @n3 = @{ $ambigs->{ substr( $codon, 2, 1 ) } };
1727        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1728    
1729        my $triple = shift @triples;
1730        $aa = $gc->{ $triple } || $gc->{ lc $triple } || $gc->{ uc $triple };
1731        $aa or return $X;
1732    
1733        foreach $triple ( @triples )
1734        {
1735            return $X if $aa ne ( $gc->{$triple} || $gc->{lc $triple} || $gc->{uc $triple} );
1736      }      }
1737    
1738      return $aa || $X;      return $aa;
1739  }  }
1740    
1741    
# Line 796  Line 1743 
1743  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1744  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1745  #  #
1746  #     @intervals = read_intervals(*FILEHANDLE)  #     @intervals = read_intervals( \*FILEHANDLE )
1747  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1748  sub read_intervals {  sub read_intervals {
1749      my $fh = shift;      my $fh = shift;
# Line 819  Line 1766 
1766    
1767    
1768  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1769    #  Convert a list of intervals to read [ id, left_end, right_end ].
1770    #
1771    #     @intervals = standardize_intervals( @interval_refs )
1772    #-----------------------------------------------------------------------------
1773    sub standardize_intervals {
1774        map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
1775    }
1776    
1777    
1778    #-----------------------------------------------------------------------------
1779  #  Take the union of a list of intervals  #  Take the union of a list of intervals
1780  #  #
1781  #     @joined = join_intervals( @interval_refs )  #     @joined = join_intervals( @interval_refs )
# Line 858  Line 1815 
1815      return @joined;      return @joined;
1816  }  }
1817    
1818    
1819    #-----------------------------------------------------------------------------
1820    #  Split location strings to oriented intervals.
1821    #
1822    #     @intervals = locations_2_intervals( @locations )
1823    #     $interval  = locations_2_intervals( $location  )
1824    #-----------------------------------------------------------------------------
1825    sub locations_2_intervals {
1826        my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
1827                           || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
1828                           || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
1829                           || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
1830                            ? [ $1, $2+0, $3+0 ]
1831                            : ()
1832                            } @_;
1833    
1834        return wantarray ? @intervals : $intervals[0];
1835    }
1836    
1837    
1838    #-----------------------------------------------------------------------------
1839    #  Read a list of oriented intervals from a file.
1840    #  Allow id_start_end, or id \s start \s end formats
1841    #
1842    #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1843    #-----------------------------------------------------------------------------
1844    sub read_oriented_intervals {
1845        my $fh = shift;
1846        my @intervals = ();
1847    
1848        while (<$fh>) {
1849            chomp;
1850               /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
1851            || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
1852            || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
1853            || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
1854            || next;
1855    
1856            #  Matched a pattern.  Store reference to (id, start, end):
1857            push @intervals, [ $1, $2+0, $3+0 ];
1858        }
1859        return @intervals;
1860    }
1861    
1862    
1863    #-----------------------------------------------------------------------------
1864    #  Reverse the orientation of a list of intervals
1865    #
1866    #     @reversed = reverse_intervals( @interval_refs )
1867    #-----------------------------------------------------------------------------
1868    sub reverse_intervals {
1869        map { [ $_->[0], $_->[2], $_->[1] ] } @_;
1870    }
1871    
1872    
1873    #-----------------------------------------------------------------------------
1874    #  Convert GenBank locations to SEED locations
1875    #
1876    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1877    #-----------------------------------------------------------------------------
1878    sub gb_location_2_seed
1879    {
1880        my $contig = shift @_;
1881        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1882    
1883        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1884    }
1885    
1886    sub gb_loc_2_seed_2
1887    {
1888        my ( $contig, $loc ) = @_;
1889    
1890        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1891        {
1892            join( '_', $contig, $1, $2 )
1893        }
1894    
1895        elsif ( $loc =~ /^join\((.*)\)$/ )
1896        {
1897            $loc = $1;
1898            my $lvl = 0;
1899            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1900            {
1901                for ( substr( $loc, $i, 1 ) )
1902                {
1903                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1904                    /\(/          and $lvl--;
1905                    /\)/          and $lvl++;
1906                }
1907            }
1908            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1909            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1910        }
1911    
1912        elsif ( $loc =~ /^complement\((.*)\)$/ )
1913        {
1914            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1915            reverse
1916            gb_loc_2_seed_2( $contig, $1 )
1917        }
1918    
1919        else
1920        {
1921            ()
1922        }
1923    }
1924    
1925    
1926    #-----------------------------------------------------------------------------
1927    #  Read qual.
1928    #
1929    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1930    #
1931    #     @seq_entries = read_qual( )               #  STDIN
1932    #    \@seq_entries = read_qual( )               #  STDIN
1933    #     @seq_entries = read_qual( \*FILEHANDLE )
1934    #    \@seq_entries = read_qual( \*FILEHANDLE )
1935    #     @seq_entries = read_qual(  $filename )
1936    #    \@seq_entries = read_qual(  $filename )
1937    #-----------------------------------------------------------------------------
1938    sub read_qual {
1939        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1940        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1941    
1942        my @quals = ();
1943        my ($id, $desc, $qual) = ("", "", []);
1944    
1945        while ( <$fh> ) {
1946            chomp;
1947            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1948                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1949                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1950            }
1951            else {
1952                push @$qual, split;
1953            }
1954        }
1955        close( $fh ) if $close;
1956    
1957        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1958        return wantarray ? @quals : \@quals;
1959    }
1960    
1961    
1962    #-------------------------------------------------------------------------------
1963    #  Fraction difference for an alignment of two nucleotide sequences in terms of
1964    #  number of differing residues, number of gaps, and number of gap opennings.
1965    #
1966    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, \%options )
1967    #
1968    #  or
1969    #
1970    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2 )
1971    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $gap_wgt )
1972    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, $open_wgt, $extend_wgt )
1973    #
1974    #  Options:
1975    #
1976    #      gap      => $gap_wgt          # Gap open and extend weight (D = 0.5)
1977    #      open     => $open_wgt         # Gap openning weight (D = gap_wgt)
1978    #      extend   => $extend_wgt       # Gap extension weight (D = open_wgt)
1979    #      t_gap    => $term_gap_wgt     # Terminal open and extend weight
1980    #      t_open   => $term_open_wgt    # Terminal gap open weight (D = open_wgt)
1981    #      t_extend => $term_extend_wgt  # Terminal gap extend weight (D = extend_wgt)
1982    #
1983    #  Default gap open and gap extend weights are 1/2.  Beware that
1984    #
1985    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1 )
1986    #
1987    #  and
1988    #
1989    #     $fraction_diff = fraction_nt_diff( $seq1, $seq2, 1, 0 )
1990    #
1991    #  are different.  The first has equal openning and extension weights, whereas
1992    #  the second has an openning weight of 1, and and extension weight of 0 (it
1993    #  only penalizes the number of runs of gaps).
1994    #-------------------------------------------------------------------------------
1995    sub fraction_nt_diff
1996    {
1997        my ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( @_[0,1] );
1998    
1999        my $diff_scr;
2000        if ( ref( $_[2] ) eq 'HASH' )
2001        {
2002            my $opts = $_[2];
2003            my $gap_open    = defined $opts->{ open }     ? $opts->{ open }
2004                            : defined $opts->{ gap }      ? $opts->{ gap }
2005                            : 0.5;
2006            my $gap_extend  = defined $opts->{ extend }   ? $opts->{ extend }
2007                            : $gap_open;
2008            my $term_open   = defined $opts->{ t_open }   ? $opts->{ t_open }
2009                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
2010                            : $gap_open;
2011            my $term_extend = defined $opts->{ t_extend } ? $opts->{ t_extend }
2012                            : defined $opts->{ t_gap }    ? $opts->{ t_gap }
2013                            : $gap_extend;
2014    
2015            $nopen -= $topen;
2016            $ngap  -= $tgap;
2017            $diff_scr = $ndif + $gap_open  * $nopen + $gap_extend  * ($ngap-$nopen)
2018                              + $term_open * $topen + $term_extend * ($tgap-$topen);
2019        }
2020        else
2021        {
2022            my $gap_open   = defined( $_[2] ) ? $_[2] : 0.5;
2023            my $gap_extend = defined( $_[3] ) ? $_[3] : $gap_open;
2024            $diff_scr = $ndif + $gap_open * $nopen + $gap_extend * ($ngap-$nopen);
2025        }
2026        my $ttl_scr = $nid + $diff_scr;
2027    
2028        $ttl_scr ? $diff_scr / $ttl_scr : undef
2029    }
2030    
2031    
2032    #-------------------------------------------------------------------------------
2033    #  Interpret an alignment of two nucleotide sequences in terms of: useful
2034    #  aligned positions (unambiguous, and not a common gap), number of identical
2035    #  residues, number of differing residues, number of gaps, and number of gap
2036    #  opennings.
2037    #
2038    #     ( $npos, $nid, $ndif, $ngap, $nopen, $tgap, $topen ) = interpret_nt_align( $seq1, $seq2 )
2039    #
2040    #  $npos  = total aligned positons (= $nid + $ndif + $ngap)
2041    #  $nid   = number of positions with identical nucleotides (ignoring case)
2042    #  $ndif  = number of positions with differing nucleotides
2043    #  $ngap  = number of positions with gap in one sequence but not the other
2044    #  $nopen = number of runs of gaps
2045    #  $tgap  = number of gaps in runs adjacent to a terminus
2046    #  $topen = number of alignment ends with gaps
2047    #
2048    #  Some of the methods might seem overly complex, but are necessary for cases
2049    #  in which the gaps switch strands in the alignment:
2050    #
2051    #     seq1  ---ACGTGAC--TTGCAGAG
2052    #     seq2  TTT---TGACGG--GCAGGG
2053    #     mask  00000011110000111111
2054    #
2055    #     npos  = 20
2056    #     nid   =  9
2057    #     ndif  =  1
2058    #     ngap  = 10
2059    #     nopen =  4
2060    #     tgap  =  3
2061    #     topen =  1
2062    #
2063    #  Although there are 4 gap opennings, there are only 2 runs in the mask,
2064    #  and the terminal run is length 6, not 3.  (Why handle these?  Because
2065    #  pairs of sequences from a multiple sequence alignment can look like this.)
2066    #-------------------------------------------------------------------------------
2067    sub interpret_nt_align
2068    {
2069        #  Remove alignment columns that are not informative:
2070        my ( $s1, $s2 ) = useful_nt_align( @_[0,1] );
2071        my $nmat = length( $s1 );          # Useful alignment length
2072    
2073        my $m1 = $s1;
2074        $m1 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
2075        $m1 =~ tr/\377/\000/c;             # Others (gaps) to null byte
2076        my $m2 = $s2;
2077        $m2 =~ tr/ACGT/\377/;              # Nucleotides to all 1 bits
2078        $m2 =~ tr/\377/\000/c;             # Others (gaps) to null byte
2079        $m1 &= $m2;                        # Gap in either sequence becomes null
2080        $s1 &= $m1;                        # Apply mask to sequence 1
2081        $s2 &= $m1;                        # Apply mask to sequence 2
2082        my $nopen = @{[ $s1 =~ /\000+/g ]}   # Gap opens in sequence 1
2083                  + @{[ $s2 =~ /\000+/g ]};  # Gap opens in sequence 2
2084        my ( $tgap, $topen ) = ( 0, 0 );
2085        if ( $s1 =~ /^(\000+)/ || $s2 =~ /^(\000+)/ ) { $tgap += length( $1 ); $topen++ }
2086        if ( $s1 =~ /(\000+)$/ || $s2 =~ /(\000+)$/ ) { $tgap += length( $1 ); $topen++ }
2087        $s1 =~ tr/\000//d;                 # Remove nulls (former gaps)
2088        $s2 =~ tr/\000//d;                 # Remove nulls (former gaps)
2089        my $ngap = $nmat - length( $s1 );  # Total gaps
2090    
2091        my $xor = $s1 ^ $s2;               # xor of identical residues is null byte
2092        my $nid = ( $xor =~ tr/\000//d );  # Count the nulls (identical residues)
2093        my $ndif = $nmat - $nid - $ngap;
2094    
2095        ( $nmat, $nid, $ndif, $ngap, $nopen, $tgap, $topen )
2096    }
2097    
2098    
2099    sub useful_nt_align
2100    {
2101        my ( $s1, $s2 ) = map { uc $_ } @_;
2102        $s1 =~ tr/U/T/;         # Convert U to T
2103        my $m1 = $s1;
2104        $m1 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
2105        $m1 =~ tr/\377/\000/c;  # All else to null byte
2106        $s2 =~ tr/U/T/;         # Convert U to T
2107        my $m2 = $s2;
2108        $m2 =~ tr/ACGT-/\377/;  # Allowed symbols to hex FF byte
2109        $m2 =~ tr/\377/\000/c;  # All else to null byte
2110        $m1 &= $m2;             # Invalid in either sequence becomes null
2111        $s1 &= $m1;             # Apply mask to sequence 1
2112        $s1 =~ tr/\000//d;      # Delete nulls in sequence 1
2113        $s2 &= $m1;             # Apply mask to sequence 2
2114        $s2 =~ tr/\000//d;      # Delete nulls in sequence 2
2115        ( $s1, $s2 )
2116    }
2117    
2118    
2119  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3