[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Dec 1 16:54:26 2003 UTC revision 1.9, Thu Nov 1 20:07:42 2007 UTC
# Line 1  Line 1 
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  read_fasta_seqs( *FILEHANDLE )  #  A sequence entry is ( $id, $def, $seq )
4  #  read_next_fasta_seq( *FILEHANDLE )  #  A list of entries is a list of references
5  #  parse_fasta_title( $title )  #
6  #  split_fasta_title( $title )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )  #  @seq_entries = read_fasta( )                     # STDIN
9  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11  #  index_seq_list( @seq_entry_list )  #  @seq_entries = read_clustal( )                   # STDIN
12  #  seq_entry_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal( \*FILEHANDLE )
13  #  seq_desc_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal(  $filename )
14  #  seq_data_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal_file(  $filename )
15    #
16  #  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  complement_DNA_entry( @seq_entry [, $fix_id] )  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  complement_RNA_entry( @seq_entry [, $fix_id] )  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
20  #  complement_DNA_seq( $NA_seq )  #
21  #  complement_RNA_seq( $NA_seq )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  to_DNA_seq( $NA_seq )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  to_RNA_seq( $NA_seq )  #
24  #  clean_ae_sequence( $seq )  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25    #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  translate_seq( $seq [, $met_start] )  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  translate_codon( $triplet )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28  #  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )  #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46    #
47    #   @packed_seqs = pack_alignment(  @seqs )
48    #   @packed_seqs = pack_alignment( \@seqs )
49    #  \@packed_seqs = pack_alignment(  @seqs )
50    #  \@packed_seqs = pack_alignment( \@seqs )
51    #
52    #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
53    #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
54    #  $DNAseq = DNA_subseq(  $seq, $from, $to );
55    #  $DNAseq = DNA_subseq( \$seq, $from, $to );
56    #  $RNAseq = RNA_subseq(  $seq, $from, $to );
57    #  $RNAseq = RNA_subseq( \$seq, $from, $to );
58    #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
59    #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
60    #  $DNAseq = complement_DNA_seq( $NA_seq );
61    #  $RNAseq = complement_RNA_seq( $NA_seq );
62    #  $DNAseq = to_DNA_seq( $NA_seq );
63    #  $RNAseq = to_RNA_seq( $NA_seq );
64    #  $seq    = pack_seq( $sequence )
65    #  $seq    = clean_ae_sequence( $seq )
66    #
67    #  $seq = translate_seq( $seq [, $met_start] )
68    #  $aa  = translate_codon( $triplet );
69    #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
70    #
71    #     User-supplied genetic code must be upper case index and match the
72    #     DNA versus RNA type of sequence
73    #
74    #  $seq = translate_seq_with_user_code( $seq, $gen_code_hash [, $met_start] )
75    #
76    #     Locations (= oriented intervals) are ( id, start, end )
77    #     Intervals are ( id, left, right )
78    #
79    #  @intervals = read_intervals( \*FILEHANDLE )
80    #  @intervals = read_oriented_intervals( \*FILEHANDLE )
81    #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
82    #  @joined    = join_intervals( @interval_refs )
83    #  @intervals = locations_2_intervals( @locations )
84    #  $interval  = locations_2_intervals( $location  )
85    #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
86    #
87    #  Convert GenBank locations to SEED locations
88    #
89    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
90    #
91    #  Read quality scores from a fasta-like file:
92    #
93    #  @seq_entries = read_qual( )               #  STDIN
94    # \@seq_entries = read_qual( )               #  STDIN
95    #  @seq_entries = read_qual( \*FILEHANDLE )
96    # \@seq_entries = read_qual( \*FILEHANDLE )
97    #  @seq_entries = read_qual(  $filename )
98    # \@seq_entries = read_qual(  $filename )
99    #
100    
101  #  read_intervals( *FILEHANDLE )  use strict;
102  #  join_intervals( @interval_refs )  use Carp;
103    
104  use gjolib qw(wrap_text);  #  Exported global variables:
105    
106    our @aa_1_letter_order;  # Alpha by 1 letter
107    our @aa_3_letter_order;  # PAM matrix order
108    our @aa_n_codon_order;
109    our %genetic_code;
110    our %genetic_code_with_U;
111    our %amino_acid_codons_DNA;
112    our %amino_acid_codons_RNA;
113    our %n_codon_for_aa;
114    our %reverse_genetic_code_DNA;
115    our %reverse_genetic_code_RNA;
116    our %DNA_letter_can_be;
117    our %RNA_letter_can_be;
118    our %one_letter_to_three_letter_aa;
119    our %three_letter_to_one_letter_aa;
120    
121  require Exporter;  require Exporter;
122    
123  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
124  our @EXPORT = qw(  our @EXPORT = qw(
125          read_fasta_seqs          read_fasta_seqs
126            read_fasta
127          read_next_fasta_seq          read_next_fasta_seq
128            read_clustal_file
129            read_clustal
130          parse_fasta_title          parse_fasta_title
131          split_fasta_title          split_fasta_title
132          print_seq_list_as_fasta          print_seq_list_as_fasta
133            print_alignment_as_fasta
134            print_alignment_as_phylip
135            print_alignment_as_nexus
136          print_seq_as_fasta          print_seq_as_fasta
137          print_gb_locus          print_gb_locus
138    
# Line 49  Line 141 
141          seq_desc_by_id          seq_desc_by_id
142          seq_data_by_id          seq_data_by_id
143    
144            pack_alignment
145    
146          subseq_DNA_entry          subseq_DNA_entry
147          subseq_RNA_entry          subseq_RNA_entry
148            DNA_subseq
149            RNA_subseq
150          complement_DNA_entry          complement_DNA_entry
151          complement_RNA_entry          complement_RNA_entry
152          complement_DNA_seq          complement_DNA_seq
153          complement_RNA_seq          complement_RNA_seq
154          to_DNA_seq          to_DNA_seq
155          to_RNA_seq          to_RNA_seq
156            pack_seq
157          clean_ae_sequence          clean_ae_sequence
158    
159          translate_seq          translate_seq
# Line 64  Line 161 
161          translate_seq_with_user_code          translate_seq_with_user_code
162    
163          read_intervals          read_intervals
164            standardize_intervals
165          join_intervals          join_intervals
166            locations_2_intervals
167            read_oriented_intervals
168            reverse_intervals
169    
170            gb_location_2_seed
171    
172            read_qual
173          );          );
174    
175  use strict;  our @EXPORT_OK = qw(
176            @aa_1_letter_order
177            @aa_3_letter_order
178            @aa_n_codon_order
179            %genetic_code
180            %genetic_code_with_U
181            %amino_acid_codons_DNA
182            %amino_acid_codons_RNA
183            %n_codon_for_aa
184            %reverse_genetic_code_DNA
185            %reverse_genetic_code_RNA
186            %DNA_letter_can_be
187            %RNA_letter_can_be
188            %one_letter_to_three_letter_aa
189            %three_letter_to_one_letter_aa
190            );
191    
192    
193  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
194  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
195    #     filehandle is passed through
196    #     string is taken as file name to be openend
197    #     undef or "" defaults to STDOUT
198    #
199    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
200    #
201    #-----------------------------------------------------------------------------
202    sub input_filehandle
203    {
204        my $file = shift;
205    
206        #  FILEHANDLE
207    
208        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
209    
210        #  Null string or undef
211    
212        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
213    
214        #  File name
215    
216        if ( ! ref( $file ) )
217        {
218            my $fh;
219            -f $file or die "Could not find input file \"$file\"\n";
220            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
221            return ( $fh, $file, 1 );
222        }
223    
224        #  Some other kind of reference; return the unused value
225    
226        return ( \*STDIN, undef, 0, $file );
227    }
228    
229    
230    #-----------------------------------------------------------------------------
231    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
232  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
233  #  #
234  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
235  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
236  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
237      my $fh = shift;  
238      wantarray || die "read_fasta_seqs requires list context\n";  
239    #-----------------------------------------------------------------------------
240    #  Read fasta sequences.
241    #  Save the contents in a list of refs to arrays:  (id, description, seq)
242    #
243    #     @seq_entries = read_fasta( )               #  STDIN
244    #    \@seq_entries = read_fasta( )               #  STDIN
245    #     @seq_entries = read_fasta( \*FILEHANDLE )
246    #    \@seq_entries = read_fasta( \*FILEHANDLE )
247    #     @seq_entries = read_fasta(  $filename )
248    #    \@seq_entries = read_fasta(  $filename )
249    #-----------------------------------------------------------------------------
250    sub read_fasta {
251        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
252        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
253    
254      my @seqs = ();      my @seqs = ();
255      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 90  Line 261 
261              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
262          }          }
263          else {          else {
264              tr/\t 0-9//d;              tr/     0-9//d;
265              $seq .= $_ ;              $seq .= $_ ;
266          }          }
267      }      }
268        close( $fh ) if $close;
269    
270      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
271      return @seqs;      return wantarray ? @seqs : \@seqs;
272  }  }
273    
274    
# Line 104  Line 276 
276  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
277  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
278  #  #
279  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
280  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
281  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
282  #  (these should really be hashes indexed by file handle):  
283  #  {   #  Use bare block to scope the header hash
284  my $next_fasta_seq_header = "";  
285        my %next_header;
286    
287  sub read_next_fasta_seq {  sub read_next_fasta_seq {
288      my $fh = shift;      my $fh = shift;
289      wantarray || die "read_next_fasta_seq requires list context\n";          my ( $id, $desc );
290    
291      my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );          if ( defined( $next_header{$fh} ) ) {
292                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
293            }
294            else {
295                $next_header{$fh} = "";
296                ( $id, $desc ) = ( undef, "" );
297            }
298      my $seq = "";      my $seq = "";
299    
300      while ( <$fh> ) {      while ( <$fh> ) {
301          chomp;          chomp;
302          if ( /^>/ ) {        #  new id          if ( /^>/ ) {        #  new id
303              $next_fasta_seq_header = $_;                  $next_header{$fh} = $_;
304              if ( $id && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
305              ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );                  {
306                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
307                    }
308                    ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
309              $seq = "";              $seq = "";
310          }          }
311          else {          else {
312              tr/\t 0-9//d;                  tr/     0-9//d;
313              $seq .= $_ ;              $seq .= $_ ;
314          }          }
315      }      }
316    
317      #  Done with file, clear "next header"          #  Done with file, delete "next header"
318    
319            delete $next_header{$fh};
320            return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
321                                                          : [$id, $desc, $seq]
322                                              )
323                                            : () ;
324        }
325    }
326    
327    
328    #-----------------------------------------------------------------------------
329    #  Read a clustal alignment from a file.
330    #  Save the contents in a list of refs to arrays:  (id, description, seq)
331    #
332    #     @seq_entries = read_clustal_file( $filename )
333    #-----------------------------------------------------------------------------
334    sub read_clustal_file { read_clustal( @_ ) }
335    
336    
337    #-----------------------------------------------------------------------------
338    #  Read a clustal alignment.
339    #  Save the contents in a list of refs to arrays:  (id, description, seq)
340    #
341    #     @seq_entries = read_clustal( )              # STDIN
342    #     @seq_entries = read_clustal( \*FILEHANDLE )
343    #     @seq_entries = read_clustal(  $filename )
344    #-----------------------------------------------------------------------------
345    sub read_clustal {
346        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
347        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
348    
349        my ( %seq, @ids, $line );
350        while ( defined( $line = <$fh> ) )
351        {
352            ( $line =~ /^[A-Za-z0-9]/ ) or next;
353            chomp $line;
354            my @flds = split /\s+/, $line;
355            if ( @flds == 2 )
356            {
357                $seq{ $flds[0] } or push @ids, $flds[0];
358                push @{ $seq{ $flds[0] } }, $flds[1];
359            }
360        }
361        close( $fh ) if $close;
362    
363      $next_fasta_seq_header = "";      map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
     return ($id && $seq) ? ($id, $desc, $seq) : () ;  
364  }  }
365    
366    
# Line 151  Line 376 
376      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
377          return ($1, $3 ? $3 : "");          return ($1, $3 ? $3 : "");
378      }      }
379      else {      elsif ($title =~ /^>/) {
380          return ("", "");          return ("", "");
381      }      }
382        else {
383            return (undef, "");
384        }
385  }  }
386    
387  sub split_fasta_title {  sub split_fasta_title {
# Line 162  Line 390 
390    
391    
392  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
393  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
394    #     filehandle is passed through
395    #     string is taken as file name to be openend
396    #     undef or "" defaults to STDOUT
397    #
398    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
399  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
400  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
401  sub print_seq_list_as_fasta {  sub output_filehandle
402      my $fh = shift;  {
403      my @seq_list = @_;      my $file = shift;
404    
405        #  FILEHANDLE
406    
407        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
408    
409        #  Null string or undef
410    
411        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
412    
413        #  File name
414    
415      foreach my $seq_ptr (@seq_list) {      if ( ! ref( $file ) )
416          print_seq_as_fasta($fh, @$seq_ptr);      {
417            my $fh;
418            open( $fh, ">$file" ) || die "Could not open output $file\n";
419            return ( $fh, $file, 1 );
420      }      }
421    
422        #  Some other kind of reference; return the unused value
423    
424        return ( \*STDOUT, undef, 0, $file );
425    }
426    
427    
428    #-----------------------------------------------------------------------------
429    #  Legacy function for printing fasta sequence set:
430    #
431    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
432    #-----------------------------------------------------------------------------
433    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
434    
435    
436    #-----------------------------------------------------------------------------
437    #  Print list of sequence entries in fasta format.
438    #  Missing, undef or "" filename defaults to STDOUT.
439    #
440    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
441    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
442    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
443    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
444    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
445    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
446    #-----------------------------------------------------------------------------
447    sub print_alignment_as_fasta {
448        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
449        ( unshift @_, $unused ) if $unused;
450    
451        ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n";
452    
453        #  Expand the sequence entry list if necessary:
454    
455        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
456    
457        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
458    
459        close( $fh ) if $close;
460    }
461    
462    
463    #-----------------------------------------------------------------------------
464    #  Print list of sequence entries in phylip format.
465    #  Missing, undef or "" filename defaults to STDOUT.
466    #
467    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
468    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
469    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
470    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
471    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
472    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
473    #-----------------------------------------------------------------------------
474    sub print_alignment_as_phylip {
475        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
476        ( unshift @_, $unused ) if $unused;
477    
478        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
479    
480        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
481    
482        my ( %id2, %used );
483        my $maxlen = 0;
484        foreach ( @seq_list )
485        {
486            my ( $id, undef, $seq ) = @$_;
487    
488            #  Need a name that is unique within 10 characters
489    
490            my $id2 = substr( $id, 0, 10 );
491            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
492            my $n = "0";
493            while ( $used{ $id2 } )
494            {
495                $n++;
496                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
497            }
498            $used{ $id2 } = 1;
499            $id2{ $id } = $id2;
500    
501                    #  Prepare to pad sequences (should not be necessary, but ...)
502    
503            my $len = length( $seq );
504            $maxlen = $len if ( $len > $maxlen );
505        }
506    
507        my $nseq = @seq_list;
508        print $fh "$nseq  $maxlen\n";
509        foreach ( @seq_list )
510        {
511            my ( $id, undef, $seq ) = @$_;
512            my $len = length( $seq );
513            printf $fh "%-10s  %s%s\n", $id2{ $id },
514                                        $seq,
515                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
516        }
517    
518        close( $fh ) if $close;
519    }
520    
521    
522    #-----------------------------------------------------------------------------
523    #  Print list of sequence entries in nexus format.
524    #  Missing, undef or "" filename defaults to STDOUT.
525    #
526    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
527    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
528    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
529    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
530    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
531    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
532    #-----------------------------------------------------------------------------
533    sub print_alignment_as_nexus {
534        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
535        ( unshift @_, $unused ) if $unused;
536    
537        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
538    
539        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
540    
541        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
542    
543        my %id2;
544        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
545        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
546        foreach ( @seq_list )
547        {
548            my ( $id, undef, $seq ) = @$_;
549            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
550            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
551            {
552                    $id2 =~ s/'/''/g;
553                    $id2 = qq('$id2');
554                }
555            $id2{ $id } = $id2;
556            my $idlen = length( $id2 );
557            $maxidlen = $idlen if ( $idlen > $maxidlen );
558    
559            my $seqlen = length( $seq );
560            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
561    
562            $nt += $seq =~ tr/Tt//d;
563            $nu += $seq =~ tr/Uu//d;
564            $n1 += $seq =~ tr/ACGNacgn//d;
565            $n2 += $seq =~ tr/A-Za-z//d;
566        }
567    
568        my $nseq = @seq_list;
569        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
570    
571        print $fh <<"END_HEAD";
572    #NEXUS
573    
574    BEGIN Data;
575        Dimensions
576            NTax=$nseq
577            NChar=$maxseqlen
578            ;
579        Format
580            DataType=$type
581            Gap=-
582            Missing=?
583            ;
584        Matrix
585    
586    END_HEAD
587    
588        foreach ( @seq_list )
589        {
590            my ( $id, undef, $seq ) = @$_;
591            my $len = length( $seq );
592            printf  $fh  "%-${maxidlen}s  %s%s\n",
593                         $id2{ $id },
594                         $seq,
595                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
596        }
597    
598        print $fh <<"END_TAIL";
599    ;
600    END;
601    END_TAIL
602    
603        close( $fh ) if $close;
604  }  }
605    
606    
607  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
608  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
609  #  #
610  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
611  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
612  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
613  sub print_seq_as_fasta {  sub print_seq_as_fasta {
614      my $fh = shift;      my $fh = shift;
# Line 197  Line 625 
625  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
626  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
627  #  #
628  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
629  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
630  sub print_gb_locus {  sub print_gb_locus {
631      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 223  Line 651 
651    
652    
653  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
654    #  Return a string with text wrapped to defined line lengths:
655    #
656    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
657    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
658    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
659    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
660    #-----------------------------------------------------------------------------
661    sub wrap_text {
662        my ($str, $len, $ind, $indn) = @_;
663    
664        defined($str)  || die "wrap_text called without a string\n";
665        defined($len)  || ($len  =   80);
666        defined($ind)  || ($ind  =    0);
667        ($ind  < $len) || die "wrap error: indent greater than line length\n";
668        defined($indn) || ($indn = $ind);
669        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
670    
671        $str =~ s/\s+$//;
672        $str =~ s/^\s+//;
673        my ($maxchr, $maxchr1);
674        my (@lines) = ();
675    
676        while ($str) {
677            $maxchr1 = ($maxchr = $len - $ind) - 1;
678            if ($maxchr >= length($str)) {
679                push @lines, (" " x $ind) . $str;
680                last;
681            }
682            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
683                push @lines, (" " x $ind) . $1;
684                $str = $2;
685            }
686            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
687                push @lines, (" " x $ind) . $1;
688                $str = $2;
689            }
690            else {
691                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
692                $str = substr($str, $maxchr);
693            }
694            $ind = $indn;
695        }
696    
697        return join("\n", @lines);
698    }
699    
700    
701    #-----------------------------------------------------------------------------
702  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
703  #  #
704  #     my %seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(  @seq_list );
705    #     my \%seq_ind  = index_seq_list( \@seq_list );
706  #  #
707  #  Usage example:  #  Usage example:
708  #  #
709  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
710  #  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind    = index_seq_list(@seq_list); # hash from names to pointers
711  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
712  #  #
713  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
714  sub index_seq_list {  sub index_seq_list {
715      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
716      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
717      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
718  }  }
719    
720    
721  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
722  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
723  #  #
724  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
725  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
726  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
727    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
728  #  #
729  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
730  sub seq_entry_by_id {  sub seq_entry_by_id {
731      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
732      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
733      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
734  }  }
735    
736    
# Line 269  Line 747 
747      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
748  }  }
749    
750    #-----------------------------------------------------------------------------
751    #  Remove columns of alignment gaps from sequences:
752    #
753    #   @packed_seqs = pack_alignment(  @seqs )
754    #   @packed_seqs = pack_alignment( \@seqs )
755    #  \@packed_seqs = pack_alignment(  @seqs )
756    #  \@packed_seqs = pack_alignment( \@seqs )
757    #
758    #-----------------------------------------------------------------------------
759    
760    sub pack_alignment
761    {
762        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
763        @seqs or return wantarray ? () : [];
764    
765        my $mask  = pack_mask( $seqs[0]->[2] );
766        foreach ( @seqs[ 1 .. (@seqs-1) ] )
767        {
768            $mask |= pack_mask( $_->[2] );
769        }
770    
771        my $seq;
772        my @seqs2 = map { $seq = $_->[2] & $mask;
773                          $seq =~ tr/\000//d;
774                          [ $_->[0], $_->[1], $seq ]
775                        }
776                    @seqs;
777    
778        return wantarray ? @seqs2 : \@seqs2;
779    }
780    
781    sub pack_mask
782    {
783        my $mask = shift;
784        $mask =~ tr/-/\000/;
785        $mask =~ tr/\000/\377/c;
786        return $mask;
787    }
788    
789  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
790  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 332  Line 848 
848      }      }
849    
850      if ( $fix_id ) {      if ( $fix_id ) {
851          if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )          if ( ( $id =~ s/_(\d+)_(\d+)$// )
852            && ( abs($2-$1)+1 == $len ) ) {            && ( abs($2-$1)+1 == $len ) ) {
853              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
854              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
# Line 344  Line 860 
860  }  }
861    
862    
863    sub DNA_subseq
864    {
865        my ( $seq, $from, $to ) = @_;
866    
867        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
868                                          : length(  $seq );
869        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
870        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
871    
872        my $left  = ( $from < $to ) ? $from : $to;
873        my $right = ( $from < $to ) ? $to   : $from;
874        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
875        if ( $right > $len ) { $right = $len }
876        if ( $left  < 1    ) { $left  =    1 }
877    
878        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
879                                             : substr(  $seq, $left-1, $right-$left+1 );
880    
881        if ( $from > $to )
882        {
883            $subseq = reverse $subseq;
884            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
885                         [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
886        }
887    
888        $subseq
889    }
890    
891    
892    sub RNA_subseq
893    {
894        my ( $seq, $from, $to ) = @_;
895    
896        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
897                                          : length(  $seq );
898        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
899        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
900    
901        my $left  = ( $from < $to ) ? $from : $to;
902        my $right = ( $from < $to ) ? $to   : $from;
903        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
904        if ( $right > $len ) { $right = $len }
905        if ( $left  < 1    ) { $left  =    1 }
906    
907        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
908                                             : substr(  $seq, $left-1, $right-$left+1 );
909    
910        if ( $from > $to )
911        {
912            $subseq = reverse $subseq;
913            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
914                         [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
915        }
916    
917        $subseq
918    }
919    
920    
921  sub complement_DNA_entry {  sub complement_DNA_entry {
922      my ($id, $desc, $seq, $fix_id) = @_;      my ($id, $desc, $seq, $fix_id) = @_;
923      $fix_id ||= 0;     #  fix undef values      $fix_id ||= 0;     #  fix undef values
# Line 353  Line 927 
927      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
928                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
929      if ($fix_id) {      if ($fix_id) {
930          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
931              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
932          }          }
933          else {          else {
# Line 374  Line 948 
948      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
949                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
950      if ($fix_id) {      if ($fix_id) {
951          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
952              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
953          }          }
954          else {          else {
# Line 416  Line 990 
990  }  }
991    
992    
993    sub pack_seq {
994        my $seq = shift;
995        $seq =~ tr/A-Za-z//cd;
996        return $seq;
997    }
998    
999    
1000  sub clean_ae_sequence {  sub clean_ae_sequence {
1001      $_ = shift;      $_ = shift;
1002      $_ = to7bit($_);      $_ = to7bit($_);
# Line 464  Line 1045 
1045  #  #
1046  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1047    
1048  our %genetic_code = (   # work as DNA  @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
1049    @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
1050    @aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );
1051    
1052    %genetic_code = (
1053    
1054        # DNA version
1055    
1056      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
1057      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
1058      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
# Line 482  Line 1070 
1070      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
1071      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",
1072    
1073        # RNA suppliment
1074    
1075        UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",
1076        UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",
1077        UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",
1078        UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",
1079        CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",
1080        CUC => "L",
1081        CUA => "L",
1082        CUG => "L",
1083        AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",
1084        AUC => "I",
1085        AUA => "I",
1086        AUG => "M",
1087        GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",
1088        GUC => "V",
1089        GUA => "V",
1090        GUG => "V",
1091    
1092      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1093      #  speed up the processing of some common ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1094    
1095      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
1096      TTR => "L",  TCR => "S",  TAR => "*",      TTR => "L",  TCR => "S",  TAR => "*",
# Line 496  Line 1103 
1103                   ACN => "T",                   ACN => "T",
1104      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
1105      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
1106      GTN => "V",  GCN => "A",               GGN => "G"      GTN => "V",  GCN => "A",               GGN => "G",
1107    
1108        UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",
1109        UUR => "L",  UCR => "S",  UAR => "*",
1110                     UCN => "S",
1111        CUY => "L",
1112        CUR => "L",
1113        CUN => "L",
1114        AUY => "I",
1115        GUY => "V",
1116        GUR => "V",
1117        GUN => "V"
1118  );  );
1119    
1120    
1121  my %DNA_letter_can_be = (  #  Add lower case by construction:
1122    
1123    foreach ( keys %genetic_code ) {
1124        $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1125    }
1126    
1127    
1128    #  Construct the genetic code with selenocysteine by difference:
1129    
1130    %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;
1131    $genetic_code_with_U{ TGA } = "U";
1132    $genetic_code_with_U{ tga } = "u";
1133    $genetic_code_with_U{ UGA } = "U";
1134    $genetic_code_with_U{ uga } = "u";
1135    
1136    
1137    %amino_acid_codons_DNA = (
1138             L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
1139             R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
1140             S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
1141             A  => [ qw( GCA GCG GCT GCC ) ],
1142             G  => [ qw( GGA GGG GGT GGC ) ],
1143             P  => [ qw( CCA CCG CCT CCC ) ],
1144             T  => [ qw( ACA ACG ACT ACC ) ],
1145             V  => [ qw( GTA GTG GTT GTC ) ],
1146             I  => [ qw( ATA ATT ATC ) ],
1147             C  => [ qw( TGT TGC ) ],
1148             D  => [ qw( GAT GAC ) ],
1149             E  => [ qw( GAA GAG ) ],
1150             F  => [ qw( TTT TTC ) ],
1151             H  => [ qw( CAT CAC ) ],
1152             K  => [ qw( AAA AAG ) ],
1153             N  => [ qw( AAT AAC ) ],
1154             Q  => [ qw( CAA CAG ) ],
1155             Y  => [ qw( TAT TAC ) ],
1156             M  => [ qw( ATG ) ],
1157             U  => [ qw( TGA ) ],
1158             W  => [ qw( TGG ) ],
1159    
1160             l  => [ qw( tta ttg cta ctg ctt ctc ) ],
1161             r  => [ qw( aga agg cga cgg cgt cgc ) ],
1162             s  => [ qw( agt agc tca tcg tct tcc ) ],
1163             a  => [ qw( gca gcg gct gcc ) ],
1164             g  => [ qw( gga ggg ggt ggc ) ],
1165             p  => [ qw( cca ccg cct ccc ) ],
1166             t  => [ qw( aca acg act acc ) ],
1167             v  => [ qw( gta gtg gtt gtc ) ],
1168             i  => [ qw( ata att atc ) ],
1169             c  => [ qw( tgt tgc ) ],
1170             d  => [ qw( gat gac ) ],
1171             e  => [ qw( gaa gag ) ],
1172             f  => [ qw( ttt ttc ) ],
1173             h  => [ qw( cat cac ) ],
1174             k  => [ qw( aaa aag ) ],
1175             n  => [ qw( aat aac ) ],
1176             q  => [ qw( caa cag ) ],
1177             y  => [ qw( tat tac ) ],
1178             m  => [ qw( atg ) ],
1179             u  => [ qw( tga ) ],
1180             w  => [ qw( tgg ) ],
1181    
1182            '*' => [ qw( TAA TAG TGA ) ]
1183    );
1184    
1185    
1186    
1187    %amino_acid_codons_RNA = (
1188             L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
1189             R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
1190             S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
1191             A  => [ qw( GCA GCG GCU GCC ) ],
1192             G  => [ qw( GGA GGG GGU GGC ) ],
1193             P  => [ qw( CCA CCG CCU CCC ) ],
1194             T  => [ qw( ACA ACG ACU ACC ) ],
1195             V  => [ qw( GUA GUG GUU GUC ) ],
1196             B  => [ qw( GAU GAC AAU AAC ) ],
1197             Z  => [ qw( GAA GAG CAA CAG ) ],
1198             I  => [ qw( AUA AUU AUC ) ],
1199             C  => [ qw( UGU UGC ) ],
1200             D  => [ qw( GAU GAC ) ],
1201             E  => [ qw( GAA GAG ) ],
1202             F  => [ qw( UUU UUC ) ],
1203             H  => [ qw( CAU CAC ) ],
1204             K  => [ qw( AAA AAG ) ],
1205             N  => [ qw( AAU AAC ) ],
1206             Q  => [ qw( CAA CAG ) ],
1207             Y  => [ qw( UAU UAC ) ],
1208             M  => [ qw( AUG ) ],
1209             U  => [ qw( UGA ) ],
1210             W  => [ qw( UGG ) ],
1211    
1212             l  => [ qw( uua uug cua cug cuu cuc ) ],
1213             r  => [ qw( aga agg cga cgg cgu cgc ) ],
1214             s  => [ qw( agu agc uca ucg ucu ucc ) ],
1215             a  => [ qw( gca gcg gcu gcc ) ],
1216             g  => [ qw( gga ggg ggu ggc ) ],
1217             p  => [ qw( cca ccg ccu ccc ) ],
1218             t  => [ qw( aca acg acu acc ) ],
1219             v  => [ qw( gua gug guu guc ) ],
1220             b  => [ qw( gau gac aau aac ) ],
1221             z  => [ qw( gaa gag caa cag ) ],
1222             i  => [ qw( aua auu auc ) ],
1223             c  => [ qw( ugu ugc ) ],
1224             d  => [ qw( gau gac ) ],
1225             e  => [ qw( gaa gag ) ],
1226             f  => [ qw( uuu uuc ) ],
1227             h  => [ qw( cau cac ) ],
1228             k  => [ qw( aaa aag ) ],
1229             n  => [ qw( aau aac ) ],
1230             q  => [ qw( caa cag ) ],
1231             y  => [ qw( uau uac ) ],
1232             m  => [ qw( aug ) ],
1233             u  => [ qw( uga ) ],
1234             w  => [ qw( ugg ) ],
1235    
1236            '*' => [ qw( UAA UAG UGA ) ]
1237    );
1238    
1239    
1240    %n_codon_for_aa = map {
1241        $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
1242        } keys %amino_acid_codons_DNA;
1243    
1244    
1245    %reverse_genetic_code_DNA = (
1246             A  => "GCN",  a  => "gcn",
1247             C  => "TGY",  c  => "tgy",
1248             D  => "GAY",  d  => "gay",
1249             E  => "GAR",  e  => "gar",
1250             F  => "TTY",  f  => "tty",
1251             G  => "GGN",  g  => "ggn",
1252             H  => "CAY",  h  => "cay",
1253             I  => "ATH",  i  => "ath",
1254             K  => "AAR",  k  => "aar",
1255             L  => "YTN",  l  => "ytn",
1256             M  => "ATG",  m  => "atg",
1257             N  => "AAY",  n  => "aay",
1258             P  => "CCN",  p  => "ccn",
1259             Q  => "CAR",  q  => "car",
1260             R  => "MGN",  r  => "mgn",
1261             S  => "WSN",  s  => "wsn",
1262             T  => "ACN",  t  => "acn",
1263             U  => "TGA",  u  => "tga",
1264             V  => "GTN",  v  => "gtn",
1265             W  => "TGG",  w  => "tgg",
1266             X  => "NNN",  x  => "nnn",
1267             Y  => "TAY",  y  => "tay",
1268            '*' => "TRR"
1269    );
1270    
1271    %reverse_genetic_code_RNA = (
1272             A  => "GCN",  a  => "gcn",
1273             C  => "UGY",  c  => "ugy",
1274             D  => "GAY",  d  => "gay",
1275             E  => "GAR",  e  => "gar",
1276             F  => "UUY",  f  => "uuy",
1277             G  => "GGN",  g  => "ggn",
1278             H  => "CAY",  h  => "cay",
1279             I  => "AUH",  i  => "auh",
1280             K  => "AAR",  k  => "aar",
1281             L  => "YUN",  l  => "yun",
1282             M  => "AUG",  m  => "aug",
1283             N  => "AAY",  n  => "aay",
1284             P  => "CCN",  p  => "ccn",
1285             Q  => "CAR",  q  => "car",
1286             R  => "MGN",  r  => "mgn",
1287             S  => "WSN",  s  => "wsn",
1288             T  => "ACN",  t  => "acn",
1289             U  => "UGA",  u  => "uga",
1290             V  => "GUN",  v  => "gun",
1291             W  => "UGG",  w  => "ugg",
1292             X  => "NNN",  x  => "nnn",
1293             Y  => "UAY",  y  => "uay",
1294            '*' => "URR"
1295    );
1296    
1297    
1298    %DNA_letter_can_be = (
1299      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1300      B => ["C", "G", "T"],       b => ["c", "g", "t"],      B => ["C", "G", "T"],       b => ["c", "g", "t"],
1301      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 520  Line 1315 
1315  );  );
1316    
1317    
1318  my %RNA_letter_can_be = (  %RNA_letter_can_be = (
1319      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1320      B => ["C", "G", "U"],       b => ["c", "g", "u"],      B => ["C", "G", "U"],       b => ["c", "g", "u"],
1321      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 540  Line 1335 
1335  );  );
1336    
1337    
1338  my %one_letter_to_three_letter_aa = (  %one_letter_to_three_letter_aa = (
1339       A  => "Ala",           A  => "Ala", a  => "Ala",
1340       B  => "Asx",           B  => "Asx", b  => "Asx",
1341       C  => "Cys",           C  => "Cys", c  => "Cys",
1342       D  => "Asp",           D  => "Asp", d  => "Asp",
1343       E  => "Glu",           E  => "Glu", e  => "Glu",
1344       F  => "Phe",           F  => "Phe", f  => "Phe",
1345       G  => "Gly",           G  => "Gly", g  => "Gly",
1346       H  => "His",           H  => "His", h  => "His",
1347       I  => "Ile",           I  => "Ile", i  => "Ile",
1348       K  => "Lys",           K  => "Lys", k  => "Lys",
1349       L  => "Leu",           L  => "Leu", l  => "Leu",
1350       M  => "Met",           M  => "Met", m  => "Met",
1351       N  => "Asn",           N  => "Asn", n  => "Asn",
1352       P  => "Pro",           P  => "Pro", p  => "Pro",
1353       Q  => "Gln",           Q  => "Gln", q  => "Gln",
1354       R  => "Arg",           R  => "Arg", r  => "Arg",
1355       S  => "Ser",           S  => "Ser", s  => "Ser",
1356       T  => "Thr",           T  => "Thr", t  => "Thr",
1357       U  => "Sec",           U  => "Sec", u  => "Sec",
1358       V  => "Val",           V  => "Val", v  => "Val",
1359       W  => "Trp",           W  => "Trp", w  => "Trp",
1360       X  => "Xxx",           X  => "Xxx", x  => "Xxx",
1361       Y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1362       Z  => "Glx",           Z  => "Glx", z  => "Glx",
1363      "*" => "***"          '*' => "***"
1364            );
1365    
1366    
1367    %three_letter_to_one_letter_aa = (
1368         ALA  => "A",   Ala  => "A",   ala  => "a",
1369         ARG  => "R",   Arg  => "R",   arg  => "r",
1370         ASN  => "N",   Asn  => "N",   asn  => "n",
1371         ASP  => "D",   Asp  => "D",   asp  => "d",
1372         ASX  => "B",   Asx  => "B",   asx  => "b",
1373         CYS  => "C",   Cys  => "C",   cys  => "c",
1374         GLN  => "Q",   Gln  => "Q",   gln  => "q",
1375         GLU  => "E",   Glu  => "E",   glu  => "e",
1376         GLX  => "Z",   Glx  => "Z",   glx  => "z",
1377         GLY  => "G",   Gly  => "G",   gly  => "g",
1378         HIS  => "H",   His  => "H",   his  => "h",
1379         ILE  => "I",   Ile  => "I",   ile  => "i",
1380         LEU  => "L",   Leu  => "L",   leu  => "l",
1381         LYS  => "K",   Lys  => "K",   lys  => "k",
1382         MET  => "M",   Met  => "M",   met  => "m",
1383         PHE  => "F",   Phe  => "F",   phe  => "f",
1384         PRO  => "P",   Pro  => "P",   pro  => "p",
1385         SEC  => "U",   Sec  => "U",   sec  => "u",
1386         SER  => "S",   Ser  => "S",   ser  => "s",
1387         THR  => "T",   Thr  => "T",   thr  => "t",
1388         TRP  => "W",   Trp  => "W",   trp  => "w",
1389         TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
1390         VAL  => "V",   Val  => "V",   val  => "v",
1391         XAA  => "X",   Xaa  => "X",   xaa  => "x",
1392         XXX  => "X",   Xxx  => "X",   xxx  => "x",
1393        '***' => "*"
1394  );  );
1395    
1396    
# Line 586  Line 1411 
1411                              #  (note: undef is false)                              #  (note: undef is false)
1412    
1413      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
1414      my $pep  = ($met && ($imax >= 0)) ? "M" : "";      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );
     my $aa;  
1415      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
1416          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
1417      }      }
# Line 796  Line 1620 
1620  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1621  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1622  #  #
1623  #     @intervals = read_intervals(*FILEHANDLE)  #     @intervals = read_intervals( \*FILEHANDLE )
1624  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1625  sub read_intervals {  sub read_intervals {
1626      my $fh = shift;      my $fh = shift;
# Line 819  Line 1643 
1643    
1644    
1645  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1646    #  Convert a list of intervals to read [ id, left_end, right_end ].
1647    #
1648    #     @intervals = standardize_intervals( @interval_refs )
1649    #-----------------------------------------------------------------------------
1650    sub standardize_intervals {
1651        map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
1652    }
1653    
1654    
1655    #-----------------------------------------------------------------------------
1656  #  Take the union of a list of intervals  #  Take the union of a list of intervals
1657  #  #
1658  #     @joined = join_intervals( @interval_refs )  #     @joined = join_intervals( @interval_refs )
# Line 858  Line 1692 
1692      return @joined;      return @joined;
1693  }  }
1694    
1695    
1696    #-----------------------------------------------------------------------------
1697    #  Split location strings to oriented intervals.
1698    #
1699    #     @intervals = locations_2_intervals( @locations )
1700    #     $interval  = locations_2_intervals( $location  )
1701    #-----------------------------------------------------------------------------
1702    sub locations_2_intervals {
1703        my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
1704                           || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
1705                           || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
1706                           || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
1707                            ? [ $1, $2+0, $3+0 ]
1708                            : ()
1709                            } @_;
1710    
1711        return wantarray ? @intervals : $intervals[0];
1712    }
1713    
1714    
1715    #-----------------------------------------------------------------------------
1716    #  Read a list of oriented intervals from a file.
1717    #  Allow id_start_end, or id \s start \s end formats
1718    #
1719    #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1720    #-----------------------------------------------------------------------------
1721    sub read_oriented_intervals {
1722        my $fh = shift;
1723        my @intervals = ();
1724    
1725        while (<$fh>) {
1726            chomp;
1727               /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
1728            || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
1729            || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
1730            || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
1731            || next;
1732    
1733            #  Matched a pattern.  Store reference to (id, start, end):
1734            push @intervals, [ $1, $2+0, $3+0 ];
1735        }
1736        return @intervals;
1737    }
1738    
1739    
1740    #-----------------------------------------------------------------------------
1741    #  Reverse the orientation of a list of intervals
1742    #
1743    #     @reversed = reverse_intervals( @interval_refs )
1744    #-----------------------------------------------------------------------------
1745    sub reverse_intervals {
1746        map { [ $_->[0], $_->[2], $_->[1] ] } @_;
1747    }
1748    
1749    
1750    #-----------------------------------------------------------------------------
1751    #  Convert GenBank locations to SEED locations
1752    #
1753    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1754    #-----------------------------------------------------------------------------
1755    sub gb_location_2_seed
1756    {
1757        my $contig = shift @_;
1758        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1759    
1760        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1761    }
1762    
1763    sub gb_loc_2_seed_2
1764    {
1765        my ( $contig, $loc ) = @_;
1766    
1767        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1768        {
1769            join( '_', $contig, $1, $2 )
1770        }
1771    
1772        elsif ( $loc =~ /^join\((.*)\)$/ )
1773        {
1774            $loc = $1;
1775            my $lvl = 0;
1776            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1777            {
1778                for ( substr( $loc, $i, 1 ) )
1779                {
1780                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1781                    /\(/          and $lvl--;
1782                    /\)/          and $lvl++;
1783                }
1784            }
1785            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1786            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1787        }
1788    
1789        elsif ( $loc =~ /^complement\((.*)\)$/ )
1790        {
1791            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1792            reverse
1793            gb_loc_2_seed_2( $contig, $1 )
1794        }
1795    
1796        else
1797        {
1798            ()
1799        }
1800    }
1801    
1802    
1803    #-----------------------------------------------------------------------------
1804    #  Read qual.
1805    #
1806    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1807    #
1808    #     @seq_entries = read_qual( )               #  STDIN
1809    #    \@seq_entries = read_qual( )               #  STDIN
1810    #     @seq_entries = read_qual( \*FILEHANDLE )
1811    #    \@seq_entries = read_qual( \*FILEHANDLE )
1812    #     @seq_entries = read_qual(  $filename )
1813    #    \@seq_entries = read_qual(  $filename )
1814    #-----------------------------------------------------------------------------
1815    sub read_qual {
1816        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1817        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1818    
1819        my @quals = ();
1820        my ($id, $desc, $qual) = ("", "", []);
1821    
1822        while ( <$fh> ) {
1823            chomp;
1824            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1825                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1826                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1827            }
1828            else {
1829                push @$qual, split;
1830            }
1831        }
1832        close( $fh ) if $close;
1833    
1834        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1835        return wantarray ? @quals : \@quals;
1836    }
1837    
1838    
1839  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.9

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3