[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Mon Dec 5 19:06:30 2005 UTC revision 1.10, Fri Nov 23 00:33:31 2007 UTC
# Line 1  Line 1 
 #  
 # Copyright (c) 2003-2006 University of Chicago and Fellowship  
 # for Interpretations of Genomes. All Rights Reserved.  
 #  
 # This file is part of the SEED Toolkit.  
 #  
 # The SEED Toolkit is free software. You can redistribute  
 # it and/or modify it under the terms of the SEED Toolkit  
 # Public License.  
 #  
 # You should have received a copy of the SEED Toolkit Public License  
 # along with this program; if not write to the University of Chicago  
 # at info@ci.uchicago.edu or the Fellowship for Interpretation of  
 # Genomes at veronika@thefig.info or download a copy from  
 # http://www.theseed.org/LICENSE.TXT.  
 #  
   
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
4  #  A list of entries is a list of references  #  A list of entries is a list of references
5  #  #
6  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs  #  @seq_entries = read_fasta( )                     # STDIN
9    #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11    #  @seq_entries = read_clustal( )                   # STDIN
12    #  @seq_entries = read_clustal( \*FILEHANDLE )
13    #  @seq_entries = read_clustal(  $filename )
14    #  @seq_entries = read_clustal_file(  $filename )
15  #  #
16    #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
# Line 31  Line 21 
21  #  ( $id, $def ) = parse_fasta_title( $title )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  ( $id, $def ) = split_fasta_title( $title )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  #
24  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28    #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46    #
47    #   @packed_seqs = pack_alignment(  @seqs )
48    #   @packed_seqs = pack_alignment( \@seqs )
49    #  \@packed_seqs = pack_alignment(  @seqs )
50    #  \@packed_seqs = pack_alignment( \@seqs )
51  #  #
52  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
53  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
54    #  $DNAseq = DNA_subseq(  $seq, $from, $to );
55    #  $DNAseq = DNA_subseq( \$seq, $from, $to );
56    #  $RNAseq = RNA_subseq(  $seq, $from, $to );
57    #  $RNAseq = RNA_subseq( \$seq, $from, $to );
58  #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );  #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
59  #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );  #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
60  #  $DNAseq = complement_DNA_seq( $NA_seq );  #  $DNAseq = complement_DNA_seq( $NA_seq );
# Line 47  Line 64 
64  #  $seq    = pack_seq( $sequence )  #  $seq    = pack_seq( $sequence )
65  #  $seq    = clean_ae_sequence( $seq )  #  $seq    = clean_ae_sequence( $seq )
66  #  #
67  #  $seq = translate_seq( $seq [, $met_start] )  #  $aa = translate_seq( $nt, $met_start )
68    #  $aa = translate_seq( $nt )
69  #  $aa  = translate_codon( $triplet );  #  $aa  = translate_codon( $triplet );
 #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  
70  #  #
71  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code.  The supplied code needs to be complete in
72  #  DNA versus RNA type of sequence  #  RNA and/or DNA, and upper and/or lower case.  The program guesses based
73    #  on lysine and phenylalanine codons.
74    #
75    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash, $met_start )
76    #  $aa = translate_seq_with_user_code( $nt, $gen_code_hash )
77  #  #
78  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
79  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
80  #  #
81  #  @intervals = read_intervals( *FILEHANDLE )  #  @intervals = read_intervals( \*FILEHANDLE )
82  #  @intervals = read_oriented_intervals( *FILEHANDLE )  #  @intervals = read_oriented_intervals( \*FILEHANDLE )
83  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
84  #  @joined    = join_intervals( @interval_refs )  #  @joined    = join_intervals( @interval_refs )
85  #  @intervals = locations_2_intervals( @locations )  #  @intervals = locations_2_intervals( @locations )
86  #  $interval  = locations_2_intervals( $location  )  #  $interval  = locations_2_intervals( $location  )
87  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
88    #
89    #  Convert GenBank locations to SEED locations
90    #
91    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
92    #
93    #  Read quality scores from a fasta-like file:
94    #
95    #  @seq_entries = read_qual( )               #  STDIN
96    # \@seq_entries = read_qual( )               #  STDIN
97    #  @seq_entries = read_qual( \*FILEHANDLE )
98    # \@seq_entries = read_qual( \*FILEHANDLE )
99    #  @seq_entries = read_qual(  $filename )
100    # \@seq_entries = read_qual(  $filename )
101    #
102    
103  use strict;  use strict;
104    use Carp;
 use gjolib qw( wrap_text );  
105    
106  #  Exported global variables:  #  Exported global variables:
107    
# Line 92  Line 125 
125  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
126  our @EXPORT = qw(  our @EXPORT = qw(
127          read_fasta_seqs          read_fasta_seqs
128            read_fasta
129          read_next_fasta_seq          read_next_fasta_seq
130            read_clustal_file
131            read_clustal
132          parse_fasta_title          parse_fasta_title
133          split_fasta_title          split_fasta_title
134          print_seq_list_as_fasta          print_seq_list_as_fasta
135            print_alignment_as_fasta
136            print_alignment_as_phylip
137            print_alignment_as_nexus
138          print_seq_as_fasta          print_seq_as_fasta
139          print_gb_locus          print_gb_locus
140    
# Line 104  Line 143 
143          seq_desc_by_id          seq_desc_by_id
144          seq_data_by_id          seq_data_by_id
145    
146            pack_alignment
147    
148          subseq_DNA_entry          subseq_DNA_entry
149          subseq_RNA_entry          subseq_RNA_entry
150            DNA_subseq
151            RNA_subseq
152          complement_DNA_entry          complement_DNA_entry
153          complement_RNA_entry          complement_RNA_entry
154          complement_DNA_seq          complement_DNA_seq
# Line 125  Line 168 
168          locations_2_intervals          locations_2_intervals
169          read_oriented_intervals          read_oriented_intervals
170          reverse_intervals          reverse_intervals
171    
172            gb_location_2_seed
173    
174            read_qual
175          );          );
176    
177  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 146  Line 193 
193    
194    
195  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
196  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
197    #     filehandle is passed through
198    #     string is taken as file name to be openend
199    #     undef or "" defaults to STDOUT
200    #
201    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
202    #
203    #-----------------------------------------------------------------------------
204    sub input_filehandle
205    {
206        my $file = shift;
207    
208        #  FILEHANDLE
209    
210        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
211    
212        #  Null string or undef
213    
214        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
215    
216        #  File name
217    
218        if ( ! ref( $file ) )
219        {
220            my $fh;
221            -f $file or die "Could not find input file \"$file\"\n";
222            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
223            return ( $fh, $file, 1 );
224        }
225    
226        #  Some other kind of reference; return the unused value
227    
228        return ( \*STDIN, undef, 0, $file );
229    }
230    
231    
232    #-----------------------------------------------------------------------------
233    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
234  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
235  #  #
236  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
237  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
238  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
239      my $fh = shift;  
240      wantarray || die "read_fasta_seqs requires list context\n";  
241    #-----------------------------------------------------------------------------
242    #  Read fasta sequences.
243    #  Save the contents in a list of refs to arrays:  (id, description, seq)
244    #
245    #     @seq_entries = read_fasta( )               #  STDIN
246    #    \@seq_entries = read_fasta( )               #  STDIN
247    #     @seq_entries = read_fasta( \*FILEHANDLE )
248    #    \@seq_entries = read_fasta( \*FILEHANDLE )
249    #     @seq_entries = read_fasta(  $filename )
250    #    \@seq_entries = read_fasta(  $filename )
251    #-----------------------------------------------------------------------------
252    sub read_fasta {
253        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
254        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
255    
256      my @seqs = ();      my @seqs = ();
257      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 169  Line 267 
267              $seq .= $_ ;              $seq .= $_ ;
268          }          }
269      }      }
270        close( $fh ) if $close;
271    
272      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
273      return @seqs;      return wantarray ? @seqs : \@seqs;
274  }  }
275    
276    
# Line 179  Line 278 
278  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
279  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
280  #  #
281  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
282  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
283  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
284    
# Line 188  Line 287 
287      my %next_header;      my %next_header;
288    
289      sub read_next_fasta_seq {      sub read_next_fasta_seq {
         wantarray || die "read_next_fasta_seq requires list context\n";  
   
290          my $fh = shift;          my $fh = shift;
291          my ( $id, $desc );          my ( $id, $desc );
292    
# Line 206  Line 303 
303              chomp;              chomp;
304              if ( /^>/ ) {        #  new id              if ( /^>/ ) {        #  new id
305                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
306                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
307                    {
308                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
309                    }
310                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
311                  $seq = "";                  $seq = "";
312              }              }
# Line 219  Line 319 
319          #  Done with file, delete "next header"          #  Done with file, delete "next header"
320    
321          delete $next_header{$fh};          delete $next_header{$fh};
322          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
323                                                          : [$id, $desc, $seq]
324                                              )
325                                            : () ;
326        }
327    }
328    
329    
330    #-----------------------------------------------------------------------------
331    #  Read a clustal alignment from a file.
332    #  Save the contents in a list of refs to arrays:  (id, description, seq)
333    #
334    #     @seq_entries = read_clustal_file( $filename )
335    #-----------------------------------------------------------------------------
336    sub read_clustal_file { read_clustal( @_ ) }
337    
338    
339    #-----------------------------------------------------------------------------
340    #  Read a clustal alignment.
341    #  Save the contents in a list of refs to arrays:  (id, description, seq)
342    #
343    #     @seq_entries = read_clustal( )              # STDIN
344    #     @seq_entries = read_clustal( \*FILEHANDLE )
345    #     @seq_entries = read_clustal(  $filename )
346    #-----------------------------------------------------------------------------
347    sub read_clustal {
348        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
349        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
350    
351        my ( %seq, @ids, $line );
352        while ( defined( $line = <$fh> ) )
353        {
354            ( $line =~ /^[A-Za-z0-9]/ ) or next;
355            chomp $line;
356            my @flds = split /\s+/, $line;
357            if ( @flds == 2 )
358            {
359                $seq{ $flds[0] } or push @ids, $flds[0];
360                push @{ $seq{ $flds[0] } }, $flds[1];
361      }      }
362  }  }
363        close( $fh ) if $close;
364    
365        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
366    }
367    
368    
369  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
# Line 250  Line 392 
392    
393    
394  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
395  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
396    #     filehandle is passed through
397    #     string is taken as file name to be openend
398    #     undef or "" defaults to STDOUT
399    #
400    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
401  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
402  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
403  sub print_seq_list_as_fasta {  sub output_filehandle
404      my $fh = shift;  {
405      my @seq_list = @_;      my $file = shift;
406    
407      foreach my $seq_ptr (@seq_list) {      #  FILEHANDLE
408          print_seq_as_fasta($fh, @$seq_ptr);  
409        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
410    
411        #  Null string or undef
412    
413        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
414    
415        #  File name
416    
417        if ( ! ref( $file ) )
418        {
419            my $fh;
420            open( $fh, ">$file" ) || die "Could not open output $file\n";
421            return ( $fh, $file, 1 );
422      }      }
423    
424        #  Some other kind of reference; return the unused value
425    
426        return ( \*STDOUT, undef, 0, $file );
427    }
428    
429    
430    #-----------------------------------------------------------------------------
431    #  Legacy function for printing fasta sequence set:
432    #
433    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
434    #-----------------------------------------------------------------------------
435    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
436    
437    
438    #-----------------------------------------------------------------------------
439    #  Print list of sequence entries in fasta format.
440    #  Missing, undef or "" filename defaults to STDOUT.
441    #
442    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
443    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
444    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
445    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
446    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
447    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
448    #-----------------------------------------------------------------------------
449    sub print_alignment_as_fasta {
450        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
451        ( unshift @_, $unused ) if $unused;
452    
453        ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n";
454    
455        #  Expand the sequence entry list if necessary:
456    
457        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
458    
459        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
460    
461        close( $fh ) if $close;
462    }
463    
464    
465    #-----------------------------------------------------------------------------
466    #  Print list of sequence entries in phylip format.
467    #  Missing, undef or "" filename defaults to STDOUT.
468    #
469    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
470    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
471    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
472    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
473    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
474    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
475    #-----------------------------------------------------------------------------
476    sub print_alignment_as_phylip {
477        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
478        ( unshift @_, $unused ) if $unused;
479    
480        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
481    
482        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
483    
484        my ( %id2, %used );
485        my $maxlen = 0;
486        foreach ( @seq_list )
487        {
488            my ( $id, undef, $seq ) = @$_;
489    
490            #  Need a name that is unique within 10 characters
491    
492            my $id2 = substr( $id, 0, 10 );
493            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
494            my $n = "0";
495            while ( $used{ $id2 } )
496            {
497                $n++;
498                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
499            }
500            $used{ $id2 } = 1;
501            $id2{ $id } = $id2;
502    
503                    #  Prepare to pad sequences (should not be necessary, but ...)
504    
505            my $len = length( $seq );
506            $maxlen = $len if ( $len > $maxlen );
507        }
508    
509        my $nseq = @seq_list;
510        print $fh "$nseq  $maxlen\n";
511        foreach ( @seq_list )
512        {
513            my ( $id, undef, $seq ) = @$_;
514            my $len = length( $seq );
515            printf $fh "%-10s  %s%s\n", $id2{ $id },
516                                        $seq,
517                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
518        }
519    
520        close( $fh ) if $close;
521    }
522    
523    
524    #-----------------------------------------------------------------------------
525    #  Print list of sequence entries in nexus format.
526    #  Missing, undef or "" filename defaults to STDOUT.
527    #
528    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
529    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
530    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
531    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
532    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
533    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
534    #-----------------------------------------------------------------------------
535    sub print_alignment_as_nexus {
536        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
537        ( unshift @_, $unused ) if $unused;
538    
539        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
540    
541        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
542    
543        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
544    
545        my %id2;
546        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
547        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
548        foreach ( @seq_list )
549        {
550            my ( $id, undef, $seq ) = @$_;
551            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
552            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
553            {
554                    $id2 =~ s/'/''/g;
555                    $id2 = qq('$id2');
556                }
557            $id2{ $id } = $id2;
558            my $idlen = length( $id2 );
559            $maxidlen = $idlen if ( $idlen > $maxidlen );
560    
561            my $seqlen = length( $seq );
562            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
563    
564            $nt += $seq =~ tr/Tt//d;
565            $nu += $seq =~ tr/Uu//d;
566            $n1 += $seq =~ tr/ACGNacgn//d;
567            $n2 += $seq =~ tr/A-Za-z//d;
568        }
569    
570        my $nseq = @seq_list;
571        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
572    
573        print $fh <<"END_HEAD";
574    #NEXUS
575    
576    BEGIN Data;
577        Dimensions
578            NTax=$nseq
579            NChar=$maxseqlen
580            ;
581        Format
582            DataType=$type
583            Gap=-
584            Missing=?
585            ;
586        Matrix
587    
588    END_HEAD
589    
590        foreach ( @seq_list )
591        {
592            my ( $id, undef, $seq ) = @$_;
593            my $len = length( $seq );
594            printf  $fh  "%-${maxidlen}s  %s%s\n",
595                         $id2{ $id },
596                         $seq,
597                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
598        }
599    
600        print $fh <<"END_TAIL";
601    ;
602    END;
603    END_TAIL
604    
605        close( $fh ) if $close;
606  }  }
607    
608    
609  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
610  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
611  #  #
612  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
613  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
614  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
615  sub print_seq_as_fasta {  sub print_seq_as_fasta {
616      my $fh = shift;      my $fh = shift;
# Line 285  Line 627 
627  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
628  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
629  #  #
630  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
631  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
632  sub print_gb_locus {  sub print_gb_locus {
633      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 311  Line 653 
653    
654    
655  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
656    #  Return a string with text wrapped to defined line lengths:
657    #
658    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
659    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
660    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
661    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
662    #-----------------------------------------------------------------------------
663    sub wrap_text {
664        my ($str, $len, $ind, $indn) = @_;
665    
666        defined($str)  || die "wrap_text called without a string\n";
667        defined($len)  || ($len  =   80);
668        defined($ind)  || ($ind  =    0);
669        ($ind  < $len) || die "wrap error: indent greater than line length\n";
670        defined($indn) || ($indn = $ind);
671        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
672    
673        $str =~ s/\s+$//;
674        $str =~ s/^\s+//;
675        my ($maxchr, $maxchr1);
676        my (@lines) = ();
677    
678        while ($str) {
679            $maxchr1 = ($maxchr = $len - $ind) - 1;
680            if ($maxchr >= length($str)) {
681                push @lines, (" " x $ind) . $str;
682                last;
683            }
684            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
685                push @lines, (" " x $ind) . $1;
686                $str = $2;
687            }
688            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
689                push @lines, (" " x $ind) . $1;
690                $str = $2;
691            }
692            else {
693                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
694                $str = substr($str, $maxchr);
695            }
696            $ind = $indn;
697        }
698    
699        return join("\n", @lines);
700    }
701    
702    
703    #-----------------------------------------------------------------------------
704  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
705  #  #
706  #     my \%seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
707    #     my \%seq_ind  = index_seq_list( \@seq_list );
708  #  #
709  #  Usage example:  #  Usage example:
710  #  #
711  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
712  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
713  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
714  #  #
715  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
716  sub index_seq_list {  sub index_seq_list {
717      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
718      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
719      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
720  }  }
721    
722    
723  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
724  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
725  #  #
726  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
727  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
728  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
729    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
730  #  #
731  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
732  sub seq_entry_by_id {  sub seq_entry_by_id {
733      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
734      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
735      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
736  }  }
737    
738    
# Line 357  Line 749 
749      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
750  }  }
751    
752    #-----------------------------------------------------------------------------
753    #  Remove columns of alignment gaps from sequences:
754    #
755    #   @packed_seqs = pack_alignment(  @seqs )
756    #   @packed_seqs = pack_alignment( \@seqs )
757    #  \@packed_seqs = pack_alignment(  @seqs )
758    #  \@packed_seqs = pack_alignment( \@seqs )
759    #
760    #-----------------------------------------------------------------------------
761    
762    sub pack_alignment
763    {
764        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
765        @seqs or return wantarray ? () : [];
766    
767        my $mask  = pack_mask( $seqs[0]->[2] );
768        foreach ( @seqs[ 1 .. (@seqs-1) ] )
769        {
770            $mask |= pack_mask( $_->[2] );
771        }
772    
773        my $seq;
774        my @seqs2 = map { $seq = $_->[2] & $mask;
775                          $seq =~ tr/\000//d;
776                          [ $_->[0], $_->[1], $seq ]
777                        }
778                    @seqs;
779    
780        return wantarray ? @seqs2 : \@seqs2;
781    }
782    
783    sub pack_mask
784    {
785        my $mask = shift;
786        $mask =~ tr/-/\000/;
787        $mask =~ tr/\000/\377/c;
788        return $mask;
789    }
790    
791  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
792  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 432  Line 862 
862  }  }
863    
864    
865    sub DNA_subseq
866    {
867        my ( $seq, $from, $to ) = @_;
868    
869        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
870                                          : length(  $seq );
871        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
872        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
873    
874        my $left  = ( $from < $to ) ? $from : $to;
875        my $right = ( $from < $to ) ? $to   : $from;
876        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
877        if ( $right > $len ) { $right = $len }
878        if ( $left  < 1    ) { $left  =    1 }
879    
880        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
881                                             : substr(  $seq, $left-1, $right-$left+1 );
882    
883        if ( $from > $to )
884        {
885            $subseq = reverse $subseq;
886            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
887                         [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
888        }
889    
890        $subseq
891    }
892    
893    
894    sub RNA_subseq
895    {
896        my ( $seq, $from, $to ) = @_;
897    
898        my $len = ref( $seq ) eq 'SCALAR' ? length( $$seq )
899                                          : length(  $seq );
900        if ( ( $from eq '$' ) || ( $from eq "" ) ) { $from = $len }
901        if ( ( $to   eq '$' ) || ( ! $to       ) ) { $to   = $len }
902    
903        my $left  = ( $from < $to ) ? $from : $to;
904        my $right = ( $from < $to ) ? $to   : $from;
905        if ( ( $right < 1 ) || ( $left > $len ) ) { return "" }
906        if ( $right > $len ) { $right = $len }
907        if ( $left  < 1    ) { $left  =    1 }
908    
909        my $subseq = ref( $seq ) eq 'SCALAR' ? substr( $$seq, $left-1, $right-$left+1 )
910                                             : substr(  $seq, $left-1, $right-$left+1 );
911    
912        if ( $from > $to )
913        {
914            $subseq = reverse $subseq;
915            $subseq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
916                         [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
917        }
918    
919        $subseq
920    }
921    
922    
923  sub complement_DNA_entry {  sub complement_DNA_entry {
924      my ($id, $desc, $seq, $fix_id) = @_;      my ($id, $desc, $seq, $fix_id) = @_;
925      $fix_id ||= 0;     #  fix undef values      $fix_id ||= 0;     #  fix undef values
# Line 549  Line 1037 
1037  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein:
1038  #  #
1039  #  $seq = translate_seq( $seq [, $met_start] )  #  $seq = translate_seq( $seq [, $met_start] )
1040  #  $aa  = translate_codon( $triplet );  #     $aa  = translate_codon( $triplet )
1041  #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );  #     $aa  = translate_DNA_codon( $triplet )     # Does not rely on DNA
1042    #     $aa  = translate_uc_DNA_codon( $triplet )  # Does not rely on uc or DNA
1043  #  #
1044  #  User-supplied genetic code must be upper case index and match the  #  User-supplied genetic code must be upper case index and match the
1045  #  DNA versus RNA type of sequence  #  DNA versus RNA type of sequence
# Line 567  Line 1056 
1056    
1057      # DNA version      # DNA version
1058    
1059      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => 'F',  TCT => 'S',  TAT => 'Y',  TGT => 'C',
1060      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => 'F',  TCC => 'S',  TAC => 'Y',  TGC => 'C',
1061      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => 'L',  TCA => 'S',  TAA => '*',  TGA => '*',
1062      TTG => "L",  TCG => "S",  TAG => "*",  TGG => "W",      TTG => 'L',  TCG => 'S',  TAG => '*',  TGG => 'W',
1063      CTT => "L",  CCT => "P",  CAT => "H",  CGT => "R",      CTT => 'L',  CCT => 'P',  CAT => 'H',  CGT => 'R',
1064      CTC => "L",  CCC => "P",  CAC => "H",  CGC => "R",      CTC => 'L',  CCC => 'P',  CAC => 'H',  CGC => 'R',
1065      CTA => "L",  CCA => "P",  CAA => "Q",  CGA => "R",      CTA => 'L',  CCA => 'P',  CAA => 'Q',  CGA => 'R',
1066      CTG => "L",  CCG => "P",  CAG => "Q",  CGG => "R",      CTG => 'L',  CCG => 'P',  CAG => 'Q',  CGG => 'R',
1067      ATT => "I",  ACT => "T",  AAT => "N",  AGT => "S",      ATT => 'I',  ACT => 'T',  AAT => 'N',  AGT => 'S',
1068      ATC => "I",  ACC => "T",  AAC => "N",  AGC => "S",      ATC => 'I',  ACC => 'T',  AAC => 'N',  AGC => 'S',
1069      ATA => "I",  ACA => "T",  AAA => "K",  AGA => "R",      ATA => 'I',  ACA => 'T',  AAA => 'K',  AGA => 'R',
1070      ATG => "M",  ACG => "T",  AAG => "K",  AGG => "R",      ATG => 'M',  ACG => 'T',  AAG => 'K',  AGG => 'R',
1071      GTT => "V",  GCT => "A",  GAT => "D",  GGT => "G",      GTT => 'V',  GCT => 'A',  GAT => 'D',  GGT => 'G',
1072      GTC => "V",  GCC => "A",  GAC => "D",  GGC => "G",      GTC => 'V',  GCC => 'A',  GAC => 'D',  GGC => 'G',
1073      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => 'V',  GCA => 'A',  GAA => 'E',  GGA => 'G',
1074      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => 'V',  GCG => 'A',  GAG => 'E',  GGG => 'G',
   
     # RNA suppliment  
   
     UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",  
     UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",  
     UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",  
     UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",  
     CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",  
     CUC => "L",  
     CUA => "L",  
     CUG => "L",  
     AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",  
     AUC => "I",  
     AUA => "I",  
     AUG => "M",  
     GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",  
     GUC => "V",  
     GUA => "V",  
     GUG => "V",  
1075    
1076      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1077      #  speed up the processing of some ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1078    
1079      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => 'F',  TCY => 'S',  TAY => 'Y',  TGY => 'C',
1080      TTR => "L",  TCR => "S",  TAR => "*",      TTR => 'L',  TCR => 'S',  TAR => '*',
1081                   TCN => "S",                   TCN => 'S',
1082      CTY => "L",  CCY => "P",  CAY => "H",  CGY => "R",      CTY => 'L',  CCY => 'P',  CAY => 'H',  CGY => 'R',
1083      CTR => "L",  CCR => "P",  CAR => "Q",  CGR => "R",      CTR => 'L',  CCR => 'P',  CAR => 'Q',  CGR => 'R',
1084      CTN => "L",  CCN => "P",               CGN => "R",      CTN => 'L',  CCN => 'P',               CGN => 'R',
1085      ATY => "I",  ACY => "T",  AAY => "N",  AGY => "S",      ATY => 'I',  ACY => 'T',  AAY => 'N',  AGY => 'S',
1086                   ACR => "T",  AAR => "K",  AGR => "R",                   ACR => 'T',  AAR => 'K',  AGR => 'R',
1087                   ACN => "T",                   ACN => 'T',
1088      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => 'V',  GCY => 'A',  GAY => 'D',  GGY => 'G',
1089      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => 'V',  GCR => 'A',  GAR => 'E',  GGR => 'G',
1090      GTN => "V",  GCN => "A",               GGN => "G",      GTN => 'V',  GCN => 'A',               GGN => 'G'
   
     UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",  
     UUR => "L",  UCR => "S",  UAR => "*",  
                  UCN => "S",  
     CUY => "L",  
     CUR => "L",  
     CUN => "L",  
     AUY => "I",  
     GUY => "V",  
     GUR => "V",  
     GUN => "V"  
1091  );  );
1092    
1093    #  Add RNA by construction:
1094    
1095    foreach ( grep { /T/ } keys %genetic_code )
1096    {
1097        my $codon = $_;
1098        $codon =~ s/T/U/g;
1099        $genetic_code{ $codon } = lc $genetic_code{ $_ }
1100    }
1101    
1102  #  Add lower case by construction:  #  Add lower case by construction:
1103    
1104  foreach ( keys %genetic_code ) {  foreach ( keys %genetic_code )
1105    {
1106      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }      $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1107  }  }
1108    
1109    
1110  #  Construct the genetic code with selanocysteine by difference:  #  Construct the genetic code with selenocysteine by difference:
1111    
1112  %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;  %genetic_code_with_U = %genetic_code;
1113  $genetic_code_with_U{ TGA } = "U";  $genetic_code_with_U{ TGA } = 'U';
1114  $genetic_code_with_U{ tga } = "u";  $genetic_code_with_U{ tga } = 'u';
1115  $genetic_code_with_U{ UGA } = "U";  $genetic_code_with_U{ UGA } = 'U';
1116  $genetic_code_with_U{ uga } = "u";  $genetic_code_with_U{ uga } = 'u';
1117    
1118    
1119  %amino_acid_codons_DNA = (  %amino_acid_codons_DNA = (
# Line 849  Line 1317 
1317  );  );
1318    
1319    
1320  %one_letter_to_three_letter_aa = {  %one_letter_to_three_letter_aa = (
1321           A  => "Ala", a  => "Ala",           A  => "Ala", a  => "Ala",
1322           B  => "Asx", b  => "Asx",           B  => "Asx", b  => "Asx",
1323           C  => "Cys", c  => "Cys",           C  => "Cys", c  => "Cys",
# Line 875  Line 1343 
1343           Y  => "Tyr", y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1344           Z  => "Glx", z  => "Glx",           Z  => "Glx", z  => "Glx",
1345          '*' => "***"          '*' => "***"
1346          };          );
1347    
1348    
1349  %three_letter_to_one_letter_aa = (  %three_letter_to_one_letter_aa = (
# Line 909  Line 1377 
1377    
1378    
1379  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1380  #  Translate nucleotides to one letter protein:  #  Translate nucleotides to one letter protein.  Respects case of the
1381    #  nucleotide sequence.
1382  #  #
1383  #      $seq = translate_seq( $seq [, $met_start] )  #      $aa = translate_seq( $nt, $met_start )
1384    #      $aa = translate_seq( $nt )
1385  #  #
1386  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1387    
1388  sub translate_seq {  sub translate_seq
1389      my $seq = uc shift;  {
1390      $seq =~ tr/UX/TN/;      #  make it DNA, and allow X      my $seq = shift;
1391      $seq =~ tr/-//d;        #  remove gaps      $seq =~ tr/-//d;        #  remove gaps
1392    
1393      my $met = shift || 0;   #  a second argument that is true      my @codons = $seq =~ m/(...?)/g;  #  Will try to translate last 2 nt
1394                              #  forces first amino acid to be Met  
1395                              #  (note: undef is false)      #  A second argument that is true forces first amino acid to be Met
1396    
1397      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my @met;
1398      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );      if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1399      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      {
1400          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1401      }      }
1402    
1403      return $pep;      join( '', @met, map { translate_codon( $_ ) } @codons )
1404  }  }
1405    
1406    
1407  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1408  #  Translate a single triplet with "universal" genetic code  #  Translate a single triplet with "universal" genetic code.
 #  Uppercase and DNA are performed, then translate_uc_DNA_codon  
 #  is called.  
1409  #  #
1410  #      $aa = translate_codon( $triplet )  #      $aa = translate_codon( $triplet )
1411    #      $aa = translate_DNA_codon( $triplet )
1412    #      $aa = translate_uc_DNA_codon( $triplet )
1413  #  #
1414  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1415    
1416  sub translate_codon {  sub translate_DNA_codon { translate_codon( @_ ) }
     my $codon = uc shift;  #  Make it uppercase  
     $codon =~ tr/UX/TN/;   #  Make it DNA, and allow X  
     return translate_uc_DNA_codon($codon);  
 }  
   
1417    
1418  #-----------------------------------------------------------------------------  sub translate_uc_DNA_codon { translate_codon( uc $_[0] ) }
 #  Translate a single triplet with "universal" genetic code  
 #  Uppercase and DNA assumed  
 #  Intended for private use by translate_codon and translate_seq  
 #  
 #      $aa = translate_uc_DNA_codon( $triplet )  
 #  
 #-----------------------------------------------------------------------------  
1419    
1420  sub translate_uc_DNA_codon {  sub translate_codon
1421    {
1422      my $codon = shift;      my $codon = shift;
1423      my $aa;      $codon =~ tr/Uu/Tt/;     #  Make it DNA
1424    
1425      #  Try a simple lookup:      #  Try a simple lookup:
1426    
1427        my $aa;
1428        if ( $aa = $genetic_code{ $codon } ) { return $aa }
1429    
1430        #  Attempt to recover from mixed-case codons:
1431    
1432        $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1433      if ( $aa = $genetic_code{ $codon } ) { return $aa }      if ( $aa = $genetic_code{ $codon } ) { return $aa }
1434    
1435      #  With the expanded code defined above, this catches simple N, R      #  The code defined above catches simple N, R and Y ambiguities in the
1436      #  and Y ambiguities in the third position.  Other codes like      #  third position.  Other codons (e.g., GG[KMSWBDHV], or even GG) might
1437      #  GG[KMSWBDHV], or even GG, might be unambiguously translated by      #  be unambiguously translated by converting the last position to N and
1438      #  converting the last position to N and seeing if this is in the      #  seeing if this is in the code table:
1439      #  (expanded) code table:  
1440        my $N = ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1441      if ( $aa = $genetic_code{ substr($codon,0,2) . "N" } ) { return $aa }      if ( $aa = $genetic_code{ substr($codon,0,2) . $N } ) { return $aa }
1442    
1443      #  Test that codon is valid and might have unambiguous aa:      #  Test that codon is valid for an unambiguous aa:
1444    
1445      if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/ ) { return "X" }      my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
1446      #                     ^^      if ( $codon !~ m/^[ACGTMY][ACGT][ACGTKMRSWYBDHVN]$/i
1447      #                     |+- for leucine YTR        && $codon !~ m/^YT[AGR]$/i     #  Leu YTR
1448      #                     +-- for arginine MGR        && $codon !~ m/^MG[AGR]$/i     #  Arg MGR
1449           )
1450        {
1451            return $X;
1452        }
1453    
1454      #  Expand all ambiguous nucleotides to see if they all yield same aa.      #  Expand all ambiguous nucleotides to see if they all yield same aa.
     #  Loop order tries to fail quickly with first position change.  
1455    
1456      $aa = "";      my @n1 = @{ $DNA_letter_can_be{ substr( $codon, 0, 1 ) } };
1457      for my $n2 ( @{ $DNA_letter_can_be{ substr($codon,1,1) } } ) {      my $n2 =                        substr( $codon, 1, 1 );
1458          for my $n3 ( @{ $DNA_letter_can_be{ substr($codon,2,1) } } ) {      my @n3 = @{ $DNA_letter_can_be{ substr( $codon, 2, 1 ) } };
1459              for my $n1 ( @{ $DNA_letter_can_be{ substr($codon,0,1) } } ) {      my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1460                  #  set the first value of $aa  
1461                  if ($aa eq "") { $aa = $genetic_code{ $n1 . $n2 . $n3 } }      my $triple = shift @triples;
1462                  #  or break out if any other amino acid is detected      $aa = $genetic_code{ $triple };
1463                  elsif ($aa ne $genetic_code{ $n1 . $n2 . $n3 } ) { return "X" }      $aa or return $X;
             }  
         }  
     }  
1464    
1465      return $aa || "X";      foreach $triple ( @triples ) { return $X if $aa ne $genetic_code{$triple} }
1466    
1467        return $aa;
1468  }  }
1469    
1470    
# Line 1006  Line 1473 
1473  #  Diagnose the use of upper versus lower, and T versus U in the supplied  #  Diagnose the use of upper versus lower, and T versus U in the supplied
1474  #  code, and transform the supplied nucleotide sequence to match.  #  code, and transform the supplied nucleotide sequence to match.
1475  #  #
1476  #  translate_seq_with_user_code($seq, \%gen_code [, $start_with_met] )  #     $aa = translate_seq_with_user_code( $nt, \%gen_code )
1477    #     $aa = translate_seq_with_user_code( $nt, \%gen_code, $start_with_met )
1478  #  #
1479    #  Modified 2007-11-22 to be less intrusive in these diagnoses by sensing
1480    #  the presence of both versions in the user code.
1481  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1482    
1483  sub translate_seq_with_user_code {  sub translate_seq_with_user_code
1484    {
1485      my $seq = shift;      my $seq = shift;
1486      $seq =~ tr/-//d;     #  remove gaps  ***  Why?      $seq =~ tr/-//d;     #  remove gaps  ***  Why?
     $seq =~ tr/Xx/Nn/;   #  allow X  
1487    
1488      my $gc = shift;      #  Reference to hash of DNA alphabet code      my $gc = shift;      #  Reference to hash of code
1489      if (! $gc || ref($gc) ne "HASH") {      if (! $gc || ref($gc) ne "HASH")
1490          die "translate_seq_with_user_code needs genetic code hash as secondargument.";      {
1491            print STDERR "translate_seq_with_user_code needs genetic code hash as second argument.";
1492            return undef;
1493      }      }
1494    
1495      #  Test the type of code supplied: uppercase versus lowercase      #  Test code support for upper vs lower case:
1496    
1497      my ($RNA_F, $DNA_F, $M, $N, $X);      my ( $TTT, $UUU );
1498        if    ( $gc->{AAA} && ! $gc->{aaa} )   #  Uppercase only code table
1499      if ($gc->{ "AAA" }) {     #  Looks like uppercase code table      {
1500          $seq   = uc $seq;     #  Uppercase sequence          $seq   = uc $seq;     #  Uppercase sequence
1501          $RNA_F = "UUU";       #  Uppercase RNA Phe codon          ( $TTT, $UUU ) = ( 'TTT', 'UUU' );
         $DNA_F = "TTT";       #  Uppercase DNA Phe codon  
         $M     = "M";         #  Uppercase initiator  
         $N     = "N";         #  Uppercase ambiguous nuc  
         $X     = "X";         #  Uppercase ambiguous aa  
1502      }      }
1503      elsif ($gc->{ "aaa" }) {  #  Looks like lowercase code table      elsif ( $gc->{aaa} && ! $gc->{AAA} )   #  Lowercase only code table
1504        {
1505          $seq   = lc $seq;     #  Lowercase sequence          $seq   = lc $seq;     #  Lowercase sequence
1506          $RNA_F = "uuu";       #  Lowercase RNA Phe codon          ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
         $DNA_F = "ttt";       #  Lowercase DNA Phe codon  
         $M     = "m";         #  Lowercase initiator  
         $N     = "n";         #  Lowercase ambiguous nuc  
         $X     = "x";         #  Lowercase ambiguous aa  
1507      }      }
1508      else {      elsif ( $gc->{aaa} )
1509          die "User-supplied genetic code does not have aaa or AAA\n";      {
1510            ( $TTT, $UUU ) = ( 'ttt', 'uuu' );
1511        }
1512        else
1513        {
1514            print STDERR "User-supplied genetic code does not have aaa or AAA\n";
1515            return undef;
1516      }      }
1517    
1518      #  Test the type of code supplied:  UUU versus TTT      #  Test code support for U vs T:
   
     my ($ambigs);  
1519    
1520      if ($gc->{ $RNA_F }) {     #  Looks like RNA code table      my $ambigs;
1521          $seq =~ tr/Tt/Uu/;      if    ( $gc->{$UUU} && ! $gc->{$TTT} )  # RNA only code table
1522        {
1523            $seq = tr/Tt/Uu/;
1524          $ambigs = \%RNA_letter_can_be;          $ambigs = \%RNA_letter_can_be;
1525      }      }
1526      elsif ($gc->{ $DNA_F }) {  #  Looks like DNA code table      elsif ( $gc->{$TTT} && ! $gc->{$UUU} )  # DNA only code table
1527          $seq =~ tr/Uu/Tt/;      {
1528            $seq = tr/Uu/Tt/;
1529          $ambigs = \%DNA_letter_can_be;          $ambigs = \%DNA_letter_can_be;
1530      }      }
1531      else {      else
1532          die "User-supplied genetic code does not have $RNA_F or $DNA_F\n";      {
1533            my $t = $seq =~ tr/Tt//;
1534            my $u = $seq =~ tr/Uu//;
1535            $ambigs = ( $t > $u ) ? \%DNA_letter_can_be : \%RNA_letter_can_be;
1536      }      }
1537    
1538      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      #  We can now do the codon-by-codon translation:
1539    
1540      my $met = shift;     #  a third argument that is true      my @codons = $seq =~ m/(...?)/g;  #  will try to translate last 2 nt
                          #  forces first amino acid to be Met  
                          #  (note: undef is false)  
     my $pep  = ($met && ($imax >= 0)) ? $M : "";  
     my $aa;  
1541    
1542      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      #  A third argument that is true forces first amino acid to be Met
1543          $pep .= translate_codon_with_user_code( substr($seq,$i,3), $gc, $N, $X, $ambigs );  
1544        my @met;
1545        if ( ( shift @_ ) && ( my $codon1 = shift @codons ) )
1546        {
1547            push @met, ( $codon1 =~ /[a-z]/ ? 'm' : 'M' );
1548      }      }
1549    
1550      return $pep;      join( '', @met, map { translate_codon_with_user_code( $_, $gc, $ambigs ) } @codons )
1551  }  }
1552    
1553    
1554  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1555  #  Translate with user-supplied genetic code hash.  For speed, no error  #  Translate with user-supplied genetic code hash.  No error check on the code.
1556  #  check on the hash.  Calling programs should check for the hash at a  #  Should only be called through translate_seq_with_user_code.
 #  higher level.  
 #  
 #  Should only be called through translate_seq_with_user_code  
1557  #  #
1558  #   translate_codon_with_user_code( $triplet, \%code, $N, $X, $ambig_table )  #     $aa = translate_codon_with_user_code( $triplet, \%code, $ambig_table )
1559  #  #
1560  #  $triplet      speaks for itself  #  $triplet      speaks for itself
1561  #  $code         ref to the hash with the codon translations  #  $code         ref to the hash with the codon translations
 #  $N            character to use for ambiguous nucleotide  
 #  $X            character to use for ambiguous amino acid  
1562  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code
1563  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1564    
1565    sub translate_codon_with_user_code
1566  sub translate_codon_with_user_code {  {
1567      my $codon = shift;      my ( $codon, $gc, $N, $X, $ambigs ) = @_;
     my $gc    = shift;  
     my $aa;  
1568    
1569      #  Try a simple lookup:      #  Try a simple lookup:
1570    
1571        my $aa;
1572      if ( $aa = $gc->{ $codon } ) { return $aa }      if ( $aa = $gc->{ $codon } ) { return $aa }
1573    
1574      #  Test that codon is valid and might have unambiguous aa:      #  Attempt to recover from mixed-case codons:
1575    
1576        $codon = ( $codon =~ /[a-z]/ ) ? lc $codon : uc $codon;
1577        if ( $aa = $genetic_code{ $codon } ) { return $aa }
1578    
1579      my ($N, $X, $ambigs) = @_;      #  Test that codon is valid for an unambiguous aa:
     if ( $codon =~ m/^[ACGTUMY][ACGTU]$/i ) { $codon .= $N }  
     if ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) { return $X }  
     #                          ^^  
     #                          |+- for leucine YTR  
     #                          +-- for arginine MGR  
1580    
1581      #  Expand all ambiguous nucleotides to see if they all yield same aa.      my $X = ( $codon =~ /[a-z]/ ) ? 'x' : 'X';
     #  Loop order tries to fail quickly with first position change.  
1582    
1583      $aa = "";      if ( $codon =~ m/^[ACGTU][ACGTU]$/i )  # Add N?
1584      for my $n2 ( @{ $ambigs->{ substr($codon,1,1) } } ) {      {
1585          for my $n3 ( @{ $ambigs->{ substr($codon,2,1) } } ) {          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
             for my $n1 ( @{ $ambigs->{ substr($codon,0,1) } } ) {  
                 #  set the first value of $aa  
                 if ($aa eq "") { $aa = $gc->{ $n1 . $n2 . $n3 } }  
                 #  break out if any other amino acid is detected  
                 elsif ($aa ne $gc->{ $n1 . $n2 . $n3 } ) { return "X" }  
1586              }              }
1587        elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?
1588        {
1589            return $X;
1590          }          }
1591    
1592        #  Expand all ambiguous nucleotides to see if they all yield same aa.
1593    
1594        my @n1 = @{ $ambigs->{ substr( $codon, 0, 1 ) } };
1595        my $n2 =               substr( $codon, 1, 1 );
1596        my @n3 = @{ $ambigs->{ substr( $codon, 2, 1 ) } };
1597        my @triples = map { my $n12 = $_ . $n2; map { $n12 . $_ } @n3 } @n1;
1598    
1599        my $triple = shift @triples;
1600        $aa = $gc->{ $triple } || $gc->{ lc $triple } || $gc->{ uc $triple };
1601        $aa or return $X;
1602    
1603        foreach $triple ( @triples )
1604        {
1605            return $X if $aa ne ( $gc->{$triple} || $gc->{lc $triple} || $gc->{uc $triple} );
1606      }      }
1607    
1608      return $aa || $X;      return $aa;
1609  }  }
1610    
1611    
# Line 1134  Line 1613 
1613  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1614  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1615  #  #
1616  #     @intervals = read_intervals( *FILEHANDLE )  #     @intervals = read_intervals( \*FILEHANDLE )
1617  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1618  sub read_intervals {  sub read_intervals {
1619      my $fh = shift;      my $fh = shift;
# Line 1230  Line 1709 
1709  #  Read a list of oriented intervals from a file.  #  Read a list of oriented intervals from a file.
1710  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1711  #  #
1712  #     @intervals = read_oriented_intervals( *FILEHANDLE )  #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1713  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1714  sub read_oriented_intervals {  sub read_oriented_intervals {
1715      my $fh = shift;      my $fh = shift;
# Line 1261  Line 1740 
1740  }  }
1741    
1742    
1743    #-----------------------------------------------------------------------------
1744    #  Convert GenBank locations to SEED locations
1745    #
1746    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1747    #-----------------------------------------------------------------------------
1748    sub gb_location_2_seed
1749    {
1750        my $contig = shift @_;
1751        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1752    
1753        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1754    }
1755    
1756    sub gb_loc_2_seed_2
1757    {
1758        my ( $contig, $loc ) = @_;
1759    
1760        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1761        {
1762            join( '_', $contig, $1, $2 )
1763        }
1764    
1765        elsif ( $loc =~ /^join\((.*)\)$/ )
1766        {
1767            $loc = $1;
1768            my $lvl = 0;
1769            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1770            {
1771                for ( substr( $loc, $i, 1 ) )
1772                {
1773                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1774                    /\(/          and $lvl--;
1775                    /\)/          and $lvl++;
1776                }
1777            }
1778            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1779            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1780        }
1781    
1782        elsif ( $loc =~ /^complement\((.*)\)$/ )
1783        {
1784            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1785            reverse
1786            gb_loc_2_seed_2( $contig, $1 )
1787        }
1788    
1789        else
1790        {
1791            ()
1792        }
1793    }
1794    
1795    
1796    #-----------------------------------------------------------------------------
1797    #  Read qual.
1798    #
1799    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1800    #
1801    #     @seq_entries = read_qual( )               #  STDIN
1802    #    \@seq_entries = read_qual( )               #  STDIN
1803    #     @seq_entries = read_qual( \*FILEHANDLE )
1804    #    \@seq_entries = read_qual( \*FILEHANDLE )
1805    #     @seq_entries = read_qual(  $filename )
1806    #    \@seq_entries = read_qual(  $filename )
1807    #-----------------------------------------------------------------------------
1808    sub read_qual {
1809        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1810        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1811    
1812        my @quals = ();
1813        my ($id, $desc, $qual) = ("", "", []);
1814    
1815        while ( <$fh> ) {
1816            chomp;
1817            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1818                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1819                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1820            }
1821            else {
1822                push @$qual, split;
1823            }
1824        }
1825        close( $fh ) if $close;
1826    
1827        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1828        return wantarray ? @quals : \@quals;
1829    }
1830    
1831    
1832  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.10

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3