[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Dec 1 16:54:26 2003 UTC revision 1.8, Tue Jun 26 15:20:16 2007 UTC
# Line 1  Line 1 
1  package gjoseqlib;  package gjoseqlib;
2    use Carp;
3    
4  #  read_fasta_seqs( *FILEHANDLE )  #  A sequence entry is ( $id, $def, $seq )
5  #  read_next_fasta_seq( *FILEHANDLE )  #  A list of entries is a list of references
6  #  parse_fasta_title( $title )  #
7  #  split_fasta_title( $title )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
8  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
9  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )  #  @seq_entries = read_fasta( )                     # STDIN
10  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  @seq_entries = read_fasta( \*FILEHANDLE )
11    #  @seq_entries = read_fasta(  $filename )
12  #  index_seq_list( @seq_entry_list )  #  @seq_entries = read_clustal( )                   # STDIN
13  #  seq_entry_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal( \*FILEHANDLE )
14  #  seq_desc_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal(  $filename )
15  #  seq_data_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal_file(  $filename )
16    #
17  #  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
18  #  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
19  #  complement_DNA_entry( @seq_entry [, $fix_id] )  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
20  #  complement_RNA_entry( @seq_entry [, $fix_id] )  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
21  #  complement_DNA_seq( $NA_seq )  #
22  #  complement_RNA_seq( $NA_seq )  #  ( $id, $def ) = parse_fasta_title( $title )
23  #  to_DNA_seq( $NA_seq )  #  ( $id, $def ) = split_fasta_title( $title )
24  #  to_RNA_seq( $NA_seq )  #
25  #  clean_ae_sequence( $seq )  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
26    #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
27  #  translate_seq( $seq [, $met_start] )  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
28  #  translate_codon( $triplet )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
29  #  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )  #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
31    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
32    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
34    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
35    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
37    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
39    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
41    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
43    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
44    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
45    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
46    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
47    #
48    #   @packed_seqs = pack_alignment(  @seqs )
49    #   @packed_seqs = pack_alignment( \@seqs )
50    #  \@packed_seqs = pack_alignment(  @seqs )
51    #  \@packed_seqs = pack_alignment( \@seqs )
52    #
53    #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
54    #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
55    #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
56    #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
57    #  $DNAseq = complement_DNA_seq( $NA_seq );
58    #  $RNAseq = complement_RNA_seq( $NA_seq );
59    #  $DNAseq = to_DNA_seq( $NA_seq );
60    #  $RNAseq = to_RNA_seq( $NA_seq );
61    #  $seq    = pack_seq( $sequence )
62    #  $seq    = clean_ae_sequence( $seq )
63    #
64    #  $seq = translate_seq( $seq [, $met_start] )
65    #  $aa  = translate_codon( $triplet );
66    #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
67    #
68    #  User-supplied genetic code must be upper case index and match the
69    #  DNA versus RNA type of sequence
70    #
71    #  Locations (= oriented intervals) are ( id, start, end )
72    #  Intervals are ( id, left, right )
73    #
74    #  @intervals = read_intervals( \*FILEHANDLE )
75    #  @intervals = read_oriented_intervals( \*FILEHANDLE )
76    #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
77    #  @joined    = join_intervals( @interval_refs )
78    #  @intervals = locations_2_intervals( @locations )
79    #  $interval  = locations_2_intervals( $location  )
80    #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
81    #
82    #  Convert GenBank locations to SEED locations
83    #
84    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
85    #
86    #  Read quality scores from a fasta-like file:
87    #
88    #  @seq_entries = read_qual( )               #  STDIN
89    # \@seq_entries = read_qual( )               #  STDIN
90    #  @seq_entries = read_qual( \*FILEHANDLE )
91    # \@seq_entries = read_qual( \*FILEHANDLE )
92    #  @seq_entries = read_qual(  $filename )
93    # \@seq_entries = read_qual(  $filename )
94    #
95    
96    use strict;
97    
98  #  read_intervals( *FILEHANDLE )  #  Exported global variables:
 #  join_intervals( @interval_refs )  
99    
100  use gjolib qw(wrap_text);  our @aa_1_letter_order;  # Alpha by 1 letter
101    our @aa_3_letter_order;  # PAM matrix order
102    our @aa_n_codon_order;
103    our %genetic_code;
104    our %genetic_code_with_U;
105    our %amino_acid_codons_DNA;
106    our %amino_acid_codons_RNA;
107    our %n_codon_for_aa;
108    our %reverse_genetic_code_DNA;
109    our %reverse_genetic_code_RNA;
110    our %DNA_letter_can_be;
111    our %RNA_letter_can_be;
112    our %one_letter_to_three_letter_aa;
113    our %three_letter_to_one_letter_aa;
114    
115  require Exporter;  require Exporter;
116    
117  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
118  our @EXPORT = qw(  our @EXPORT = qw(
119          read_fasta_seqs          read_fasta_seqs
120            read_fasta
121          read_next_fasta_seq          read_next_fasta_seq
122            read_clustal_file
123            read_clustal
124          parse_fasta_title          parse_fasta_title
125          split_fasta_title          split_fasta_title
126          print_seq_list_as_fasta          print_seq_list_as_fasta
127            print_alignment_as_fasta
128            print_alignment_as_phylip
129            print_alignment_as_nexus
130          print_seq_as_fasta          print_seq_as_fasta
131          print_gb_locus          print_gb_locus
132    
# Line 49  Line 135 
135          seq_desc_by_id          seq_desc_by_id
136          seq_data_by_id          seq_data_by_id
137    
138            pack_alignment
139    
140          subseq_DNA_entry          subseq_DNA_entry
141          subseq_RNA_entry          subseq_RNA_entry
142          complement_DNA_entry          complement_DNA_entry
# Line 57  Line 145 
145          complement_RNA_seq          complement_RNA_seq
146          to_DNA_seq          to_DNA_seq
147          to_RNA_seq          to_RNA_seq
148            pack_seq
149          clean_ae_sequence          clean_ae_sequence
150    
151          translate_seq          translate_seq
# Line 64  Line 153 
153          translate_seq_with_user_code          translate_seq_with_user_code
154    
155          read_intervals          read_intervals
156            standardize_intervals
157          join_intervals          join_intervals
158            locations_2_intervals
159            read_oriented_intervals
160            reverse_intervals
161    
162            gb_location_2_seed
163    
164            read_qual
165          );          );
166    
167  use strict;  our @EXPORT_OK = qw(
168            @aa_1_letter_order
169            @aa_3_letter_order
170            @aa_n_codon_order
171            %genetic_code
172            %genetic_code_with_U
173            %amino_acid_codons_DNA
174            %amino_acid_codons_RNA
175            %n_codon_for_aa
176            %reverse_genetic_code_DNA
177            %reverse_genetic_code_RNA
178            %DNA_letter_can_be
179            %RNA_letter_can_be
180            %one_letter_to_three_letter_aa
181            %three_letter_to_one_letter_aa
182            );
183    
184    
185  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
186  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
187    #     filehandle is passed through
188    #     string is taken as file name to be openend
189    #     undef or "" defaults to STDOUT
190    #
191    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
192    #
193    #-----------------------------------------------------------------------------
194    sub input_filehandle
195    {
196        my $file = shift;
197    
198        #  FILEHANDLE
199    
200        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
201    
202        #  Null string or undef
203    
204        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
205    
206        #  File name
207    
208        if ( ! ref( $file ) )
209        {
210            my $fh;
211            -f $file or die "Could not find input file \"$file\"\n";
212            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
213            return ( $fh, $file, 1 );
214        }
215    
216        #  Some other kind of reference; return the unused value
217    
218        return ( \*STDIN, undef, 0, $file );
219    }
220    
221    
222    #-----------------------------------------------------------------------------
223    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
224  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
225  #  #
226  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
227  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
228  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
229      my $fh = shift;  
230      wantarray || die "read_fasta_seqs requires list context\n";  
231    #-----------------------------------------------------------------------------
232    #  Read fasta sequences.
233    #  Save the contents in a list of refs to arrays:  (id, description, seq)
234    #
235    #     @seq_entries = read_fasta( )               #  STDIN
236    #    \@seq_entries = read_fasta( )               #  STDIN
237    #     @seq_entries = read_fasta( \*FILEHANDLE )
238    #    \@seq_entries = read_fasta( \*FILEHANDLE )
239    #     @seq_entries = read_fasta(  $filename )
240    #    \@seq_entries = read_fasta(  $filename )
241    #-----------------------------------------------------------------------------
242    sub read_fasta {
243        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
244        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
245    
246      my @seqs = ();      my @seqs = ();
247      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 90  Line 253 
253              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
254          }          }
255          else {          else {
256              tr/\t 0-9//d;              tr/     0-9//d;
257              $seq .= $_ ;              $seq .= $_ ;
258          }          }
259      }      }
260        close( $fh ) if $close;
261    
262      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
263      return @seqs;      return wantarray ? @seqs : \@seqs;
264  }  }
265    
266    
# Line 104  Line 268 
268  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
269  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
270  #  #
271  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
272  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
273  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
274  #  (these should really be hashes indexed by file handle):  
275  #  {   #  Use bare block to scope the header hash
276  my $next_fasta_seq_header = "";  
277        my %next_header;
278    
279  sub read_next_fasta_seq {  sub read_next_fasta_seq {
280      my $fh = shift;      my $fh = shift;
281      wantarray || die "read_next_fasta_seq requires list context\n";          my ( $id, $desc );
282    
283      my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );          if ( defined( $next_header{$fh} ) ) {
284                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
285            }
286            else {
287                $next_header{$fh} = "";
288                ( $id, $desc ) = ( undef, "" );
289            }
290      my $seq = "";      my $seq = "";
291    
292      while ( <$fh> ) {      while ( <$fh> ) {
293          chomp;          chomp;
294          if ( /^>/ ) {        #  new id          if ( /^>/ ) {        #  new id
295              $next_fasta_seq_header = $_;                  $next_header{$fh} = $_;
296              if ( $id && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
297              ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );                  {
298                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
299                    }
300                    ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
301              $seq = "";              $seq = "";
302          }          }
303          else {          else {
304              tr/\t 0-9//d;                  tr/     0-9//d;
305              $seq .= $_ ;              $seq .= $_ ;
306          }          }
307      }      }
308    
309      #  Done with file, clear "next header"          #  Done with file, delete "next header"
310    
311            delete $next_header{$fh};
312            return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
313                                                          : [$id, $desc, $seq]
314                                              )
315                                            : () ;
316        }
317    }
318    
319    
320    #-----------------------------------------------------------------------------
321    #  Read a clustal alignment from a file.
322    #  Save the contents in a list of refs to arrays:  (id, description, seq)
323    #
324    #     @seq_entries = read_clustal_file( $filename )
325    #-----------------------------------------------------------------------------
326    sub read_clustal_file { read_clustal( @_ ) }
327    
328    
329      $next_fasta_seq_header = "";  #-----------------------------------------------------------------------------
330      return ($id && $seq) ? ($id, $desc, $seq) : () ;  #  Read a clustal alignment.
331    #  Save the contents in a list of refs to arrays:  (id, description, seq)
332    #
333    #     @seq_entries = read_clustal( )              # STDIN
334    #     @seq_entries = read_clustal( \*FILEHANDLE )
335    #     @seq_entries = read_clustal(  $filename )
336    #-----------------------------------------------------------------------------
337    sub read_clustal {
338        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
339        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
340    
341        my ( %seq, @ids, $line );
342        while ( defined( $line = <$fh> ) )
343        {
344            ( $line =~ /^[A-Za-z0-9]/ ) or next;
345            chomp $line;
346            my @flds = split /\s+/, $line;
347            if ( @flds == 2 )
348            {
349                $seq{ $flds[0] } or push @ids, $flds[0];
350                push @{ $seq{ $flds[0] } }, $flds[1];
351            }
352        }
353        close( $fh ) if $close;
354    
355        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
356  }  }
357    
358    
# Line 151  Line 368 
368      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
369          return ($1, $3 ? $3 : "");          return ($1, $3 ? $3 : "");
370      }      }
371      else {      elsif ($title =~ /^>/) {
372          return ("", "");          return ("", "");
373      }      }
374        else {
375            return (undef, "");
376        }
377  }  }
378    
379  sub split_fasta_title {  sub split_fasta_title {
# Line 162  Line 382 
382    
383    
384  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
385  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
386    #     filehandle is passed through
387    #     string is taken as file name to be openend
388    #     undef or "" defaults to STDOUT
389    #
390    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
391  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
392  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
393  sub print_seq_list_as_fasta {  sub output_filehandle
394      my $fh = shift;  {
395      my @seq_list = @_;      my $file = shift;
396    
397        #  FILEHANDLE
398    
399        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
400    
401        #  Null string or undef
402    
403        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
404    
405        #  File name
406    
407        if ( ! ref( $file ) )
408        {
409            my $fh;
410            open( $fh, ">$file" ) || die "Could not open output $file\n";
411            return ( $fh, $file, 1 );
412        }
413    
414      foreach my $seq_ptr (@seq_list) {      #  Some other kind of reference; return the unused value
415          print_seq_as_fasta($fh, @$seq_ptr);  
416        return ( \*STDOUT, undef, 0, $file );
417      }      }
418    
419    
420    #-----------------------------------------------------------------------------
421    #  Legacy function for printing fasta sequence set:
422    #
423    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
424    #-----------------------------------------------------------------------------
425    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
426    
427    
428    #-----------------------------------------------------------------------------
429    #  Print list of sequence entries in fasta format.
430    #  Missing, undef or "" filename defaults to STDOUT.
431    #
432    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
433    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
434    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
435    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
436    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
437    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
438    #-----------------------------------------------------------------------------
439    sub print_alignment_as_fasta {
440        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
441        ( unshift @_, $unused ) if $unused;
442    
443        ( ref( $_[0] ) eq "ARRAY" ) or confess "Bad sequence entry passed to print_alignment_as_fasta\n";
444    
445        #  Expand the sequence entry list if necessary:
446    
447        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
448    
449        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
450    
451        close( $fh ) if $close;
452    }
453    
454    
455    #-----------------------------------------------------------------------------
456    #  Print list of sequence entries in phylip format.
457    #  Missing, undef or "" filename defaults to STDOUT.
458    #
459    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
460    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
461    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
462    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
463    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
464    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
465    #-----------------------------------------------------------------------------
466    sub print_alignment_as_phylip {
467        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
468        ( unshift @_, $unused ) if $unused;
469    
470        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
471    
472        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
473    
474        my ( %id2, %used );
475        my $maxlen = 0;
476        foreach ( @seq_list )
477        {
478            my ( $id, undef, $seq ) = @$_;
479    
480            #  Need a name that is unique within 10 characters
481    
482            my $id2 = substr( $id, 0, 10 );
483            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
484            my $n = "0";
485            while ( $used{ $id2 } )
486            {
487                $n++;
488                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
489            }
490            $used{ $id2 } = 1;
491            $id2{ $id } = $id2;
492    
493                    #  Prepare to pad sequences (should not be necessary, but ...)
494    
495            my $len = length( $seq );
496            $maxlen = $len if ( $len > $maxlen );
497        }
498    
499        my $nseq = @seq_list;
500        print $fh "$nseq  $maxlen\n";
501        foreach ( @seq_list )
502        {
503            my ( $id, undef, $seq ) = @$_;
504            my $len = length( $seq );
505            printf $fh "%-10s  %s%s\n", $id2{ $id },
506                                        $seq,
507                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
508        }
509    
510        close( $fh ) if $close;
511    }
512    
513    
514    #-----------------------------------------------------------------------------
515    #  Print list of sequence entries in nexus format.
516    #  Missing, undef or "" filename defaults to STDOUT.
517    #
518    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
519    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
520    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
521    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
522    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
523    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
524    #-----------------------------------------------------------------------------
525    sub print_alignment_as_nexus {
526        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
527        ( unshift @_, $unused ) if $unused;
528    
529        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
530    
531        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
532    
533        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
534    
535        my %id2;
536        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
537        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
538        foreach ( @seq_list )
539        {
540            my ( $id, undef, $seq ) = @$_;
541            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
542            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
543            {
544                    $id2 =~ s/'/''/g;
545                    $id2 = qq('$id2');
546                }
547            $id2{ $id } = $id2;
548            my $idlen = length( $id2 );
549            $maxidlen = $idlen if ( $idlen > $maxidlen );
550    
551            my $seqlen = length( $seq );
552            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
553    
554            $nt += $seq =~ tr/Tt//d;
555            $nu += $seq =~ tr/Uu//d;
556            $n1 += $seq =~ tr/ACGNacgn//d;
557            $n2 += $seq =~ tr/A-Za-z//d;
558        }
559    
560        my $nseq = @seq_list;
561        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
562    
563        print $fh <<"END_HEAD";
564    #NEXUS
565    
566    BEGIN Data;
567        Dimensions
568            NTax=$nseq
569            NChar=$maxseqlen
570            ;
571        Format
572            DataType=$type
573            Gap=-
574            Missing=?
575            ;
576        Matrix
577    
578    END_HEAD
579    
580        foreach ( @seq_list )
581        {
582            my ( $id, undef, $seq ) = @$_;
583            my $len = length( $seq );
584            printf  $fh  "%-${maxidlen}s  %s%s\n",
585                         $id2{ $id },
586                         $seq,
587                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
588        }
589    
590        print $fh <<"END_TAIL";
591    ;
592    END;
593    END_TAIL
594    
595        close( $fh ) if $close;
596  }  }
597    
598    
599  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
600  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
601  #  #
602  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
603  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
604  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
605  sub print_seq_as_fasta {  sub print_seq_as_fasta {
606      my $fh = shift;      my $fh = shift;
# Line 197  Line 617 
617  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
618  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
619  #  #
620  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
621  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
622  sub print_gb_locus {  sub print_gb_locus {
623      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 223  Line 643 
643    
644    
645  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
646    #  Return a string with text wrapped to defined line lengths:
647    #
648    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
649    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
650    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
651    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
652    #-----------------------------------------------------------------------------
653    sub wrap_text {
654        my ($str, $len, $ind, $indn) = @_;
655    
656        defined($str)  || die "wrap_text called without a string\n";
657        defined($len)  || ($len  =   80);
658        defined($ind)  || ($ind  =    0);
659        ($ind  < $len) || die "wrap error: indent greater than line length\n";
660        defined($indn) || ($indn = $ind);
661        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
662    
663        $str =~ s/\s+$//;
664        $str =~ s/^\s+//;
665        my ($maxchr, $maxchr1);
666        my (@lines) = ();
667    
668        while ($str) {
669            $maxchr1 = ($maxchr = $len - $ind) - 1;
670            if ($maxchr >= length($str)) {
671                push @lines, (" " x $ind) . $str;
672                last;
673            }
674            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
675                push @lines, (" " x $ind) . $1;
676                $str = $2;
677            }
678            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
679                push @lines, (" " x $ind) . $1;
680                $str = $2;
681            }
682            else {
683                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
684                $str = substr($str, $maxchr);
685            }
686            $ind = $indn;
687        }
688    
689        return join("\n", @lines);
690    }
691    
692    
693    #-----------------------------------------------------------------------------
694  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
695  #  #
696  #     my %seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(  @seq_list );
697    #     my \%seq_ind  = index_seq_list( \@seq_list );
698  #  #
699  #  Usage example:  #  Usage example:
700  #  #
701  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
702  #  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind    = index_seq_list(@seq_list); # hash from names to pointers
703  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
704  #  #
705  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
706  sub index_seq_list {  sub index_seq_list {
707      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
708      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
709      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
710  }  }
711    
712    
713  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
714  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
715  #  #
716  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
717  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
718  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
719    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
720  #  #
721  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
722  sub seq_entry_by_id {  sub seq_entry_by_id {
723      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
724      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
725      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
726  }  }
727    
728    
# Line 269  Line 739 
739      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
740  }  }
741    
742    #-----------------------------------------------------------------------------
743    #  Remove columns of alignment gaps from sequences:
744    #
745    #   @packed_seqs = pack_alignment(  @seqs )
746    #   @packed_seqs = pack_alignment( \@seqs )
747    #  \@packed_seqs = pack_alignment(  @seqs )
748    #  \@packed_seqs = pack_alignment( \@seqs )
749    #
750    #-----------------------------------------------------------------------------
751    
752    sub pack_alignment
753    {
754        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
755        @seqs or return wantarray ? () : [];
756    
757        my $mask  = pack_mask( $seqs[0]->[2] );
758        foreach ( @seqs[ 1 .. (@seqs-1) ] )
759        {
760            $mask |= pack_mask( $_->[2] );
761        }
762    
763        my $seq;
764        my @seqs2 = map { $seq = $_->[2] & $mask;
765                          $seq =~ tr/\000//d;
766                          [ $_->[0], $_->[1], $seq ]
767                        }
768                    @seqs;
769    
770        return wantarray ? @seqs2 : \@seqs2;
771    }
772    
773    sub pack_mask
774    {
775        my $mask = shift;
776        $mask =~ tr/-/\000/;
777        $mask =~ tr/\000/\377/c;
778        return $mask;
779    }
780    
781  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
782  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 332  Line 840 
840      }      }
841    
842      if ( $fix_id ) {      if ( $fix_id ) {
843          if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )          if ( ( $id =~ s/_(\d+)_(\d+)$// )
844            && ( abs($2-$1)+1 == $len ) ) {            && ( abs($2-$1)+1 == $len ) ) {
845              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
846              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
# Line 353  Line 861 
861      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
862                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
863      if ($fix_id) {      if ($fix_id) {
864          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
865              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
866          }          }
867          else {          else {
# Line 374  Line 882 
882      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
883                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
884      if ($fix_id) {      if ($fix_id) {
885          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
886              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
887          }          }
888          else {          else {
# Line 416  Line 924 
924  }  }
925    
926    
927    sub pack_seq {
928        my $seq = shift;
929        $seq =~ tr/A-Za-z//cd;
930        return $seq;
931    }
932    
933    
934  sub clean_ae_sequence {  sub clean_ae_sequence {
935      $_ = shift;      $_ = shift;
936      $_ = to7bit($_);      $_ = to7bit($_);
# Line 464  Line 979 
979  #  #
980  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
981    
982  our %genetic_code = (   # work as DNA  @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
983    @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
984    @aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );
985    
986    %genetic_code = (
987    
988        # DNA version
989    
990      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
991      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
992      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
# Line 482  Line 1004 
1004      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
1005      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",
1006    
1007        # RNA suppliment
1008    
1009        UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",
1010        UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",
1011        UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",
1012        UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",
1013        CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",
1014        CUC => "L",
1015        CUA => "L",
1016        CUG => "L",
1017        AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",
1018        AUC => "I",
1019        AUA => "I",
1020        AUG => "M",
1021        GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",
1022        GUC => "V",
1023        GUA => "V",
1024        GUG => "V",
1025    
1026      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1027      #  speed up the processing of some common ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1028    
1029      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
1030      TTR => "L",  TCR => "S",  TAR => "*",      TTR => "L",  TCR => "S",  TAR => "*",
# Line 496  Line 1037 
1037                   ACN => "T",                   ACN => "T",
1038      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
1039      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
1040      GTN => "V",  GCN => "A",               GGN => "G"      GTN => "V",  GCN => "A",               GGN => "G",
1041    
1042        UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",
1043        UUR => "L",  UCR => "S",  UAR => "*",
1044                     UCN => "S",
1045        CUY => "L",
1046        CUR => "L",
1047        CUN => "L",
1048        AUY => "I",
1049        GUY => "V",
1050        GUR => "V",
1051        GUN => "V"
1052    );
1053    
1054    
1055    #  Add lower case by construction:
1056    
1057    foreach ( keys %genetic_code ) {
1058        $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1059    }
1060    
1061    
1062    #  Construct the genetic code with selanocysteine by difference:
1063    
1064    %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;
1065    $genetic_code_with_U{ TGA } = "U";
1066    $genetic_code_with_U{ tga } = "u";
1067    $genetic_code_with_U{ UGA } = "U";
1068    $genetic_code_with_U{ uga } = "u";
1069    
1070    
1071    %amino_acid_codons_DNA = (
1072             L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
1073             R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
1074             S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
1075             A  => [ qw( GCA GCG GCT GCC ) ],
1076             G  => [ qw( GGA GGG GGT GGC ) ],
1077             P  => [ qw( CCA CCG CCT CCC ) ],
1078             T  => [ qw( ACA ACG ACT ACC ) ],
1079             V  => [ qw( GTA GTG GTT GTC ) ],
1080             I  => [ qw( ATA ATT ATC ) ],
1081             C  => [ qw( TGT TGC ) ],
1082             D  => [ qw( GAT GAC ) ],
1083             E  => [ qw( GAA GAG ) ],
1084             F  => [ qw( TTT TTC ) ],
1085             H  => [ qw( CAT CAC ) ],
1086             K  => [ qw( AAA AAG ) ],
1087             N  => [ qw( AAT AAC ) ],
1088             Q  => [ qw( CAA CAG ) ],
1089             Y  => [ qw( TAT TAC ) ],
1090             M  => [ qw( ATG ) ],
1091             U  => [ qw( TGA ) ],
1092             W  => [ qw( TGG ) ],
1093    
1094             l  => [ qw( tta ttg cta ctg ctt ctc ) ],
1095             r  => [ qw( aga agg cga cgg cgt cgc ) ],
1096             s  => [ qw( agt agc tca tcg tct tcc ) ],
1097             a  => [ qw( gca gcg gct gcc ) ],
1098             g  => [ qw( gga ggg ggt ggc ) ],
1099             p  => [ qw( cca ccg cct ccc ) ],
1100             t  => [ qw( aca acg act acc ) ],
1101             v  => [ qw( gta gtg gtt gtc ) ],
1102             i  => [ qw( ata att atc ) ],
1103             c  => [ qw( tgt tgc ) ],
1104             d  => [ qw( gat gac ) ],
1105             e  => [ qw( gaa gag ) ],
1106             f  => [ qw( ttt ttc ) ],
1107             h  => [ qw( cat cac ) ],
1108             k  => [ qw( aaa aag ) ],
1109             n  => [ qw( aat aac ) ],
1110             q  => [ qw( caa cag ) ],
1111             y  => [ qw( tat tac ) ],
1112             m  => [ qw( atg ) ],
1113             u  => [ qw( tga ) ],
1114             w  => [ qw( tgg ) ],
1115    
1116            '*' => [ qw( TAA TAG TGA ) ]
1117  );  );
1118    
1119    
1120  my %DNA_letter_can_be = (  
1121    %amino_acid_codons_RNA = (
1122             L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
1123             R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
1124             S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
1125             A  => [ qw( GCA GCG GCU GCC ) ],
1126             G  => [ qw( GGA GGG GGU GGC ) ],
1127             P  => [ qw( CCA CCG CCU CCC ) ],
1128             T  => [ qw( ACA ACG ACU ACC ) ],
1129             V  => [ qw( GUA GUG GUU GUC ) ],
1130             B  => [ qw( GAU GAC AAU AAC ) ],
1131             Z  => [ qw( GAA GAG CAA CAG ) ],
1132             I  => [ qw( AUA AUU AUC ) ],
1133             C  => [ qw( UGU UGC ) ],
1134             D  => [ qw( GAU GAC ) ],
1135             E  => [ qw( GAA GAG ) ],
1136             F  => [ qw( UUU UUC ) ],
1137             H  => [ qw( CAU CAC ) ],
1138             K  => [ qw( AAA AAG ) ],
1139             N  => [ qw( AAU AAC ) ],
1140             Q  => [ qw( CAA CAG ) ],
1141             Y  => [ qw( UAU UAC ) ],
1142             M  => [ qw( AUG ) ],
1143             U  => [ qw( UGA ) ],
1144             W  => [ qw( UGG ) ],
1145    
1146             l  => [ qw( uua uug cua cug cuu cuc ) ],
1147             r  => [ qw( aga agg cga cgg cgu cgc ) ],
1148             s  => [ qw( agu agc uca ucg ucu ucc ) ],
1149             a  => [ qw( gca gcg gcu gcc ) ],
1150             g  => [ qw( gga ggg ggu ggc ) ],
1151             p  => [ qw( cca ccg ccu ccc ) ],
1152             t  => [ qw( aca acg acu acc ) ],
1153             v  => [ qw( gua gug guu guc ) ],
1154             b  => [ qw( gau gac aau aac ) ],
1155             z  => [ qw( gaa gag caa cag ) ],
1156             i  => [ qw( aua auu auc ) ],
1157             c  => [ qw( ugu ugc ) ],
1158             d  => [ qw( gau gac ) ],
1159             e  => [ qw( gaa gag ) ],
1160             f  => [ qw( uuu uuc ) ],
1161             h  => [ qw( cau cac ) ],
1162             k  => [ qw( aaa aag ) ],
1163             n  => [ qw( aau aac ) ],
1164             q  => [ qw( caa cag ) ],
1165             y  => [ qw( uau uac ) ],
1166             m  => [ qw( aug ) ],
1167             u  => [ qw( uga ) ],
1168             w  => [ qw( ugg ) ],
1169    
1170            '*' => [ qw( UAA UAG UGA ) ]
1171    );
1172    
1173    
1174    %n_codon_for_aa = map {
1175        $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
1176        } keys %amino_acid_codons_DNA;
1177    
1178    
1179    %reverse_genetic_code_DNA = (
1180             A  => "GCN",  a  => "gcn",
1181             C  => "TGY",  c  => "tgy",
1182             D  => "GAY",  d  => "gay",
1183             E  => "GAR",  e  => "gar",
1184             F  => "TTY",  f  => "tty",
1185             G  => "GGN",  g  => "ggn",
1186             H  => "CAY",  h  => "cay",
1187             I  => "ATH",  i  => "ath",
1188             K  => "AAR",  k  => "aar",
1189             L  => "YTN",  l  => "ytn",
1190             M  => "ATG",  m  => "atg",
1191             N  => "AAY",  n  => "aay",
1192             P  => "CCN",  p  => "ccn",
1193             Q  => "CAR",  q  => "car",
1194             R  => "MGN",  r  => "mgn",
1195             S  => "WSN",  s  => "wsn",
1196             T  => "ACN",  t  => "acn",
1197             U  => "TGA",  u  => "tga",
1198             V  => "GTN",  v  => "gtn",
1199             W  => "TGG",  w  => "tgg",
1200             X  => "NNN",  x  => "nnn",
1201             Y  => "TAY",  y  => "tay",
1202            '*' => "TRR"
1203    );
1204    
1205    %reverse_genetic_code_RNA = (
1206             A  => "GCN",  a  => "gcn",
1207             C  => "UGY",  c  => "ugy",
1208             D  => "GAY",  d  => "gay",
1209             E  => "GAR",  e  => "gar",
1210             F  => "UUY",  f  => "uuy",
1211             G  => "GGN",  g  => "ggn",
1212             H  => "CAY",  h  => "cay",
1213             I  => "AUH",  i  => "auh",
1214             K  => "AAR",  k  => "aar",
1215             L  => "YUN",  l  => "yun",
1216             M  => "AUG",  m  => "aug",
1217             N  => "AAY",  n  => "aay",
1218             P  => "CCN",  p  => "ccn",
1219             Q  => "CAR",  q  => "car",
1220             R  => "MGN",  r  => "mgn",
1221             S  => "WSN",  s  => "wsn",
1222             T  => "ACN",  t  => "acn",
1223             U  => "UGA",  u  => "uga",
1224             V  => "GUN",  v  => "gun",
1225             W  => "UGG",  w  => "ugg",
1226             X  => "NNN",  x  => "nnn",
1227             Y  => "UAY",  y  => "uay",
1228            '*' => "URR"
1229    );
1230    
1231    
1232    %DNA_letter_can_be = (
1233      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1234      B => ["C", "G", "T"],       b => ["c", "g", "t"],      B => ["C", "G", "T"],       b => ["c", "g", "t"],
1235      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 520  Line 1249 
1249  );  );
1250    
1251    
1252  my %RNA_letter_can_be = (  %RNA_letter_can_be = (
1253      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1254      B => ["C", "G", "U"],       b => ["c", "g", "u"],      B => ["C", "G", "U"],       b => ["c", "g", "u"],
1255      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 540  Line 1269 
1269  );  );
1270    
1271    
1272  my %one_letter_to_three_letter_aa = (  %one_letter_to_three_letter_aa = (
1273       A  => "Ala",           A  => "Ala", a  => "Ala",
1274       B  => "Asx",           B  => "Asx", b  => "Asx",
1275       C  => "Cys",           C  => "Cys", c  => "Cys",
1276       D  => "Asp",           D  => "Asp", d  => "Asp",
1277       E  => "Glu",           E  => "Glu", e  => "Glu",
1278       F  => "Phe",           F  => "Phe", f  => "Phe",
1279       G  => "Gly",           G  => "Gly", g  => "Gly",
1280       H  => "His",           H  => "His", h  => "His",
1281       I  => "Ile",           I  => "Ile", i  => "Ile",
1282       K  => "Lys",           K  => "Lys", k  => "Lys",
1283       L  => "Leu",           L  => "Leu", l  => "Leu",
1284       M  => "Met",           M  => "Met", m  => "Met",
1285       N  => "Asn",           N  => "Asn", n  => "Asn",
1286       P  => "Pro",           P  => "Pro", p  => "Pro",
1287       Q  => "Gln",           Q  => "Gln", q  => "Gln",
1288       R  => "Arg",           R  => "Arg", r  => "Arg",
1289       S  => "Ser",           S  => "Ser", s  => "Ser",
1290       T  => "Thr",           T  => "Thr", t  => "Thr",
1291       U  => "Sec",           U  => "Sec", u  => "Sec",
1292       V  => "Val",           V  => "Val", v  => "Val",
1293       W  => "Trp",           W  => "Trp", w  => "Trp",
1294       X  => "Xxx",           X  => "Xxx", x  => "Xxx",
1295       Y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1296       Z  => "Glx",           Z  => "Glx", z  => "Glx",
1297      "*" => "***"          '*' => "***"
1298            );
1299    
1300    
1301    %three_letter_to_one_letter_aa = (
1302         ALA  => "A",   Ala  => "A",   ala  => "a",
1303         ARG  => "R",   Arg  => "R",   arg  => "r",
1304         ASN  => "N",   Asn  => "N",   asn  => "n",
1305         ASP  => "D",   Asp  => "D",   asp  => "d",
1306         ASX  => "B",   Asx  => "B",   asx  => "b",
1307         CYS  => "C",   Cys  => "C",   cys  => "c",
1308         GLN  => "Q",   Gln  => "Q",   gln  => "q",
1309         GLU  => "E",   Glu  => "E",   glu  => "e",
1310         GLX  => "Z",   Glx  => "Z",   glx  => "z",
1311         GLY  => "G",   Gly  => "G",   gly  => "g",
1312         HIS  => "H",   His  => "H",   his  => "h",
1313         ILE  => "I",   Ile  => "I",   ile  => "i",
1314         LEU  => "L",   Leu  => "L",   leu  => "l",
1315         LYS  => "K",   Lys  => "K",   lys  => "k",
1316         MET  => "M",   Met  => "M",   met  => "m",
1317         PHE  => "F",   Phe  => "F",   phe  => "f",
1318         PRO  => "P",   Pro  => "P",   pro  => "p",
1319         SEC  => "U",   Sec  => "U",   sec  => "u",
1320         SER  => "S",   Ser  => "S",   ser  => "s",
1321         THR  => "T",   Thr  => "T",   thr  => "t",
1322         TRP  => "W",   Trp  => "W",   trp  => "w",
1323         TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
1324         VAL  => "V",   Val  => "V",   val  => "v",
1325         XAA  => "X",   Xaa  => "X",   xaa  => "x",
1326         XXX  => "X",   Xxx  => "X",   xxx  => "x",
1327        '***' => "*"
1328  );  );
1329    
1330    
# Line 586  Line 1345 
1345                              #  (note: undef is false)                              #  (note: undef is false)
1346    
1347      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
1348      my $pep  = ($met && ($imax >= 0)) ? "M" : "";      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );
     my $aa;  
1349      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
1350          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
1351      }      }
# Line 796  Line 1554 
1554  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1555  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1556  #  #
1557  #     @intervals = read_intervals(*FILEHANDLE)  #     @intervals = read_intervals( \*FILEHANDLE )
1558  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1559  sub read_intervals {  sub read_intervals {
1560      my $fh = shift;      my $fh = shift;
# Line 819  Line 1577 
1577    
1578    
1579  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1580    #  Convert a list of intervals to read [ id, left_end, right_end ].
1581    #
1582    #     @intervals = standardize_intervals( @interval_refs )
1583    #-----------------------------------------------------------------------------
1584    sub standardize_intervals {
1585        map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
1586    }
1587    
1588    
1589    #-----------------------------------------------------------------------------
1590  #  Take the union of a list of intervals  #  Take the union of a list of intervals
1591  #  #
1592  #     @joined = join_intervals( @interval_refs )  #     @joined = join_intervals( @interval_refs )
# Line 858  Line 1626 
1626      return @joined;      return @joined;
1627  }  }
1628    
1629    
1630    #-----------------------------------------------------------------------------
1631    #  Split location strings to oriented intervals.
1632    #
1633    #     @intervals = locations_2_intervals( @locations )
1634    #     $interval  = locations_2_intervals( $location  )
1635    #-----------------------------------------------------------------------------
1636    sub locations_2_intervals {
1637        my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
1638                           || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
1639                           || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
1640                           || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
1641                            ? [ $1, $2+0, $3+0 ]
1642                            : ()
1643                            } @_;
1644    
1645        return wantarray ? @intervals : $intervals[0];
1646    }
1647    
1648    
1649    #-----------------------------------------------------------------------------
1650    #  Read a list of oriented intervals from a file.
1651    #  Allow id_start_end, or id \s start \s end formats
1652    #
1653    #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1654    #-----------------------------------------------------------------------------
1655    sub read_oriented_intervals {
1656        my $fh = shift;
1657        my @intervals = ();
1658    
1659        while (<$fh>) {
1660            chomp;
1661               /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
1662            || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
1663            || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
1664            || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
1665            || next;
1666    
1667            #  Matched a pattern.  Store reference to (id, start, end):
1668            push @intervals, [ $1, $2+0, $3+0 ];
1669        }
1670        return @intervals;
1671    }
1672    
1673    
1674    #-----------------------------------------------------------------------------
1675    #  Reverse the orientation of a list of intervals
1676    #
1677    #     @reversed = reverse_intervals( @interval_refs )
1678    #-----------------------------------------------------------------------------
1679    sub reverse_intervals {
1680        map { [ $_->[0], $_->[2], $_->[1] ] } @_;
1681    }
1682    
1683    
1684    #-----------------------------------------------------------------------------
1685    #  Convert GenBank locations to SEED locations
1686    #
1687    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1688    #-----------------------------------------------------------------------------
1689    sub gb_location_2_seed
1690    {
1691        my $contig = shift @_;
1692        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1693    
1694        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1695    }
1696    
1697    sub gb_loc_2_seed_2
1698    {
1699        my ( $contig, $loc ) = @_;
1700    
1701        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1702        {
1703            join( '_', $contig, $1, $2 )
1704        }
1705    
1706        elsif ( $loc =~ /^join\((.*)\)$/ )
1707        {
1708            $loc = $1;
1709            my $lvl = 0;
1710            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1711            {
1712                for ( substr( $loc, $i, 1 ) )
1713                {
1714                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1715                    /\(/          and $lvl--;
1716                    /\)/          and $lvl++;
1717                }
1718            }
1719            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1720            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1721        }
1722    
1723        elsif ( $loc =~ /^complement\((.*)\)$/ )
1724        {
1725            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1726            reverse
1727            gb_loc_2_seed_2( $contig, $1 )
1728        }
1729    
1730        else
1731        {
1732            ()
1733        }
1734    }
1735    
1736    
1737    #-----------------------------------------------------------------------------
1738    #  Read qual.
1739    #
1740    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1741    #
1742    #     @seq_entries = read_qual( )               #  STDIN
1743    #    \@seq_entries = read_qual( )               #  STDIN
1744    #     @seq_entries = read_qual( \*FILEHANDLE )
1745    #    \@seq_entries = read_qual( \*FILEHANDLE )
1746    #     @seq_entries = read_qual(  $filename )
1747    #    \@seq_entries = read_qual(  $filename )
1748    #-----------------------------------------------------------------------------
1749    sub read_qual {
1750        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1751        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1752    
1753        my @quals = ();
1754        my ($id, $desc, $qual) = ("", "", []);
1755    
1756        while ( <$fh> ) {
1757            chomp;
1758            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1759                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1760                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1761            }
1762            else {
1763                push @$qual, split;
1764            }
1765        }
1766        close( $fh ) if $close;
1767    
1768        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1769        return wantarray ? @quals : \@quals;
1770    }
1771    
1772    
1773  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.8

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3