[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Mon Dec 1 16:54:26 2003 UTC revision 1.7, Sun Jun 10 17:34:54 2007 UTC
# Line 1  Line 1 
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  read_fasta_seqs( *FILEHANDLE )  #  A sequence entry is ( $id, $def, $seq )
4  #  read_next_fasta_seq( *FILEHANDLE )  #  A list of entries is a list of references
5  #  parse_fasta_title( $title )  #
6  #  split_fasta_title( $title )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq )  #  @seq_entries = read_fasta( )                     # STDIN
9  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11  #  index_seq_list( @seq_entry_list )  #  @seq_entries = read_clustal( )                   # STDIN
12  #  seq_entry_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal( \*FILEHANDLE )
13  #  seq_desc_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal(  $filename )
14  #  seq_data_by_id( \%seq_index, $seq_id )  #  @seq_entries = read_clustal_file(  $filename )
15    #
16  #  subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] )  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  complement_DNA_entry( @seq_entry [, $fix_id] )  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  complement_RNA_entry( @seq_entry [, $fix_id] )  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
20  #  complement_DNA_seq( $NA_seq )  #
21  #  complement_RNA_seq( $NA_seq )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  to_DNA_seq( $NA_seq )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  to_RNA_seq( $NA_seq )  #
24  #  clean_ae_sequence( $seq )  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25    #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  translate_seq( $seq [, $met_start] )  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  translate_codon( $triplet )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28  #  translate_seq_with_user_code( $seq, \%gen_code [, $met_start] )  #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46    #
47    #   @packed_seqs = pack_alignment(  @seqs )
48    #   @packed_seqs = pack_alignment( \@seqs )
49    #  \@packed_seqs = pack_alignment(  @seqs )
50    #  \@packed_seqs = pack_alignment( \@seqs )
51    #
52    #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
53    #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
54    #  @entry  = complement_DNA_entry( @seq_entry [, $fix_id] );
55    #  @entry  = complement_RNA_entry( @seq_entry [, $fix_id] );
56    #  $DNAseq = complement_DNA_seq( $NA_seq );
57    #  $RNAseq = complement_RNA_seq( $NA_seq );
58    #  $DNAseq = to_DNA_seq( $NA_seq );
59    #  $RNAseq = to_RNA_seq( $NA_seq );
60    #  $seq    = pack_seq( $sequence )
61    #  $seq    = clean_ae_sequence( $seq )
62    #
63    #  $seq = translate_seq( $seq [, $met_start] )
64    #  $aa  = translate_codon( $triplet );
65    #  $aa  = translate_uc_DNA_codon( $upcase_DNA_triplet );
66    #
67    #  User-supplied genetic code must be upper case index and match the
68    #  DNA versus RNA type of sequence
69    #
70    #  Locations (= oriented intervals) are ( id, start, end )
71    #  Intervals are ( id, left, right )
72    #
73    #  @intervals = read_intervals( \*FILEHANDLE )
74    #  @intervals = read_oriented_intervals( \*FILEHANDLE )
75    #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
76    #  @joined    = join_intervals( @interval_refs )
77    #  @intervals = locations_2_intervals( @locations )
78    #  $interval  = locations_2_intervals( $location  )
79    #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
80    #
81    #  Convert GenBank locations to SEED locations
82    #
83    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
84    #
85    #  Read quality scores from a fasta-like file:
86    #
87    #  @seq_entries = read_qual( )               #  STDIN
88    # \@seq_entries = read_qual( )               #  STDIN
89    #  @seq_entries = read_qual( \*FILEHANDLE )
90    # \@seq_entries = read_qual( \*FILEHANDLE )
91    #  @seq_entries = read_qual(  $filename )
92    # \@seq_entries = read_qual(  $filename )
93    #
94    
95    use strict;
96    
97  #  read_intervals( *FILEHANDLE )  #  Exported global variables:
 #  join_intervals( @interval_refs )  
98    
99  use gjolib qw(wrap_text);  our @aa_1_letter_order;  # Alpha by 1 letter
100    our @aa_3_letter_order;  # PAM matrix order
101    our @aa_n_codon_order;
102    our %genetic_code;
103    our %genetic_code_with_U;
104    our %amino_acid_codons_DNA;
105    our %amino_acid_codons_RNA;
106    our %n_codon_for_aa;
107    our %reverse_genetic_code_DNA;
108    our %reverse_genetic_code_RNA;
109    our %DNA_letter_can_be;
110    our %RNA_letter_can_be;
111    our %one_letter_to_three_letter_aa;
112    our %three_letter_to_one_letter_aa;
113    
114  require Exporter;  require Exporter;
115    
116  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
117  our @EXPORT = qw(  our @EXPORT = qw(
118          read_fasta_seqs          read_fasta_seqs
119            read_fasta
120          read_next_fasta_seq          read_next_fasta_seq
121            read_clustal_file
122            read_clustal
123          parse_fasta_title          parse_fasta_title
124          split_fasta_title          split_fasta_title
125          print_seq_list_as_fasta          print_seq_list_as_fasta
126            print_alignment_as_fasta
127            print_alignment_as_phylip
128            print_alignment_as_nexus
129          print_seq_as_fasta          print_seq_as_fasta
130          print_gb_locus          print_gb_locus
131    
# Line 49  Line 134 
134          seq_desc_by_id          seq_desc_by_id
135          seq_data_by_id          seq_data_by_id
136    
137            pack_alignment
138    
139          subseq_DNA_entry          subseq_DNA_entry
140          subseq_RNA_entry          subseq_RNA_entry
141          complement_DNA_entry          complement_DNA_entry
# Line 57  Line 144 
144          complement_RNA_seq          complement_RNA_seq
145          to_DNA_seq          to_DNA_seq
146          to_RNA_seq          to_RNA_seq
147            pack_seq
148          clean_ae_sequence          clean_ae_sequence
149    
150          translate_seq          translate_seq
# Line 64  Line 152 
152          translate_seq_with_user_code          translate_seq_with_user_code
153    
154          read_intervals          read_intervals
155            standardize_intervals
156          join_intervals          join_intervals
157            locations_2_intervals
158            read_oriented_intervals
159            reverse_intervals
160    
161            gb_location_2_seed
162    
163            read_qual
164          );          );
165    
166  use strict;  our @EXPORT_OK = qw(
167            @aa_1_letter_order
168            @aa_3_letter_order
169            @aa_n_codon_order
170            %genetic_code
171            %genetic_code_with_U
172            %amino_acid_codons_DNA
173            %amino_acid_codons_RNA
174            %n_codon_for_aa
175            %reverse_genetic_code_DNA
176            %reverse_genetic_code_RNA
177            %DNA_letter_can_be
178            %RNA_letter_can_be
179            %one_letter_to_three_letter_aa
180            %three_letter_to_one_letter_aa
181            );
182    
183    
184  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
185  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
186    #     filehandle is passed through
187    #     string is taken as file name to be openend
188    #     undef or "" defaults to STDOUT
189    #
190    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
191    #
192    #-----------------------------------------------------------------------------
193    sub input_filehandle
194    {
195        my $file = shift;
196    
197        #  FILEHANDLE
198    
199        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
200    
201        #  Null string or undef
202    
203        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
204    
205        #  File name
206    
207        if ( ! ref( $file ) )
208        {
209            my $fh;
210            -f $file or die "Could not find input file \"$file\"\n";
211            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
212            return ( $fh, $file, 1 );
213        }
214    
215        #  Some other kind of reference; return the unused value
216    
217        return ( \*STDIN, undef, 0, $file );
218    }
219    
220    
221    #-----------------------------------------------------------------------------
222    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
223  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
224  #  #
225  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
226  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
227  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
228      my $fh = shift;  
229      wantarray || die "read_fasta_seqs requires list context\n";  
230    #-----------------------------------------------------------------------------
231    #  Read fasta sequences.
232    #  Save the contents in a list of refs to arrays:  (id, description, seq)
233    #
234    #     @seq_entries = read_fasta( )               #  STDIN
235    #    \@seq_entries = read_fasta( )               #  STDIN
236    #     @seq_entries = read_fasta( \*FILEHANDLE )
237    #    \@seq_entries = read_fasta( \*FILEHANDLE )
238    #     @seq_entries = read_fasta(  $filename )
239    #    \@seq_entries = read_fasta(  $filename )
240    #-----------------------------------------------------------------------------
241    sub read_fasta {
242        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
243        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
244    
245      my @seqs = ();      my @seqs = ();
246      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 90  Line 252 
252              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");              ($id, $desc, $seq) = ($1, $3 ? $3 : "", "");
253          }          }
254          else {          else {
255              tr/\t 0-9//d;              tr/     0-9//d;
256              $seq .= $_ ;              $seq .= $_ ;
257          }          }
258      }      }
259        close( $fh ) if $close;
260    
261      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
262      return @seqs;      return wantarray ? @seqs : \@seqs;
263  }  }
264    
265    
# Line 104  Line 267 
267  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
268  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
269  #  #
270  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
271  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
272  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
273  #  (these should really be hashes indexed by file handle):  
274  #  {   #  Use bare block to scope the header hash
275  my $next_fasta_seq_header = "";  
276        my %next_header;
277    
278  sub read_next_fasta_seq {  sub read_next_fasta_seq {
279      my $fh = shift;      my $fh = shift;
280      wantarray || die "read_next_fasta_seq requires list context\n";          my ( $id, $desc );
281    
282      my ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );          if ( defined( $next_header{$fh} ) ) {
283                ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
284            }
285            else {
286                $next_header{$fh} = "";
287                ( $id, $desc ) = ( undef, "" );
288            }
289      my $seq = "";      my $seq = "";
290    
291      while ( <$fh> ) {      while ( <$fh> ) {
292          chomp;          chomp;
293          if ( /^>/ ) {        #  new id          if ( /^>/ ) {        #  new id
294              $next_fasta_seq_header = $_;                  $next_header{$fh} = $_;
295              if ( $id && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
296              ( $id, $desc ) = parse_fasta_title( $next_fasta_seq_header );                  {
297                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
298                    }
299                    ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
300              $seq = "";              $seq = "";
301          }          }
302          else {          else {
303              tr/\t 0-9//d;                  tr/     0-9//d;
304              $seq .= $_ ;              $seq .= $_ ;
305          }          }
306      }      }
307    
308      #  Done with file, clear "next header"          #  Done with file, delete "next header"
309    
310            delete $next_header{$fh};
311            return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
312                                                          : [$id, $desc, $seq]
313                                              )
314                                            : () ;
315        }
316    }
317    
318    
319    #-----------------------------------------------------------------------------
320    #  Read a clustal alignment from a file.
321    #  Save the contents in a list of refs to arrays:  (id, description, seq)
322    #
323    #     @seq_entries = read_clustal_file( $filename )
324    #-----------------------------------------------------------------------------
325    sub read_clustal_file { read_clustal( @_ ) }
326    
327    
328      $next_fasta_seq_header = "";  #-----------------------------------------------------------------------------
329      return ($id && $seq) ? ($id, $desc, $seq) : () ;  #  Read a clustal alignment.
330    #  Save the contents in a list of refs to arrays:  (id, description, seq)
331    #
332    #     @seq_entries = read_clustal( )              # STDIN
333    #     @seq_entries = read_clustal( \*FILEHANDLE )
334    #     @seq_entries = read_clustal(  $filename )
335    #-----------------------------------------------------------------------------
336    sub read_clustal {
337        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
338        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
339    
340        my ( %seq, @ids, $line );
341        while ( defined( $line = <$fh> ) )
342        {
343            ( $line =~ /^[A-Za-z0-9]/ ) or next;
344            chomp $line;
345            my @flds = split /\s+/, $line;
346            if ( @flds == 2 )
347            {
348                $seq{ $flds[0] } or push @ids, $flds[0];
349                push @{ $seq{ $flds[0] } }, $flds[1];
350            }
351        }
352        close( $fh ) if $close;
353    
354        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
355  }  }
356    
357    
# Line 151  Line 367 
367      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {      if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {
368          return ($1, $3 ? $3 : "");          return ($1, $3 ? $3 : "");
369      }      }
370      else {      elsif ($title =~ /^>/) {
371          return ("", "");          return ("", "");
372      }      }
373        else {
374            return (undef, "");
375        }
376  }  }
377    
378  sub split_fasta_title {  sub split_fasta_title {
# Line 162  Line 381 
381    
382    
383  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
384  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
385    #     filehandle is passed through
386    #     string is taken as file name to be openend
387    #     undef or "" defaults to STDOUT
388    #
389    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
390  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
391  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
392  sub print_seq_list_as_fasta {  sub output_filehandle
393      my $fh = shift;  {
394      my @seq_list = @_;      my $file = shift;
395    
396        #  FILEHANDLE
397    
398        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
399    
400        #  Null string or undef
401    
402        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
403    
404        #  File name
405    
406        if ( ! ref( $file ) )
407        {
408            my $fh;
409            open( $fh, ">$file" ) || die "Could not open output $file\n";
410            return ( $fh, $file, 1 );
411        }
412    
413      foreach my $seq_ptr (@seq_list) {      #  Some other kind of reference; return the unused value
414          print_seq_as_fasta($fh, @$seq_ptr);  
415        return ( \*STDOUT, undef, 0, $file );
416      }      }
417    
418    
419    #-----------------------------------------------------------------------------
420    #  Legacy function for printing fasta sequence set:
421    #
422    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
423    #-----------------------------------------------------------------------------
424    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
425    
426    
427    #-----------------------------------------------------------------------------
428    #  Print list of sequence entries in fasta format.
429    #  Missing, undef or "" filename defaults to STDOUT.
430    #
431    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
432    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
433    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
434    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
435    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
436    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
437    #-----------------------------------------------------------------------------
438    sub print_alignment_as_fasta {
439        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
440        ( unshift @_, $unused ) if $unused;
441    
442        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";
443    
444        #  Expand the sequence entry list if necessary:
445    
446        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
447    
448        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
449    
450        close( $fh ) if $close;
451    }
452    
453    
454    #-----------------------------------------------------------------------------
455    #  Print list of sequence entries in phylip format.
456    #  Missing, undef or "" filename defaults to STDOUT.
457    #
458    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
459    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
460    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
461    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
462    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
463    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
464    #-----------------------------------------------------------------------------
465    sub print_alignment_as_phylip {
466        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
467        ( unshift @_, $unused ) if $unused;
468    
469        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
470    
471        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
472    
473        my ( %id2, %used );
474        my $maxlen = 0;
475        foreach ( @seq_list )
476        {
477            my ( $id, undef, $seq ) = @$_;
478    
479            #  Need a name that is unique within 10 characters
480    
481            my $id2 = substr( $id, 0, 10 );
482            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
483            my $n = "0";
484            while ( $used{ $id2 } )
485            {
486                $n++;
487                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
488            }
489            $used{ $id2 } = 1;
490            $id2{ $id } = $id2;
491    
492                    #  Prepare to pad sequences (should not be necessary, but ...)
493    
494            my $len = length( $seq );
495            $maxlen = $len if ( $len > $maxlen );
496        }
497    
498        my $nseq = @seq_list;
499        print $fh "$nseq  $maxlen\n";
500        foreach ( @seq_list )
501        {
502            my ( $id, undef, $seq ) = @$_;
503            my $len = length( $seq );
504            printf $fh "%-10s  %s%s\n", $id2{ $id },
505                                        $seq,
506                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
507        }
508    
509        close( $fh ) if $close;
510    }
511    
512    
513    #-----------------------------------------------------------------------------
514    #  Print list of sequence entries in nexus format.
515    #  Missing, undef or "" filename defaults to STDOUT.
516    #
517    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
518    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
519    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
520    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
521    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
522    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
523    #-----------------------------------------------------------------------------
524    sub print_alignment_as_nexus {
525        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
526        ( unshift @_, $unused ) if $unused;
527    
528        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
529    
530        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
531    
532        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
533    
534        my %id2;
535        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
536        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
537        foreach ( @seq_list )
538        {
539            my ( $id, undef, $seq ) = @$_;
540            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
541            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
542            {
543                    $id2 =~ s/'/''/g;
544                    $id2 = qq('$id2');
545                }
546            $id2{ $id } = $id2;
547            my $idlen = length( $id2 );
548            $maxidlen = $idlen if ( $idlen > $maxidlen );
549    
550            my $seqlen = length( $seq );
551            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
552    
553            $nt += $seq =~ tr/Tt//d;
554            $nu += $seq =~ tr/Uu//d;
555            $n1 += $seq =~ tr/ACGNacgn//d;
556            $n2 += $seq =~ tr/A-Za-z//d;
557        }
558    
559        my $nseq = @seq_list;
560        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
561    
562        print $fh <<"END_HEAD";
563    #NEXUS
564    
565    BEGIN Data;
566        Dimensions
567            NTax=$nseq
568            NChar=$maxseqlen
569            ;
570        Format
571            DataType=$type
572            Gap=-
573            Missing=?
574            ;
575        Matrix
576    
577    END_HEAD
578    
579        foreach ( @seq_list )
580        {
581            my ( $id, undef, $seq ) = @$_;
582            my $len = length( $seq );
583            printf  $fh  "%-${maxidlen}s  %s%s\n",
584                         $id2{ $id },
585                         $seq,
586                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
587        }
588    
589        print $fh <<"END_TAIL";
590    ;
591    END;
592    END_TAIL
593    
594        close( $fh ) if $close;
595  }  }
596    
597    
598  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
599  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
600  #  #
601  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
602  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
603  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
604  sub print_seq_as_fasta {  sub print_seq_as_fasta {
605      my $fh = shift;      my $fh = shift;
# Line 197  Line 616 
616  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
617  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
618  #  #
619  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
620  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
621  sub print_gb_locus {  sub print_gb_locus {
622      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 223  Line 642 
642    
643    
644  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
645    #  Return a string with text wrapped to defined line lengths:
646    #
647    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
648    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
649    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
650    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
651    #-----------------------------------------------------------------------------
652    sub wrap_text {
653        my ($str, $len, $ind, $indn) = @_;
654    
655        defined($str)  || die "wrap_text called without a string\n";
656        defined($len)  || ($len  =   80);
657        defined($ind)  || ($ind  =    0);
658        ($ind  < $len) || die "wrap error: indent greater than line length\n";
659        defined($indn) || ($indn = $ind);
660        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
661    
662        $str =~ s/\s+$//;
663        $str =~ s/^\s+//;
664        my ($maxchr, $maxchr1);
665        my (@lines) = ();
666    
667        while ($str) {
668            $maxchr1 = ($maxchr = $len - $ind) - 1;
669            if ($maxchr >= length($str)) {
670                push @lines, (" " x $ind) . $str;
671                last;
672            }
673            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
674                push @lines, (" " x $ind) . $1;
675                $str = $2;
676            }
677            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
678                push @lines, (" " x $ind) . $1;
679                $str = $2;
680            }
681            else {
682                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
683                $str = substr($str, $maxchr);
684            }
685            $ind = $indn;
686        }
687    
688        return join("\n", @lines);
689    }
690    
691    
692    #-----------------------------------------------------------------------------
693  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
694  #  #
695  #     my %seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(  @seq_list );
696    #     my \%seq_ind  = index_seq_list( \@seq_list );
697  #  #
698  #  Usage example:  #  Usage example:
699  #  #
700  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
701  #  my %seq_ind    = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind    = index_seq_list(@seq_list); # hash from names to pointers
702  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
703  #  #
704  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
705  sub index_seq_list {  sub index_seq_list {
706      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
707      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
708      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
709  }  }
710    
711    
712  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
713  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
714  #  #
715  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
716  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
717  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
718    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
719  #  #
720  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
721  sub seq_entry_by_id {  sub seq_entry_by_id {
722      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
723      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
724      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
725  }  }
726    
727    
# Line 269  Line 738 
738      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
739  }  }
740    
741    #-----------------------------------------------------------------------------
742    #  Remove columns of alignment gaps from sequences:
743    #
744    #   @packed_seqs = pack_alignment(  @seqs )
745    #   @packed_seqs = pack_alignment( \@seqs )
746    #  \@packed_seqs = pack_alignment(  @seqs )
747    #  \@packed_seqs = pack_alignment( \@seqs )
748    #
749    #-----------------------------------------------------------------------------
750    
751    sub pack_alignment
752    {
753        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
754        @seqs or return wantarray ? () : [];
755    
756        my $mask  = pack_mask( $seqs[0]->[2] );
757        foreach ( @seqs[ 1 .. (@seqs-1) ] )
758        {
759            $mask |= pack_mask( $_->[2] );
760        }
761    
762        my $seq;
763        my @seqs2 = map { $seq = $_->[2] & $mask;
764                          $seq =~ tr/\000//d;
765                          [ $_->[0], $_->[1], $seq ]
766                        }
767                    @seqs;
768    
769        return wantarray ? @seqs2 : \@seqs2;
770    }
771    
772    sub pack_mask
773    {
774        my $mask = shift;
775        $mask =~ tr/-/\000/;
776        $mask =~ tr/\000/\377/c;
777        return $mask;
778    }
779    
780  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
781  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 332  Line 839 
839      }      }
840    
841      if ( $fix_id ) {      if ( $fix_id ) {
842          if ( ( $id =~ s/_([0-9]+)_([0-9]+)$// )          if ( ( $id =~ s/_(\d+)_(\d+)$// )
843            && ( abs($2-$1)+1 == $len ) ) {            && ( abs($2-$1)+1 == $len ) ) {
844              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }              if ( $1 <= $2 ) { $from += $1 - 1;         $to += $1 - 1 }
845              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }              else            { $from  = $1 + 1 - $from; $to  = $1 + 1 - $to }
# Line 353  Line 860 
860      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
861                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];                [TGCAAMKYSWRVHDBNtgcaamkyswrvhdbn];
862      if ($fix_id) {      if ($fix_id) {
863          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
864              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
865          }          }
866          else {          else {
# Line 374  Line 881 
881      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]      $seq =~ tr[ACGTUKMRSWYBDHVNacgtukmrswybdhvn]
882                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];                [UGCAAMKYSWRVHDBNugcaamkyswrvhdbn];
883      if ($fix_id) {      if ($fix_id) {
884          if ($id =~ s/_([0-9]+)_([0-9]+)$//) {          if ($id =~ s/_(\d+)_(\d+)$//) {
885              $id .= "_" . $2 . "_" . $1;              $id .= "_" . $2 . "_" . $1;
886          }          }
887          else {          else {
# Line 416  Line 923 
923  }  }
924    
925    
926    sub pack_seq {
927        my $seq = shift;
928        $seq =~ tr/A-Za-z//cd;
929        return $seq;
930    }
931    
932    
933  sub clean_ae_sequence {  sub clean_ae_sequence {
934      $_ = shift;      $_ = shift;
935      $_ = to7bit($_);      $_ = to7bit($_);
# Line 464  Line 978 
978  #  #
979  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
980    
981  our %genetic_code = (   # work as DNA  @aa_1_letter_order = qw( A C D E F G H I K L M N P Q R S T V W Y );  # Alpha by 1 letter
982    @aa_3_letter_order = qw( A R N D C Q E G H I L K M F P S T W Y V );  # PAM matrix order
983    @aa_n_codon_order  = qw( L R S A G P T V I C D E F H K N Q Y M W );
984    
985    %genetic_code = (
986    
987        # DNA version
988    
989      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",      TTT => "F",  TCT => "S",  TAT => "Y",  TGT => "C",
990      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",      TTC => "F",  TCC => "S",  TAC => "Y",  TGC => "C",
991      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",      TTA => "L",  TCA => "S",  TAA => "*",  TGA => "*",
# Line 482  Line 1003 
1003      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",      GTA => "V",  GCA => "A",  GAA => "E",  GGA => "G",
1004      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",      GTG => "V",  GCG => "A",  GAG => "E",  GGG => "G",
1005    
1006        # RNA suppliment
1007    
1008        UUU => "F",  UCU => "S",  UAU => "Y",  UGU => "C",
1009        UUC => "F",  UCC => "S",  UAC => "Y",  UGC => "C",
1010        UUA => "L",  UCA => "S",  UAA => "*",  UGA => "*",
1011        UUG => "L",  UCG => "S",  UAG => "*",  UGG => "W",
1012        CUU => "L",  CCU => "P",  CAU => "H",  CGU => "R",
1013        CUC => "L",
1014        CUA => "L",
1015        CUG => "L",
1016        AUU => "I",  ACU => "T",  AAU => "N",  AGU => "S",
1017        AUC => "I",
1018        AUA => "I",
1019        AUG => "M",
1020        GUU => "V",  GCU => "A",  GAU => "D",  GGU => "G",
1021        GUC => "V",
1022        GUA => "V",
1023        GUG => "V",
1024    
1025      #  The following ambiguous encodings are not necessary,  but      #  The following ambiguous encodings are not necessary,  but
1026      #  speed up the processing of some common ambiguous triplets:      #  speed up the processing of some ambiguous triplets:
1027    
1028      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",      TTY => "F",  TCY => "S",  TAY => "Y",  TGY => "C",
1029      TTR => "L",  TCR => "S",  TAR => "*",      TTR => "L",  TCR => "S",  TAR => "*",
# Line 496  Line 1036 
1036                   ACN => "T",                   ACN => "T",
1037      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",      GTY => "V",  GCY => "A",  GAY => "D",  GGY => "G",
1038      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",      GTR => "V",  GCR => "A",  GAR => "E",  GGR => "G",
1039      GTN => "V",  GCN => "A",               GGN => "G"      GTN => "V",  GCN => "A",               GGN => "G",
1040    
1041        UUY => "F",  UCY => "S",  UAY => "Y",  UGY => "C",
1042        UUR => "L",  UCR => "S",  UAR => "*",
1043                     UCN => "S",
1044        CUY => "L",
1045        CUR => "L",
1046        CUN => "L",
1047        AUY => "I",
1048        GUY => "V",
1049        GUR => "V",
1050        GUN => "V"
1051    );
1052    
1053    
1054    #  Add lower case by construction:
1055    
1056    foreach ( keys %genetic_code ) {
1057        $genetic_code{ lc $_ } = lc $genetic_code{ $_ }
1058    }
1059    
1060    
1061    #  Construct the genetic code with selanocysteine by difference:
1062    
1063    %genetic_code_with_U = map { $_ => $genetic_code{ $_ } } keys %genetic_code;
1064    $genetic_code_with_U{ TGA } = "U";
1065    $genetic_code_with_U{ tga } = "u";
1066    $genetic_code_with_U{ UGA } = "U";
1067    $genetic_code_with_U{ uga } = "u";
1068    
1069    
1070    %amino_acid_codons_DNA = (
1071             L  => [ qw( TTA TTG CTA CTG CTT CTC ) ],
1072             R  => [ qw( AGA AGG CGA CGG CGT CGC ) ],
1073             S  => [ qw( AGT AGC TCA TCG TCT TCC ) ],
1074             A  => [ qw( GCA GCG GCT GCC ) ],
1075             G  => [ qw( GGA GGG GGT GGC ) ],
1076             P  => [ qw( CCA CCG CCT CCC ) ],
1077             T  => [ qw( ACA ACG ACT ACC ) ],
1078             V  => [ qw( GTA GTG GTT GTC ) ],
1079             I  => [ qw( ATA ATT ATC ) ],
1080             C  => [ qw( TGT TGC ) ],
1081             D  => [ qw( GAT GAC ) ],
1082             E  => [ qw( GAA GAG ) ],
1083             F  => [ qw( TTT TTC ) ],
1084             H  => [ qw( CAT CAC ) ],
1085             K  => [ qw( AAA AAG ) ],
1086             N  => [ qw( AAT AAC ) ],
1087             Q  => [ qw( CAA CAG ) ],
1088             Y  => [ qw( TAT TAC ) ],
1089             M  => [ qw( ATG ) ],
1090             U  => [ qw( TGA ) ],
1091             W  => [ qw( TGG ) ],
1092    
1093             l  => [ qw( tta ttg cta ctg ctt ctc ) ],
1094             r  => [ qw( aga agg cga cgg cgt cgc ) ],
1095             s  => [ qw( agt agc tca tcg tct tcc ) ],
1096             a  => [ qw( gca gcg gct gcc ) ],
1097             g  => [ qw( gga ggg ggt ggc ) ],
1098             p  => [ qw( cca ccg cct ccc ) ],
1099             t  => [ qw( aca acg act acc ) ],
1100             v  => [ qw( gta gtg gtt gtc ) ],
1101             i  => [ qw( ata att atc ) ],
1102             c  => [ qw( tgt tgc ) ],
1103             d  => [ qw( gat gac ) ],
1104             e  => [ qw( gaa gag ) ],
1105             f  => [ qw( ttt ttc ) ],
1106             h  => [ qw( cat cac ) ],
1107             k  => [ qw( aaa aag ) ],
1108             n  => [ qw( aat aac ) ],
1109             q  => [ qw( caa cag ) ],
1110             y  => [ qw( tat tac ) ],
1111             m  => [ qw( atg ) ],
1112             u  => [ qw( tga ) ],
1113             w  => [ qw( tgg ) ],
1114    
1115            '*' => [ qw( TAA TAG TGA ) ]
1116  );  );
1117    
1118    
1119  my %DNA_letter_can_be = (  
1120    %amino_acid_codons_RNA = (
1121             L  => [ qw( UUA UUG CUA CUG CUU CUC ) ],
1122             R  => [ qw( AGA AGG CGA CGG CGU CGC ) ],
1123             S  => [ qw( AGU AGC UCA UCG UCU UCC ) ],
1124             A  => [ qw( GCA GCG GCU GCC ) ],
1125             G  => [ qw( GGA GGG GGU GGC ) ],
1126             P  => [ qw( CCA CCG CCU CCC ) ],
1127             T  => [ qw( ACA ACG ACU ACC ) ],
1128             V  => [ qw( GUA GUG GUU GUC ) ],
1129             B  => [ qw( GAU GAC AAU AAC ) ],
1130             Z  => [ qw( GAA GAG CAA CAG ) ],
1131             I  => [ qw( AUA AUU AUC ) ],
1132             C  => [ qw( UGU UGC ) ],
1133             D  => [ qw( GAU GAC ) ],
1134             E  => [ qw( GAA GAG ) ],
1135             F  => [ qw( UUU UUC ) ],
1136             H  => [ qw( CAU CAC ) ],
1137             K  => [ qw( AAA AAG ) ],
1138             N  => [ qw( AAU AAC ) ],
1139             Q  => [ qw( CAA CAG ) ],
1140             Y  => [ qw( UAU UAC ) ],
1141             M  => [ qw( AUG ) ],
1142             U  => [ qw( UGA ) ],
1143             W  => [ qw( UGG ) ],
1144    
1145             l  => [ qw( uua uug cua cug cuu cuc ) ],
1146             r  => [ qw( aga agg cga cgg cgu cgc ) ],
1147             s  => [ qw( agu agc uca ucg ucu ucc ) ],
1148             a  => [ qw( gca gcg gcu gcc ) ],
1149             g  => [ qw( gga ggg ggu ggc ) ],
1150             p  => [ qw( cca ccg ccu ccc ) ],
1151             t  => [ qw( aca acg acu acc ) ],
1152             v  => [ qw( gua gug guu guc ) ],
1153             b  => [ qw( gau gac aau aac ) ],
1154             z  => [ qw( gaa gag caa cag ) ],
1155             i  => [ qw( aua auu auc ) ],
1156             c  => [ qw( ugu ugc ) ],
1157             d  => [ qw( gau gac ) ],
1158             e  => [ qw( gaa gag ) ],
1159             f  => [ qw( uuu uuc ) ],
1160             h  => [ qw( cau cac ) ],
1161             k  => [ qw( aaa aag ) ],
1162             n  => [ qw( aau aac ) ],
1163             q  => [ qw( caa cag ) ],
1164             y  => [ qw( uau uac ) ],
1165             m  => [ qw( aug ) ],
1166             u  => [ qw( uga ) ],
1167             w  => [ qw( ugg ) ],
1168    
1169            '*' => [ qw( UAA UAG UGA ) ]
1170    );
1171    
1172    
1173    %n_codon_for_aa = map {
1174        $_ => scalar @{ $amino_acid_codons_DNA{ $_ } }
1175        } keys %amino_acid_codons_DNA;
1176    
1177    
1178    %reverse_genetic_code_DNA = (
1179             A  => "GCN",  a  => "gcn",
1180             C  => "TGY",  c  => "tgy",
1181             D  => "GAY",  d  => "gay",
1182             E  => "GAR",  e  => "gar",
1183             F  => "TTY",  f  => "tty",
1184             G  => "GGN",  g  => "ggn",
1185             H  => "CAY",  h  => "cay",
1186             I  => "ATH",  i  => "ath",
1187             K  => "AAR",  k  => "aar",
1188             L  => "YTN",  l  => "ytn",
1189             M  => "ATG",  m  => "atg",
1190             N  => "AAY",  n  => "aay",
1191             P  => "CCN",  p  => "ccn",
1192             Q  => "CAR",  q  => "car",
1193             R  => "MGN",  r  => "mgn",
1194             S  => "WSN",  s  => "wsn",
1195             T  => "ACN",  t  => "acn",
1196             U  => "TGA",  u  => "tga",
1197             V  => "GTN",  v  => "gtn",
1198             W  => "TGG",  w  => "tgg",
1199             X  => "NNN",  x  => "nnn",
1200             Y  => "TAY",  y  => "tay",
1201            '*' => "TRR"
1202    );
1203    
1204    %reverse_genetic_code_RNA = (
1205             A  => "GCN",  a  => "gcn",
1206             C  => "UGY",  c  => "ugy",
1207             D  => "GAY",  d  => "gay",
1208             E  => "GAR",  e  => "gar",
1209             F  => "UUY",  f  => "uuy",
1210             G  => "GGN",  g  => "ggn",
1211             H  => "CAY",  h  => "cay",
1212             I  => "AUH",  i  => "auh",
1213             K  => "AAR",  k  => "aar",
1214             L  => "YUN",  l  => "yun",
1215             M  => "AUG",  m  => "aug",
1216             N  => "AAY",  n  => "aay",
1217             P  => "CCN",  p  => "ccn",
1218             Q  => "CAR",  q  => "car",
1219             R  => "MGN",  r  => "mgn",
1220             S  => "WSN",  s  => "wsn",
1221             T  => "ACN",  t  => "acn",
1222             U  => "UGA",  u  => "uga",
1223             V  => "GUN",  v  => "gun",
1224             W  => "UGG",  w  => "ugg",
1225             X  => "NNN",  x  => "nnn",
1226             Y  => "UAY",  y  => "uay",
1227            '*' => "URR"
1228    );
1229    
1230    
1231    %DNA_letter_can_be = (
1232      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1233      B => ["C", "G", "T"],       b => ["c", "g", "t"],      B => ["C", "G", "T"],       b => ["c", "g", "t"],
1234      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 520  Line 1248 
1248  );  );
1249    
1250    
1251  my %RNA_letter_can_be = (  %RNA_letter_can_be = (
1252      A => ["A"],                 a => ["a"],      A => ["A"],                 a => ["a"],
1253      B => ["C", "G", "U"],       b => ["c", "g", "u"],      B => ["C", "G", "U"],       b => ["c", "g", "u"],
1254      C => ["C"],                 c => ["c"],      C => ["C"],                 c => ["c"],
# Line 540  Line 1268 
1268  );  );
1269    
1270    
1271  my %one_letter_to_three_letter_aa = (  %one_letter_to_three_letter_aa = (
1272       A  => "Ala",           A  => "Ala", a  => "Ala",
1273       B  => "Asx",           B  => "Asx", b  => "Asx",
1274       C  => "Cys",           C  => "Cys", c  => "Cys",
1275       D  => "Asp",           D  => "Asp", d  => "Asp",
1276       E  => "Glu",           E  => "Glu", e  => "Glu",
1277       F  => "Phe",           F  => "Phe", f  => "Phe",
1278       G  => "Gly",           G  => "Gly", g  => "Gly",
1279       H  => "His",           H  => "His", h  => "His",
1280       I  => "Ile",           I  => "Ile", i  => "Ile",
1281       K  => "Lys",           K  => "Lys", k  => "Lys",
1282       L  => "Leu",           L  => "Leu", l  => "Leu",
1283       M  => "Met",           M  => "Met", m  => "Met",
1284       N  => "Asn",           N  => "Asn", n  => "Asn",
1285       P  => "Pro",           P  => "Pro", p  => "Pro",
1286       Q  => "Gln",           Q  => "Gln", q  => "Gln",
1287       R  => "Arg",           R  => "Arg", r  => "Arg",
1288       S  => "Ser",           S  => "Ser", s  => "Ser",
1289       T  => "Thr",           T  => "Thr", t  => "Thr",
1290       U  => "Sec",           U  => "Sec", u  => "Sec",
1291       V  => "Val",           V  => "Val", v  => "Val",
1292       W  => "Trp",           W  => "Trp", w  => "Trp",
1293       X  => "Xxx",           X  => "Xxx", x  => "Xxx",
1294       Y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1295       Z  => "Glx",           Z  => "Glx", z  => "Glx",
1296      "*" => "***"          '*' => "***"
1297            );
1298    
1299    
1300    %three_letter_to_one_letter_aa = (
1301         ALA  => "A",   Ala  => "A",   ala  => "a",
1302         ARG  => "R",   Arg  => "R",   arg  => "r",
1303         ASN  => "N",   Asn  => "N",   asn  => "n",
1304         ASP  => "D",   Asp  => "D",   asp  => "d",
1305         ASX  => "B",   Asx  => "B",   asx  => "b",
1306         CYS  => "C",   Cys  => "C",   cys  => "c",
1307         GLN  => "Q",   Gln  => "Q",   gln  => "q",
1308         GLU  => "E",   Glu  => "E",   glu  => "e",
1309         GLX  => "Z",   Glx  => "Z",   glx  => "z",
1310         GLY  => "G",   Gly  => "G",   gly  => "g",
1311         HIS  => "H",   His  => "H",   his  => "h",
1312         ILE  => "I",   Ile  => "I",   ile  => "i",
1313         LEU  => "L",   Leu  => "L",   leu  => "l",
1314         LYS  => "K",   Lys  => "K",   lys  => "k",
1315         MET  => "M",   Met  => "M",   met  => "m",
1316         PHE  => "F",   Phe  => "F",   phe  => "f",
1317         PRO  => "P",   Pro  => "P",   pro  => "p",
1318         SEC  => "U",   Sec  => "U",   sec  => "u",
1319         SER  => "S",   Ser  => "S",   ser  => "s",
1320         THR  => "T",   Thr  => "T",   thr  => "t",
1321         TRP  => "W",   Trp  => "W",   trp  => "w",
1322         TYR  => "Y",   Tyr  => "Y",   tyr  => "y",
1323         VAL  => "V",   Val  => "V",   val  => "v",
1324         XAA  => "X",   Xaa  => "X",   xaa  => "x",
1325         XXX  => "X",   Xxx  => "X",   xxx  => "x",
1326        '***' => "*"
1327  );  );
1328    
1329    
# Line 586  Line 1344 
1344                              #  (note: undef is false)                              #  (note: undef is false)
1345    
1346      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!      my $imax = length($seq) - 2;  # will try to translate 2 nucleotides!
1347      my $pep  = ($met && ($imax >= 0)) ? "M" : "";      my $pep = ( ($met && ($imax >= 0)) ? "M" : "" );
     my $aa;  
1348      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {      for (my $i = $met ? 3 : 0; $i <= $imax; $i += 3) {
1349          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );          $pep .= translate_uc_DNA_codon( substr($seq,$i,3) );
1350      }      }
# Line 796  Line 1553 
1553  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1554  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1555  #  #
1556  #     @intervals = read_intervals(*FILEHANDLE)  #     @intervals = read_intervals( \*FILEHANDLE )
1557  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1558  sub read_intervals {  sub read_intervals {
1559      my $fh = shift;      my $fh = shift;
# Line 819  Line 1576 
1576    
1577    
1578  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1579    #  Convert a list of intervals to read [ id, left_end, right_end ].
1580    #
1581    #     @intervals = standardize_intervals( @interval_refs )
1582    #-----------------------------------------------------------------------------
1583    sub standardize_intervals {
1584        map { ( $_->[1] < $_->[2] ) ? $_ : [ $_->[0], $_->[2], $_->[1] ] } @_;
1585    }
1586    
1587    
1588    #-----------------------------------------------------------------------------
1589  #  Take the union of a list of intervals  #  Take the union of a list of intervals
1590  #  #
1591  #     @joined = join_intervals( @interval_refs )  #     @joined = join_intervals( @interval_refs )
# Line 858  Line 1625 
1625      return @joined;      return @joined;
1626  }  }
1627    
1628    
1629    #-----------------------------------------------------------------------------
1630    #  Split location strings to oriented intervals.
1631    #
1632    #     @intervals = locations_2_intervals( @locations )
1633    #     $interval  = locations_2_intervals( $location  )
1634    #-----------------------------------------------------------------------------
1635    sub locations_2_intervals {
1636        my @intervals = map { /^(\S+)_(\d+)_(\d+)(\s.*)?$/
1637                           || /^(\S+)_(\d+)-(\d+)(\s.*)?$/
1638                           || /^(\S+)=(\d+)=(\d+)(\s.*)?$/
1639                           || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/
1640                            ? [ $1, $2+0, $3+0 ]
1641                            : ()
1642                            } @_;
1643    
1644        return wantarray ? @intervals : $intervals[0];
1645    }
1646    
1647    
1648    #-----------------------------------------------------------------------------
1649    #  Read a list of oriented intervals from a file.
1650    #  Allow id_start_end, or id \s start \s end formats
1651    #
1652    #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1653    #-----------------------------------------------------------------------------
1654    sub read_oriented_intervals {
1655        my $fh = shift;
1656        my @intervals = ();
1657    
1658        while (<$fh>) {
1659            chomp;
1660               /^(\S+)_(\d+)_(\d+)(\s.*)?$/        #  id_start_end       WIT2
1661            || /^(\S+)_(\d+)-(\d+)(\s.*)?$/        #  id_start-end       ???
1662            || /^(\S+)=(\d+)=(\d+)(\s.*)?$/        #  id=start=end       Badger
1663            || /^(\S+)\s+(\d+)\s+(\d+)(\s.*)?$/    #  id \s start \s end
1664            || next;
1665    
1666            #  Matched a pattern.  Store reference to (id, start, end):
1667            push @intervals, [ $1, $2+0, $3+0 ];
1668        }
1669        return @intervals;
1670    }
1671    
1672    
1673    #-----------------------------------------------------------------------------
1674    #  Reverse the orientation of a list of intervals
1675    #
1676    #     @reversed = reverse_intervals( @interval_refs )
1677    #-----------------------------------------------------------------------------
1678    sub reverse_intervals {
1679        map { [ $_->[0], $_->[2], $_->[1] ] } @_;
1680    }
1681    
1682    
1683    #-----------------------------------------------------------------------------
1684    #  Convert GenBank locations to SEED locations
1685    #
1686    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1687    #-----------------------------------------------------------------------------
1688    sub gb_location_2_seed
1689    {
1690        my $contig = shift @_;
1691        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1692    
1693        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1694    }
1695    
1696    sub gb_loc_2_seed_2
1697    {
1698        my ( $contig, $loc ) = @_;
1699    
1700        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1701        {
1702            join( '_', $contig, $1, $2 )
1703        }
1704    
1705        elsif ( $loc =~ /^join\((.*)\)$/ )
1706        {
1707            $loc = $1;
1708            my $lvl = 0;
1709            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1710            {
1711                for ( substr( $loc, $i, 1 ) )
1712                {
1713                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1714                    /\(/          and $lvl--;
1715                    /\)/          and $lvl++;
1716                }
1717            }
1718            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1719            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1720        }
1721    
1722        elsif ( $loc =~ /^complement\((.*)\)$/ )
1723        {
1724            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1725            reverse
1726            gb_loc_2_seed_2( $contig, $1 )
1727        }
1728    
1729        else
1730        {
1731            ()
1732        }
1733    }
1734    
1735    
1736    #-----------------------------------------------------------------------------
1737    #  Read qual.
1738    #
1739    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1740    #
1741    #     @seq_entries = read_qual( )               #  STDIN
1742    #    \@seq_entries = read_qual( )               #  STDIN
1743    #     @seq_entries = read_qual( \*FILEHANDLE )
1744    #    \@seq_entries = read_qual( \*FILEHANDLE )
1745    #     @seq_entries = read_qual(  $filename )
1746    #    \@seq_entries = read_qual(  $filename )
1747    #-----------------------------------------------------------------------------
1748    sub read_qual {
1749        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1750        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1751    
1752        my @quals = ();
1753        my ($id, $desc, $qual) = ("", "", []);
1754    
1755        while ( <$fh> ) {
1756            chomp;
1757            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1758                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1759                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1760            }
1761            else {
1762                push @$qual, split;
1763            }
1764        }
1765        close( $fh ) if $close;
1766    
1767        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1768        return wantarray ? @quals : \@quals;
1769    }
1770    
1771    
1772  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3