[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Mon Dec 5 19:06:30 2005 UTC revision 1.7, Sun Jun 10 17:34:54 2007 UTC
# Line 1  Line 1 
 #  
 # Copyright (c) 2003-2006 University of Chicago and Fellowship  
 # for Interpretations of Genomes. All Rights Reserved.  
 #  
 # This file is part of the SEED Toolkit.  
 #  
 # The SEED Toolkit is free software. You can redistribute  
 # it and/or modify it under the terms of the SEED Toolkit  
 # Public License.  
 #  
 # You should have received a copy of the SEED Toolkit Public License  
 # along with this program; if not write to the University of Chicago  
 # at info@ci.uchicago.edu or the Fellowship for Interpretation of  
 # Genomes at veronika@thefig.info or download a copy from  
 # http://www.theseed.org/LICENSE.TXT.  
 #  
   
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
4  #  A list of entries is a list of references  #  A list of entries is a list of references
5  #  #
6  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs  #  @seq_entries = read_fasta( )                     # STDIN
9    #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11    #  @seq_entries = read_clustal( )                   # STDIN
12    #  @seq_entries = read_clustal( \*FILEHANDLE )
13    #  @seq_entries = read_clustal(  $filename )
14    #  @seq_entries = read_clustal_file(  $filename )
15  #  #
16    #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
# Line 31  Line 21 
21  #  ( $id, $def ) = parse_fasta_title( $title )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  ( $id, $def ) = split_fasta_title( $title )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  #
24  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28    #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46    #
47    #   @packed_seqs = pack_alignment(  @seqs )
48    #   @packed_seqs = pack_alignment( \@seqs )
49    #  \@packed_seqs = pack_alignment(  @seqs )
50    #  \@packed_seqs = pack_alignment( \@seqs )
51  #  #
52  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
53  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
# Line 57  Line 70 
70  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
71  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
72  #  #
73  #  @intervals = read_intervals( *FILEHANDLE )  #  @intervals = read_intervals( \*FILEHANDLE )
74  #  @intervals = read_oriented_intervals( *FILEHANDLE )  #  @intervals = read_oriented_intervals( \*FILEHANDLE )
75  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
76  #  @joined    = join_intervals( @interval_refs )  #  @joined    = join_intervals( @interval_refs )
77  #  @intervals = locations_2_intervals( @locations )  #  @intervals = locations_2_intervals( @locations )
78  #  $interval  = locations_2_intervals( $location  )  #  $interval  = locations_2_intervals( $location  )
79  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
80    #
81    #  Convert GenBank locations to SEED locations
82    #
83    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
84    #
85    #  Read quality scores from a fasta-like file:
86    #
87    #  @seq_entries = read_qual( )               #  STDIN
88    # \@seq_entries = read_qual( )               #  STDIN
89    #  @seq_entries = read_qual( \*FILEHANDLE )
90    # \@seq_entries = read_qual( \*FILEHANDLE )
91    #  @seq_entries = read_qual(  $filename )
92    # \@seq_entries = read_qual(  $filename )
93    #
94    
95  use strict;  use strict;
96    
 use gjolib qw( wrap_text );  
   
97  #  Exported global variables:  #  Exported global variables:
98    
99  our @aa_1_letter_order;  # Alpha by 1 letter  our @aa_1_letter_order;  # Alpha by 1 letter
# Line 92  Line 116 
116  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
117  our @EXPORT = qw(  our @EXPORT = qw(
118          read_fasta_seqs          read_fasta_seqs
119            read_fasta
120          read_next_fasta_seq          read_next_fasta_seq
121            read_clustal_file
122            read_clustal
123          parse_fasta_title          parse_fasta_title
124          split_fasta_title          split_fasta_title
125          print_seq_list_as_fasta          print_seq_list_as_fasta
126            print_alignment_as_fasta
127            print_alignment_as_phylip
128            print_alignment_as_nexus
129          print_seq_as_fasta          print_seq_as_fasta
130          print_gb_locus          print_gb_locus
131    
# Line 104  Line 134 
134          seq_desc_by_id          seq_desc_by_id
135          seq_data_by_id          seq_data_by_id
136    
137            pack_alignment
138    
139          subseq_DNA_entry          subseq_DNA_entry
140          subseq_RNA_entry          subseq_RNA_entry
141          complement_DNA_entry          complement_DNA_entry
# Line 125  Line 157 
157          locations_2_intervals          locations_2_intervals
158          read_oriented_intervals          read_oriented_intervals
159          reverse_intervals          reverse_intervals
160    
161            gb_location_2_seed
162    
163            read_qual
164          );          );
165    
166  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 146  Line 182 
182    
183    
184  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
185  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
186    #     filehandle is passed through
187    #     string is taken as file name to be openend
188    #     undef or "" defaults to STDOUT
189    #
190    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
191    #
192    #-----------------------------------------------------------------------------
193    sub input_filehandle
194    {
195        my $file = shift;
196    
197        #  FILEHANDLE
198    
199        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
200    
201        #  Null string or undef
202    
203        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
204    
205        #  File name
206    
207        if ( ! ref( $file ) )
208        {
209            my $fh;
210            -f $file or die "Could not find input file \"$file\"\n";
211            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
212            return ( $fh, $file, 1 );
213        }
214    
215        #  Some other kind of reference; return the unused value
216    
217        return ( \*STDIN, undef, 0, $file );
218    }
219    
220    
221    #-----------------------------------------------------------------------------
222    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
223  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
224  #  #
225  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
226  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
227  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
228      my $fh = shift;  
229      wantarray || die "read_fasta_seqs requires list context\n";  
230    #-----------------------------------------------------------------------------
231    #  Read fasta sequences.
232    #  Save the contents in a list of refs to arrays:  (id, description, seq)
233    #
234    #     @seq_entries = read_fasta( )               #  STDIN
235    #    \@seq_entries = read_fasta( )               #  STDIN
236    #     @seq_entries = read_fasta( \*FILEHANDLE )
237    #    \@seq_entries = read_fasta( \*FILEHANDLE )
238    #     @seq_entries = read_fasta(  $filename )
239    #    \@seq_entries = read_fasta(  $filename )
240    #-----------------------------------------------------------------------------
241    sub read_fasta {
242        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
243        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
244    
245      my @seqs = ();      my @seqs = ();
246      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 169  Line 256 
256              $seq .= $_ ;              $seq .= $_ ;
257          }          }
258      }      }
259        close( $fh ) if $close;
260    
261      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
262      return @seqs;      return wantarray ? @seqs : \@seqs;
263  }  }
264    
265    
# Line 179  Line 267 
267  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
268  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
269  #  #
270  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
271  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
272  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
273    
# Line 188  Line 276 
276      my %next_header;      my %next_header;
277    
278      sub read_next_fasta_seq {      sub read_next_fasta_seq {
         wantarray || die "read_next_fasta_seq requires list context\n";  
   
279          my $fh = shift;          my $fh = shift;
280          my ( $id, $desc );          my ( $id, $desc );
281    
# Line 206  Line 292 
292              chomp;              chomp;
293              if ( /^>/ ) {        #  new id              if ( /^>/ ) {        #  new id
294                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
295                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
296                    {
297                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
298                    }
299                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
300                  $seq = "";                  $seq = "";
301              }              }
# Line 219  Line 308 
308          #  Done with file, delete "next header"          #  Done with file, delete "next header"
309    
310          delete $next_header{$fh};          delete $next_header{$fh};
311          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
312                                                          : [$id, $desc, $seq]
313                                              )
314                                            : () ;
315      }      }
316  }  }
317    
318    
319  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
320    #  Read a clustal alignment from a file.
321    #  Save the contents in a list of refs to arrays:  (id, description, seq)
322    #
323    #     @seq_entries = read_clustal_file( $filename )
324    #-----------------------------------------------------------------------------
325    sub read_clustal_file { read_clustal( @_ ) }
326    
327    
328    #-----------------------------------------------------------------------------
329    #  Read a clustal alignment.
330    #  Save the contents in a list of refs to arrays:  (id, description, seq)
331    #
332    #     @seq_entries = read_clustal( )              # STDIN
333    #     @seq_entries = read_clustal( \*FILEHANDLE )
334    #     @seq_entries = read_clustal(  $filename )
335    #-----------------------------------------------------------------------------
336    sub read_clustal {
337        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
338        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
339    
340        my ( %seq, @ids, $line );
341        while ( defined( $line = <$fh> ) )
342        {
343            ( $line =~ /^[A-Za-z0-9]/ ) or next;
344            chomp $line;
345            my @flds = split /\s+/, $line;
346            if ( @flds == 2 )
347            {
348                $seq{ $flds[0] } or push @ids, $flds[0];
349                push @{ $seq{ $flds[0] } }, $flds[1];
350            }
351        }
352        close( $fh ) if $close;
353    
354        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
355    }
356    
357    
358    #-----------------------------------------------------------------------------
359  #  Parse a fasta file header to id and definition parts  #  Parse a fasta file header to id and definition parts
360  #  #
361  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
# Line 250  Line 381 
381    
382    
383  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
384  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
385    #     filehandle is passed through
386    #     string is taken as file name to be openend
387    #     undef or "" defaults to STDOUT
388    #
389    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
390  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
391  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
392  sub print_seq_list_as_fasta {  sub output_filehandle
393      my $fh = shift;  {
394      my @seq_list = @_;      my $file = shift;
395    
396        #  FILEHANDLE
397    
398        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
399    
400        #  Null string or undef
401    
402        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
403    
404        #  File name
405    
406        if ( ! ref( $file ) )
407        {
408            my $fh;
409            open( $fh, ">$file" ) || die "Could not open output $file\n";
410            return ( $fh, $file, 1 );
411        }
412    
413        #  Some other kind of reference; return the unused value
414    
415        return ( \*STDOUT, undef, 0, $file );
416    }
417    
418    
419    #-----------------------------------------------------------------------------
420    #  Legacy function for printing fasta sequence set:
421    #
422    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
423    #-----------------------------------------------------------------------------
424    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
425    
426    
427    #-----------------------------------------------------------------------------
428    #  Print list of sequence entries in fasta format.
429    #  Missing, undef or "" filename defaults to STDOUT.
430    #
431    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
432    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
433    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
434    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
435    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
436    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
437    #-----------------------------------------------------------------------------
438    sub print_alignment_as_fasta {
439        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
440        ( unshift @_, $unused ) if $unused;
441    
442        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";
443    
444        #  Expand the sequence entry list if necessary:
445    
446        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
447    
448        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
449    
450        close( $fh ) if $close;
451    }
452    
453    
454    #-----------------------------------------------------------------------------
455    #  Print list of sequence entries in phylip format.
456    #  Missing, undef or "" filename defaults to STDOUT.
457    #
458    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
459    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
460    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
461    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
462    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
463    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
464    #-----------------------------------------------------------------------------
465    sub print_alignment_as_phylip {
466        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
467        ( unshift @_, $unused ) if $unused;
468    
469        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
470    
471        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
472    
473        my ( %id2, %used );
474        my $maxlen = 0;
475        foreach ( @seq_list )
476        {
477            my ( $id, undef, $seq ) = @$_;
478    
479            #  Need a name that is unique within 10 characters
480    
481            my $id2 = substr( $id, 0, 10 );
482            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
483            my $n = "0";
484            while ( $used{ $id2 } )
485            {
486                $n++;
487                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
488            }
489            $used{ $id2 } = 1;
490            $id2{ $id } = $id2;
491    
492      foreach my $seq_ptr (@seq_list) {                  #  Prepare to pad sequences (should not be necessary, but ...)
493          print_seq_as_fasta($fh, @$seq_ptr);  
494            my $len = length( $seq );
495            $maxlen = $len if ( $len > $maxlen );
496        }
497    
498        my $nseq = @seq_list;
499        print $fh "$nseq  $maxlen\n";
500        foreach ( @seq_list )
501        {
502            my ( $id, undef, $seq ) = @$_;
503            my $len = length( $seq );
504            printf $fh "%-10s  %s%s\n", $id2{ $id },
505                                        $seq,
506                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
507      }      }
508    
509        close( $fh ) if $close;
510    }
511    
512    
513    #-----------------------------------------------------------------------------
514    #  Print list of sequence entries in nexus format.
515    #  Missing, undef or "" filename defaults to STDOUT.
516    #
517    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
518    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
519    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
520    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
521    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
522    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
523    #-----------------------------------------------------------------------------
524    sub print_alignment_as_nexus {
525        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
526        ( unshift @_, $unused ) if $unused;
527    
528        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
529    
530        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
531    
532        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
533    
534        my %id2;
535        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
536        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
537        foreach ( @seq_list )
538        {
539            my ( $id, undef, $seq ) = @$_;
540            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
541            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
542            {
543                    $id2 =~ s/'/''/g;
544                    $id2 = qq('$id2');
545                }
546            $id2{ $id } = $id2;
547            my $idlen = length( $id2 );
548            $maxidlen = $idlen if ( $idlen > $maxidlen );
549    
550            my $seqlen = length( $seq );
551            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
552    
553            $nt += $seq =~ tr/Tt//d;
554            $nu += $seq =~ tr/Uu//d;
555            $n1 += $seq =~ tr/ACGNacgn//d;
556            $n2 += $seq =~ tr/A-Za-z//d;
557        }
558    
559        my $nseq = @seq_list;
560        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
561    
562        print $fh <<"END_HEAD";
563    #NEXUS
564    
565    BEGIN Data;
566        Dimensions
567            NTax=$nseq
568            NChar=$maxseqlen
569            ;
570        Format
571            DataType=$type
572            Gap=-
573            Missing=?
574            ;
575        Matrix
576    
577    END_HEAD
578    
579        foreach ( @seq_list )
580        {
581            my ( $id, undef, $seq ) = @$_;
582            my $len = length( $seq );
583            printf  $fh  "%-${maxidlen}s  %s%s\n",
584                         $id2{ $id },
585                         $seq,
586                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
587        }
588    
589        print $fh <<"END_TAIL";
590    ;
591    END;
592    END_TAIL
593    
594        close( $fh ) if $close;
595  }  }
596    
597    
598  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
599  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
600  #  #
601  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
602  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
603  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
604  sub print_seq_as_fasta {  sub print_seq_as_fasta {
605      my $fh = shift;      my $fh = shift;
# Line 285  Line 616 
616  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
617  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
618  #  #
619  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
620  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
621  sub print_gb_locus {  sub print_gb_locus {
622      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 311  Line 642 
642    
643    
644  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
645    #  Return a string with text wrapped to defined line lengths:
646    #
647    #     $wrapped_text = wrap_text( $str )                  # default len   =  80
648    #     $wrapped_text = wrap_text( $str, $len )            # default ind   =   0
649    #     $wrapped_text = wrap_text( $str, $len, $indent )   # default ind_n = ind
650    #     $wrapped_text = wrap_text( $str, $len, $indent_1, $indent_n )
651    #-----------------------------------------------------------------------------
652    sub wrap_text {
653        my ($str, $len, $ind, $indn) = @_;
654    
655        defined($str)  || die "wrap_text called without a string\n";
656        defined($len)  || ($len  =   80);
657        defined($ind)  || ($ind  =    0);
658        ($ind  < $len) || die "wrap error: indent greater than line length\n";
659        defined($indn) || ($indn = $ind);
660        ($indn < $len) || die "wrap error: indent_n greater than line length\n";
661    
662        $str =~ s/\s+$//;
663        $str =~ s/^\s+//;
664        my ($maxchr, $maxchr1);
665        my (@lines) = ();
666    
667        while ($str) {
668            $maxchr1 = ($maxchr = $len - $ind) - 1;
669            if ($maxchr >= length($str)) {
670                push @lines, (" " x $ind) . $str;
671                last;
672            }
673            elsif ($str =~ /^(.{0,$maxchr1}\S)\s+(\S.*)$/) { # no expr in {}
674                push @lines, (" " x $ind) . $1;
675                $str = $2;
676            }
677            elsif ($str =~ /^(.{0,$maxchr1}-)(.*)$/) {
678                push @lines, (" " x $ind) . $1;
679                $str = $2;
680            }
681            else {
682                push @lines, (" " x $ind) . substr($str, 0, $maxchr);
683                $str = substr($str, $maxchr);
684            }
685            $ind = $indn;
686        }
687    
688        return join("\n", @lines);
689    }
690    
691    
692    #-----------------------------------------------------------------------------
693  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
694  #  #
695  #     my \%seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
696    #     my \%seq_ind  = index_seq_list( \@seq_list );
697  #  #
698  #  Usage example:  #  Usage example:
699  #  #
700  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
701  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
702  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
703  #  #
704  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
705  sub index_seq_list {  sub index_seq_list {
706      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
707      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
708      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
709  }  }
710    
711    
712  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
713  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
714  #  #
715  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
716  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
717  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
718    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
719  #  #
720  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
721  sub seq_entry_by_id {  sub seq_entry_by_id {
722      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
723      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
724      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
725  }  }
726    
727    
# Line 357  Line 738 
738      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
739  }  }
740    
741    #-----------------------------------------------------------------------------
742    #  Remove columns of alignment gaps from sequences:
743    #
744    #   @packed_seqs = pack_alignment(  @seqs )
745    #   @packed_seqs = pack_alignment( \@seqs )
746    #  \@packed_seqs = pack_alignment(  @seqs )
747    #  \@packed_seqs = pack_alignment( \@seqs )
748    #
749    #-----------------------------------------------------------------------------
750    
751    sub pack_alignment
752    {
753        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
754        @seqs or return wantarray ? () : [];
755    
756        my $mask  = pack_mask( $seqs[0]->[2] );
757        foreach ( @seqs[ 1 .. (@seqs-1) ] )
758        {
759            $mask |= pack_mask( $_->[2] );
760        }
761    
762        my $seq;
763        my @seqs2 = map { $seq = $_->[2] & $mask;
764                          $seq =~ tr/\000//d;
765                          [ $_->[0], $_->[1], $seq ]
766                        }
767                    @seqs;
768    
769        return wantarray ? @seqs2 : \@seqs2;
770    }
771    
772    sub pack_mask
773    {
774        my $mask = shift;
775        $mask =~ tr/-/\000/;
776        $mask =~ tr/\000/\377/c;
777        return $mask;
778    }
779    
780  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
781  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 849  Line 1268 
1268  );  );
1269    
1270    
1271  %one_letter_to_three_letter_aa = {  %one_letter_to_three_letter_aa = (
1272           A  => "Ala", a  => "Ala",           A  => "Ala", a  => "Ala",
1273           B  => "Asx", b  => "Asx",           B  => "Asx", b  => "Asx",
1274           C  => "Cys", c  => "Cys",           C  => "Cys", c  => "Cys",
# Line 875  Line 1294 
1294           Y  => "Tyr", y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1295           Z  => "Glx", z  => "Glx",           Z  => "Glx", z  => "Glx",
1296          '*' => "***"          '*' => "***"
1297          };          );
1298    
1299    
1300  %three_letter_to_one_letter_aa = (  %three_letter_to_one_letter_aa = (
# Line 1134  Line 1553 
1553  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1554  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1555  #  #
1556  #     @intervals = read_intervals( *FILEHANDLE )  #     @intervals = read_intervals( \*FILEHANDLE )
1557  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1558  sub read_intervals {  sub read_intervals {
1559      my $fh = shift;      my $fh = shift;
# Line 1230  Line 1649 
1649  #  Read a list of oriented intervals from a file.  #  Read a list of oriented intervals from a file.
1650  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1651  #  #
1652  #     @intervals = read_oriented_intervals( *FILEHANDLE )  #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1653  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1654  sub read_oriented_intervals {  sub read_oriented_intervals {
1655      my $fh = shift;      my $fh = shift;
# Line 1261  Line 1680 
1680  }  }
1681    
1682    
1683    #-----------------------------------------------------------------------------
1684    #  Convert GenBank locations to SEED locations
1685    #
1686    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1687    #-----------------------------------------------------------------------------
1688    sub gb_location_2_seed
1689    {
1690        my $contig = shift @_;
1691        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1692    
1693        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1694    }
1695    
1696    sub gb_loc_2_seed_2
1697    {
1698        my ( $contig, $loc ) = @_;
1699    
1700        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1701        {
1702            join( '_', $contig, $1, $2 )
1703        }
1704    
1705        elsif ( $loc =~ /^join\((.*)\)$/ )
1706        {
1707            $loc = $1;
1708            my $lvl = 0;
1709            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1710            {
1711                for ( substr( $loc, $i, 1 ) )
1712                {
1713                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1714                    /\(/          and $lvl--;
1715                    /\)/          and $lvl++;
1716                }
1717            }
1718            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1719            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1720        }
1721    
1722        elsif ( $loc =~ /^complement\((.*)\)$/ )
1723        {
1724            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1725            reverse
1726            gb_loc_2_seed_2( $contig, $1 )
1727        }
1728    
1729        else
1730        {
1731            ()
1732        }
1733    }
1734    
1735    
1736    #-----------------------------------------------------------------------------
1737    #  Read qual.
1738    #
1739    #  Save the contents in a list of refs to arrays: [ $id, $descript, \@qual ]
1740    #
1741    #     @seq_entries = read_qual( )               #  STDIN
1742    #    \@seq_entries = read_qual( )               #  STDIN
1743    #     @seq_entries = read_qual( \*FILEHANDLE )
1744    #    \@seq_entries = read_qual( \*FILEHANDLE )
1745    #     @seq_entries = read_qual(  $filename )
1746    #    \@seq_entries = read_qual(  $filename )
1747    #-----------------------------------------------------------------------------
1748    sub read_qual {
1749        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
1750        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_qual\n";
1751    
1752        my @quals = ();
1753        my ($id, $desc, $qual) = ("", "", []);
1754    
1755        while ( <$fh> ) {
1756            chomp;
1757            if (/^>\s*(\S+)(\s+(.*))?$/) {        #  new id
1758                if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1759                ($id, $desc, $qual) = ($1, $3 ? $3 : "", []);
1760            }
1761            else {
1762                push @$qual, split;
1763            }
1764        }
1765        close( $fh ) if $close;
1766    
1767        if ($id && @$qual) { push @quals, [ $id, $desc, $qual ] }
1768        return wantarray ? @quals : \@quals;
1769    }
1770    
1771    
1772  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.7

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3