[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Mon Dec 5 19:06:30 2005 UTC revision 1.6, Sun Jun 10 17:24:38 2007 UTC
# Line 1  Line 1 
 #  
 # Copyright (c) 2003-2006 University of Chicago and Fellowship  
 # for Interpretations of Genomes. All Rights Reserved.  
 #  
 # This file is part of the SEED Toolkit.  
 #  
 # The SEED Toolkit is free software. You can redistribute  
 # it and/or modify it under the terms of the SEED Toolkit  
 # Public License.  
 #  
 # You should have received a copy of the SEED Toolkit Public License  
 # along with this program; if not write to the University of Chicago  
 # at info@ci.uchicago.edu or the Fellowship for Interpretation of  
 # Genomes at veronika@thefig.info or download a copy from  
 # http://www.theseed.org/LICENSE.TXT.  
 #  
   
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
4  #  A list of entries is a list of references  #  A list of entries is a list of references
5  #  #
6  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs  #  @seq_entries = read_fasta( )                     # STDIN
9    #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11    #  @seq_entries = read_clustal( )                   # STDIN
12    #  @seq_entries = read_clustal( \*FILEHANDLE )
13    #  @seq_entries = read_clustal(  $filename )
14    #  @seq_entries = read_clustal_file(  $filename )
15  #  #
16    #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
# Line 31  Line 21 
21  #  ( $id, $def ) = parse_fasta_title( $title )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  ( $id, $def ) = split_fasta_title( $title )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  #
24  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28    #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46    #
47    #   @packed_seqs = pack_alignment(  @seqs )
48    #   @packed_seqs = pack_alignment( \@seqs )
49    #  \@packed_seqs = pack_alignment(  @seqs )
50    #  \@packed_seqs = pack_alignment( \@seqs )
51  #  #
52  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
53  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
# Line 57  Line 70 
70  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
71  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
72  #  #
73  #  @intervals = read_intervals( *FILEHANDLE )  #  @intervals = read_intervals( \*FILEHANDLE )
74  #  @intervals = read_oriented_intervals( *FILEHANDLE )  #  @intervals = read_oriented_intervals( \*FILEHANDLE )
75  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
76  #  @joined    = join_intervals( @interval_refs )  #  @joined    = join_intervals( @interval_refs )
77  #  @intervals = locations_2_intervals( @locations )  #  @intervals = locations_2_intervals( @locations )
78  #  $interval  = locations_2_intervals( $location  )  #  $interval  = locations_2_intervals( $location  )
79  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
80    #
81    #  Convert GenBank locations to SEED locations
82    #
83    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
84    
85    
86  use strict;  use strict;
# Line 92  Line 109 
109  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
110  our @EXPORT = qw(  our @EXPORT = qw(
111          read_fasta_seqs          read_fasta_seqs
112            read_fasta
113          read_next_fasta_seq          read_next_fasta_seq
114            read_clustal_file
115            read_clustal
116          parse_fasta_title          parse_fasta_title
117          split_fasta_title          split_fasta_title
118          print_seq_list_as_fasta          print_seq_list_as_fasta
119            print_alignment_as_fasta
120            print_alignment_as_phylip
121            print_alignment_as_nexus
122          print_seq_as_fasta          print_seq_as_fasta
123          print_gb_locus          print_gb_locus
124    
# Line 104  Line 127 
127          seq_desc_by_id          seq_desc_by_id
128          seq_data_by_id          seq_data_by_id
129    
130            pack_alignment
131    
132          subseq_DNA_entry          subseq_DNA_entry
133          subseq_RNA_entry          subseq_RNA_entry
134          complement_DNA_entry          complement_DNA_entry
# Line 125  Line 150 
150          locations_2_intervals          locations_2_intervals
151          read_oriented_intervals          read_oriented_intervals
152          reverse_intervals          reverse_intervals
153    
154            gb_location_2_seed
155          );          );
156    
157  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 146  Line 173 
173    
174    
175  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
176  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
177    #     filehandle is passed through
178    #     string is taken as file name to be openend
179    #     undef or "" defaults to STDOUT
180    #
181    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
182    #
183    #-----------------------------------------------------------------------------
184    sub input_filehandle
185    {
186        my $file = shift;
187    
188        #  FILEHANDLE
189    
190        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
191    
192        #  Null string or undef
193    
194        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
195    
196        #  File name
197    
198        if ( ! ref( $file ) )
199        {
200            my $fh;
201            -f $file or die "Could not find input file \"$file\"\n";
202            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
203            return ( $fh, $file, 1 );
204        }
205    
206        #  Some other kind of reference; return the unused value
207    
208        return ( \*STDIN, undef, 0, $file );
209    }
210    
211    
212    #-----------------------------------------------------------------------------
213    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
214  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
215  #  #
216  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
217  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
218  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
219      my $fh = shift;  
220      wantarray || die "read_fasta_seqs requires list context\n";  
221    #-----------------------------------------------------------------------------
222    #  Read fasta sequences.
223    #  Save the contents in a list of refs to arrays:  (id, description, seq)
224    #
225    #     @seq_entries = read_fasta( )               #  STDIN
226    #    \@seq_entries = read_fasta( )               #  STDIN
227    #     @seq_entries = read_fasta( \*FILEHANDLE )
228    #    \@seq_entries = read_fasta( \*FILEHANDLE )
229    #     @seq_entries = read_fasta(  $filename )
230    #    \@seq_entries = read_fasta(  $filename )
231    #-----------------------------------------------------------------------------
232    sub read_fasta {
233        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
234        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
235    
236      my @seqs = ();      my @seqs = ();
237      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 169  Line 247 
247              $seq .= $_ ;              $seq .= $_ ;
248          }          }
249      }      }
250        close( $fh ) if $close;
251    
252      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
253      return @seqs;      return wantarray ? @seqs : \@seqs;
254  }  }
255    
256    
# Line 179  Line 258 
258  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
259  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
260  #  #
261  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
262  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
263  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
264    
# Line 188  Line 267 
267      my %next_header;      my %next_header;
268    
269      sub read_next_fasta_seq {      sub read_next_fasta_seq {
         wantarray || die "read_next_fasta_seq requires list context\n";  
   
270          my $fh = shift;          my $fh = shift;
271          my ( $id, $desc );          my ( $id, $desc );
272    
# Line 206  Line 283 
283              chomp;              chomp;
284              if ( /^>/ ) {        #  new id              if ( /^>/ ) {        #  new id
285                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
286                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
287                    {
288                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
289                    }
290                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
291                  $seq = "";                  $seq = "";
292              }              }
# Line 219  Line 299 
299          #  Done with file, delete "next header"          #  Done with file, delete "next header"
300    
301          delete $next_header{$fh};          delete $next_header{$fh};
302          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
303                                                          : [$id, $desc, $seq]
304                                              )
305                                            : () ;
306      }      }
307  }  }
308    
309    
310  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
311    #  Read a clustal alignment from a file.
312    #  Save the contents in a list of refs to arrays:  (id, description, seq)
313    #
314    #     @seq_entries = read_clustal_file( $filename )
315    #-----------------------------------------------------------------------------
316    sub read_clustal_file { read_clustal( @_ ) }
317    
318    
319    #-----------------------------------------------------------------------------
320    #  Read a clustal alignment.
321    #  Save the contents in a list of refs to arrays:  (id, description, seq)
322    #
323    #     @seq_entries = read_clustal( )              # STDIN
324    #     @seq_entries = read_clustal( \*FILEHANDLE )
325    #     @seq_entries = read_clustal(  $filename )
326    #-----------------------------------------------------------------------------
327    sub read_clustal {
328        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
329        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
330    
331        my ( %seq, @ids, $line );
332        while ( defined( $line = <$fh> ) )
333        {
334            ( $line =~ /^[A-Za-z0-9]/ ) or next;
335            chomp $line;
336            my @flds = split /\s+/, $line;
337            if ( @flds == 2 )
338            {
339                $seq{ $flds[0] } or push @ids, $flds[0];
340                push @{ $seq{ $flds[0] } }, $flds[1];
341            }
342        }
343        close( $fh ) if $close;
344    
345        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
346    }
347    
348    
349    #-----------------------------------------------------------------------------
350  #  Parse a fasta file header to id and definition parts  #  Parse a fasta file header to id and definition parts
351  #  #
352  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
# Line 250  Line 372 
372    
373    
374  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
375  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
376    #     filehandle is passed through
377    #     string is taken as file name to be openend
378    #     undef or "" defaults to STDOUT
379    #
380    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
381  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
382  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
383  sub print_seq_list_as_fasta {  sub output_filehandle
384      my $fh = shift;  {
385      my @seq_list = @_;      my $file = shift;
386    
387        #  FILEHANDLE
388    
389        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
390    
391        #  Null string or undef
392    
393        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
394    
395        #  File name
396    
397        if ( ! ref( $file ) )
398        {
399            my $fh;
400            open( $fh, ">$file" ) || die "Could not open output $file\n";
401            return ( $fh, $file, 1 );
402        }
403    
404        #  Some other kind of reference; return the unused value
405    
406      foreach my $seq_ptr (@seq_list) {      return ( \*STDOUT, undef, 0, $file );
         print_seq_as_fasta($fh, @$seq_ptr);  
407      }      }
408    
409    
410    #-----------------------------------------------------------------------------
411    #  Legacy function for printing fasta sequence set:
412    #
413    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
414    #-----------------------------------------------------------------------------
415    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
416    
417    
418    #-----------------------------------------------------------------------------
419    #  Print list of sequence entries in fasta format.
420    #  Missing, undef or "" filename defaults to STDOUT.
421    #
422    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
423    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
424    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
425    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
426    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
427    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
428    #-----------------------------------------------------------------------------
429    sub print_alignment_as_fasta {
430        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
431        ( unshift @_, $unused ) if $unused;
432    
433        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";
434    
435        #  Expand the sequence entry list if necessary:
436    
437        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
438    
439        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
440    
441        close( $fh ) if $close;
442    }
443    
444    
445    #-----------------------------------------------------------------------------
446    #  Print list of sequence entries in phylip format.
447    #  Missing, undef or "" filename defaults to STDOUT.
448    #
449    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
450    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
451    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
452    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
453    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
454    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
455    #-----------------------------------------------------------------------------
456    sub print_alignment_as_phylip {
457        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
458        ( unshift @_, $unused ) if $unused;
459    
460        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
461    
462        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
463    
464        my ( %id2, %used );
465        my $maxlen = 0;
466        foreach ( @seq_list )
467        {
468            my ( $id, undef, $seq ) = @$_;
469    
470            #  Need a name that is unique within 10 characters
471    
472            my $id2 = substr( $id, 0, 10 );
473            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
474            my $n = "0";
475            while ( $used{ $id2 } )
476            {
477                $n++;
478                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
479            }
480            $used{ $id2 } = 1;
481            $id2{ $id } = $id2;
482    
483                    #  Prepare to pad sequences (should not be necessary, but ...)
484    
485            my $len = length( $seq );
486            $maxlen = $len if ( $len > $maxlen );
487        }
488    
489        my $nseq = @seq_list;
490        print $fh "$nseq  $maxlen\n";
491        foreach ( @seq_list )
492        {
493            my ( $id, undef, $seq ) = @$_;
494            my $len = length( $seq );
495            printf $fh "%-10s  %s%s\n", $id2{ $id },
496                                        $seq,
497                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
498        }
499    
500        close( $fh ) if $close;
501    }
502    
503    
504    #-----------------------------------------------------------------------------
505    #  Print list of sequence entries in nexus format.
506    #  Missing, undef or "" filename defaults to STDOUT.
507    #
508    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
509    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
510    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
511    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
512    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
513    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
514    #-----------------------------------------------------------------------------
515    sub print_alignment_as_nexus {
516        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
517        ( unshift @_, $unused ) if $unused;
518    
519        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
520    
521        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
522    
523        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
524    
525        my %id2;
526        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
527        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
528        foreach ( @seq_list )
529        {
530            my ( $id, undef, $seq ) = @$_;
531            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
532            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
533            {
534                    $id2 =~ s/'/''/g;
535                    $id2 = qq('$id2');
536                }
537            $id2{ $id } = $id2;
538            my $idlen = length( $id2 );
539            $maxidlen = $idlen if ( $idlen > $maxidlen );
540    
541            my $seqlen = length( $seq );
542            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
543    
544            $nt += $seq =~ tr/Tt//d;
545            $nu += $seq =~ tr/Uu//d;
546            $n1 += $seq =~ tr/ACGNacgn//d;
547            $n2 += $seq =~ tr/A-Za-z//d;
548        }
549    
550        my $nseq = @seq_list;
551        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
552    
553        print $fh <<"END_HEAD";
554    #NEXUS
555    
556    BEGIN Data;
557        Dimensions
558            NTax=$nseq
559            NChar=$maxseqlen
560            ;
561        Format
562            DataType=$type
563            Gap=-
564            Missing=?
565            ;
566        Matrix
567    
568    END_HEAD
569    
570        foreach ( @seq_list )
571        {
572            my ( $id, undef, $seq ) = @$_;
573            my $len = length( $seq );
574            printf  $fh  "%-${maxidlen}s  %s%s\n",
575                         $id2{ $id },
576                         $seq,
577                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
578        }
579    
580        print $fh <<"END_TAIL";
581    ;
582    END;
583    END_TAIL
584    
585        close( $fh ) if $close;
586  }  }
587    
588    
589  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
590  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
591  #  #
592  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
593  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
594  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
595  sub print_seq_as_fasta {  sub print_seq_as_fasta {
596      my $fh = shift;      my $fh = shift;
# Line 285  Line 607 
607  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
608  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
609  #  #
610  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
611  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
612  sub print_gb_locus {  sub print_gb_locus {
613      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 314  Line 636 
636  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
637  #  #
638  #     my \%seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
639    #     my \%seq_ind  = index_seq_list( \@seq_list );
640  #  #
641  #  Usage example:  #  Usage example:
642  #  #
643  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
644  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
645  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
646  #  #
647  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
648  sub index_seq_list {  sub index_seq_list {
649      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
650      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
651      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
652  }  }
653    
654    
655  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
656  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
657  #  #
658  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
659  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
660  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
661    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
662  #  #
663  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
664  sub seq_entry_by_id {  sub seq_entry_by_id {
665      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
666      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
667      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
668  }  }
669    
670    
# Line 357  Line 681 
681      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
682  }  }
683    
684    #-----------------------------------------------------------------------------
685    #  Remove columns of alignment gaps from sequences:
686    #
687    #   @packed_seqs = pack_alignment(  @seqs )
688    #   @packed_seqs = pack_alignment( \@seqs )
689    #  \@packed_seqs = pack_alignment(  @seqs )
690    #  \@packed_seqs = pack_alignment( \@seqs )
691    #
692    #-----------------------------------------------------------------------------
693    
694    sub pack_alignment
695    {
696        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
697        @seqs or return wantarray ? () : [];
698    
699        my $mask  = pack_mask( $seqs[0]->[2] );
700        foreach ( @seqs[ 1 .. (@seqs-1) ] )
701        {
702            $mask |= pack_mask( $_->[2] );
703        }
704    
705        my $seq;
706        my @seqs2 = map { $seq = $_->[2] & $mask;
707                          $seq =~ tr/\000//d;
708                          [ $_->[0], $_->[1], $seq ]
709                        }
710                    @seqs;
711    
712        return wantarray ? @seqs2 : \@seqs2;
713    }
714    
715    sub pack_mask
716    {
717        my $mask = shift;
718        $mask =~ tr/-/\000/;
719        $mask =~ tr/\000/\377/c;
720        return $mask;
721    }
722    
723  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
724  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 849  Line 1211 
1211  );  );
1212    
1213    
1214  %one_letter_to_three_letter_aa = {  %one_letter_to_three_letter_aa = (
1215           A  => "Ala", a  => "Ala",           A  => "Ala", a  => "Ala",
1216           B  => "Asx", b  => "Asx",           B  => "Asx", b  => "Asx",
1217           C  => "Cys", c  => "Cys",           C  => "Cys", c  => "Cys",
# Line 875  Line 1237 
1237           Y  => "Tyr", y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1238           Z  => "Glx", z  => "Glx",           Z  => "Glx", z  => "Glx",
1239          '*' => "***"          '*' => "***"
1240          };          );
1241    
1242    
1243  %three_letter_to_one_letter_aa = (  %three_letter_to_one_letter_aa = (
# Line 1134  Line 1496 
1496  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1497  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1498  #  #
1499  #     @intervals = read_intervals( *FILEHANDLE )  #     @intervals = read_intervals( \*FILEHANDLE )
1500  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1501  sub read_intervals {  sub read_intervals {
1502      my $fh = shift;      my $fh = shift;
# Line 1230  Line 1592 
1592  #  Read a list of oriented intervals from a file.  #  Read a list of oriented intervals from a file.
1593  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1594  #  #
1595  #     @intervals = read_oriented_intervals( *FILEHANDLE )  #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1596  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1597  sub read_oriented_intervals {  sub read_oriented_intervals {
1598      my $fh = shift;      my $fh = shift;
# Line 1261  Line 1623 
1623  }  }
1624    
1625    
1626    #-----------------------------------------------------------------------------
1627    #  Convert GenBank locations to SEED locations
1628    #
1629    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1630    #-----------------------------------------------------------------------------
1631    sub gb_location_2_seed
1632    {
1633        my $contig = shift @_;
1634        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1635    
1636        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1637    }
1638    
1639    sub gb_loc_2_seed_2
1640    {
1641        my ( $contig, $loc ) = @_;
1642    
1643        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1644        {
1645            join( '_', $contig, $1, $2 )
1646        }
1647    
1648        elsif ( $loc =~ /^join\((.*)\)$/ )
1649        {
1650            $loc = $1;
1651            my $lvl = 0;
1652            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1653            {
1654                for ( substr( $loc, $i, 1 ) )
1655                {
1656                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1657                    /\(/          and $lvl--;
1658                    /\)/          and $lvl++;
1659                }
1660            }
1661            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1662            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1663        }
1664    
1665        elsif ( $loc =~ /^complement\((.*)\)$/ )
1666        {
1667            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1668            reverse
1669            gb_loc_2_seed_2( $contig, $1 )
1670        }
1671    
1672        else
1673        {
1674            ()
1675        }
1676    }
1677    
1678    
1679  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.6

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3