[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Mon Dec 5 19:06:30 2005 UTC revision 1.4, Fri Dec 15 00:09:52 2006 UTC
# Line 1  Line 1 
 #  
 # Copyright (c) 2003-2006 University of Chicago and Fellowship  
 # for Interpretations of Genomes. All Rights Reserved.  
 #  
 # This file is part of the SEED Toolkit.  
 #  
 # The SEED Toolkit is free software. You can redistribute  
 # it and/or modify it under the terms of the SEED Toolkit  
 # Public License.  
 #  
 # You should have received a copy of the SEED Toolkit Public License  
 # along with this program; if not write to the University of Chicago  
 # at info@ci.uchicago.edu or the Fellowship for Interpretation of  
 # Genomes at veronika@thefig.info or download a copy from  
 # http://www.theseed.org/LICENSE.TXT.  
 #  
   
1  package gjoseqlib;  package gjoseqlib;
2    
3  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
4  #  A list of entries is a list of references  #  A list of entries is a list of references
5  #  #
6  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs  #  @seq_entries = read_fasta( )                     # STDIN
9    #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11    #  @seq_entries = read_clustal( )                   # STDIN
12    #  @seq_entries = read_clustal( \*FILEHANDLE )
13    #  @seq_entries = read_clustal(  $filename )
14    #  @seq_entries = read_clustal_file(  $filename )
15  #  #
16    #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
# Line 31  Line 21 
21  #  ( $id, $def ) = parse_fasta_title( $title )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  ( $id, $def ) = split_fasta_title( $title )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  #
24  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28    #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46  #  #
47  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
48  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
# Line 57  Line 65 
65  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
66  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
67  #  #
68  #  @intervals = read_intervals( *FILEHANDLE )  #  @intervals = read_intervals( \*FILEHANDLE )
69  #  @intervals = read_oriented_intervals( *FILEHANDLE )  #  @intervals = read_oriented_intervals( \*FILEHANDLE )
70  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
71  #  @joined    = join_intervals( @interval_refs )  #  @joined    = join_intervals( @interval_refs )
72  #  @intervals = locations_2_intervals( @locations )  #  @intervals = locations_2_intervals( @locations )
73  #  $interval  = locations_2_intervals( $location  )  #  $interval  = locations_2_intervals( $location  )
74  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
75    #
76    #  Convert GenBank locations to SEED locations
77    #
78    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
79    
80    
81  use strict;  use strict;
# Line 92  Line 104 
104  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
105  our @EXPORT = qw(  our @EXPORT = qw(
106          read_fasta_seqs          read_fasta_seqs
107            read_fasta
108          read_next_fasta_seq          read_next_fasta_seq
109            read_clustal_file
110            read_clustal
111          parse_fasta_title          parse_fasta_title
112          split_fasta_title          split_fasta_title
113          print_seq_list_as_fasta          print_seq_list_as_fasta
114            print_alignment_as_fasta
115            print_alignment_as_phylip
116            print_alignment_as_nexus
117          print_seq_as_fasta          print_seq_as_fasta
118          print_gb_locus          print_gb_locus
119    
# Line 125  Line 143 
143          locations_2_intervals          locations_2_intervals
144          read_oriented_intervals          read_oriented_intervals
145          reverse_intervals          reverse_intervals
146    
147            gb_location_2_seed
148          );          );
149    
150  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 146  Line 166 
166    
167    
168  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
169  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
170    #     filehandle is passed through
171    #     string is taken as file name to be openend
172    #     undef or "" defaults to STDOUT
173    #
174    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
175    #
176    #-----------------------------------------------------------------------------
177    sub input_filehandle
178    {
179        my $file = shift;
180    
181        #  FILEHANDLE
182    
183        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
184    
185        #  Null string or undef
186    
187        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
188    
189        #  File name
190    
191        if ( ! ref( $file ) )
192        {
193            my $fh;
194            -f $file or die "Could not find input file \"$file\"\n";
195            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
196            return ( $fh, $file, 1 );
197        }
198    
199        #  Some other kind of reference; return the unused value
200    
201        return ( \*STDIN, undef, 0, $file );
202    }
203    
204    
205    #-----------------------------------------------------------------------------
206    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
207  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
208  #  #
209  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
210  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
211  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
212      my $fh = shift;  
213      wantarray || die "read_fasta_seqs requires list context\n";  
214    #-----------------------------------------------------------------------------
215    #  Read fasta sequences.
216    #  Save the contents in a list of refs to arrays:  (id, description, seq)
217    #
218    #     @seq_entries = read_fasta( )               #  STDIN
219    #    \@seq_entries = read_fasta( )               #  STDIN
220    #     @seq_entries = read_fasta( \*FILEHANDLE )
221    #    \@seq_entries = read_fasta( \*FILEHANDLE )
222    #     @seq_entries = read_fasta(  $filename )
223    #    \@seq_entries = read_fasta(  $filename )
224    #-----------------------------------------------------------------------------
225    sub read_fasta {
226        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
227        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
228    
229      my @seqs = ();      my @seqs = ();
230      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 169  Line 240 
240              $seq .= $_ ;              $seq .= $_ ;
241          }          }
242      }      }
243        close( $fh ) if $close;
244    
245      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
246      return @seqs;      return wantarray ? @seqs : \@seqs;
247  }  }
248    
249    
# Line 179  Line 251 
251  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
252  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
253  #  #
254  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
255  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
256  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
257    
# Line 188  Line 260 
260      my %next_header;      my %next_header;
261    
262      sub read_next_fasta_seq {      sub read_next_fasta_seq {
         wantarray || die "read_next_fasta_seq requires list context\n";  
   
263          my $fh = shift;          my $fh = shift;
264          my ( $id, $desc );          my ( $id, $desc );
265    
# Line 206  Line 276 
276              chomp;              chomp;
277              if ( /^>/ ) {        #  new id              if ( /^>/ ) {        #  new id
278                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
279                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
280                    {
281                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
282                    }
283                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
284                  $seq = "";                  $seq = "";
285              }              }
# Line 219  Line 292 
292          #  Done with file, delete "next header"          #  Done with file, delete "next header"
293    
294          delete $next_header{$fh};          delete $next_header{$fh};
295          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
296                                                          : [$id, $desc, $seq]
297                                              )
298                                            : () ;
299      }      }
300  }  }
301    
302    
303  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
304    #  Read a clustal alignment from a file.
305    #  Save the contents in a list of refs to arrays:  (id, description, seq)
306    #
307    #     @seq_entries = read_clustal_file( $filename )
308    #-----------------------------------------------------------------------------
309    sub read_clustal_file { read_clustal( @_ ) }
310    
311    
312    #-----------------------------------------------------------------------------
313    #  Read a clustal alignment.
314    #  Save the contents in a list of refs to arrays:  (id, description, seq)
315    #
316    #     @seq_entries = read_clustal( )              # STDIN
317    #     @seq_entries = read_clustal( \*FILEHANDLE )
318    #     @seq_entries = read_clustal(  $filename )
319    #-----------------------------------------------------------------------------
320    sub read_clustal {
321        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
322        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
323    
324        my ( %seq, @ids, $line );
325        while ( defined( $line = <$fh> ) )
326        {
327            ( $line =~ /^[A-Za-z0-9]/ ) or next;
328            chomp $line;
329            my @flds = split /\s+/, $line;
330            if ( @flds == 2 )
331            {
332                $seq{ $flds[0] } or push @ids, $flds[0];
333                push @{ $seq{ $flds[0] } }, $flds[1];
334            }
335        }
336        close( $fh ) if $close;
337    
338        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
339    }
340    
341    
342    #-----------------------------------------------------------------------------
343  #  Parse a fasta file header to id and definition parts  #  Parse a fasta file header to id and definition parts
344  #  #
345  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
# Line 250  Line 365 
365    
366    
367  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
368  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
369    #     filehandle is passed through
370    #     string is taken as file name to be openend
371    #     undef or "" defaults to STDOUT
372    #
373    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
374  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
375  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
376  sub print_seq_list_as_fasta {  sub output_filehandle
377      my $fh = shift;  {
378      my @seq_list = @_;      my $file = shift;
379    
380        #  FILEHANDLE
381    
382        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
383    
384        #  Null string or undef
385    
386        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
387    
388        #  File name
389    
390        if ( ! ref( $file ) )
391        {
392            my $fh;
393            open( $fh, ">$file" ) || die "Could not open output $file\n";
394            return ( $fh, $file, 1 );
395        }
396    
397        #  Some other kind of reference; return the unused value
398    
399        return ( \*STDOUT, undef, 0, $file );
400    }
401    
402    
403    #-----------------------------------------------------------------------------
404    #  Legacy function for printing fasta sequence set:
405    #
406    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
407    #-----------------------------------------------------------------------------
408    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
409    
410    
411    #-----------------------------------------------------------------------------
412    #  Print list of sequence entries in fasta format.
413    #  Missing, undef or "" filename defaults to STDOUT.
414    #
415    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
416    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
417    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
418    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
419    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
420    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
421    #-----------------------------------------------------------------------------
422    sub print_alignment_as_fasta {
423        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
424        ( unshift @_, $unused ) if $unused;
425    
426      foreach my $seq_ptr (@seq_list) {      ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";
427          print_seq_as_fasta($fh, @$seq_ptr);  
428        #  Expand the sequence entry list if necessary:
429    
430        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
431    
432        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
433    
434        close( $fh ) if $close;
435      }      }
436    
437    
438    #-----------------------------------------------------------------------------
439    #  Print list of sequence entries in phylip format.
440    #  Missing, undef or "" filename defaults to STDOUT.
441    #
442    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
443    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
444    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
445    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
446    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
447    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
448    #-----------------------------------------------------------------------------
449    sub print_alignment_as_phylip {
450        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
451        ( unshift @_, $unused ) if $unused;
452    
453        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
454    
455        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
456    
457        my ( %id2, %used );
458        my $maxlen = 0;
459        foreach ( @seq_list )
460        {
461            my ( $id, undef, $seq ) = @$_;
462    
463            #  Need a name that is unique within 10 characters
464    
465            my $id2 = substr( $id, 0, 10 );
466            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
467            my $n = "0";
468            while ( $used{ $id2 } )
469            {
470                $n++;
471                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
472            }
473            $used{ $id2 } = 1;
474            $id2{ $id } = $id2;
475    
476                    #  Prepare to pad sequences (should not be necessary, but ...)
477    
478            my $len = length( $seq );
479            $maxlen = $len if ( $len > $maxlen );
480        }
481    
482        my $nseq = @seq_list;
483        print $fh "$nseq  $maxlen\n";
484        foreach ( @seq_list )
485        {
486            my ( $id, undef, $seq ) = @$_;
487            my $len = length( $seq );
488            printf $fh "%-10s  %s%s\n", $id2{ $id },
489                                        $seq,
490                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
491        }
492    
493        close( $fh ) if $close;
494    }
495    
496    
497    #-----------------------------------------------------------------------------
498    #  Print list of sequence entries in nexus format.
499    #  Missing, undef or "" filename defaults to STDOUT.
500    #
501    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
502    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
503    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
504    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
505    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
506    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
507    #-----------------------------------------------------------------------------
508    sub print_alignment_as_nexus {
509        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
510        ( unshift @_, $unused ) if $unused;
511    
512        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
513    
514        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
515    
516        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
517    
518        my %id2;
519        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
520        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
521        foreach ( @seq_list )
522        {
523            my ( $id, undef, $seq ) = @$_;
524            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
525            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
526            {
527                    $id2 =~ s/'/''/g;
528                    $id2 = qq('$id2');
529                }
530            $id2{ $id } = $id2;
531            my $idlen = length( $id2 );
532            $maxidlen = $idlen if ( $idlen > $maxidlen );
533    
534            my $seqlen = length( $seq );
535            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
536    
537            $nt += $seq =~ tr/Tt//d;
538            $nu += $seq =~ tr/Uu//d;
539            $n1 += $seq =~ tr/ACGNacgn//d;
540            $n2 += $seq =~ tr/A-Za-z//d;
541        }
542    
543        my $nseq = @seq_list;
544        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
545    
546        print $fh <<"END_HEAD";
547    #NEXUS
548    
549    BEGIN Data;
550        Dimensions
551            NTax=$nseq
552            NChar=$maxseqlen
553            ;
554        Format
555            DataType=$type
556            Gap=-
557            Missing=?
558            ;
559        Matrix
560    
561    END_HEAD
562    
563        foreach ( @seq_list )
564        {
565            my ( $id, undef, $seq ) = @$_;
566            my $len = length( $seq );
567            printf  $fh  "%-${maxidlen}s  %s%s\n",
568                         $id2{ $id },
569                         $seq,
570                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
571        }
572    
573        print $fh <<"END_TAIL";
574    ;
575    END;
576    END_TAIL
577    
578        close( $fh ) if $close;
579  }  }
580    
581    
582  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
583  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
584  #  #
585  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
586  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
587  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
588  sub print_seq_as_fasta {  sub print_seq_as_fasta {
589      my $fh = shift;      my $fh = shift;
# Line 285  Line 600 
600  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
601  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
602  #  #
603  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
604  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
605  sub print_gb_locus {  sub print_gb_locus {
606      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 314  Line 629 
629  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
630  #  #
631  #     my \%seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
632    #     my \%seq_ind  = index_seq_list( \@seq_list );
633  #  #
634  #  Usage example:  #  Usage example:
635  #  #
636  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
637  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
638  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
639  #  #
640  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
641  sub index_seq_list {  sub index_seq_list {
642      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
643      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
644      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
645  }  }
646    
647    
648  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
649  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
650  #  #
651  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
652  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
653  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
654    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
655  #  #
656  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
657  sub seq_entry_by_id {  sub seq_entry_by_id {
658      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
659      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
660      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
661  }  }
662    
663    
# Line 849  Line 1166 
1166  );  );
1167    
1168    
1169  %one_letter_to_three_letter_aa = {  %one_letter_to_three_letter_aa = (
1170           A  => "Ala", a  => "Ala",           A  => "Ala", a  => "Ala",
1171           B  => "Asx", b  => "Asx",           B  => "Asx", b  => "Asx",
1172           C  => "Cys", c  => "Cys",           C  => "Cys", c  => "Cys",
# Line 875  Line 1192 
1192           Y  => "Tyr", y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1193           Z  => "Glx", z  => "Glx",           Z  => "Glx", z  => "Glx",
1194          '*' => "***"          '*' => "***"
1195          };          );
1196    
1197    
1198  %three_letter_to_one_letter_aa = (  %three_letter_to_one_letter_aa = (
# Line 1134  Line 1451 
1451  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1452  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1453  #  #
1454  #     @intervals = read_intervals( *FILEHANDLE )  #     @intervals = read_intervals( \*FILEHANDLE )
1455  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1456  sub read_intervals {  sub read_intervals {
1457      my $fh = shift;      my $fh = shift;
# Line 1230  Line 1547 
1547  #  Read a list of oriented intervals from a file.  #  Read a list of oriented intervals from a file.
1548  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1549  #  #
1550  #     @intervals = read_oriented_intervals( *FILEHANDLE )  #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1551  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1552  sub read_oriented_intervals {  sub read_oriented_intervals {
1553      my $fh = shift;      my $fh = shift;
# Line 1261  Line 1578 
1578  }  }
1579    
1580    
1581    #-----------------------------------------------------------------------------
1582    #  Convert GenBank locations to SEED locations
1583    #
1584    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1585    #-----------------------------------------------------------------------------
1586    sub gb_location_2_seed
1587    {
1588        my $contig = shift @_;
1589        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1590    
1591        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1592    }
1593    
1594    sub gb_loc_2_seed_2
1595    {
1596        my ( $contig, $loc ) = @_;
1597    
1598        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1599        {
1600            join( '_', $contig, $1, $2 )
1601        }
1602    
1603        elsif ( $loc =~ /^join\((.*)\)$/ )
1604        {
1605            $loc = $1;
1606            my $lvl = 0;
1607            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1608            {
1609                for ( substr( $loc, $i, 1 ) )
1610                {
1611                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1612                    /\(/          and $lvl--;
1613                    /\)/          and $lvl++;
1614                }
1615            }
1616            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1617            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1618        }
1619    
1620        elsif ( $loc =~ /^complement\((.*)\)$/ )
1621        {
1622            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1623            reverse
1624            gb_loc_2_seed_2( $contig, $1 )
1625        }
1626    
1627        else
1628        {
1629            ()
1630        }
1631    }
1632    
1633    
1634  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.4

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3