[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Wed Nov 30 18:33:41 2005 UTC revision 1.5, Sun Jun 10 17:17:40 2007 UTC
# Line 3  Line 3 
3  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
4  #  A list of entries is a list of references  #  A list of entries is a list of references
5  #  #
6  #  @seq_entry = read_next_fasta_seq( *FILEHANDLE )  #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )
7  #  @seq_entries = read_fasta_seqs( *FILEHANDLE )  #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
8  #  $seq_ind = index_seq_list( @seq_entries ); # hash from ids to entry refs  #  @seq_entries = read_fasta( )                     # STDIN
9    #  @seq_entries = read_fasta( \*FILEHANDLE )
10    #  @seq_entries = read_fasta(  $filename )
11    #  @seq_entries = read_clustal( )                   # STDIN
12    #  @seq_entries = read_clustal( \*FILEHANDLE )
13    #  @seq_entries = read_clustal(  $filename )
14    #  @seq_entries = read_clustal_file(  $filename )
15  #  #
16    #  $seq_ind   = index_seq_list( @seq_entries );   # hash from ids to entries
17  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );  #  @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
18  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );  #  $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
19  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );  #  $seq       = seq_data_by_id(  \%seq_index, $seq_id );
# Line 14  Line 21 
21  #  ( $id, $def ) = parse_fasta_title( $title )  #  ( $id, $def ) = parse_fasta_title( $title )
22  #  ( $id, $def ) = split_fasta_title( $title )  #  ( $id, $def ) = split_fasta_title( $title )
23  #  #
24  #  print_seq_list_as_fasta( *FILEHANDLE, @seq_entry_list );  #  print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );  # Original form
25  #  print_seq_as_fasta( *FILEHANDLE, $id, $desc, $seq) ;  #  print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
26  #  print_seq_as_fasta( *FILEHANDLE, @seq_entry );  #  print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
27  #  print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #  print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
28    #  print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
29    #  print_alignment_as_fasta(  $filename,    @seq_entry_list );
30    #  print_alignment_as_fasta(  $filename,   \@seq_entry_list );
31    #  print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
32    #  print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
33    #  print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
34    #  print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
35    #  print_alignment_as_phylip(  $filename,    @seq_entry_list );
36    #  print_alignment_as_phylip(  $filename,   \@seq_entry_list );
37    #  print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
38    #  print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
39    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
40    #  print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
41    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
42    #  print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
43    #  print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq) ;
44    #  print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
45    #  print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq );
46    #
47    #  @seqs = pack_alignment( @seqs )
48  #  #
49  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_DNA_entry( @seq_entry, $from, $to [, $fix_id] );
50  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );  #  @entry  = subseq_RNA_entry( @seq_entry, $from, $to [, $fix_id] );
# Line 40  Line 67 
67  #  Locations (= oriented intervals) are ( id, start, end )  #  Locations (= oriented intervals) are ( id, start, end )
68  #  Intervals are ( id, left, right )  #  Intervals are ( id, left, right )
69  #  #
70  #  @intervals = read_intervals( *FILEHANDLE )  #  @intervals = read_intervals( \*FILEHANDLE )
71  #  @intervals = read_oriented_intervals( *FILEHANDLE )  #  @intervals = read_oriented_intervals( \*FILEHANDLE )
72  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)  #  @intervals = standardize_intervals( @interval_refs ) # (id, left, right)
73  #  @joined    = join_intervals( @interval_refs )  #  @joined    = join_intervals( @interval_refs )
74  #  @intervals = locations_2_intervals( @locations )  #  @intervals = locations_2_intervals( @locations )
75  #  $interval  = locations_2_intervals( $location  )  #  $interval  = locations_2_intervals( $location  )
76  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)  #  @reversed  = reverse_intervals( @interval_refs )      # (id, end, start)
77    #
78    #  Convert GenBank locations to SEED locations
79    #
80    #  @seed_locs = gb_location_2_seed( $contig, @gb_locs )
81    
82    
83  use strict;  use strict;
# Line 75  Line 106 
106  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
107  our @EXPORT = qw(  our @EXPORT = qw(
108          read_fasta_seqs          read_fasta_seqs
109            read_fasta
110          read_next_fasta_seq          read_next_fasta_seq
111            read_clustal_file
112            read_clustal
113          parse_fasta_title          parse_fasta_title
114          split_fasta_title          split_fasta_title
115          print_seq_list_as_fasta          print_seq_list_as_fasta
116            print_alignment_as_fasta
117            print_alignment_as_phylip
118            print_alignment_as_nexus
119          print_seq_as_fasta          print_seq_as_fasta
120          print_gb_locus          print_gb_locus
121    
# Line 87  Line 124 
124          seq_desc_by_id          seq_desc_by_id
125          seq_data_by_id          seq_data_by_id
126    
127            pack_alignment
128    
129          subseq_DNA_entry          subseq_DNA_entry
130          subseq_RNA_entry          subseq_RNA_entry
131          complement_DNA_entry          complement_DNA_entry
# Line 108  Line 147 
147          locations_2_intervals          locations_2_intervals
148          read_oriented_intervals          read_oriented_intervals
149          reverse_intervals          reverse_intervals
150    
151            gb_location_2_seed
152          );          );
153    
154  our @EXPORT_OK = qw(  our @EXPORT_OK = qw(
# Line 129  Line 170 
170    
171    
172  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
173  #  Read fasta sequences from a file.  #  Helper function for defining an input filehandle:
174    #     filehandle is passed through
175    #     string is taken as file name to be openend
176    #     undef or "" defaults to STDOUT
177    #
178    #    ( \*FH, $name, $close [, $file] ) = input_filehandle( $file );
179    #
180    #-----------------------------------------------------------------------------
181    sub input_filehandle
182    {
183        my $file = shift;
184    
185        #  FILEHANDLE
186    
187        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
188    
189        #  Null string or undef
190    
191        return ( \*STDIN, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
192    
193        #  File name
194    
195        if ( ! ref( $file ) )
196        {
197            my $fh;
198            -f $file or die "Could not find input file \"$file\"\n";
199            open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";
200            return ( $fh, $file, 1 );
201        }
202    
203        #  Some other kind of reference; return the unused value
204    
205        return ( \*STDIN, undef, 0, $file );
206    }
207    
208    
209    #-----------------------------------------------------------------------------
210    #  Read fasta sequences from a filehandle (legacy interface; use read_fasta)
211  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #  Save the contents in a list of refs to arrays:  (id, description, seq)
212  #  #
213  #     @seqs = read_fasta_seqs(*FILEHANDLE)  #     @seq_entries = read_fasta_seqs( \*FILEHANDLE )
214  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
215  sub read_fasta_seqs {  sub read_fasta_seqs { read_fasta( @_ ) }
216      my $fh = shift;  
217      wantarray || die "read_fasta_seqs requires list context\n";  
218    #-----------------------------------------------------------------------------
219    #  Read fasta sequences.
220    #  Save the contents in a list of refs to arrays:  (id, description, seq)
221    #
222    #     @seq_entries = read_fasta( )               #  STDIN
223    #    \@seq_entries = read_fasta( )               #  STDIN
224    #     @seq_entries = read_fasta( \*FILEHANDLE )
225    #    \@seq_entries = read_fasta( \*FILEHANDLE )
226    #     @seq_entries = read_fasta(  $filename )
227    #    \@seq_entries = read_fasta(  $filename )
228    #-----------------------------------------------------------------------------
229    sub read_fasta {
230        my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
231        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
232    
233      my @seqs = ();      my @seqs = ();
234      my ($id, $desc, $seq) = ("", "", "");      my ($id, $desc, $seq) = ("", "", "");
# Line 152  Line 244 
244              $seq .= $_ ;              $seq .= $_ ;
245          }          }
246      }      }
247        close( $fh ) if $close;
248    
249      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }      if ($id && $seq) { push @seqs, [ $id, $desc, $seq ] }
250      return @seqs;      return wantarray ? @seqs : \@seqs;
251  }  }
252    
253    
# Line 162  Line 255 
255  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.
256  #  Return the contents as an array:  (id, description, seq)  #  Return the contents as an array:  (id, description, seq)
257  #  #
258  #     @seq_entry = read_next_fasta_seq(*FILEHANDLE)  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )
259  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
260  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
261    
# Line 171  Line 264 
264      my %next_header;      my %next_header;
265    
266      sub read_next_fasta_seq {      sub read_next_fasta_seq {
         wantarray || die "read_next_fasta_seq requires list context\n";  
   
267          my $fh = shift;          my $fh = shift;
268          my ( $id, $desc );          my ( $id, $desc );
269    
# Line 189  Line 280 
280              chomp;              chomp;
281              if ( /^>/ ) {        #  new id              if ( /^>/ ) {        #  new id
282                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
283                  if ( defined($id) && $seq ) { return ($id, $desc, $seq) }                  if ( defined($id) && $seq )
284                    {
285                        return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
286                    }
287                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
288                  $seq = "";                  $seq = "";
289              }              }
# Line 202  Line 296 
296          #  Done with file, delete "next header"          #  Done with file, delete "next header"
297    
298          delete $next_header{$fh};          delete $next_header{$fh};
299          return (defined($id) && $seq) ? ($id, $desc, $seq) : () ;          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)
300                                                          : [$id, $desc, $seq]
301                                              )
302                                            : () ;
303      }      }
304  }  }
305    
306    
307  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
308    #  Read a clustal alignment from a file.
309    #  Save the contents in a list of refs to arrays:  (id, description, seq)
310    #
311    #     @seq_entries = read_clustal_file( $filename )
312    #-----------------------------------------------------------------------------
313    sub read_clustal_file { read_clustal( @_ ) }
314    
315    
316    #-----------------------------------------------------------------------------
317    #  Read a clustal alignment.
318    #  Save the contents in a list of refs to arrays:  (id, description, seq)
319    #
320    #     @seq_entries = read_clustal( )              # STDIN
321    #     @seq_entries = read_clustal( \*FILEHANDLE )
322    #     @seq_entries = read_clustal(  $filename )
323    #-----------------------------------------------------------------------------
324    sub read_clustal {
325        my ( $fh, undef, $close, $unused ) = input_filehandle( shift );
326        $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_clustal_file\n";
327    
328        my ( %seq, @ids, $line );
329        while ( defined( $line = <$fh> ) )
330        {
331            ( $line =~ /^[A-Za-z0-9]/ ) or next;
332            chomp $line;
333            my @flds = split /\s+/, $line;
334            if ( @flds == 2 )
335            {
336                $seq{ $flds[0] } or push @ids, $flds[0];
337                push @{ $seq{ $flds[0] } }, $flds[1];
338            }
339        }
340        close( $fh ) if $close;
341    
342        map { [ $_, "", join( "", @{$seq{$_}} ) ] } @ids;
343    }
344    
345    
346    #-----------------------------------------------------------------------------
347  #  Parse a fasta file header to id and definition parts  #  Parse a fasta file header to id and definition parts
348  #  #
349  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
# Line 233  Line 369 
369    
370    
371  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
372  #  Print list of sequence entries in fasta format  #  Helper function for defining an output filehandle:
373    #     filehandle is passed through
374    #     string is taken as file name to be openend
375    #     undef or "" defaults to STDOUT
376    #
377    #    ( \*FH, $name, $close [, $file] ) = output_filehandle( $file );
378  #  #
 #     print_seq_list_as_fasta(*FILEHANDLE, @seq_entry_list);  
379  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
380  sub print_seq_list_as_fasta {  sub output_filehandle
381      my $fh = shift;  {
382      my @seq_list = @_;      my $file = shift;
383    
384        #  FILEHANDLE
385    
386        return ( $file, $file, 0 ) if ( ref( $file ) eq "GLOB" );
387    
388      foreach my $seq_ptr (@seq_list) {      #  Null string or undef
389          print_seq_as_fasta($fh, @$seq_ptr);  
390        return ( \*STDOUT, "", 0 ) if ( ! defined( $file ) || ( $file eq "" ) );
391    
392        #  File name
393    
394        if ( ! ref( $file ) )
395        {
396            my $fh;
397            open( $fh, ">$file" ) || die "Could not open output $file\n";
398            return ( $fh, $file, 1 );
399      }      }
400    
401        #  Some other kind of reference; return the unused value
402    
403        return ( \*STDOUT, undef, 0, $file );
404    }
405    
406    
407    #-----------------------------------------------------------------------------
408    #  Legacy function for printing fasta sequence set:
409    #
410    #     print_seq_list_as_fasta( \*FILEHANDLE, @seq_entry_list );
411    #-----------------------------------------------------------------------------
412    sub print_seq_list_as_fasta { print_alignment_as_fasta( @_ ) }
413    
414    
415    #-----------------------------------------------------------------------------
416    #  Print list of sequence entries in fasta format.
417    #  Missing, undef or "" filename defaults to STDOUT.
418    #
419    #     print_alignment_as_fasta(                @seq_entry_list ); # STDOUT
420    #     print_alignment_as_fasta(               \@seq_entry_list ); # STDOUT
421    #     print_alignment_as_fasta( \*FILEHANDLE,  @seq_entry_list );
422    #     print_alignment_as_fasta( \*FILEHANDLE, \@seq_entry_list );
423    #     print_alignment_as_fasta(  $filename,    @seq_entry_list );
424    #     print_alignment_as_fasta(  $filename,   \@seq_entry_list );
425    #-----------------------------------------------------------------------------
426    sub print_alignment_as_fasta {
427        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
428        ( unshift @_, $unused ) if $unused;
429    
430        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_fasta\n";
431    
432        #  Expand the sequence entry list if necessary:
433    
434        if ( ref( $_[0]->[0] ) eq "ARRAY" ) { @_ = @{ $_[0] } }
435    
436        foreach my $seq_ptr ( @_ ) { print_seq_as_fasta( $fh, @$seq_ptr ) }
437    
438        close( $fh ) if $close;
439    }
440    
441    
442    #-----------------------------------------------------------------------------
443    #  Print list of sequence entries in phylip format.
444    #  Missing, undef or "" filename defaults to STDOUT.
445    #
446    #     print_alignment_as_phylip(                @seq_entry_list ); # STDOUT
447    #     print_alignment_as_phylip(               \@seq_entry_list ); # STDOUT
448    #     print_alignment_as_phylip( \*FILEHANDLE,  @seq_entry_list );
449    #     print_alignment_as_phylip( \*FILEHANDLE, \@seq_entry_list );
450    #     print_alignment_as_phylip(  $filename,    @seq_entry_list );
451    #     print_alignment_as_phylip(  $filename,   \@seq_entry_list );
452    #-----------------------------------------------------------------------------
453    sub print_alignment_as_phylip {
454        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
455        ( unshift @_, $unused ) if $unused;
456    
457        ( ref( $_[0] ) eq "ARRAY" ) or die die "Bad sequence entry passed to print_alignment_as_phylip\n";
458    
459        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
460    
461        my ( %id2, %used );
462        my $maxlen = 0;
463        foreach ( @seq_list )
464        {
465            my ( $id, undef, $seq ) = @$_;
466    
467            #  Need a name that is unique within 10 characters
468    
469            my $id2 = substr( $id, 0, 10 );
470            $id2 =~ s/_/ /g;  # PHYLIP sequence files accept spaces
471            my $n = "0";
472            while ( $used{ $id2 } )
473            {
474                $n++;
475                $id2 = substr( $id, 0, 10 - length( $n ) ) . $n;
476            }
477            $used{ $id2 } = 1;
478            $id2{ $id } = $id2;
479    
480                    #  Prepare to pad sequences (should not be necessary, but ...)
481    
482            my $len = length( $seq );
483            $maxlen = $len if ( $len > $maxlen );
484        }
485    
486        my $nseq = @seq_list;
487        print $fh "$nseq  $maxlen\n";
488        foreach ( @seq_list )
489        {
490            my ( $id, undef, $seq ) = @$_;
491            my $len = length( $seq );
492            printf $fh "%-10s  %s%s\n", $id2{ $id },
493                                        $seq,
494                                        $len<$maxlen ? ("?" x ($maxlen-$len)) : "";
495        }
496    
497        close( $fh ) if $close;
498    }
499    
500    
501    #-----------------------------------------------------------------------------
502    #  Print list of sequence entries in nexus format.
503    #  Missing, undef or "" filename defaults to STDOUT.
504    #
505    #     print_alignment_as_nexus(               [ \%label_hash, ]  @seq_entry_list );
506    #     print_alignment_as_nexus(               [ \%label_hash, ] \@seq_entry_list );
507    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ]  @seq_entry_list );
508    #     print_alignment_as_nexus( \*FILEHANDLE, [ \%label_hash, ] \@seq_entry_list );
509    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ]  @seq_entry_list );
510    #     print_alignment_as_nexus(  $filename,   [ \%label_hash, ] \@seq_entry_list );
511    #-----------------------------------------------------------------------------
512    sub print_alignment_as_nexus {
513        my ( $fh, undef, $close, $unused ) = output_filehandle( shift );
514        ( unshift @_, $unused ) if $unused;
515    
516        my $lbls = ( ref( $_[0] ) eq "HASH" ) ? shift : undef;
517    
518        ( ref( $_[0] ) eq "ARRAY" ) or die "Bad sequence entry passed to print_alignment_as_nexus\n";
519    
520        my @seq_list = ( ref( $_[0]->[0] ) eq "ARRAY" ) ? @{ $_[0] } : @_;
521    
522        my %id2;
523        my ( $maxidlen, $maxseqlen ) = ( 0, 0 );
524        my ( $n1, $n2, $nt, $nu ) = ( 0, 0, 0, 0 );
525        foreach ( @seq_list )
526        {
527            my ( $id, undef, $seq ) = @$_;
528            my $id2 = $lbls ? ( $lbls->{ $id } || $id ) : $id;
529            if ( $id2 !~ /^[-+.0-9A-Za-z~_|]+$/ )
530            {
531                    $id2 =~ s/'/''/g;
532                    $id2 = qq('$id2');
533                }
534            $id2{ $id } = $id2;
535            my $idlen = length( $id2 );
536            $maxidlen = $idlen if ( $idlen > $maxidlen );
537    
538            my $seqlen = length( $seq );
539            $maxseqlen = $seqlen if ( $seqlen > $maxseqlen );
540    
541            $nt += $seq =~ tr/Tt//d;
542            $nu += $seq =~ tr/Uu//d;
543            $n1 += $seq =~ tr/ACGNacgn//d;
544            $n2 += $seq =~ tr/A-Za-z//d;
545        }
546    
547        my $nseq = @seq_list;
548        my $type = ( $n1 < 2 * $n2 ) ?  'protein' : ($nt>$nu) ? 'DNA' : 'RNA';
549    
550        print $fh <<"END_HEAD";
551    #NEXUS
552    
553    BEGIN Data;
554        Dimensions
555            NTax=$nseq
556            NChar=$maxseqlen
557            ;
558        Format
559            DataType=$type
560            Gap=-
561            Missing=?
562            ;
563        Matrix
564    
565    END_HEAD
566    
567        foreach ( @seq_list )
568        {
569            my ( $id, undef, $seq ) = @$_;
570            my $len = length( $seq );
571            printf  $fh  "%-${maxidlen}s  %s%s\n",
572                         $id2{ $id },
573                         $seq,
574                         $len<$maxseqlen ? ("?" x ($maxseqlen-$len)) : "";
575        }
576    
577        print $fh <<"END_TAIL";
578    ;
579    END;
580    END_TAIL
581    
582        close( $fh ) if $close;
583  }  }
584    
585    
586  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
587  #  Print one sequence in fasta format to an open file  #  Print one sequence in fasta format to an open file
588  #  #
589  #     print_seq_as_fasta(*FILEHANDLE, $id, $desc, $seq);  #     print_seq_as_fasta( \*FILEHANDLE, $id, $desc, $seq );
590  #     print_seq_as_fasta(*FILEHANDLE, @seq_entry);  #     print_seq_as_fasta( \*FILEHANDLE, @seq_entry );
591  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
592  sub print_seq_as_fasta {  sub print_seq_as_fasta {
593      my $fh = shift;      my $fh = shift;
# Line 268  Line 604 
604  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
605  #  Print one sequence in GenBank flat file format:  #  Print one sequence in GenBank flat file format:
606  #  #
607  #     print_gb_locus( *FILEHANDLE, $locus, $def, $accession, $seq )  #     print_gb_locus( \*FILEHANDLE, $locus, $def, $accession, $seq )
608  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
609  sub print_gb_locus {  sub print_gb_locus {
610      my ($fh, $loc, $def, $acc, $seq) = @_;      my ($fh, $loc, $def, $acc, $seq) = @_;
# Line 297  Line 633 
633  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)  #  Build an index from seq_id to pointer to sequence entry: (id, desc, seq)
634  #  #
635  #     my \%seq_ind  = index_seq_list(@seq_list);  #     my \%seq_ind  = index_seq_list(@seq_list);
636    #     my \%seq_ind  = index_seq_list( \@seq_list );
637  #  #
638  #  Usage example:  #  Usage example:
639  #  #
640  #  my @seq_list   = read_fasta_seqs(*STDIN);   # list of pointers to entries  #  my  @seq_list   = read_fasta_seqs(\*STDIN);  # list of pointers to entries
641  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers  #  my \%seq_ind   = index_seq_list(@seq_list); # hash from names to pointers
642  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry  #  my @chosen_seq = @{%seq_ind{"contig1"}};    # extract one entry
643  #  #
644  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
645  sub index_seq_list {  sub index_seq_list {
646      my %seq_index = map { @{$_}[0] => $_ } @_;      ( ref( $_[0] )      ne 'ARRAY' ) ? {}
647      return \%seq_index;    : ( ref( $_[0]->[0] ) ne 'ARRAY' ) ? { map { $_->[0] => $_ } @_ }
648      :                                    { map { $_->[0] => $_ } @{ $_[0] } }
649  }  }
650    
651    
652  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
653  #  Three routines to access all or part of sequence entry by id:  #  Three routines to access all or part of sequence entry by id:
654  #  #
655  #     my @seq_entry  = seq_entry_by_id( \%seq_index, $seq_id );  #     @seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
656  #     my $seq_desc   = seq_desc_by_id(  \%seq_index, $seq_id );  #    \@seq_entry = seq_entry_by_id( \%seq_index, $seq_id );
657  #     my $seq        = seq_data_by_id(  \%seq_index, $seq_id );  #     $seq_desc  = seq_desc_by_id(  \%seq_index, $seq_id );
658    #     $seq       = seq_data_by_id(  \%seq_index, $seq_id );
659  #  #
660  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
661  sub seq_entry_by_id {  sub seq_entry_by_id {
662      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";      (my $ind_ref = shift)  || die "No index supplied to seq_entry_by_id\n";
663      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";      (my $id      = shift)  || die "No id supplied to seq_entry_by_id\n";
664      wantarray || die "entry_by_id requires list context\n";      return wantarray ? @{ $ind_ref->{$id} } : $ind_ref->{$id};
     return @{ $ind_ref->{$id} };  
665  }  }
666    
667    
# Line 340  Line 678 
678      return ${ $ind_ref->{$id} }[2];      return ${ $ind_ref->{$id} }[2];
679  }  }
680    
681    #-----------------------------------------------------------------------------
682    #  Remove columns of alignment gaps from sequences:
683    #
684    #  @packed_seqs = pack_alignment( @seqs )
685    #
686    #-----------------------------------------------------------------------------
687    
688    sub pack_alignment
689    {
690        my @seqs = ( ref( $_[0] ) eq 'ARRAY' and ref( $_[0]->[0] ) eq 'ARRAY' ) ? @{$_[0] } : @_;
691        @seqs or return wantarray ? () : [];
692    
693        my $mask  = pack_mask( $seqs[0]->[2] );
694        foreach ( @seqs[ 1 .. (@seqs-1) ] )
695        {
696            $mask |= pack_mask( $_->[2] );
697        }
698    
699        my $seq;
700        my @seqs2 = map { $seq = $_->[2] & $mask;
701                          $seq =~ tr/\000//d;
702                          [ $_->[0], $_->[1], $seq ]
703                        }
704                    @seqs;
705    
706        return wantarray ? @seqs2 : \@seqs2;
707    }
708    
709    sub pack_mask
710    {
711        my $mask = shift;
712        $mask =~ tr/-/\000/;
713        $mask =~ tr/\000/\377/c;
714        return $mask;
715    }
716    
717  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
718  #  Some simple sequence manipulations:  #  Some simple sequence manipulations:
# Line 832  Line 1205 
1205  );  );
1206    
1207    
1208  %one_letter_to_three_letter_aa = {  %one_letter_to_three_letter_aa = (
1209           A  => "Ala", a  => "Ala",           A  => "Ala", a  => "Ala",
1210           B  => "Asx", b  => "Asx",           B  => "Asx", b  => "Asx",
1211           C  => "Cys", c  => "Cys",           C  => "Cys", c  => "Cys",
# Line 858  Line 1231 
1231           Y  => "Tyr", y  => "Tyr",           Y  => "Tyr", y  => "Tyr",
1232           Z  => "Glx", z  => "Glx",           Z  => "Glx", z  => "Glx",
1233          '*' => "***"          '*' => "***"
1234          };          );
1235    
1236    
1237  %three_letter_to_one_letter_aa = (  %three_letter_to_one_letter_aa = (
# Line 1117  Line 1490 
1490  #  Read a list of intervals from a file.  #  Read a list of intervals from a file.
1491  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1492  #  #
1493  #     @intervals = read_intervals( *FILEHANDLE )  #     @intervals = read_intervals( \*FILEHANDLE )
1494  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1495  sub read_intervals {  sub read_intervals {
1496      my $fh = shift;      my $fh = shift;
# Line 1213  Line 1586 
1586  #  Read a list of oriented intervals from a file.  #  Read a list of oriented intervals from a file.
1587  #  Allow id_start_end, or id \s start \s end formats  #  Allow id_start_end, or id \s start \s end formats
1588  #  #
1589  #     @intervals = read_oriented_intervals( *FILEHANDLE )  #     @intervals = read_oriented_intervals( \*FILEHANDLE )
1590  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1591  sub read_oriented_intervals {  sub read_oriented_intervals {
1592      my $fh = shift;      my $fh = shift;
# Line 1244  Line 1617 
1617  }  }
1618    
1619    
1620    #-----------------------------------------------------------------------------
1621    #  Convert GenBank locations to SEED locations
1622    #
1623    #     @seed_locs = gb_location_2_seed( $contig, @gb_locs )
1624    #-----------------------------------------------------------------------------
1625    sub gb_location_2_seed
1626    {
1627        my $contig = shift @_;
1628        $contig or die "First arg of gb_location_2_seed must be contig_id\n";
1629    
1630        map { join( ',', gb_loc_2_seed_2( $contig, $_ ) ) || undef } @_
1631    }
1632    
1633    sub gb_loc_2_seed_2
1634    {
1635        my ( $contig, $loc ) = @_;
1636    
1637        if ( $loc =~ /^(\d+)\.\.(\d+)$/ )
1638        {
1639            join( '_', $contig, $1, $2 )
1640        }
1641    
1642        elsif ( $loc =~ /^join\((.*)\)$/ )
1643        {
1644            $loc = $1;
1645            my $lvl = 0;
1646            for ( my $i = length( $loc )-1; $i >= 0; $i-- )
1647            {
1648                for ( substr( $loc, $i, 1 ) )
1649                {
1650                    /,/ && ! $lvl and substr( $loc, $i, 1 ) = "\t";
1651                    /\(/          and $lvl--;
1652                    /\)/          and $lvl++;
1653                }
1654            }
1655            $lvl == 0 or print STDERR "Paren matching error: $loc\n" and die;
1656            map { gb_loc_2_seed_2( $contig, $_ ) } split /\t/, $loc
1657        }
1658    
1659        elsif ( $loc =~ /^complement\((.*)\)$/ )
1660        {
1661            map { s/_(\d+)_(\d+)$/_$2_$1/; $_ }
1662            reverse
1663            gb_loc_2_seed_2( $contig, $1 )
1664        }
1665    
1666        else
1667        {
1668            ()
1669        }
1670    }
1671    
1672    
1673  1;  1;

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.5

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3