[Bio] / FigKernelPackages / gjoseqlib.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjoseqlib.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Thu Jan 3 21:36:41 2008 UTC revision 1.12, Mon Feb 11 20:37:27 2008 UTC
# Line 3  Line 3 
3  #  A sequence entry is ( $id, $def, $seq )  #  A sequence entry is ( $id, $def, $seq )
4  #  A list of entries is a list of references  #  A list of entries is a list of references
5  #  #
 #  @seq_entry   = read_next_fasta_seq( \*FILEHANDLE )  
 #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form  
6  #  @seq_entries = read_fasta( )                     # STDIN  #  @seq_entries = read_fasta( )                     # STDIN
7  #  @seq_entries = read_fasta( \*FILEHANDLE )  #  @seq_entries = read_fasta( \*FILEHANDLE )
8  #  @seq_entries = read_fasta(  $filename )  #  @seq_entries = read_fasta(  $filename )
9    #  @entry = read_next_fasta_seq( \*FILEHANDLE )
10    # \@entry = read_next_fasta_seq( \*FILEHANDLE )
11    #  @entry = read_next_fasta_seq(  $filename )
12    # \@entry = read_next_fasta_seq(  $filename )
13    #  @entry = read_next_fasta_seq()                   # STDIN
14    # \@entry = read_next_fasta_seq()                   # STDIN
15    #  @seq_entries = read_fasta_seqs( \*FILEHANDLE )   # Original form
16  #  @seq_entries = read_clustal( )                   # STDIN  #  @seq_entries = read_clustal( )                   # STDIN
17  #  @seq_entries = read_clustal( \*FILEHANDLE )  #  @seq_entries = read_clustal( \*FILEHANDLE )
18  #  @seq_entries = read_clustal(  $filename )  #  @seq_entries = read_clustal(  $filename )
# Line 230  Line 235 
235      if ( ! ref( $file ) )      if ( ! ref( $file ) )
236      {      {
237          my $fh;          my $fh;
238          -f $file or die "Could not find input file \"$file\"\n";          if    ( -f $file                       ) { }
239          open( $fh, "<$file" ) || die "Could not open \"$file\" for input\n";          elsif (    $file =~ /^>(.+)$/ && -f $1 ) { $file = $1 }
240            else { die "Could not find input file '$file'\n" }
241            open( $fh, "<$file" ) || die "Could not open '$file' for input\n";
242          return ( $fh, $file, 1 );          return ( $fh, $file, 1 );
243      }      }
244    
# Line 251  Line 258 
258    
259    
260  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
261  #  Read fasta sequences.  #  Read fasta sequences.  Save the contents in a list of refs to arrays:
262  #  Save the contents in a list of refs to arrays:  (id, description, seq)  #
263    #     $seq_entry = [ id, description, seq ]
264  #  #
265  #     @seq_entries = read_fasta( )               #  STDIN  #     @seq_entries = read_fasta( )               #  STDIN
266  #    \@seq_entries = read_fasta( )               #  STDIN  #    \@seq_entries = read_fasta( )               #  STDIN
# Line 260  Line 268 
268  #    \@seq_entries = read_fasta( \*FILEHANDLE )  #    \@seq_entries = read_fasta( \*FILEHANDLE )
269  #     @seq_entries = read_fasta(  $filename )  #     @seq_entries = read_fasta(  $filename )
270  #    \@seq_entries = read_fasta(  $filename )  #    \@seq_entries = read_fasta(  $filename )
271    #  #  @seq_entries = read_fasta( "command |" )   #  open and read from pipe
272    #  # \@seq_entries = read_fasta( "command |" )   #  open and read from pipe
273    #
274    #-----------------------------------------------------------------------------
275    sub read_fasta
276    {
277        my @seqs = map { $_->[2] =~ tr/ \n\r\t//d; $_ }
278                   map { /^(\S+)(\s+([^\n]*\S)?\s*)?\n(.+)$/s ? [ $1, $3 || '', $4 ] : () }
279                   split /^>\s*/m, slurp( @_ );
280        wantarray() ? @seqs : \@seqs;
281    }
282    
283    #-----------------------------------------------------------------------------
284    #  A fast file reader:
285    #
286    #     $data = slurp( )               #  \*STDIN
287    #     $data = slurp( \*FILEHANDLE )  #  an open file handle
288    #     $data = slurp(  $filename )    #  a file name
289    #     $data = slurp( "<$filename" )  #  file with explicit direction
290    #   # $data = slurp( "$command |" )  #  open and read from pipe
291    #
292    #  Note:  It is faster to read lines by reading the file and splitting
293    #         than by reading the lines sequentially.  If space is not an
294    #         issue, this is the way to go.  If space is an issue, then lines
295    #         or records should be processed one-by-one (rather than loading
296    #         the whole input into a string or array).
297    #-----------------------------------------------------------------------------
298    sub slurp
299    {
300        my ( $fh, $close );
301        if ( ref $_[0] eq 'GLOB' )
302        {
303            $fh = shift;
304        }
305        elsif ( $_[0] && ! ref $_[0] )
306        {
307            my $file = shift;
308            if    ( -f $file                       ) { $file = "<$file" }
309            elsif (    $file =~ /^<(.*)$/ && -f $1 ) { }  # Explicit read
310          # elsif (    $file =~ /\S\s*\|$/         ) { }  # Read from a pipe
311            else                                     { return undef }
312            open $fh, $file or return undef;
313            $close = 1;
314        }
315        else
316        {
317            $fh = \*STDIN;
318        }
319    
320        my $out = '';
321        my $inc = 1048576;
322        my $end =       0;
323        my $read;
324        while ( $read = read( $fh, $out, $inc, $end ) ) { $end += $read }
325        close( $fh ) if $close;
326    
327        $out;
328    }
329    
330    
331    #-----------------------------------------------------------------------------
332    #  Previous, 50% slower fasta reader:
333  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
334  sub read_fasta {  sub read_fasta_0
335    {
336      my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );      my ( $fh, $name, $close, $unused ) = input_filehandle( $_[0] );
337      $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";      $unused && die "Bad reference type (" . ref( $unused ) . ") passed to read_fasta\n";
338    
# Line 287  Line 358 
358    
359    
360  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
361  #  Read one fasta sequence at a time from a file.  #  Read one fasta sequence at a time from a file.  This is half as fast a
362  #  Return the contents as an array:  (id, description, seq)  #  read_fasta(), but can handle an arbitrarily large file.  State information
363    #  is retained in hashes, so any number of streams can be interlaced.
364    #
365    #      @entry = read_next_fasta_seq( \*FILEHANDLE )
366    #     \@entry = read_next_fasta_seq( \*FILEHANDLE )
367    #      @entry = read_next_fasta_seq(  $filename )
368    #     \@entry = read_next_fasta_seq(  $filename )
369    #      @entry = read_next_fasta_seq()                # \*STDIN
370    #     \@entry = read_next_fasta_seq()                # \*STDIN
371  #  #
372  #     @seq_entry = read_next_fasta_seq( \*FILEHANDLE )  #      @entry = ( $id, $description, $seq )
373    #
374    #  When reading at the end of file, () is returned.
375    #  With a filename, reading past this will reopen the file at the beginning.
376  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
377  #  Reading always overshoots, so save next id and description  #  Reading always overshoots, so save next id and description
378    
379  {   #  Use bare block to scope the header hash  {   #  Use bare block to scope the header hash
380    
381      my %next_header;      my %next_header;
382        my %file_handle;
383        my %close_file;
384    
385      sub read_next_fasta_seq {      sub read_next_fasta_seq
386          my $fh = shift;      {
387          my ( $id, $desc );          my $fh = $file_handle{ $_[0] };
388            if ( ! $fh )
389            {
390                if ( ref $_[0] )
391                {
392                    return () if ref $_[0] ne 'GLOB';
393                    $fh = $_[0];
394                }
395                elsif ( $_[0] )
396                {
397                    my $file = $_[0];
398                    if    ( -f $file                       ) { $file = "<$file" }
399                    elsif (    $file =~ /^<(.*)$/ && -f $1 ) { }  # Explicit read
400                  # elsif (    $file =~ /\S\s*\|$/         ) { }  # Read from a pipe
401                    else                                     { return () }
402                    open $fh, $file or return ();
403                    $close_file{ $fh } = 1;
404                }
405                else
406                {
407                    $fh = \*STDIN;
408                }
409                $file_handle{ $_[0] } = $fh;
410            }
411    
412          if ( defined( $next_header{$fh} ) ) {          my ( $id, $desc, $seq ) = ( undef, '', '' );
413            if ( defined( $next_header{$fh} ) )
414            {
415              ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );              ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
416          }          }
417          else {          else
418              $next_header{$fh} = "";          {
419              ( $id, $desc ) = ( undef, "" );              $next_header{$fh} = '';
420          }          }
         my $seq = "";  
421    
422          while ( <$fh> ) {          while ( <$fh> )
423            {
424              chomp;              chomp;
425              if ( /^>/ ) {        #  new id              if ( /^>/ )        #  new id
426                {
427                  $next_header{$fh} = $_;                  $next_header{$fh} = $_;
428                  if ( defined($id) && $seq )                  if ( defined($id) && $seq )
429                  {                  {
430                      return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]                      return wantarray ? ($id, $desc, $seq) : [$id, $desc, $seq]
431                  }                  }
432                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );                  ( $id, $desc ) = parse_fasta_title( $next_header{$fh} );
433                  $seq = "";                  $seq = '';
434              }              }
435              else {              else
436                  tr/     0-9//d;              {
437                    tr/ \t\r//d;
438                  $seq .= $_ ;                  $seq .= $_ ;
439              }              }
440          }          }
441    
442          #  Done with file, delete "next header"          #  Done with file; there is no next header:
443    
444          delete $next_header{$fh};          delete $next_header{$fh};
445          return ( defined($id) && $seq ) ? ( wantarray ? ($id, $desc, $seq)  
446                                                        : [$id, $desc, $seq]          #  Return last set of data:
447                                            )  
448                                          : () ;          if ( defined($id) && $seq )
449            {
450                return wantarray ? ($id,$desc,$seq) : [$id,$desc,$seq]
451            }
452    
453            #  Or close everything out (returning the empty list tells caller
454            #  that we are done)
455    
456            if ( $close_file{ $fh } ) { close $fh; delete $close_file{ $fh } }
457            delete $file_handle{ $_[0] };
458    
459            return ();
460      }      }
461  }  }
462    
# Line 384  Line 506 
506  #     ($id, $def) = parse_fasta_title( $title )  #     ($id, $def) = parse_fasta_title( $title )
507  #     ($id, $def) = split_fasta_title( $title )  #     ($id, $def) = split_fasta_title( $title )
508  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
509  sub parse_fasta_title {  sub parse_fasta_title
510    {
511      my $title = shift;      my $title = shift;
512      chomp;      chomp $title;
     if ($title =~ /^>?\s*(\S+)(:?\s+(.*\S)\s*)?$/) {  
         return ($1, $3 ? $3 : "");  
     }  
     elsif ($title =~ /^>/) {  
         return ("", "");  
     }  
     else {  
         return (undef, "");  
     }  
 }  
513    
514  sub split_fasta_title {      return $title =~ /^>?\s*(\S+)(\s+(.*\S)?\s*)?$/ ? ( $1, $3 || '' )
515      parse_fasta_title ( shift );           : $title =~ /^>/                           ? ( '', '' )
516             :                                            ( undef, undef )
517  }  }
518    
519    sub split_fasta_title { parse_fasta_title( @_ ) }
520    
521    
522  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
523  #  Helper function for defining an output filehandle:  #  Helper function for defining an output filehandle:
# Line 1567  Line 1683 
1683  #  Translate with user-supplied genetic code hash.  No error check on the code.  #  Translate with user-supplied genetic code hash.  No error check on the code.
1684  #  Should only be called through translate_seq_with_user_code.  #  Should only be called through translate_seq_with_user_code.
1685  #  #
1686  #     $aa = translate_codon_with_user_code( $triplet, \%code, $ambig_table )  #     $aa = translate_codon_with_user_code( $triplet, \%code, \%ambig_table )
1687  #  #
1688  #  $triplet      speaks for itself  #  $triplet      speaks for itself
1689  #  $code         ref to the hash with the codon translations  #  \%code         ref to the hash with the codon translations
1690  #  $ambig_table  ref to hash with lists of nucleotides for each ambig code  #  \%ambig_table  ref to hash with lists of nucleotides for each ambig code
1691  #-----------------------------------------------------------------------------  #-----------------------------------------------------------------------------
1692    
1693  sub translate_codon_with_user_code  sub translate_codon_with_user_code
1694  {  {
1695      my ( $codon, $gc, $N, $X, $ambigs ) = @_;      my ( $codon, $gc, $ambigs ) = @_;
1696    
1697      #  Try a simple lookup:      #  Try a simple lookup:
1698    
# Line 1596  Line 1712 
1712      {      {
1713          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';          $codon .= ( $codon =~ /[a-z]/ ) ? 'n' : 'N';
1714      }      }
1715        #  This makes assumptions about the user code, but tranlating ambiguous
1716        #  codons is really a bit off the wall to start with:
1717      elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?      elsif ( $codon !~ m/^[ACGTUMY][ACGTU][ACGTUKMRSWYBDHVN]$/i ) # Valid?
1718      {      {
1719          return $X;          return $X;

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3