[Bio] / FigKernelPackages / gjogenbank.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjogenbank.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Mon Aug 16 20:43:27 2010 UTC revision 1.3, Tue Aug 17 20:13:42 2010 UTC
# Line 1  Line 1 
1  #!/usr/bin/env perl  #
   
2  # This is a SAS component.  # This is a SAS component.
3    #
4  package gjogenbank;  package gjogenbank;
5    
6  #===============================================================================  #===============================================================================
7  #  #  Parse one or more GenBank entries in a file into perl structures.
8  #  Parse one or more GenBank files into perl structures  #  All of the entries in the file as a list:
9  #  #
10  #    @entries = parse_genbank( )           #  \*STDIN  #    @entries = parse_genbank( )           #  \*STDIN
11  #   \@entries = parse_genbank( )           #  \*STDIN  #   \@entries = parse_genbank( )           #  \*STDIN
# Line 15  Line 14 
14  #    @entries = parse_genbank(  $file )  #    @entries = parse_genbank(  $file )
15  #   \@entries = parse_genbank(  $file )  #   \@entries = parse_genbank(  $file )
16  #  #
17  #  Access functions:  #  One entry per call:
18    #
19    #      $entry = parse_next_genbank( )         #  STDIN
20    #      $entry = parse_next_genbank( \*FH )
21    #      $entry = parse_next_genbank( $file )
22    #
23    #  Error or end-of-file returns undef.
24    #
25    #  Each entry is a hash with key value pairs, some of which have special
26    #  processing.
27    #
28    #     Key => Value
29    #
30    #     ACCESSION  => [ Accession, ... ]
31    #     DATE       =>   Date
32    #     DEFINITION =>   Definition
33    #     GEOMETRY   =>   Geometry
34    #     GI         =>   GI_number
35    #     KEYWORDS   => { Key_phrase => 1, Key_phrase => 1, ... }
36    #     LOCUS      =>   Locus
37    #     ORGANISM   =>   Organism_name
38    #     REFERENCES => [ { Field => Value, Field => Value, ... }, ... ]
39    #     SEQUENCE   =>   Sequence
40    #     SOURCE     =>   Source_string
41    #     TAXONOMY   => [ Taxon, Taxon, ... ]
42    #     VERSION    => [ Version, Other_information ]
43    #
44    #  Feature records are merged by type.  Slash is removed from qualifier name.
45    #  Surrounding quotation marks are stripped from qualifier values.
46    #
47    #     FEATURES   => { Type => [ [ Location, { Qualifier => \@values } ],
48    #                               [ Location, { Qualifier => \@values } ],
49    #                               ...
50    #                             ],
51    #                     Type => ...
52    #                   }
53    #
54    #
55    #  Access functions to parts of structure:
56  #  #
57  #  Sequence of a feature, optionally including information on partial ends.  #  Sequence of a feature, optionally including information on partial ends.
58  #  #
59  #     $seq                           = ftr_dna(  $dna, $ftr )  #     $seq                           = ftr_dna(  $dna, $ftr )
60  #     $seq                           = ftr_dna( \$dna, $ftr )  #     $seq                           = ftr_dna( \$dna, $ftr )
61  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna(  $dna, $ftr )  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna(  $dna, $ftr )  # boolean of > or < in location
62  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna( \$dna, $ftr )  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna( \$dna, $ftr )
63  #  #
64  #    $ftr_location = location( $ftr )      #  Returns empty string on failure.  #    $ftr_location = location( $ftr )      #  Returns empty string on failure.
65  #  #
66    #  Identify features with partial 5' or 3' ends.
67    #
68    #     $partial_5_prime = partial_5_prime( $ftr )
69    #     $partial_3_prime = partial_3_prime( $ftr )
70    #
71  #   \%ftr_qualifiers = qualifiers( $ftr )  #  Returns empty hash reference on failure.  #   \%ftr_qualifiers = qualifiers( $ftr )  #  Returns empty hash reference on failure.
72  #  #
73  #    $gene              = gene( $ftr )  #    $gene              = gene( $ftr )
# Line 41  Line 83 
83  #   \@EC_number = EC_number( $ftr )  #   \@EC_number = EC_number( $ftr )
84  #  #
85  #    $translation = CDS_translation( $ftr )  # Uses in situ if found  #    $translation = CDS_translation( $ftr )  # Uses in situ if found
86    #     $translation = CDS_translation( $ftr,  $dna )   # If not in feature, translate
87    #     $translation = CDS_translation( $ftr, \$dna )
88    #     $translation = CDS_translation( $ftr,  $entry )
89  #  #
90  #  Convert GenBank location to [ [ $contig, $begin, $dir, $length ], ... ]  #  Convert GenBank location to [ [ $contig, $begin, $dir, $length ], ... ]
91  #  #
# Line 62  Line 107 
107  #===============================================================================  #===============================================================================
108    
109  use strict;  use strict;
 # eval { require gjoseqlib; }   # required for translations  
110    
111  require Exporter;  require Exporter;
112  our @ISA = qw(Exporter);  our @ISA = qw(Exporter);
113  our @EXPORT = qw( parse_genbank );  our @EXPORT = qw( parse_genbank
114                      parse_next_genbank
115                    );
116    
117  #===============================================================================  #===============================================================================
118  #  #
# Line 77  Line 123 
123  #    @entries = parse_genbank(  $file )  #    @entries = parse_genbank(  $file )
124  #   \@entries = parse_genbank(  $file )  #   \@entries = parse_genbank(  $file )
125  #  #
126  #  Each entry is a hash with key value pairs, some of which have special  #===============================================================================
127  #  processing.  my %genbank_streams;
128  #  
129  #     Key => Value  sub parse_genbank
130    {
131        my $file = shift;
132    
133        my ( $fh, $close ) = &input_filehandle( $file );
134        $fh or return wantarray ? () : [];
135    
136        my @entries;
137        while ( my $entry = parse_one_genbank_entry( $fh ) ) { push @entries, $entry }
138    
139        close $fh if $close;
140    
141        wantarray ? @entries : \@entries;
142    }
143    
144    
145    #-------------------------------------------------------------------------------
146    #  Read and parse a GenBank file, on entry at a time.  Successive calls with
147    #  same parameter will return successive entries.  Calls to different files
148    #  can be interlaced.
149  #  #
150  #     ACCESSION  => [ Accession, ... ]  #      $entry = parse_next_genbank( )         #  STDIN
151  #     DATE       =>   Date  #      $entry = parse_next_genbank( \*FH )
152  #     DEFINITION =>   Definition  #      $entry = parse_next_genbank( $file )
 #     GEOMETRY   =>   Geometry  
 #     GI         =>   GI_number  
 #     KEYWORDS   => { Key_phrase => 1, Key_phrase => 1, ... }  
 #     LOCUS      =>   Locus  
 #     ORGANISM   =>   Organism_name  
 #     REFERENCES => [ { Field => Value, Field => Value, ... }, ... ]  
 #     SEQUENCE   =>   Sequence  
 #     SOURCE     =>   Source_string  
 #     TAXONOMY   => [ Taxon, Taxon, ... ]  
 #     VERSION    => [ Version, Other_information ]  
153  #  #
154  #  Feature records are merged by type.  Slash is removed from qualifier name.  #  Error or end-of-file returns undef.
155  #  Surrounding quotation marks are stripped from qualifier values.  #-------------------------------------------------------------------------------
156    sub parse_next_genbank
157    {
158        my $file = shift;
159    
160        my $stream = $genbank_streams{ $file || '' };
161        if ( ! $stream )
162        {
163            $stream = [ &input_filehandle( $file ) ];   #  Value is ( $fh, $close )
164            $stream->[0] or return undef;               #  Got a file handle?
165            $genbank_streams{ $file || '' } = $stream;
166        }
167    
168        my ( $fh, $close ) = @$stream;
169        my $entry = parse_one_genbank_entry( $fh );
170    
171        if ( ! $entry ) { close $fh if $close; delete $genbank_streams{ $file || '' }; }
172    
173        return $entry;
174    }
175    
176    
177    #-------------------------------------------------------------------------------
178    #  If it should be necessary to close a stream openned by parse_next_genbank()
179    #  before it reaches the end-of-file, this will do it.
180  #  #
181  #     FEATURES   => { Type => [ [ Location, { Qualifier => \@values } ],  #      close_next_genbank( )         # does nothing
182  #                               [ Location, { Qualifier => \@values } ],  #      close_next_genbank( \*FH )
183  #                               ...  #      close_next_genbank( $file )
 #                             ],  
 #                     Type => ...  
 #                   }  
184  #  #
185  #===============================================================================  #-------------------------------------------------------------------------------
186  sub parse_genbank  sub close_next_genbank
187  {  {
188      my $file = shift;      my $file = shift;
189      my ( $fh, $close ) = input_filehandle( $file );      my $stream = $genbank_streams{ $file || '' };
190      $fh or return wantarray() ? () : [];      close $stream->[0] if $stream && ref $stream eq 'ARRAY' && $stream->[1];
191    }
192    
193      my @entries;  
194    #-------------------------------------------------------------------------------
195    #  Parse the next GenBank format entry read from an open file handle.  This is
196    #  primarily intended as an internal function called through parse_genbank()
197    #  or parse_next_genbank().
198    #
199    #     \%entry = parse_one_genbank_entry( \*FH )
200    #
201    #  Error or end-of-file returns undef
202    #-------------------------------------------------------------------------------
203    sub parse_one_genbank_entry
204    {
205        my $fh = shift;
206    
207      my $state = 0;      my $state = 0;
208      if ( defined( $_ = <$fh> ) ) { chomp } else { $state = -1 }      if ( defined( $_ = <$fh> ) ) { chomp } else { $state = -1 }
209    
210      #  0 = Looking for LOCUS      #  0 = Looking for LOCUS
211      #  1 = Header information      #  1 = Header information
212      #  2 = Features      #  2 = Features
213      #  3 = Sequence      #  3 = Sequence
214      # -1 = Error      # -1 = Error
215    
     while ( $state >= 0 )  
     {  
216          my %entry = ();          my %entry = ();
217          my @sequence;          my @sequence;
218    
# Line 326  Line 414 
414              }              }
415              $entry{ SEQUENCE } = join "", @sequence;              $entry{ SEQUENCE } = join "", @sequence;
416    
             push @entries, \%entry;  
   
417              $state = ( $_ eq '//' ) ? 0 : -1 if $state >= 0;              $state = ( $_ eq '//' ) ? 0 : -1 if $state >= 0;
418          }          }
     }  
419    
420      wantarray ? @entries : \@entries;      $state >= 0 ? \%entry : undef;
421  }  }
422    
423    
# Line 379  Line 464 
464  #===============================================================================  #===============================================================================
465  #  Access methods for some features and feature data  #  Access methods for some features and feature data
466  #===============================================================================  #===============================================================================
467  #  Sequence of a feature, optionally including information on partial ends.  #  Sequence of a feature.  In list context, include information on partial ends.
468    #  Can get extract the data from a DNA string, reference to a string, or from
469    #  the SEQUENCE in an entry.
470  #  #
471  #     $seq                           = ftr_dna(  $dna, $ftr )  #     $seq                           = ftr_dna(  $dna, $ftr )
472  #     $seq                           = ftr_dna( \$dna, $ftr )  #     $seq                           = ftr_dna( \$dna, $ftr )
473    #     $seq                           = ftr_dna(  $entry, $ftr )
474  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna(  $dna, $ftr )  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna(  $dna, $ftr )
475  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna( \$dna, $ftr )  #   ( $seq, $partial_5, $partial_3 ) = ftr_dna( \$dna, $ftr )
476    #   ( $seq, $partial_5, $partial_3 ) = ftr_dna(  $entry, $ftr )
477  #  #
478  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
479  sub ftr_dna  sub ftr_dna
480  {  {
481      my ( $dnaR, $ftr ) = @_;      my ( $dna, $ftr ) = @_;
482      $dnaR and ( ! ref( $dnaR ) or ref( $dnaR ) eq 'SCALAR' ) or return undef;      return undef if ! ( $dna && $ftr );
483      $dnaR = \$dnaR if ! ref( $dnaR );  
484        my $dnaR =   ref $dna eq 'SCALAR'                     ?  $dna
485                 :   ref $dna eq 'HASH' && $dna->{ SEQUENCE } ? \$dna->{ SEQUENCE }
486                 : ! ref $dna                                 ? \$dna
487                 :                                               undef;
488        return undef if ! $dnaR;
489    
490      my $loc = &location( $ftr );      my $loc = &location( $ftr );
491      $loc or return undef;      $loc or return undef;
# Line 434  Line 528 
528    
529    
530  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
531    #  Identify features with partial 5' or 3' ends.
532    #
533    #     $partial_5_prime = partial_5_prime( $ftr )
534    #     $partial_3_prime = partial_3_prime( $ftr )
535    #
536    #-------------------------------------------------------------------------------
537    sub partial_5_prime
538    {
539        my $ftr = shift             or return undef;
540        my $loc = &location( $ftr ) or return undef;
541        my $complement = ( $loc =~ s/^complement\((.*)\)$/$1/ );
542        $loc =~ s/^join\((.*)\)$/$1/;
543        my @spans = split /,/, $loc;
544    
545        $complement ? $spans[-1] =~ /\.\.>/ : $spans[0] =~ /^</;
546    }
547    
548    
549    sub partial_3_prime
550    {
551        my $ftr = shift             or return undef;
552        my $loc = &location( $ftr ) or return undef;
553        my $complement = ( $loc =~ s/^complement\((.*)\)$/$1/ );
554        $loc =~ s/^join\((.*)\)$/$1/;
555        my @spans = split /,/, $loc;
556    
557        $complement ? $spans[0] =~ /^</ : $spans[-1] =~ /\.\.>/;
558    }
559    
560    
561    #-------------------------------------------------------------------------------
562  #  #
563  #    $ftr_location = location( $ftr )   #  Returns empty string on failure.  #    $ftr_location = location( $ftr )   #  Returns empty string on failure.
564  #  #
# Line 582  Line 707 
707  #   $translation = CDS_translation( $ftr )  #   $translation = CDS_translation( $ftr )
708  #   $translation = CDS_translation( $ftr,  $dna )  #   $translation = CDS_translation( $ftr,  $dna )
709  #   $translation = CDS_translation( $ftr, \$dna )  #   $translation = CDS_translation( $ftr, \$dna )
710    #   $translation = CDS_translation( $ftr,  $entry )
711    #
712  #  #
713  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
714  sub CDS_translation  sub CDS_translation
715  {  {
716      my ( $ftr, $seqR ) = @_;      my ( $ftr, $dna ) = @_;
717      my $qual = &qualifiers( $ftr );      my $qual = &qualifiers( $ftr );
718    
719      return $qual->{ translation }->[0] if $qual->{ translation };      return $qual->{ translation }->[0] if $qual->{ translation };
     return undef unless $seqR;  
720    
721      my $CDS_dna = ftr_dna( $seqR, $ftr ) or return undef;      return undef if ! $dna;
722    
723      my $have_lib = 0;      my $have_lib = 0;
724      eval { require gjoseqlib; $have_lib = 1; };      eval { require gjoseqlib; $have_lib = 1; };
725        return undef if ! $have_lib;
726    
727        my $CDS_dna = ftr_dna( $dna, $ftr ) or return undef;
728        my $pep = gjoseqlib::translate_seq( $CDS_dna, ! partial_5_prime( $ftr ) );
729        $pep =~ s/\*$// if $pep;
730    
731      return      $pep;
732  }  }
733    
734    

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3