[Bio] / FigKernelPackages / gjogenbank.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/gjogenbank.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Tue Mar 26 23:16:17 2013 UTC revision 1.12, Mon Aug 31 20:29:13 2015 UTC
# Line 13  Line 13 
13  #     \@entries = parse_genbank( \*FH )  #     \@entries = parse_genbank( \*FH )
14  #      @entries = parse_genbank(  $file )  #      @entries = parse_genbank(  $file )
15  #     \@entries = parse_genbank(  $file )  #     \@entries = parse_genbank(  $file )
16    #      @entries = parse_genbank( \$text )
17    #     \@entries = parse_genbank( \$text )
18  #  #
19  #  One entry per call:  #  One entry per call:
20  #  #
# Line 68  Line 70 
70  #     @types = feature_types( $entry );  #     @types = feature_types( $entry );
71  #    \@types = feature_types( $entry );  #    \@types = feature_types( $entry );
72  #  #
73    #  Note that the returned features DO NOT include the type!
74    #
75  #     @ftrs = features_of_type( $entry,  @types );  #     @ftrs = features_of_type( $entry,  @types );
76  #    \@ftrs = features_of_type( $entry,  @types );  #    \@ftrs = features_of_type( $entry,  @types );
77  #     @ftrs = features_of_type( $entry, \@types );  #     @ftrs = features_of_type( $entry, \@types );
# Line 191  Line 195 
195    
196                             rpt_type                             rpt_type
197                             rpt_family                             rpt_family
198                               rpt_unit_range
199                               rpt_unit_seq
200    
201                             operon                             operon
202                             gene                             gene
203                             gene_synonym                             gene_synonym
204                             allele                             allele
205                             locus_tag                             locus_tag
206                               old_locus_tag
207                             codon_start                             codon_start
208                             transl_table                             transl_table
209    
210                             anticodon                             anticodon
211                             product                             product
212                             function                             function
213                               GO_component
214                               GO_function
215                               GO_process
216                             EC_number                             EC_number
217                             protein_id                             protein_id
218                             locus_peptide                             locus_peptide
# Line 233  Line 243 
243                           trans_splicing                           trans_splicing
244                        );                        );
245    
246    #  Qualifiers whose values are not wrapped in quotation marks:
247    
248    my %unquoted_qual  = map { $_ => 1 }
249                         qw( anticodon
250                             citation
251                             codon_start
252                             compare
253                             direction
254                             estimated_length
255                             mod_base
256                             number
257                             rpt_type
258                             rpt_unit_range
259                             tag_peptide
260                             transl_except
261                             transl_table
262                          );
263    
264    
265    my %genbank_streams;   # used by parse_next_genbank() to track open streams
266    
267  #===============================================================================  #===============================================================================
268    #  Read GenBank entries from one or more files and/or strings.
269  #  #
270  #    @entries = parse_genbank( )           #  \*STDIN  #    @entries = parse_genbank( )           #  \*STDIN
271  #   \@entries = parse_genbank( )           #  \*STDIN  #   \@entries = parse_genbank( )           #  \*STDIN
272  #    @entries = parse_genbank( \*FH )  #    @entries = parse_genbank( \*FH, ... )       #  open file
273  #   \@entries = parse_genbank( \*FH )  #   \@entries = parse_genbank( \*FH, ... )       #  open file
274  #    @entries = parse_genbank(  $file )  #    @entries = parse_genbank(  $gb_file, ... )  #  file name
275  #   \@entries = parse_genbank(  $file )  #   \@entries = parse_genbank(  $gb_file, ... )  #  file name
276    #    @entries = parse_genbank( \$gb_text, ... )  #  reference to data string
277    #   \@entries = parse_genbank( \$gb_text, ... )  #  reference to data string
278  #  #
279  #===============================================================================  #===============================================================================
 my %genbank_streams;  
280    
281  sub parse_genbank  sub parse_genbank
282  {  {
283        my @entries;
284        my $first = 1;
285        while ( @_ || $first )
286        {
287      my $file = shift;      my $file = shift;
   
288      my ( $fh, $close ) = &input_filehandle( $file );      my ( $fh, $close ) = &input_filehandle( $file );
289      $fh or return wantarray ? () : [];          if ( $fh )
290            {
     my @entries;  
291      while ( my $entry = parse_one_genbank_entry( $fh ) ) { push @entries, $entry }      while ( my $entry = parse_one_genbank_entry( $fh ) ) { push @entries, $entry }
   
292      close $fh if $close;      close $fh if $close;
293            }
294            $first = 0;
295        }
296    
297      wantarray ? @entries : \@entries;      wantarray ? @entries : \@entries;
298  }  }
299    
300    
301  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
302  #  Read and parse a GenBank file, on entry at a time.  Successive calls with  #  Read and parse a GenBank file, one entry at a time.  Successive calls with
303  #  same parameter will return successive entries.  Calls to different files  #  same parameter will return successive entries.  Calls to different files
304  #  can be interlaced.  #  can be interlaced.
305  #  #
# Line 290  Line 326 
326    
327      if ( ! $entry ) { close $fh if $close; delete $genbank_streams{ $file || '' }; }      if ( ! $entry ) { close $fh if $close; delete $genbank_streams{ $file || '' }; }
328    
329      return $entry;      $entry;
330  }  }
331    
332    
333  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
334  #  If it should be necessary to close a stream openned by parse_next_genbank()  #  If it should be necessary to close a stream openned by parse_next_genbank()
335  #  before it reaches the end-of-file, this will do it.  #  before it reaches the end-of-file, this will do it.
# Line 366  Line 403 
403              if ( defined( $_ = <$fh> ) ) { chomp } else { $state = -1 }              if ( defined( $_ = <$fh> ) ) { chomp } else { $state = -1 }
404          }          }
405    
406            elsif ( /^CONTIG / )
407            {
408                $state = 3;
409                last;
410            }
411    
412          elsif ( /^ORIGIN / )          elsif ( /^ORIGIN / )
413          {          {
414              $state = 3;              $state = 3;
# Line 450  Line 493 
493    
494              else              else
495              {              {
496                  my $imax = @value - 2;                  if ( @value > 1 )
                 if ( $imax > 0 )  
497                  {                  {
498                      foreach ( @value[ 0 .. $imax ] ) { $_ .= ' ' if ! /-$/ }                      foreach ( @value[ 0 .. @value-2 ] ) { $_ .= ' ' if ! /-$/ }
499                  }                  }
500                  my $value = join( '', @value );                  my $value = join( '', @value );
501    
# Line 463  Line 505 
505                  }                  }
506                  elsif ( $tag eq 'VERSION' )                  elsif ( $tag eq 'VERSION' )
507                  {                  {
508                      if ( $value =~ / +GI:(\d+)/ )                      if ( $value =~ s/ +GI:(\d+)// )
509                      {                      {
510                          $entry{ gi } = $1;                          $entry{ gi } = $1;
                         $value =~ s/ +GI:\d+//;  
511                      }                      }
512                      $entry{ VERSION } = [ split / +/, $value ];                      $entry{ VERSION } = [ split / +/, $value ];
513                  }                  }
# Line 580  Line 621 
621    
622      while ( $state == 3 )      while ( $state == 3 )
623      {      {
624            #  Introducer to sequence data
625    
626          if ( /^ORIGIN/ )          if ( /^ORIGIN/ )
627          {          {
628              $entry{ ORIGIN } = $1 if /^ORIGIN \s+(\S.*\S)\s*$/;              $entry{ ORIGIN } = $1 if /^ORIGIN \s+(\S.*\S)\s*$/;
# Line 623  Line 666 
666                  push @{ $entry{ COMMENT } }, @value;                  push @{ $entry{ COMMENT } }, @value;
667              }              }
668    
669                elsif ( $tag eq 'CONTIG' )
670                {
671                    #  The value of CONTIG is a location string and is merged
672                    #  without spaces.
673                    $entry{ CONTIG } = join( '', @value );
674                }
675    
676              else              else
677              {              {
678                  my $imax = @value - 2;                  if ( @value > 1 )
                 if ( $imax > 0 )  
679                  {                  {
680                      foreach ( @value[ 0 .. $imax ] ) { $_ .= ' ' if ! /-$/ }                      foreach ( @value[ 0 .. @value-2 ] ) { $_ .= ' ' if ! /-$/ }
681                  }                  }
682                  $entry{ $tag } = join( '', @value );                  $entry{ $tag } = join( '', @value );
683              }              }
684          }          }
685    
686            #  End of entry without sequence data.  These can be replaced by
687            #  WGS and WGS_SCAFLD, or CONTIG or possibly something else.
688    
689            elsif ( $_ eq '//' )
690            {
691                #  We could fetch and assemble the data
692                if ( 0 and $entry{ CONTIG } and eval { require NCBI_sequence } )
693                {
694                    $entry{ SEQUENCE } = NCBI_sequence::contig( $entry{CONTIG} );
695                }
696                $state = 0;
697            }
698    
699          else  # This is really a format error, but let's skip it.          else  # This is really a format error, but let's skip it.
700          {          {
701              if ( defined( $_ = <$fh> ) ) { chomp } else { $state = -1 }              if ( defined( $_ = <$fh> ) ) { chomp } else { $state = -1 }
# Line 849  Line 911 
911  #-------------------------------------------------------------------------------  #-------------------------------------------------------------------------------
912  #  Sequence of a feature.  In list context, include information on partial ends.  #  Sequence of a feature.  In list context, include information on partial ends.
913  #  Can get extract the data from a DNA string, reference to a string, or from  #  Can get extract the data from a DNA string, reference to a string, or from
914  #  the SEQUENCE in an entry.  #  the SEQUENCE in an entry.  However, this does not handle locations that
915    #  specify contigs.  Also, this adjusts CDS features to the first nucleotide
916    #  of the first complete codon, which is not really what we should be doing.
917  #  #
918  #     $seq                           = ftr_dna(  $dna,   $ftr )  #     $seq                           = ftr_dna(  $dna,   $ftr )
919  #     $seq                           = ftr_dna( \$dna,   $ftr )  #     $seq                           = ftr_dna( \$dna,   $ftr )
# Line 882  Line 946 
946      my $complement = ( $loc =~ s/^complement\((.*)\)$/$1/ );      my $complement = ( $loc =~ s/^complement\((.*)\)$/$1/ );
947      $loc =~ s/^join\((.*)\)$/$1/;      $loc =~ s/^join\((.*)\)$/$1/;
948      my @spans = split /,/, $loc;      my @spans = split /,/, $loc;
949        #  For each substring, see if it needs to be complemented.  This does
950        #  not occur unless pieces are drawn from multiple contigs, which we
951        #  are not dealing with here anyway.
952        my @cspan = map { s/^complement\((.+)\)$/$1/ } @spans;
953      if ( grep { ! /^<?\d+\.\.>?\d+$/ } @spans )      if ( grep { ! /^<?\d+\.\.>?\d+$/ } @spans )
954      {      {
955          print STDERR "*** Feature location parse error: $loc0\n";          print STDERR "*** Feature location parse error: $loc0\n";
# Line 892  Line 960 
960      my $partial_3 = $spans[-1] =~ s/\.\.>/../;      my $partial_3 = $spans[-1] =~ s/\.\.>/../;
961      ( $partial_5, $partial_3 ) = ( $partial_3, $partial_5 ) if $complement;      ( $partial_5, $partial_3 ) = ( $partial_3, $partial_5 ) if $complement;
962    
963      my $seq = join( '', map { extract_span( $dnaR, $_ ) } @spans );      my @parts;
964        for ( my $i = 0; $i < @spans; $i++ )
965        {
966            $parts[$i] = $cspan[$i] ? gjoseqlib::complement_DNA_seq( extract_span( $dnaR, $spans[$i] ) )
967                                    :                                extract_span( $dnaR, $spans[$i] );
968        }
969        my $seq = join( '', @parts );
970      $seq = gjoseqlib::complement_DNA_seq( $seq ) if $complement;      $seq = gjoseqlib::complement_DNA_seq( $seq ) if $complement;
971    
972      #  Sequences that run off the end can start at other than the first      #  Sequences that run off the end can start at other than the first
973      #  nucleotide of a codon.      #  nucleotide of a codon.
974        #  This should not really be done here, even though it makes life easier
975        #  for the calling program. -- GJO, 2015-08-27
976    
977      my $qual = &qualifiers( $ftr );      my $qual = &qualifiers( $ftr );
978      my $codon_start = $qual->{ codon_start } ? $qual->{ codon_start }->[0] : 1;      my $codon_start = $qual->{ codon_start } ? $qual->{ codon_start }->[0] : 1;
# Line 1193  Line 1269 
1269  #   (accession:)?<?\d+^>?\d+                   # site between residues  #   (accession:)?<?\d+^>?\d+                   # site between residues
1270  #   (accession:)?\d+                           # single residue  #   (accession:)?\d+                           # single residue
1271  #   (accession:)?complement\(element\)  #   (accession:)?complement\(element\)
1272    #                gap\(\w+\)
1273  #   (accession:)?join\(element,element,...\)  #   (accession:)?join\(element,element,...\)
1274  #   (accession:)?order\(element,element,...\)  #   (accession:)?order\(element,element,...\)
1275  #  #
# Line 1565  Line 1642 
1642  #     mol_type   =>   Mol_type  #     mol_type   =>   Mol_type
1643  #  #
1644  #  The record types marked with a * are required.  #  The record types marked with a * are required.
1645  #  At least of of the types marked with a + must also be present.  #  At least one of the types marked with a + must also be present.
1646  #  #
1647  #  Feature records are merged by type.  Slash is removed from qualifier name.  #  Feature records are merged by type.  Slash is removed from qualifier name.
1648  #  Surrounding quotation marks are stripped from qualifier values.  #  Surrounding quotation marks are stripped from qualifier values.
# Line 2010  Line 2087 
2087  sub add_feature_qualifier  sub add_feature_qualifier
2088  {  {
2089      my ( $key, $value ) = @_;      my ( $key, $value ) = @_;
2090        $key or return;
2091        $value = '' if ! defined $value;
2092    
2093      my $qualif;      my $qualif;
2094      if ( ! defined $value || $value eq '' )      if ( $valueless_qual{$key} )
2095      {      {
2096          $qualif = "/$key";          $qualif = "/$key";
2097      }      }
2098        elsif ( $unquoted_qual{$key} )
2099        {
2100            $qualif = "/key=$value";
2101        }
2102      else      else
2103      {      {
2104          if ( $value !~ /^\d+$/ ) { $value =~ s/"/""/g; $value = qq("$value"); }          $value =~ s/"/""/g;
2105          $qualif = "/$key=$value";          $qualif = qq(/$key="$value");
2106      }      }
2107    
2108      formline( <<'End_of_continuation', $qualif ) if defined $qualif && length $qualif;      formline( <<'End_of_continuation', $qualif ) if defined $qualif && length $qualif;
# Line 2105  Line 2188 
2188  #  #
2189  #     filehandle is passed through  #     filehandle is passed through
2190  #     string is taken as file name to be opened  #     string is taken as file name to be opened
2191    #     scalar reference is taken as data string to be opened
2192  #     undef or "" defaults to STDIN  #     undef or "" defaults to STDIN
2193  #  #
2194  #      \*FH           = input_filehandle( $file );  #      \*FH           = input_filehandle( $file );
# Line 2131  Line 2215 
2215    
2216      #  File name      #  File name
2217    
2218      if ( ! ref( $file ) )      if ( ! ref( $file ) || ref( $file ) eq 'SCALAR' )
2219      {      {
2220          -f $file or die "Could not find input file '$file'.\n";          ref( $file ) or -f $file or die "Could not find input file '$file'.\n";
2221          my $fh;          my $fh;
2222          open( $fh, "<$file" ) || die "Could not open '$file' for input.\n";          open( $fh, "<", $file ) || die "Could not open '$file' for input.\n";
2223          return wantarray ? ( $fh, 1 ) : $fh;          return wantarray ? ( $fh, 1 ) : $fh;
2224      }      }
2225    
# Line 2148  Line 2232 
2232  #  #
2233  #     filehandle is passed through  #     filehandle is passed through
2234  #     string is taken as file name to be opened  #     string is taken as file name to be opened
2235    #     scalar reference is taken as data string to be opened
2236  #     undef or "" defaults to STDOUT  #     undef or "" defaults to STDOUT
2237  #  #
2238  #      \*FH           = output_filehandle( $file );  #      \*FH           = output_filehandle( $file );
# Line 2174  Line 2259 
2259    
2260      #  File name      #  File name
2261    
2262      if ( ! ref( $file ) )      if ( ! ref( $file ) || ref( $file ) eq 'SCALAR' )
2263      {      {
2264          my $fh;          my $fh;
2265          open( $fh, ">$file" ) || die "Could not open '$file' for output.\n";          open( $fh, ">", $file ) || die "Could not open '$file' for output.\n";
2266          return wantarray ? ( $fh, 1 ) : $fh;          return wantarray ? ( $fh, 1 ) : $fh;
2267      }      }
2268    

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3