[Bio] / FigKernelPackages / FigGFF.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FigGFF.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Sat Apr 9 21:09:30 2005 UTC revision 1.22, Fri May 18 13:59:59 2007 UTC
# Line 1  Line 1 
1  #  #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18    #
19  # FIG GFF utilities.  # FIG GFF utilities.
20  #  #
21    
# Line 12  Line 29 
29    
30  use base qw(Exporter);  use base qw(Exporter);
31  use vars qw(@EXPORT);  use vars qw(@EXPORT);
32  @EXPORT = qw(map_seed_alias_to_dbxref);  @EXPORT = qw(map_seed_alias_to_dbxref map_dbxref_to_seed_alias);
33    
34  #  #
35  # General GFF-related routines.  # General GFF-related routines.
# Line 57  Line 74 
74      {      {
75          return "NCBI_NP:$1";          return "NCBI_NP:$1";
76      }      }
77        elsif (/^NM_(\d+.*)/)
78        {
79            return "NCBI_NM:$1";
80        }
81      elsif (/^gi\|(\d+)/)      elsif (/^gi\|(\d+)/)
82      {      {
83          return "NCBI_gi:$1";          return "NCBI_gi:$1";
# Line 86  Line 107 
107      my($dbxref) = @_;      my($dbxref) = @_;
108    
109      # if it is not a valid xref just return it      # if it is not a valid xref just return it
110      return $dbxref unless (m/:/);      return $dbxref unless $dbxref =~ m/:/;
111    
112      my($type, $ref) = split(/:/, $dbxref, 2);      my($type, $ref) = split(/:/, $dbxref, 2);
113    
114      if ($type eq "NCBI_NP")      if (lc($type) eq "ncbi_np")
115      {      {
116          return "NP_$ref";          return "$ref";
117      }      }
118      elsif ($type eq "NCBI_gi")      elsif (lc($type) eq "ncbi_nm")
119        {
120            return "$ref";
121        }
122        elsif (lc($type) eq "ncbi_pid")
123        {
124            return "$ref";
125        }
126        elsif (lc($type) eq "ncbi_gi")
127      {      {
128          return "gi|$ref";          return "gi|$ref";
129      }      }
130      elsif ($type eq "KEGG")      elsif (lc($type) eq "kegg")
131      {      {
132          $ref =~ s/ /:/;          $ref =~ s/ /:/;
133          return "kegg|$ref";          return "kegg|$ref";
134      }      }
135      elsif ($type eq "UniProt")      elsif (lc($type) eq "uniprot")
136      {      {
137          return "uni|$ref";          return "uni|$ref";
138      }      }
139      elsif ($type eq "Swiss-Prot")      elsif (lc($type) eq "swiss-prot")
140      {      {
141          return "sp|$ref";          return "sp|$ref";
142      }      }
# Line 381  Line 410 
410                  chomp($addseq);                  chomp($addseq);
411                  $fastasequences .= ">$protein_id\n$addseq\n";                  $fastasequences .= ">$protein_id\n$addseq\n";
412    
413                  my $addseq = uc($fig->dna_seq($genome, @location));                  $addseq = uc($fig->dna_seq($genome, @location));
414                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
415    
416                  $fastasequences .= ">$cds_id\n$addseq\n";                  $fastasequences .= ">$cds_id\n$addseq\n";
# Line 541  Line 570 
570    
571  use base qw(Class::Accessor);  use base qw(Class::Accessor);
572    
573  __PACKAGE__->mk_accessors(qw(fig current_file));  __PACKAGE__->mk_accessors(qw(fig current_file features_by_genome feature_index features filename fasta_data contig_checksum genome_checksum contigs));
574    
575  my $count;  my $count;
576    
577    =pod
578    
579    =head1 GFFParser
580    
581    A parser for GFF3 files.
582    
583    =head2 new()
584    
585    Instantiate
586    my $fgff = GFFParser->new($fig);
587    
588    =cut
589    
590    
591  #  #
592  # GFF file parser. Creates GFFFiles.  # GFF file parser. Creates GFFFiles.
# Line 561  Line 603 
603      return bless($self, $class);      return bless($self, $class);
604  }  }
605    
606    =head2 parse()
607    
608    Takes a filename as an argument, and returns a file object.
609    
610    The file object is a reference to a hash with the following keys:
611            features_by_genome
612                    An array of all the features in this genome
613            feature_index
614                    A hash with a key of the features by ID and the value being the GFFFeature
615            features
616                    All the features in the genome, as an array with each element being a GFFFeature element
617            filename
618                    The filename of the file that was parsed
619            fasta_data
620                    A hash with the key being the ID and the value being the sequence
621    
622    Not sure about:
623            contig_checksum
624            genome_checksum
625            contigs
626            fig
627    
628    This is method now stores the data internally, so you can then access the data as:
629            $fgff->features_by_genome->{}
630            $fgff->feature_index->{}
631            $fgff->features->{}
632            $fgff->filename->{}
633            $fgff->fasta_data->{}
634            $fgff->contig_checksum->{}
635            $fgff->genome_checksum->{}
636            $fgff->contigs->{}
637            $fgff->fig->{}
638    =cut
639    
640  sub parse  sub parse
641  {  {
642      my($self, $file) = @_;      my($self, $file) = @_;
# Line 579  Line 655 
655      }      }
656      else      else
657      {      {
658          open($fh, "<$file") or confess "Cannot open $file: $!";          if ($file =~ /\.gz$/) {open($fh, "gunzip -c $file |") or confess "can't open a pipe to gunzip $file"}
659            else {open($fh, "<$file") or confess "Cannot open $file: $!";}
660          $fobj->filename($file);          $fobj->filename($file);
661          $close_handle = 1;          $close_handle = 1;
662      }      }
# Line 605  Line 682 
682      while (<$fh>)      while (<$fh>)
683      {      {
684          chomp;          chomp;
685            next unless ($_); # ignore empty lines
686          #          #
687          # Check first for the fasta directive so we can run off and parse that          # Check first for the fasta directive so we can run off and parse that
688          # separately.          # separately.
689          #          #
690    
691    
692          if (/^>/)          if (/^>/)
693          {          {
694              $self->parse_fasta($fh, $_);              $self->parse_fasta($fh, $_);
# Line 617  Line 696 
696          }          }
697          elsif (/^\#\#FASTA/)          elsif (/^\#\#FASTA/)
698          {          {
699              print "Got fasta directive\n";              # print "Got fasta directive\n";
700              $_ = <$fh>;              $_ = <$fh>;
701              chomp;              chomp;
702              $self->parse_fasta($fh, $_);              $self->parse_fasta($fh, $_);
# Line 630  Line 709 
709              #              #
710              next;              next;
711          }          }
712            elsif (/^\#$/)
713            {
714                #
715                # blank line starting with #
716                #
717                next;
718            }
719          elsif (/^\#\#(\S+)(?:\t(.*))?/)          elsif (/^\#\#(\S+)(?:\t(.*))?/)
720          {          {
721              #              #
# Line 665  Line 751 
751          }          }
752      }      }
753    
754        foreach my $k (qw[features_by_genome feature_index features filename fasta_data contig_checksum genome_checksum contigs])
755        {
756                $self->{$k}=$fobj->{$k};
757        }
758    
759      return $fobj;      return $fobj;
760  }  }
761    
762  sub parse_gff3_directive  =head2 feature_tree
 {  
     my($self, $directive, $rest) = @_;  
763    
764      $directive = lc($directive);  Generate and return a feature tree for the features in the GFF3 file. Most features have Parent/Child relationships, eg. an exon is a child of a gene, and a CDS is a child of an mRNA. This method will return the tree so that you can recurse up and down it.
765      my @rest=split /\t/, $rest;  
766    =cut
767    
768      if ($directive eq "genome")  sub feature_tree {
769            my $self=shift;
770            return $self->{'tree'} if (defined $self->{'tree'});
771            my $tree;
772            my $fc;
773            foreach my $k (keys %{$self->features_by_genome})
774      {      {
775          $self->current_file->genome_id($rest[0]);  # first create a hash with only parents, and an array that houses thei children
776          $self->current_file->genome_name($rest[1]);                  my $children;
777                    foreach my $feat (@{$self->features_by_genome->{$k}})
778                    {
779                            my $parent;
780                            if (defined $feat->{'Parent'}) {$parent=$feat->{'Parent'}}
781                            elsif (defined $feat->{'attributes'}->{'Parent'}) {$parent=$feat->{'attributes'}->{'Parent'}}
782    
783                            if (defined $parent) {push @{$children->{$parent}}, $feat->{'ID'}}
784                            else {$tree->{$feat->{'ID'}}=undef}
785      }      }
786      elsif ($directive eq "genome_md5")  
787    # now add them to a tree
788                    $self->_add2tree($tree, [keys %$tree], $children);
789            }
790            $self->{'tree'}=$tree;
791            return $tree;
792    }
793    
794    sub _add2tree {
795            my ($self, $tree, $parents, $children)=@_;
796            foreach my $parent (@$parents)
797      {      {
798          $self->current_file->set_genome_checksum(@rest[0,1]);                  if ($children->{$parent})
799                    {
800                            map {$tree->{$parent}->{$_}=undef} @{$children->{$parent}};
801                            $self->_add2tree($tree->{$parent}, $children->{$parent}, $children);
802                    }
803      }      }
804      elsif ($directive eq "origin")  }
805    
806    
807    
808    
809    
810    =head2 parse_gff3_directive()
811    
812    Pases the directives within the files (e.g. headers, flags for FASTA, and so on).
813    
814    =cut
815    
816    
817    
818    sub parse_gff3_directive
819      {      {
820          print STDERR "We have a directive called origin but this should be changed as it will conflict with NCBI's ORIGIN indicating beginning of the sequence\n";      my($self, $directive, $rest) = @_;
821          print STDERR "At the moment ORIGIN is returned by \$feat->project\n";  
822          $self->current_file->project($rest[0]);      $directive = lc($directive);
823        # this should catch both #seed and ##seed :-)
824        if ($directive eq "seed")
825        {
826          return $self->parse_seed_directive($rest);
827      }      }
828      elsif ($directive eq "taxonomy")  
829        my @rest=split /\t/, $rest;
830    
831        # removed genome, genome_md5, origin, taxnomy as they are not real gff directives. These are in seed_directives below
832        if ($directive eq "project")
833      {      {
834          $self->current_file->taxonomy($rest);          # I am not sure if PROJECT is a seed directive or a GFF directive
835            $self->current_file->project($rest[0]);
836      }      }
837      elsif ($directive eq "sequence-region")      elsif ($directive eq "sequence-region")
838      {      {
# Line 703  Line 843 
843      }      }
844      else      else
845      {      {
846          print "Have gff3 directive '$directive' rest='$rest'\n";          print STDERR "Have gff3 directive '$directive' rest='$rest'\n";
847      }      }
848    
849  }  }
850    
851    =head2 parse_seed_directive()
852    
853    Parse out seed information that we hide in the headers, eg, project, name, taxid, and so on. These are our internal representations, but are generally treated as comments by other gff3 parsers
854    
855    =cut
856    
857  sub parse_seed_directive  sub parse_seed_directive
858  {  {
859      my($self, $rest) = @_;      my($self, $rest) = @_;
# Line 715  Line 861 
861      my($verb, @rest) = split(/\t/, $rest);      my($verb, @rest) = split(/\t/, $rest);
862    
863      # are we case sensitive? I don't think so      # are we case sensitive? I don't think so
864      $verb-lc($verb);      $verb=lc($verb);
865    
866      if ($verb eq "anno_start")      if ($verb eq "genome_id")
867        {
868            $self->current_file->genome_id($rest[0]);
869        }
870        elsif ($verb eq "name")
871        {
872            $self->current_file->genome_name($rest[0]);
873        }
874        elsif ($verb eq "genome_md5")
875        {
876            $self->current_file->set_genome_checksum($rest[0]);
877        }
878        elsif ($verb eq "project")
879        {
880            # I am not sure if PROJECT is a seed directive or a GFF directive
881            $self->current_file->project($rest[0]);
882        }
883        elsif ($verb eq "taxonomy")
884        {
885            $self->current_file->taxonomy($rest[0]);
886        }
887        elsif ($verb eq "taxon_id")
888        {
889            $self->current_file->taxon_id($rest[0]);
890        }
891        elsif ($verb eq "anno_start")
892      {      {
893          $self->current_file->anno_start($rest[0]);          $self->current_file->anno_start($rest[0]);
894      }      }
# Line 731  Line 902 
902      }      }
903  }  }
904    
905    =head2 parse_local_directive()
906    
907    I haven't seen one of these :)
908    
909    =cut
910    
911  sub parse_local_directive  sub parse_local_directive
912  {  {
913      my($self, $directive, $rest) = @_;      my($self, $directive, $rest) = @_;
914    
915      print "Have local directive '$directive' rest='$rest'\n";      print STDERR "Have local directive '$directive' rest='$rest'\n";
916  }  }
917    
918    =head2 parse_feature
919    
920    Reads a feature line and stuffs it into the right places, as appropriate.
921    
922    =cut
923    
924  sub parse_feature  sub parse_feature
925  {  {
926      my($self, $seqid, $source, $type, $start, $end, $score, $strand, $phase, $attributes) = @_;      my($self, $seqid, $source, $type, $start, $end, $score, $strand, $phase, $attributes) = @_;
# Line 797  Line 980 
980    
981              $value = \@values;              $value = \@values;
982          }          }
983            else
984            {
985                $value = $values[0];
986            }
987    
988    
989          $athash->{$name} = $value;          $athash->{$name} = $value;
990    
# Line 837  Line 1025 
1025  # in order to support the backward-compatiblity syntax that  # in order to support the backward-compatiblity syntax that
1026  # lets a file skip the ##FASTA directive if it wishes.  # lets a file skip the ##FASTA directive if it wishes.
1027  #  #
1028    
1029    =head2 parse_fasta()
1030    
1031    Read the fasta sequence into memory
1032    
1033    =cut
1034    
1035  sub parse_fasta  sub parse_fasta
1036  {  {
1037      my($self, $fh, $first_line) = @_;      my($self, $fh, $first_line) = @_;
1038      my($cur, $cur_id);      my($cur, $cur_id);
1039    
1040      for ($_ = $first_line; $_;  $_ = <$fh>, chomp)      for ($_ = $first_line; defined($_);  $_ = <$fh>, chomp)
1041      {      {
1042          if (/^>\s*(\S+)/)          if (/^>\s*(\S+)/)
1043          {          {
# Line 879  Line 1074 
1074      $self->current_file->fasta_data($id, $data);      $self->current_file->fasta_data($id, $data);
1075  }  }
1076    
1077    =pod
1078    
1079    =head1 GFFFeature
1080    
1081    A GFFFeature that acceesses the data
1082    
1083    =head2 methods
1084    
1085    fig seqid source type start end score strand phase attributes genome fig_id
1086    
1087    =cut
1088    
1089  package GFFFeature;  package GFFFeature;
1090    
1091  use strict;  use strict;
# Line 889  Line 1096 
1096    
1097  map { $GFF_Tags{$_} = 1 } @GFF_Tags;  map { $GFF_Tags{$_} = 1 } @GFF_Tags;
1098    
1099  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes genome fig_id),  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes
1100                                 genome fig_id),
1101                            @GFF_Tags);                            @GFF_Tags);
1102    
1103    
# Line 968  Line 1176 
1176  use strict;  use strict;
1177  use base qw(Class::Accessor);  use base qw(Class::Accessor);
1178    
1179  __PACKAGE__->mk_accessors(qw(fig filename features feature_index anno_start anno_end));  __PACKAGE__->mk_accessors(qw(fig filename features feature_index anno_start anno_end taxon_id genome_id));
1180    
1181  #  #
1182  # Package to hold the contents of a GFF file, and to hold the code  # Package to hold the contents of a GFF file, and to hold the code
# Line 986  Line 1194 
1194          features => [],          features => [],
1195          contigs  => [],          contigs  => [],
1196          feature_index => {},          feature_index => {},
1197          genome_checksum => {},          genome_checksum => '',
1198          contig_checksum => {},          contig_checksum => {},
1199          features_by_genome => {},          features_by_genome => {},
1200      };      };
# Line 1011  Line 1219 
1219    
1220  sub genome_checksum  sub genome_checksum
1221  {  {
     my($self, $genome) = @_;  
   
     return $self->{genome_checksum}->{$genome};  
 }  
   
 sub genome_checksums  
 {  
1222      my($self) = @_;      my($self) = @_;
1223    
1224      return [map { [$_, $self->{genome_checksum}->{$_}] } keys(%{$self->{genome_checksum}})];      return $self->{genome_checksum};
1225  }  }
1226    
1227  sub set_genome_checksum  sub set_genome_checksum
1228  {  {
1229      my($self, $genome, $md5sum) = @_;      my($self, $md5sum) = @_;
1230      $self->{genome_checksum}->{$genome} = $md5sum;      $self->{genome_checksum} = $md5sum;
1231  }  }
1232    
1233  sub set_contig_checksum  sub set_contig_checksum
# Line 1061  Line 1262 
1262  sub contigs  sub contigs
1263  {  {
1264      my($self, $contig) = @_;      my($self, $contig) = @_;
1265        if ($contig && $contig =~ /\w\w\_\d+\.\d+/) {
1266          print STDERR "WARNING: $contig appears to have a version number. We should standardize on timming that somewhere\n";
1267        }
1268      $contig && (push @{$self->{contigs}}, $contig);      $contig && (push @{$self->{contigs}}, $contig);
1269      return $self->{contigs};      return $self->{contigs};
1270  }  }

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.22

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3