[Bio] / FigKernelPackages / FigGFF.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FigGFF.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.4, Wed Mar 30 20:49:24 2005 UTC revision 1.20, Mon Dec 5 19:06:30 2005 UTC
# Line 1  Line 1 
1  #  #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18    #
19  # FIG GFF utilities.  # FIG GFF utilities.
20  #  #
21    
# Line 12  Line 29 
29    
30  use base qw(Exporter);  use base qw(Exporter);
31  use vars qw(@EXPORT);  use vars qw(@EXPORT);
32  @EXPORT = qw(map_seed_alias_to_dbxref);  @EXPORT = qw(map_seed_alias_to_dbxref map_dbxref_to_seed_alias);
33    
34  #  #
35  # General GFF-related routines.  # General GFF-related routines.
# Line 57  Line 74 
74      {      {
75          return "NCBI_NP:$1";          return "NCBI_NP:$1";
76      }      }
77        elsif (/^NM_(\d+.*)/)
78        {
79            return "NCBI_NM:$1";
80        }
81      elsif (/^gi\|(\d+)/)      elsif (/^gi\|(\d+)/)
82      {      {
83          return "NCBI_gi:$1";          return "NCBI_gi:$1";
# Line 85  Line 106 
106  {  {
107      my($dbxref) = @_;      my($dbxref) = @_;
108    
109        # if it is not a valid xref just return it
110        return $dbxref unless $dbxref =~ m/:/;
111    
112      my($type, $ref) = split(/:/, $dbxref, 2);      my($type, $ref) = split(/:/, $dbxref, 2);
113    
114      if ($type eq "NCBI_NP")      if (lc($type) eq "ncbi_np")
115        {
116            return "$ref";
117        }
118        elsif (lc($type) eq "ncbi_nm")
119      {      {
120          return "NP_$ref";          return "$ref";
121      }      }
122      elsif ($type eq "NCBI_gi")      elsif (lc($type) eq "ncbi_pid")
123        {
124            return "$ref";
125        }
126        elsif (lc($type) eq "ncbi_gi")
127      {      {
128          return "gi|$ref";          return "gi|$ref";
129      }      }
130      elsif ($type eq "KEGG")      elsif (lc($type) eq "kegg")
131      {      {
132          $ref =~ s/ /:/;          $ref =~ s/ /:/;
133          return "kegg|$ref";          return "kegg|$ref";
134      }      }
135      elsif ($type eq "UniProt")      elsif (lc($type) eq "uniprot")
136      {      {
137          return "uni|$ref";          return "uni|$ref";
138      }      }
139      elsif ($type eq "Swiss-Prot")      elsif (lc($type) eq "swiss-prot")
140      {      {
141          return "sp|$ref";          return "sp|$ref";
142      }      }
143    
144      return undef;      return $dbxref; # just return itself if we don't know what it is.
145  }  }
146    
147  package GFFWriter;  package GFFWriter;
# Line 134  Line 166 
166    
167      map { $default_options->{$_} = $options{$_} } keys(%options);      map { $default_options->{$_} = $options{$_} } keys(%options);
168    
169        # added contig_start_cache and contig_end_cache because we have something like
170        # sequence-region   contigname      1       10000
171        # in general we will set contig_start_cache == 1
172        # and contig_end_cache == contig_length_cache
173    
174      my $self = {      my $self = {
175          options => $default_options,          options => $default_options,
176          contig_length_cache => {},          contig_length_cache => {},
177            contig_start_cache  => {},
178            contig_end_cache    => {},
179          fig => $fig,          fig => $fig,
180      };      };
181    
# Line 371  Line 410 
410                  chomp($addseq);                  chomp($addseq);
411                  $fastasequences .= ">$protein_id\n$addseq\n";                  $fastasequences .= ">$protein_id\n$addseq\n";
412    
413                  my $addseq = uc($fig->dna_seq($genome, @location));                  $addseq = uc($fig->dna_seq($genome, @location));
414                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
415    
416                  $fastasequences .= ">$cds_id\n$addseq\n";                  $fastasequences .= ">$cds_id\n$addseq\n";
# Line 595  Line 634 
634      while (<$fh>)      while (<$fh>)
635      {      {
636          chomp;          chomp;
637            next unless ($_); # ignore empty lines
638          #          #
639          # Check first for the fasta directive so we can run off and parse that          # Check first for the fasta directive so we can run off and parse that
640          # separately.          # separately.
# Line 607  Line 647 
647          }          }
648          elsif (/^\#\#FASTA/)          elsif (/^\#\#FASTA/)
649          {          {
650              print "Got fasta directive\n";              # print "Got fasta directive\n";
651              $_ = <$fh>;              $_ = <$fh>;
652              chomp;              chomp;
653              $self->parse_fasta($fh, $_);              $self->parse_fasta($fh, $_);
# Line 635  Line 675 
675              # Directive.              # Directive.
676              #              #
677    
678              if ($1 eq "seed")              if (lc($1) eq "seed")
679              {              {
680                  $self->parse_seed_directive($2);                  $self->parse_seed_directive($2);
681              }              }
# Line 662  Line 702 
702  {  {
703      my($self, $directive, $rest) = @_;      my($self, $directive, $rest) = @_;
704    
705      print "Have gff3 directive '$directive' rest='$rest'\n";      $directive = lc($directive);
706        # this should catch both #seed and ##seed :-)
707        if ($directive eq "seed")
708        {
709          return $self->parse_seed_directive($rest);
710        }
711    
712        my @rest=split /\t/, $rest;
713    
714        # removed genome, genome_md5, origin, taxnomy as they are not real gff directives. These are in seed_directives below
715        if ($directive eq "project")
716        {
717            # I am not sure if PROJECT is a seed directive or a GFF directive
718            $self->current_file->project($rest[0]);
719        }
720        elsif ($directive eq "sequence-region")
721        {
722            $self->current_file->contigs($rest[0]);
723            $self->{contig_length_cache}->{$rest[0]}=$rest[2]-$rest[1];
724            $self->{contig_start_cache}->{$rest[0]}=$rest[1];
725            $self->{contig_end_cache}->{$rest[0]}=$rest[2];
726        }
727        else
728        {
729            print STDERR "Have gff3 directive '$directive' rest='$rest'\n";
730        }
731    
732  }  }
733    
734  sub parse_seed_directive  sub parse_seed_directive
# Line 671  Line 737 
737    
738      my($verb, @rest) = split(/\t/, $rest);      my($verb, @rest) = split(/\t/, $rest);
739    
740      if ($verb eq "anno_start")      # are we case sensitive? I don't think so
741        $verb=lc($verb);
742    
743        if ($verb eq "genome_id")
744      {      {
745          $self->current_file->anno_start($rest[0]);          $self->current_file->genome_id($rest[0]);
746      }      }
747      elsif ($verb eq "anno_end")      elsif ($verb eq "name")
748      {      {
749          $self->current_file->anno_start($rest[0]);          $self->current_file->genome_name($rest[0]);
750      }      }
751      elsif ($verb eq "genome_md5")      elsif ($verb eq "genome_md5")
752      {      {
753          $self->current_file->set_genome_checksum(@rest[0,1]);          $self->current_file->set_genome_checksum($rest[0]);
754        }
755        elsif ($verb eq "project")
756        {
757            # I am not sure if PROJECT is a seed directive or a GFF directive
758            $self->current_file->project($rest[0]);
759        }
760        elsif ($verb eq "taxonomy")
761        {
762            $self->current_file->taxonomy($rest[0]);
763        }
764        elsif ($verb eq "taxon_id")
765        {
766            $self->current_file->taxon_id($rest[0]);
767        }
768        elsif ($verb eq "anno_start")
769        {
770            $self->current_file->anno_start($rest[0]);
771        }
772        elsif ($verb eq "anno_end")
773        {
774            $self->current_file->anno_start($rest[0]);
775      }      }
776      elsif ($verb eq "contig_md5")      elsif ($verb eq "contig_md5")
777      {      {
# Line 693  Line 783 
783  {  {
784      my($self, $directive, $rest) = @_;      my($self, $directive, $rest) = @_;
785    
786      print "Have local directive '$directive' rest='$rest'\n";      print STDERR "Have local directive '$directive' rest='$rest'\n";
787  }  }
788    
789  sub parse_feature  sub parse_feature
# Line 727  Line 817 
817    
818          my @values = map { uri_unescape($_) } split(/,/, $value);          my @values = map { uri_unescape($_) } split(/,/, $value);
819    
820            # handle the aliases
821            if ($name eq "Alias") {
822             foreach my $val (@values)
823             {
824               $val = FigGFF::map_dbxref_to_seed_alias($val);
825             }
826            }
827    
828          #          #
829          # This might be a little goofy for the users, but we will use it          # This might be a little goofy for the users, but we will use it
830          # for now:          # for now:
# Line 747  Line 845 
845    
846              $value = \@values;              $value = \@values;
847          }          }
848            else
849            {
850                $value = $values[0];
851            }
852    
853    
854          $athash->{$name} = $value;          $athash->{$name} = $value;
855    
# Line 792  Line 895 
895      my($self, $fh, $first_line) = @_;      my($self, $fh, $first_line) = @_;
896      my($cur, $cur_id);      my($cur, $cur_id);
897    
898      for ($_ = $first_line; $_;  $_ = <$fh>, chomp)      for ($_ = $first_line; defined($_);  $_ = <$fh>, chomp)
899      {      {
900          if (/^>\s*(\S+)/)          if (/^>\s*(\S+)/)
901          {          {
# Line 826  Line 929 
929      my($self, $id, $data) = @_;      my($self, $id, $data) = @_;
930    
931      my $len = length($data);      my $len = length($data);
932      $self->current_file->set_fasta_data($id, $data);      $self->current_file->fasta_data($id, $data);
933  }  }
934    
935  package GFFFeature;  package GFFFeature;
# Line 839  Line 942 
942    
943  map { $GFF_Tags{$_} = 1 } @GFF_Tags;  map { $GFF_Tags{$_} = 1 } @GFF_Tags;
944    
945  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes genome fig_id),  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes
946                                 genome fig_id),
947                            @GFF_Tags);                            @GFF_Tags);
948    
949    
# Line 918  Line 1022 
1022  use strict;  use strict;
1023  use base qw(Class::Accessor);  use base qw(Class::Accessor);
1024    
1025  __PACKAGE__->mk_accessors(qw(fig filename features feature_index anno_start anno_end));  __PACKAGE__->mk_accessors(qw(fig filename features feature_index anno_start anno_end taxon_id genome_id));
1026    
1027  #  #
1028  # Package to hold the contents of a GFF file, and to hold the code  # Package to hold the contents of a GFF file, and to hold the code
# Line 934  Line 1038 
1038      my $self = {      my $self = {
1039          fig => $fig,          fig => $fig,
1040          features => [],          features => [],
1041            contigs  => [],
1042          feature_index => {},          feature_index => {},
1043          genome_checksum => {},          genome_checksum => '',
1044          contig_checksum => {},          contig_checksum => {},
1045          features_by_genome => {},          features_by_genome => {},
1046      };      };
# Line 960  Line 1065 
1065    
1066  sub genome_checksum  sub genome_checksum
1067  {  {
     my($self, $genome) = @_;  
   
     return $self->{genome_checksum}->{$genome};  
 }  
   
 sub genome_checksums  
 {  
1068      my($self) = @_;      my($self) = @_;
1069    
1070      return [map { [$_, $self->{genome_checksum}->{$_}] } keys(%{$self->{genome_checksum}})];      return $self->{genome_checksum};
1071  }  }
1072    
1073  sub set_genome_checksum  sub set_genome_checksum
1074  {  {
1075      my($self, $genome, $md5sum) = @_;      my($self, $md5sum) = @_;
1076      $self->{genome_checksum}->{$genome} = $md5sum;      $self->{genome_checksum} = $md5sum;
1077  }  }
1078    
1079  sub set_contig_checksum  sub set_contig_checksum
# Line 984  Line 1082 
1082      $self->{contig_checksum}->{$genome}->{$contig} = $md5sum;      $self->{contig_checksum}->{$genome}->{$contig} = $md5sum;
1083  }  }
1084    
1085  sub set_fasta_data  =head2 fasta_data()
1086    
1087    Get or set the fasta data. Given an id and some data will set the data for that id. Given an id will return the data for that id. Called without arguments will return a reference to a hash of sequences.
1088    
1089    This means that if you give it an id and sequence it will return that sequence. Hmmm.
1090    
1091    =cut
1092    
1093    sub fasta_data
1094  {  {
1095      my($self, $id, $data) = @_;      my($self, $id, $data) = @_;
1096        $id && $data && ($self->{fasta_data}->{$id} = $data);
1097        $id && return $self->{fasta_data}->{$id};
1098        return $self->{fasta_data};
1099    }
1100    
1101    
1102    =head2 contigs()
1103    
1104    Add a contig to the list, or return a reference to an array of contigs
1105    
1106    =cut
1107    
1108    sub contigs
1109    {
1110        my($self, $contig) = @_;
1111        if ($contig && $contig =~ /\w\w\_\d+\.\d+/) {
1112          print STDERR "WARNING: $contig appears to have a version number. We should standardize on timming that somewhere\n";
1113        }
1114        $contig && (push @{$self->{contigs}}, $contig);
1115        return $self->{contigs};
1116    }
1117    
1118    =head2 contig_length()
1119    
1120      $self->{fasta_data}->{$id} = $data;  Get or set the length of a specfic contig.
1121      my $length=$fob->contig_length($contig, $length);
1122      my $length=$fob->contig_length($contig);
1123    
1124    =cut
1125    
1126    sub contig_length
1127    {
1128       my($self, $contig, $length) = @_;
1129       $length && ($self->{contig_length_cache}->{$contig}=$length);
1130       return $self->{contig_length_cache}->{$contig};
1131  }  }
1132    
1133    =head1 Information about the source of the sequence.
1134    
1135    These are things that we have parsed out the GFF3 file, or want to add into the GFF3 file. We can use these methods to get or set them as required. In general, if a value is supplied that will be used as the new value.
1136    
1137    =cut
1138    
1139    =head2 genome_id()
1140    
1141    Get or set a genome id for this file.
1142    
1143    =cut
1144    
1145    sub genome_id
1146    {
1147        my($self, $genomeid) = @_;
1148        $genomeid && ($self->{genome_id}=$genomeid);
1149        return $self->{genome_id};
1150    }
1151    
1152    =head2 genome_name()
1153    
1154    Get or set a genome id for this file.
1155    
1156    =cut
1157    
1158    sub genome_name
1159    {
1160        my($self, $genomename) = @_;
1161        $genomename && ($self->{genome_name}=$genomename);
1162        return $self->{genome_name};
1163    }
1164    
1165    =head2 project()
1166    
1167    Get or set the project.
1168    
1169    =cut
1170    
1171    sub project
1172    {
1173         my ($self, $pro) = @_;
1174         $pro && ($self->{project}=$pro);
1175         return $self->{project};
1176    }
1177    
1178    =head2 taxonomy()
1179    
1180    Get or set the taxonomy
1181    
1182    =cut
1183    
1184    sub taxonomy
1185    {
1186        my($self, $tax) = @_;
1187        $tax && ($self->{taxonomy}=$tax);
1188        return $self->{taxonomy};
1189    }
1190    
1191    
1192    
1193    
1194    
1195  1;  1;

Legend:
Removed from v.1.4  
changed lines
  Added in v.1.20

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3