[Bio] / FigKernelPackages / FigGFF.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FigGFF.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Fri Apr 8 20:22:23 2005 UTC revision 1.21, Sun Feb 5 00:45:19 2006 UTC
# Line 1  Line 1 
1  #  #
2    # Copyright (c) 2003-2006 University of Chicago and Fellowship
3    # for Interpretations of Genomes. All Rights Reserved.
4    #
5    # This file is part of the SEED Toolkit.
6    #
7    # The SEED Toolkit is free software. You can redistribute
8    # it and/or modify it under the terms of the SEED Toolkit
9    # Public License.
10    #
11    # You should have received a copy of the SEED Toolkit Public License
12    # along with this program; if not write to the University of Chicago
13    # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14    # Genomes at veronika@thefig.info or download a copy from
15    # http://www.theseed.org/LICENSE.TXT.
16    #
17    
18    #
19  # FIG GFF utilities.  # FIG GFF utilities.
20  #  #
21    
# Line 12  Line 29 
29    
30  use base qw(Exporter);  use base qw(Exporter);
31  use vars qw(@EXPORT);  use vars qw(@EXPORT);
32  @EXPORT = qw(map_seed_alias_to_dbxref);  @EXPORT = qw(map_seed_alias_to_dbxref map_dbxref_to_seed_alias);
33    
34  #  #
35  # General GFF-related routines.  # General GFF-related routines.
# Line 57  Line 74 
74      {      {
75          return "NCBI_NP:$1";          return "NCBI_NP:$1";
76      }      }
77        elsif (/^NM_(\d+.*)/)
78        {
79            return "NCBI_NM:$1";
80        }
81      elsif (/^gi\|(\d+)/)      elsif (/^gi\|(\d+)/)
82      {      {
83          return "NCBI_gi:$1";          return "NCBI_gi:$1";
# Line 85  Line 106 
106  {  {
107      my($dbxref) = @_;      my($dbxref) = @_;
108    
109        # if it is not a valid xref just return it
110        return $dbxref unless $dbxref =~ m/:/;
111    
112      my($type, $ref) = split(/:/, $dbxref, 2);      my($type, $ref) = split(/:/, $dbxref, 2);
113    
114      if ($type eq "NCBI_NP")      if (lc($type) eq "ncbi_np")
115      {      {
116          return "NP_$ref";          return "$ref";
117      }      }
118      elsif ($type eq "NCBI_gi")      elsif (lc($type) eq "ncbi_nm")
119        {
120            return "$ref";
121        }
122        elsif (lc($type) eq "ncbi_pid")
123        {
124            return "$ref";
125        }
126        elsif (lc($type) eq "ncbi_gi")
127      {      {
128          return "gi|$ref";          return "gi|$ref";
129      }      }
130      elsif ($type eq "KEGG")      elsif (lc($type) eq "kegg")
131      {      {
132          $ref =~ s/ /:/;          $ref =~ s/ /:/;
133          return "kegg|$ref";          return "kegg|$ref";
134      }      }
135      elsif ($type eq "UniProt")      elsif (lc($type) eq "uniprot")
136      {      {
137          return "uni|$ref";          return "uni|$ref";
138      }      }
139      elsif ($type eq "Swiss-Prot")      elsif (lc($type) eq "swiss-prot")
140      {      {
141          return "sp|$ref";          return "sp|$ref";
142      }      }
143    
144      return undef;      return $dbxref; # just return itself if we don't know what it is.
145  }  }
146    
147  package GFFWriter;  package GFFWriter;
# Line 134  Line 166 
166    
167      map { $default_options->{$_} = $options{$_} } keys(%options);      map { $default_options->{$_} = $options{$_} } keys(%options);
168    
169        # added contig_start_cache and contig_end_cache because we have something like
170        # sequence-region   contigname      1       10000
171        # in general we will set contig_start_cache == 1
172        # and contig_end_cache == contig_length_cache
173    
174      my $self = {      my $self = {
175          options => $default_options,          options => $default_options,
176          contig_length_cache => {},          contig_length_cache => {},
177            contig_start_cache  => {},
178            contig_end_cache    => {},
179          fig => $fig,          fig => $fig,
180      };      };
181    
# Line 371  Line 410 
410                  chomp($addseq);                  chomp($addseq);
411                  $fastasequences .= ">$protein_id\n$addseq\n";                  $fastasequences .= ">$protein_id\n$addseq\n";
412    
413                  my $addseq = uc($fig->dna_seq($genome, @location));                  $addseq = uc($fig->dna_seq($genome, @location));
414                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
415    
416                  $fastasequences .= ">$cds_id\n$addseq\n";                  $fastasequences .= ">$cds_id\n$addseq\n";
# Line 595  Line 634 
634      while (<$fh>)      while (<$fh>)
635      {      {
636          chomp;          chomp;
637            next unless ($_); # ignore empty lines
638          #          #
639          # Check first for the fasta directive so we can run off and parse that          # Check first for the fasta directive so we can run off and parse that
640          # separately.          # separately.
641          #          #
642    
643    
644          if (/^>/)          if (/^>/)
645          {          {
646              $self->parse_fasta($fh, $_);              $self->parse_fasta($fh, $_);
# Line 607  Line 648 
648          }          }
649          elsif (/^\#\#FASTA/)          elsif (/^\#\#FASTA/)
650          {          {
651              print "Got fasta directive\n";              # print "Got fasta directive\n";
652              $_ = <$fh>;              $_ = <$fh>;
653              chomp;              chomp;
654              $self->parse_fasta($fh, $_);              $self->parse_fasta($fh, $_);
# Line 620  Line 661 
661              #              #
662              next;              next;
663          }          }
664            elsif (/^\#$/)
665            {
666                #
667                # blank line starting with #
668                #
669                next;
670            }
671          elsif (/^\#\#(\S+)(?:\t(.*))?/)          elsif (/^\#\#(\S+)(?:\t(.*))?/)
672          {          {
673              #              #
# Line 662  Line 710 
710  {  {
711      my($self, $directive, $rest) = @_;      my($self, $directive, $rest) = @_;
712    
713      print "Have gff3 directive '$directive' rest='$rest'\n";      $directive = lc($directive);
714        # this should catch both #seed and ##seed :-)
715        if ($directive eq "seed")
716        {
717          return $self->parse_seed_directive($rest);
718        }
719    
720        my @rest=split /\t/, $rest;
721    
722        # removed genome, genome_md5, origin, taxnomy as they are not real gff directives. These are in seed_directives below
723        if ($directive eq "project")
724        {
725            # I am not sure if PROJECT is a seed directive or a GFF directive
726            $self->current_file->project($rest[0]);
727        }
728        elsif ($directive eq "sequence-region")
729        {
730            $self->current_file->contigs($rest[0]);
731            $self->{contig_length_cache}->{$rest[0]}=$rest[2]-$rest[1];
732            $self->{contig_start_cache}->{$rest[0]}=$rest[1];
733            $self->{contig_end_cache}->{$rest[0]}=$rest[2];
734        }
735        else
736        {
737            print STDERR "Have gff3 directive '$directive' rest='$rest'\n";
738        }
739    
740  }  }
741    
742  sub parse_seed_directive  sub parse_seed_directive
# Line 671  Line 745 
745    
746      my($verb, @rest) = split(/\t/, $rest);      my($verb, @rest) = split(/\t/, $rest);
747    
748      if ($verb eq "anno_start")      # are we case sensitive? I don't think so
749        $verb=lc($verb);
750    
751        if ($verb eq "genome_id")
752      {      {
753          $self->current_file->anno_start($rest[0]);          $self->current_file->genome_id($rest[0]);
754      }      }
755      elsif ($verb eq "anno_end")      elsif ($verb eq "name")
756      {      {
757          $self->current_file->anno_start($rest[0]);          $self->current_file->genome_name($rest[0]);
758      }      }
759      elsif ($verb eq "genome_md5")      elsif ($verb eq "genome_md5")
760      {      {
761          $self->current_file->set_genome_checksum(@rest[0,1]);          $self->current_file->set_genome_checksum($rest[0]);
762        }
763        elsif ($verb eq "project")
764        {
765            # I am not sure if PROJECT is a seed directive or a GFF directive
766            $self->current_file->project($rest[0]);
767        }
768        elsif ($verb eq "taxonomy")
769        {
770            $self->current_file->taxonomy($rest[0]);
771        }
772        elsif ($verb eq "taxon_id")
773        {
774            $self->current_file->taxon_id($rest[0]);
775        }
776        elsif ($verb eq "anno_start")
777        {
778            $self->current_file->anno_start($rest[0]);
779        }
780        elsif ($verb eq "anno_end")
781        {
782            $self->current_file->anno_start($rest[0]);
783      }      }
784      elsif ($verb eq "contig_md5")      elsif ($verb eq "contig_md5")
785      {      {
# Line 693  Line 791 
791  {  {
792      my($self, $directive, $rest) = @_;      my($self, $directive, $rest) = @_;
793    
794      print "Have local directive '$directive' rest='$rest'\n";      print STDERR "Have local directive '$directive' rest='$rest'\n";
795  }  }
796    
797  sub parse_feature  sub parse_feature
# Line 727  Line 825 
825    
826          my @values = map { uri_unescape($_) } split(/,/, $value);          my @values = map { uri_unescape($_) } split(/,/, $value);
827    
828            # handle the aliases
829            if ($name eq "Alias") {
830             foreach my $val (@values)
831             {
832               $val = FigGFF::map_dbxref_to_seed_alias($val);
833             }
834            }
835    
836          #          #
837          # This might be a little goofy for the users, but we will use it          # This might be a little goofy for the users, but we will use it
838          # for now:          # for now:
# Line 747  Line 853 
853    
854              $value = \@values;              $value = \@values;
855          }          }
856            else
857            {
858                $value = $values[0];
859            }
860    
861    
862          $athash->{$name} = $value;          $athash->{$name} = $value;
863    
# Line 792  Line 903 
903      my($self, $fh, $first_line) = @_;      my($self, $fh, $first_line) = @_;
904      my($cur, $cur_id);      my($cur, $cur_id);
905    
906      for ($_ = $first_line; $_;  $_ = <$fh>, chomp)      for ($_ = $first_line; defined($_);  $_ = <$fh>, chomp)
907      {      {
908          if (/^>\s*(\S+)/)          if (/^>\s*(\S+)/)
909          {          {
# Line 826  Line 937 
937      my($self, $id, $data) = @_;      my($self, $id, $data) = @_;
938    
939      my $len = length($data);      my $len = length($data);
940      $self->current_file->set_fasta_data($id, $data);      $self->current_file->fasta_data($id, $data);
941  }  }
942    
943  package GFFFeature;  package GFFFeature;
# Line 839  Line 950 
950    
951  map { $GFF_Tags{$_} = 1 } @GFF_Tags;  map { $GFF_Tags{$_} = 1 } @GFF_Tags;
952    
953  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes genome fig_id),  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes
954                                 genome fig_id),
955                            @GFF_Tags);                            @GFF_Tags);
956    
957    
# Line 918  Line 1030 
1030  use strict;  use strict;
1031  use base qw(Class::Accessor);  use base qw(Class::Accessor);
1032    
1033  __PACKAGE__->mk_accessors(qw(fig filename features feature_index anno_start anno_end));  __PACKAGE__->mk_accessors(qw(fig filename features feature_index anno_start anno_end taxon_id genome_id));
1034    
1035  #  #
1036  # Package to hold the contents of a GFF file, and to hold the code  # Package to hold the contents of a GFF file, and to hold the code
# Line 934  Line 1046 
1046      my $self = {      my $self = {
1047          fig => $fig,          fig => $fig,
1048          features => [],          features => [],
1049            contigs  => [],
1050          feature_index => {},          feature_index => {},
1051          genome_checksum => {},          genome_checksum => '',
1052          contig_checksum => {},          contig_checksum => {},
1053          features_by_genome => {},          features_by_genome => {},
1054      };      };
# Line 960  Line 1073 
1073    
1074  sub genome_checksum  sub genome_checksum
1075  {  {
     my($self, $genome) = @_;  
   
     return $self->{genome_checksum}->{$genome};  
 }  
   
 sub genome_checksums  
 {  
1076      my($self) = @_;      my($self) = @_;
1077    
1078      return [map { [$_, $self->{genome_checksum}->{$_}] } keys(%{$self->{genome_checksum}})];      return $self->{genome_checksum};
1079  }  }
1080    
1081  sub set_genome_checksum  sub set_genome_checksum
1082  {  {
1083      my($self, $genome, $md5sum) = @_;      my($self, $md5sum) = @_;
1084      $self->{genome_checksum}->{$genome} = $md5sum;      $self->{genome_checksum} = $md5sum;
1085  }  }
1086    
1087  sub set_contig_checksum  sub set_contig_checksum
# Line 984  Line 1090 
1090      $self->{contig_checksum}->{$genome}->{$contig} = $md5sum;      $self->{contig_checksum}->{$genome}->{$contig} = $md5sum;
1091  }  }
1092    
1093  sub set_fasta_data  =head2 fasta_data()
1094    
1095    Get or set the fasta data. Given an id and some data will set the data for that id. Given an id will return the data for that id. Called without arguments will return a reference to a hash of sequences.
1096    
1097    This means that if you give it an id and sequence it will return that sequence. Hmmm.
1098    
1099    =cut
1100    
1101    sub fasta_data
1102  {  {
1103      my($self, $id, $data) = @_;      my($self, $id, $data) = @_;
1104        $id && $data && ($self->{fasta_data}->{$id} = $data);
1105        $id && return $self->{fasta_data}->{$id};
1106        return $self->{fasta_data};
1107    }
1108    
1109    
1110    =head2 contigs()
1111    
1112    Add a contig to the list, or return a reference to an array of contigs
1113    
1114    =cut
1115    
1116    sub contigs
1117    {
1118        my($self, $contig) = @_;
1119        if ($contig && $contig =~ /\w\w\_\d+\.\d+/) {
1120          print STDERR "WARNING: $contig appears to have a version number. We should standardize on timming that somewhere\n";
1121        }
1122        $contig && (push @{$self->{contigs}}, $contig);
1123        return $self->{contigs};
1124    }
1125    
1126    =head2 contig_length()
1127    
1128    Get or set the length of a specfic contig.
1129      my $length=$fob->contig_length($contig, $length);
1130      my $length=$fob->contig_length($contig);
1131    
1132    =cut
1133    
1134    sub contig_length
1135    {
1136       my($self, $contig, $length) = @_;
1137       $length && ($self->{contig_length_cache}->{$contig}=$length);
1138       return $self->{contig_length_cache}->{$contig};
1139    }
1140    
1141    =head1 Information about the source of the sequence.
1142    
1143    These are things that we have parsed out the GFF3 file, or want to add into the GFF3 file. We can use these methods to get or set them as required. In general, if a value is supplied that will be used as the new value.
1144    
1145    =cut
1146    
1147      $self->{fasta_data}->{$id} = $data;  =head2 genome_id()
1148    
1149    Get or set a genome id for this file.
1150    
1151    =cut
1152    
1153    sub genome_id
1154    {
1155        my($self, $genomeid) = @_;
1156        $genomeid && ($self->{genome_id}=$genomeid);
1157        return $self->{genome_id};
1158  }  }
1159    
1160    =head2 genome_name()
1161    
1162    Get or set a genome id for this file.
1163    
1164    =cut
1165    
1166    sub genome_name
1167    {
1168        my($self, $genomename) = @_;
1169        $genomename && ($self->{genome_name}=$genomename);
1170        return $self->{genome_name};
1171    }
1172    
1173    =head2 project()
1174    
1175    Get or set the project.
1176    
1177    =cut
1178    
1179    sub project
1180    {
1181         my ($self, $pro) = @_;
1182         $pro && ($self->{project}=$pro);
1183         return $self->{project};
1184    }
1185    
1186    =head2 taxonomy()
1187    
1188    Get or set the taxonomy
1189    
1190    =cut
1191    
1192    sub taxonomy
1193    {
1194        my($self, $tax) = @_;
1195        $tax && ($self->{taxonomy}=$tax);
1196        return $self->{taxonomy};
1197    }
1198    
1199    
1200    
1201    
1202    
1203  1;  1;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3