[Bio] / FigKernelPackages / FigGFF.pm Repository:
ViewVC logotype

Diff of /FigKernelPackages/FigGFF.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Thu Mar 17 20:02:40 2005 UTC revision 1.4, Wed Mar 30 20:49:24 2005 UTC
# Line 6  Line 6 
6  # A GFFWriter handles the generation of GFF files from SEED data structures.  # A GFFWriter handles the generation of GFF files from SEED data structures.
7  #  #
8    
9    package FigGFF;
10    
11    use strict;
12    
13    use base qw(Exporter);
14    use vars qw(@EXPORT);
15    @EXPORT = qw(map_seed_alias_to_dbxref);
16    
17    #
18    # General GFF-related routines.
19    #
20    
21    
22    #
23    # Alias translation.
24    #
25    # These routines map between the SEED aliases and the standard
26    # dbxref names as defined here:
27    #
28    # ftp://ftp.geneontology.org/pub/go/doc/GO.xrf_abbs
29    #
30    # In particular:
31    #
32    # abbreviation: NCBI_gi
33    # database: NCBI databases.
34    # object: Identifier.
35    # example_id: NCBI_gi:10727410
36    # generic_url: http://www.ncbi.nlm.nih.gov/
37    # url_syntax:
38    # url_example: http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Retrieve&db=nucleotide&list_uids=10727410&dopt=GenBank
39    #
40    # abbreviation: NCBI_NP
41    # database: NCBI RefSeq.
42    # object: Protein identifier.
43    # example_id: NCBI_NP:123456
44    # generic_url: http://www.ncbi.nlm.nih.gov/
45    # url_syntax:
46    #
47    # abbreviation: KEGG
48    # database: Kyoto Encyclopedia of Genes and Genomes.
49    # generic_url: http://www.genome.ad.jp/kegg/
50    
51    sub map_seed_alias_to_dbxref
52    {
53        my($alias) = @_;
54    
55        $_ = $alias;
56        if (/^NP_(\d+.*)/)
57        {
58            return "NCBI_NP:$1";
59        }
60        elsif (/^gi\|(\d+)/)
61        {
62            return "NCBI_gi:$1";
63        }
64        elsif (/^kegg\|(\S+):(\S+)/)
65        {
66            return "KEGG:$1 $2";
67        }
68        elsif (/^uni\|(\S+)/)
69        {
70            return "UniProt:$1";
71        }
72        elsif (/^sp\|(\S+)/)
73        {
74            return "Swiss-Prot:$1";
75        }
76    
77        return undef;
78    }
79    
80    #
81    # And map back again.
82    #
83    
84    sub map_dbxref_to_seed_alias
85    {
86        my($dbxref) = @_;
87    
88        my($type, $ref) = split(/:/, $dbxref, 2);
89    
90        if ($type eq "NCBI_NP")
91        {
92            return "NP_$ref";
93        }
94        elsif ($type eq "NCBI_gi")
95        {
96            return "gi|$ref";
97        }
98        elsif ($type eq "KEGG")
99        {
100            $ref =~ s/ /:/;
101            return "kegg|$ref";
102        }
103        elsif ($type eq "UniProt")
104        {
105            return "uni|$ref";
106        }
107        elsif ($type eq "Swiss-Prot")
108        {
109            return "sp|$ref";
110        }
111    
112        return undef;
113    }
114    
115  package GFFWriter;  package GFFWriter;
116    
117  use strict;  use strict;
118  use FIG;  use FIG;
119    use FigGFF;
120    
121  use Carp;  use Carp;
122  use URI::Escape;  use URI::Escape;
# Line 52  Line 158 
158    
159  sub gff3_for_feature  sub gff3_for_feature
160  {  {
161      my($self, $fid, $user, $user_note) = @_;      my($self, $fid, $user, $user_note, $in_aliases, $in_loc) = @_;
162    
163      #      #
164      # Options we need to figure out somehow.      # Options we need to figure out somehow.
# Line 77  Line 183 
183      #      #
184      # Do this first to make sure that we really have a feature.      # Do this first to make sure that we really have a feature.
185      #      #
186      my @location = $fig->feature_location($fid);      my @location = ref($in_loc) ? @$in_loc : $fig->feature_location($fid);
187      if (@location == 0 or !defined($location[0]))      if (@location == 0 or !defined($location[0]))
188      {      {
189          warn "No location found for feature $fid\n";          warn "No location found for feature $fid\n";
# Line 100  Line 206 
206      # all the aliases we are going to use      # all the aliases we are going to use
207      #      #
208      my @alias;      my @alias;
209        my @xref;
210    
211      if ($options->{with_assignments})      if ($options->{with_assignments})
212      {      {
213          my $func = $fig->function_of($fid, $user);          my $func = $fig->function_of($fid, $user);
214          if ($func)          if ($func)
215          {          {
216              push @$note, ("Note=\"" . uri_escape($func) . '"');              push @$note, ("Note=" . uri_escape($func));
217          }          }
218      }      }
219    
220      if ($options->{with_aliases})      if ($options->{with_aliases})
221      {      {
222      # now find aliases      # now find aliases
223          foreach my $alias ($fig->feature_aliases($fid))          my @feat_aliases = ref($in_aliases) ? @$in_aliases : $fig->feature_aliases($fid);
224          {          foreach my $alias (@feat_aliases)
             if ($alias =~ /^NP/)  
             {  
                 push @$note, "Dbxref=\"NCBI_genpept:$alias\"";  
             }  
             elsif ($alias =~ /gi\|/)  
             {  
                 $alias =~ s/^gi\|//;  
                 push @$note, "Dbxref=\"NCBI_gi:$alias\"";  
             }  
             elsif ($alias =~ /^kegg/i)  
225              {              {
226                  $alias =~ s/kegg\|/KEGG:/i;              my $mapped = FigGFF::map_seed_alias_to_dbxref($alias);
227                  $alias =~ s/^(.*):/$1+/;              if ($mapped)
                 push @$note, "Dbxref=\"$alias\"";  
             }  
             elsif ($alias =~ /^uni/)  
228              {              {
229                  $alias =~ s/uni\|/UniProt:/;                  push(@xref, $mapped);
                 # $note = check_go($note, $alias) if ($go);  
                 push @$note, "Dbxref=\"$alias\"";  
             }  
             elsif ($alias =~ /^sp\|/)  
             {  
                 $alias =~ s/sp\|/Swiss-Prot:/;  
                 push @$note, "Dbxref=\"$alias\"";  
230              }              }
231              else              else
232              {              {
233                  # don't know what it is so keep it as an alias                  push(@alias, $alias);
                 $alias = uri_escape($alias); # just in case!  
                 push @alias, $alias;  
234              }              }
235          }          }
236      }      }
# Line 153  Line 238 
238      # now just join all the aliases and put them into @$note so we can add it to the array      # now just join all the aliases and put them into @$note so we can add it to the array
239      if (@alias)      if (@alias)
240      {      {
241          push @$note, "Alias=\"". join (",", @alias) . '"';          push @$note, "Alias=". join (",", map { uri_escape($_) } @alias);
242      }      }
243    
244      #      #
# Line 166  Line 251 
251      }      }
252    
253      # the LAST thing I am going to add as a note is the FIG id so that I can grep it out easily      # the LAST thing I am going to add as a note is the FIG id so that I can grep it out easily
254      push @$note, "Dbxref=\"SEED:$fid\"";      #
255        # for now, make SEED xref the first in the list so we can search for DBxref=SEEd
256        #
257    
258        unshift(@xref, "SEED:$fid");
259    
260        push(@$note, "Dbxref=" . join(",", map { uri_escape($_) } @xref));
261    
262      # finally join all the notes into a long string that can be added as column 9      # finally join all the notes into a long string that can be added as column 9
263      my $allnotes;      my $allnotes;
# Line 280  Line 371 
371                  chomp($addseq);                  chomp($addseq);
372                  $fastasequences .= ">$protein_id\n$addseq\n";                  $fastasequences .= ">$protein_id\n$addseq\n";
373    
374                  my $addseq = uc($fig->dna_seq($genome, $fig->feature_location($fid)));                  my $addseq = uc($fig->dna_seq($genome, @location));
375                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
376    
377                  $fastasequences .= ">$cds_id\n$addseq\n";                  $fastasequences .= ">$cds_id\n$addseq\n";
# Line 302  Line 393 
393              my ($id, $type)=("rna.$geneid", "tRNA");              my ($id, $type)=("rna.$geneid", "tRNA");
394              if ($outputfasta)              if ($outputfasta)
395              {              {
396                  my $addseq = $fig->dna_seq($genome, $fig->feature_location($fid));                  my $addseq = $fig->dna_seq($genome, @location);
397                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);                  $addseq =~ s/(.{$linelength})/$1\n/g; chomp($addseq);
398                  $fastasequences .= ">$id\n$addseq\n";                  $fastasequences .= ">$id\n$addseq\n";
399              }              }
# Line 634  Line 725 
725      {      {
726          my($name, $value) = split(/=/, $attr);          my($name, $value) = split(/=/, $attr);
727    
728            my @values = map { uri_unescape($_) } split(/,/, $value);
729    
730          #          #
731          # Handle the GFF3-defined attributes          # This might be a little goofy for the users, but we will use it
732            # for now:
733            #
734            # if there is more than one value, the value is a ref to a list
735            # of the values.
736            #
737            # Otherwise, the value is a scalar.
738          #          #
   
         my @values = split(/,/, $value);  
739    
740          if (@values > 1)          if (@values > 1)
741          {          {
742              my $vlist = [];              #
743              for my $value (@values)              # Yes, you can do this ... I had to look it up :-).
744              {              #
745                  $value = uri_unescape($value);              # It's in 'man perlfaq3'.
746                  push(@$vlist, $value);              #
747              }  
748              $value = $vlist;              $value = \@values;
749          }          }
750    
751          $athash->{$name} = $value;          $athash->{$name} = $value;
752    
753            #
754            # Handle the GFF3-defined attributes.
755            #
756            # These show up as Class::Accessor's on the feature object.
757            #
758    
759          if ($GFFFeature::GFF_Tags{$name})          if ($GFFFeature::GFF_Tags{$name})
760          {          {
761              $feature->set($name, $value);              $feature->set($name, $value);
762    
763                if ($name eq "Dbxref")
764                {
765                    #
766                    # If we have a SEED:figid DBxref, set the genome and fig_id attributes.
767                    #
768    
769                    my @seed_xref = grep /^"?SEED:/, @values;
770                    if (@seed_xref and $seed_xref[0] =~ /^"?SEED:(fig\|(\d+\.\d+)\..*)/)
771                    {
772                        $feature->genome($2);
773                        $feature->fig_id($1);
774          }          }
775    
776      }      }
777            }
778        }
779      $feature->attributes($athash);      $feature->attributes($athash);
780    
781    
782      $self->current_file->add_feature($feature);      $self->current_file->add_feature($feature);
783  }  }
784    
# Line 721  Line 839 
839    
840  map { $GFF_Tags{$_} = 1 } @GFF_Tags;  map { $GFF_Tags{$_} = 1 } @GFF_Tags;
841    
842  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes),  __PACKAGE__->mk_accessors(qw(fig seqid source type start end score strand phase attributes genome fig_id),
843                            @GFF_Tags);                            @GFF_Tags);
844    
845    
# Line 736  Line 854 
854      return bless($self, $class);      return bless($self, $class);
855  }  }
856    
857    sub find_local_feature
858    {
859        my($self, $local_genome) = @_;
860        my $db = $self->fig->db_handle;
861    
862        # For debugging.
863        undef $local_genome;
864        if ($local_genome)
865        {
866            #
867            # It's a precise match. We need to determine if we have this
868            # particular feature in this SEED (it is possible for one to
869            # have exported an annotation for a feature that was added
870            # to a genome after its initial release).
871            #
872            # We do this by searching for a local feature with the same contig,
873            # start, and stop as this feature.
874            #
875    
876            my $qry = qq(SELECT id
877                         FROM features
878                         WHERE (genome = ? AND
879                                contig = ? AND
880                                minloc = ? AND
881                                maxloc = ?));
882            my $res = $db->SQL($qry, undef, $local_genome, $self->seqid,
883                               $self->start, $self->end);
884    
885            return map { $_->[0] } @$res;
886        }
887    
888        #
889        # Otherwise, we need to try a set of heuristics to match
890        # this id.
891        #
892    
893        #
894        # Try matching aliases first.
895        #
896    
897        my @aliases = grep { !/^\"?SEED/ } ref($self->Dbxref) ? @{$self->Dbxref} : ($self->Dbxref);
898    
899        my @maliases = map { FigGFF::map_dbxref_to_seed_alias($_) } @aliases;
900    
901        print "Found aliases @aliases\n";
902        print "Found mapped aliases @maliases\n";
903    
904        for my $malias (@maliases)
905        {
906            my $fid = $self->fig->by_alias($malias);
907            if ($fid)
908            {
909                print "Mapped $malias to $fid\n";
910            }
911        }
912    
913    }
914    
915    
916  package GFFFile;  package GFFFile;
917    
918  use strict;  use strict;
# Line 758  Line 935 
935          fig => $fig,          fig => $fig,
936          features => [],          features => [],
937          feature_index => {},          feature_index => {},
938            genome_checksum => {},
939            contig_checksum => {},
940            features_by_genome => {},
941      };      };
942      return bless($self, $class);      return bless($self, $class);
943  }  }
# Line 768  Line 948 
948    
949      push(@{$self->features}, $feature);      push(@{$self->features}, $feature);
950      $self->feature_index->{$feature->ID} = $feature;      $self->feature_index->{$feature->ID} = $feature;
951        push(@{$self->{features_by_genome}->{$feature->genome}}, $feature);
952    }
953    
954    sub features_for_genome
955    {
956        my($self, $genome) = @_;
957    
958        return $self->{features_by_genome}->{$genome};
959    }
960    
961    sub genome_checksum
962    {
963        my($self, $genome) = @_;
964    
965        return $self->{genome_checksum}->{$genome};
966    }
967    
968    sub genome_checksums
969    {
970        my($self) = @_;
971    
972        return [map { [$_, $self->{genome_checksum}->{$_}] } keys(%{$self->{genome_checksum}})];
973  }  }
974    
975  sub set_genome_checksum  sub set_genome_checksum

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.4

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3