package Observation; require Exporter; @EXPORT_OK = qw(get_objects); use strict; use warnings; use HTML; 1; # $Id: Observation.pm,v 1.10 2007/06/20 20:55:36 arodri7 Exp $ =head1 NAME Observation -- A presentation layer for observations in SEED. =head1 DESCRIPTION The SEED environment contains various sources of information for sequence features. The purpose of this library is to provide a single interface to this data. The data can be used to display information for a given sequence feature (protein or other, but primarily information is computed for proteins). Example: use FIG; use Observation; my $fig = new FIG; my $fid = "fig|83333.1.peg.3"; my $observations = Observation::get_objects($fid); foreach my $observation (@$observations) { print "ID: " . $fid . "\n"; print "Start: " . $observation->start() . "\n"; ... } B print "$Observation->acc\n" prints the Accession number if present for the Observation =cut =head1 BACKGROUND =head2 Data incorporated in the Observations As the goal of this library is to provide an integrated view, we combine diverse sources of evidence. =head3 SEED core evidence The core SEED data structures provided by FIG.pm. These are Similarities, BBHs and PCHs. =head3 Attribute based Evidence We use the SEED attribute infrastructure to store information computed by a variety of computational procedures. These are e.g. InterPro hits via InterProScan (ipr), NCBI Conserved Domain Database Hits via PSSM(cdd), PFAM hits via HMM(pfam), SignalP results(signalp), and various others. =head1 METHODS The public methods this package provides are listed below: =head3 acc() A valid accession or remote ID (in the style of a db_xref) or a valid local ID (FID) in case this is supported. =cut sub acc { my ($self) = @_; return $self->{acc}; } =head3 description() The description of the hit. Taken from the data or from the our Ontology database for some cases e.g. IPR or PFAM. B Either remoteid or description is required. =cut sub description { my ($self) = @_; return $self->{description}; } =head3 class() The class of evidence (required). This is usually simply the name of the tool or the name of the SEED data structure. B the connection of class and display_method and URL. Current valid classes are: =over 9 =item IDENTICAL (seq) =item SIM (seq) =item BBH (seq) =item PCH (fc) =item FIGFAM (seq) =item IPR (dom) =item CDD (dom) =item PFAM (dom) =item SIGNALP (dom) =item CELLO(loc) =item TMHMM (loc) =item HMMTOP (loc) =back =cut sub class { my ($self) = @_; return $self->{class}; } =head3 type() The type of evidence (required). Where type is one of the following: =over 8 =item seq=Sequence similarity =item dom=domain based match =item loc=Localization of the feature =item fc=Functional coupling. =back =cut sub type { my ($self) = @_; return $self->{acc}; } =head3 start() Start of hit in query sequence. =cut sub start { my ($self) = @_; return $self->{start}; } =head3 end() End of the hit in query sequence. =cut sub stop { my ($self) = @_; return $self->{stop}; } =head3 evalue() E-value or P-Value if present. =cut sub evalue { my ($self) = @_; return $self->{evalue}; } =head3 score() Score if present. B Either score or eval are required. =cut sub score { my ($self) = @_; return $self->{score}; } =head3 display_method() If available use the function specified here to display the "raw" observation. In the case of a BLAST alignment of fid1 and fid2 a cgi script will be called to display the results of running the command "bl2seq fid1 fid2". B that URL linked to in display_method() is an external component and needs to added to the code for every class of evidence. =cut sub display { die "Abstract Method Called\n"; } =head3 rank() Returns an integer from 1 - 10 indicating the importance of this observations. Currently always returns 1. =cut sub rank { my ($self) = @_; # return $self->{rank}; return 1; } =head3 supports_annotation() Does a this observation support the annotation of its feature? Returns =over 3 =item 10, if feature annotation is identical to $self->description =item 1, Feature annotation is similar to $self->annotation; this is computed using FIG::SameFunc() =item undef =back =cut sub supports_annotation { my ($self) = @_; # no code here so far return $self->{supports_annotation}; } =head3 url() URL describing the subject. In case of a BLAST hit against a sequence, this URL will lead to a page displaying the sequence record for the sequence. In case of an HMM hit, the URL will be to the URL description. =cut sub url { my ($self) = @_; my $url = get_url($self->type, $self->acc); return $url; } =head3 get_objects() This is the B method of this Package. It will probably have to: - get all sims for the feature - get all bbhs for the feature - copy information from sim to bbh (bbh have no match location etc) - get pchs (difficult) - get attributes (there is code for this that in get_attribute_based_observations - get_attributes_based_observations returns an array of arrays of hashes like this" my $dataset [ [ { name => 'acc', value => '1234' }, { name => 'from', value => '4' }, { name => 'to', value => '400' }, .... ], [ { name => 'acc', value => '456' }, { name => 'from', value => '1' }, { name => 'to', value => '100' }, .... ], ... ]; return $datasets; } It will invoke the required calls to the SEED API to retrieve the information required. =cut sub get_objects { my ($self,$fid,$classes) = @_; my $objects = []; my @matched_datasets=(); # call function that fetches attribute based observations # returns an array of arrays of hashes if(scalar(@$classes) < 1){ get_attribute_based_observations($fid,\@matched_datasets); get_sims_observations($fid,\@matched_datasets); get_identical_proteins($fid,\@matched_datasets); get_functional_coupling($fid,\@matched_datasets); } else{ #IPR,CDD,CELLO,PFAM,SIGNALP - attribute based my %domain_classes; my $identical_flag=0; my $pch_flag=0; my $sims_flag=0; foreach my $class (@$classes){ if($class =~ /(IPR|CDD|PFAM)/){ $domain_classes{$class} = 1; } elsif ($class eq "IDENTICAL") { $identical_flag = 1; } elsif ($class eq "PCH") { $pch_flag = 1; } elsif ($class eq "SIM") { $sims_flag = 1; } } if ($identical_flag ==1) { get_identical_proteins($fid,\@matched_datasets); } if ( (defined($domain_classes{IPR})) || (defined($domain_classes{CDD})) || (defined($domain_classes{PFAM})) ) { get_attribute_based_domain_observations($fid,\%domain_classes,\@matched_datasets); } if ($pch_flag == 1) { get_functional_coupling($fid,\@matched_datasets); } if ($sims_flag == 1) { get_sims_observations($fid,\@matched_datasets); } #add CELLO and SignalP later } foreach my $dataset (@matched_datasets) { my $object; if($dataset->{'type'} eq "dom"){ $object = Observation::Domain->new($dataset); } if($dataset->{'class'} eq "PCH"){ $object = Observation::FC->new($dataset); } if ($dataset->{'class'} eq "IDENTICAL"){ $object = Observation::Identical->new($dataset); } if ($dataset->{'class'} eq "SIM"){ $object = Observation::Sims->new($dataset); } push (@$objects, $object); } return $objects; } =head1 Internal Methods These methods are not meant to be used outside of this package. B =cut =head3 get_url (internal) get_url() return a valid URL or undef for any observation. URLs are constructed by looking at the Accession acc() and name() Info from both attributes is combined with a table of base URLs stored in this function. =cut sub get_url { my ($self) = @_; my $url=''; # a hash with a URL for each observation; identified by name() #my $URL => { 'PFAM' => "http://www.sanger.ac.uk/cgi-bin/Pfam/getacc?" ,\ # 'IPR' => "http://www.ebi.ac.uk/interpro/DisplayIproEntry?ac=" ,\ # 'CDD' => "http://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=",\ # 'PIR' => "http://www.ncbi.nlm.nih.gov/Structure/cdd/cddsrv.cgi?uid=",\ # 'FIGFAM' => '',\ # 'sim'=> "http://www.theseed.org/linkin.cgi?id=",\ # 'bbh'=> "http://www.theseed.org/linkin.cgi?id=" #}; # if (defined $URL{$self->name}) { # $url = $URL{$self->name}.$self->acc; # return $url; # } # else return undef; } =head3 get_display_method (internal) get_display_method() return a valid URL or undef for any observation. URLs are constructed by looking at the Accession acc() and name() and Info from both attributes is combined with a table of base URLs stored in this function. =cut sub get_display_method { my ($self) = @_; # a hash with a URL for each observation; identified by name() #my $URL => { 'sim'=> "http://www.theseed.org/featalign.cgi?id1=",\ # 'bbh'=> "http://www.theseed.org/featalign.cgi?id1=" # }; #if (defined $URL{$self->name}) { # $url = $URL{$self->name}.$self->feature_id."&id2=".$self->acc; # return $url; # } # else return undef; } sub get_attribute_based_domain_observations{ # we read a FIG ID and a reference to an array (of arrays of hashes, see above) my ($fid,$domain_classes,$datasets_ref) = (@_); my $fig = new FIG; foreach my $attr_ref ($fig->get_attributes($fid)) { my $key = @$attr_ref[1]; my @parts = split("::",$key); my $class = $parts[0]; if($domain_classes->{$parts[0]}){ my $val = @$attr_ref[2]; if($val =~/^(\d+\.\d+|0\.0);(\d+)-(\d+)/){ my $raw_evalue = $1; my $from = $2; my $to = $3; my $evalue; if($raw_evalue =~/(\d+)\.(\d+)/){ my $part2 = 1000 - $1; my $part1 = $2/100; $evalue = $part1."e-".$part2; } else{ $evalue = "0.0"; } my $dataset = {'class' => $class, 'acc' => $key, 'type' => "dom" , 'evalue' => $evalue, 'start' => $from, 'stop' => $to }; push (@{$datasets_ref} ,$dataset); } } } } =head3 get_attribute_based_evidence (internal) This method retrieves evidence from the attribute server =cut sub get_attribute_based_observations{ # we read a FIG ID and a reference to an array (of arrays of hashes, see above) my ($fid,$datasets_ref) = (@_); my $_myfig = new FIG; foreach my $attr_ref ($_myfig->get_attributes($fid)) { # convert the ref into a string for easier handling my ($string) = "@$attr_ref"; # print "S:$string\n"; my ($key,$val) = ( $string =~ /\S+\s(\S+)\s(\S+)/); # THIS SHOULD BE DONE ANOTHER WAY FM->TD # we need to do the right thing for each type, ie no evalue for CELLO and no coordinates, but a score, etc # as fas as possible this should be configured so that the type of observation and the regexp are # stored somewhere for easy expansion # if (($key =~ /PFAM::/) || ( $key =~ /IPR::/) || ( $key =~ /CDD::/) ) { # some keys are composite CDD::1233244 or PFAM:PF1233 if ( $key =~ /::/ ) { my ($firstkey,$restkey) = ( $key =~ /([a-zA-Z0-9]+)::(.*)/); $val=$restkey.";".$val; $key=$firstkey; } my ($acc,$raw_evalue, $from,$to) = ($val =~ /(\S+);(\S+);(\d+)-(\d+)/ ); my $evalue= 255; if (defined $raw_evalue) { # some of the tool do not give us an evalue my ($k,$expo) = ( $raw_evalue =~ /(\d+).(\d+)/); my ($new_k, $new_exp); # # THIS DOES NOT WORK PROPERLY # if($raw_evalue =~/(\d+).(\d+)/){ # $new_exp = (1000+$expo); # $new_k = $k / 100; } $evalue = "0.01"#new_k."e-".$new_exp; } # unroll it all into an array of hashes # this needs to be done differently for different types of observations my $dataset = [ { name => 'class', value => $key }, { name => 'acc' , value => $acc}, { name => 'type', value => "dom"} , # this clearly needs to be done properly FM->TD { name => 'evalue', value => $evalue }, { name => 'start', value => $from}, { name => 'stop' , value => $to} ]; push (@{$datasets_ref} ,$dataset); } } } =head3 get_sims_observations() (internal) This methods retrieves sims fills the internal data structures. =cut sub get_sims_observations{ my ($fid,$datasets_ref) = (@_); my $fig = new FIG; my @sims= $fig->nsims($fid,100,1e-20,"fig"); my ($dataset); foreach my $sim (@sims){ my $hit = $sim->[1]; my $evalue = $sim->[10]; my $from = $sim->[8]; my $to = $sim->[9]; $dataset = {'class' => 'SIM', 'acc' => $hit, 'type' => 'seq', 'evalue' => $evalue, 'start' => $from, 'stop' => $to }; push (@{$datasets_ref} ,$dataset); } } =head3 get_identical_proteins() (internal) This methods retrieves sims fills the internal data structures. =cut sub get_identical_proteins{ my ($fid,$datasets_ref) = (@_); my $fig = new FIG; my @funcs = (); my @maps_to = grep { $_ ne $fid and $_ !~ /^xxx/ } map { $_->[0] } $fig->mapped_prot_ids($fid); foreach my $id (@maps_to) { my ($tmp, $who); if (($id ne $fid) && ($tmp = $fig->function_of($id))) { if ($id =~ /^fig\|/) { $who = "FIG" } elsif ($id =~ /^gi\|/) { $who = "NCBI" } elsif ($id =~ /^^[NXYZA]P_/) { $who = "RefSeq" } elsif ($id =~ /^sp\|/) { $who = "SwissProt" } elsif ($id =~ /^uni\|/) { $who = "UniProt" } elsif ($id =~ /^tigr\|/) { $who = "TIGR" } elsif ($id =~ /^pir\|/) { $who = "PIR" } elsif ($id =~ /^kegg\|/) { $who = "KEGG" } elsif ($id =~ /^tr\|/) { $who = "TrEMBL" } elsif ($id =~ /^eric\|/) { $who = "ASAP" } push(@funcs, [$id,$who,$tmp]); } } my ($dataset); foreach my $row (@funcs){ my $id = $row->[0]; my $organism = $fig->org_of($fid); my $who = $row->[1]; my $assignment = $row->[2]; my $dataset = {'class' => 'IDENTICAL', 'id' => $id, 'organism' => $organism, 'type' => 'seq', 'database' => $who, 'function' => $assignment }; push (@{$datasets_ref} ,$dataset); } } =head3 get_functional_coupling() (internal) This methods retrieves the functional coupling of a protein given a peg ID =cut sub get_functional_coupling{ my ($fid,$datasets_ref) = (@_); my $fig = new FIG; my @funcs = (); # initialize some variables my($sc,$neigh); # set default parameters for coupling and evidence my ($bound,$sim_cutoff,$coupling_cutoff) = (5000, 1.0e-10, 4); # get the fc data my @fc_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,1); # retrieve data my @rows = map { ($sc,$neigh) = @$_; [$sc,$neigh,scalar $fig->function_of($neigh)] } @fc_data; my ($dataset); foreach my $row (@rows){ my $id = $row->[1]; my $score = $row->[0]; my $description = $row->[2]; my $dataset = {'class' => 'PCH', 'score' => $score, 'id' => $id, 'type' => 'fc', 'function' => $description }; push (@{$datasets_ref} ,$dataset); } } =head3 get_sims_and_bbhs() (internal) This methods retrieves sims and also BBHs and fills the internal data structures. =cut # sub get_sims_and_bbhs{ # # blast m8 output format # # id1, id2, %ident, align len, mismatches, gaps, q.start, q.stop, s. start, s.stop, eval, bit # my $Sims=(); # @sims_src = $fig->sims($fid,80,500,"fig",0); # print "found $#sims_src SIMs\n"; # foreach $sims (@sims_src) { # my ($sims_string) = "@$sims"; # # print "$sims_string\n"; # my ($rfid,$start,$stop,$eval) = ( $sims_string =~ /\S+\s+(\S+)\s+\S+\s\S+\s+(\S+)\s+(\S+)\s+ # \S+\s+\S+\s+\S+\s+\S+\s+(\S+)+.*/); # # print "ID: $rfid, E:$eval, Start:$start stop:$stop\n"; # $Sims{$rfid}{'eval'}=$eval; # $Sims{$rfid}{'start'}=$start; # $Sims{$rfid}{'stop'}=$stop; # print "$rfid $Sims{$rfid}{'eval'}\n"; # } # # BBHs # my $BBHs=(); # @bbhs_src = $fig->bbhs($fid,1.0e-10); # print "found $#bbhs_src BBHs\n"; # foreach $bbh (@bbhs_src) { # #print "@$bbh\n"; # my ($bbh_string) = "@$bbh"; # my ($rfid,$eval,$score) = ( $bbh_string =~ /(\S+)\s(\S+)\s(\S+)/); # #print "ID: $rfid, E:$eval, S:$score\n"; # $BBHs{$rfid}{'eval'}=$eval; # $BBHs{$rfid}{'score'}=$score; # #print "$rfid $BBHs{$rfid}{'eval'}\n"; # } # } =head3 new (internal) Instantiate a new object. =cut sub new { my ($class,$dataset) = @_; #$self = { acc => '', # description => '', # class => '', # type => '', # start => '', # stop => '', # evalue => '', # score => '', # display_method => '', # feature_id => '', # rank => '', # supports_annotation => '', # id => '', # organism => '', # who => '' # }; my $self = { class => $dataset->{'class'}, type => $dataset->{'type'} }; bless($self,$class); return $self; } =head3 feature_id (internal) =cut sub feature_id { my ($self) = @_; return $self->{feature_id}; } =head3 id (internal) Returns the ID of the identical sequence =cut sub id { my ($self) = @_; return $self->{id}; } =head3 organism (internal) Returns the organism of the identical sequence =cut sub organism { my ($self) = @_; return $self->{organism}; } =head3 function (internal) Returns the function of the identical sequence =cut sub function { my ($self) = @_; return $self->{function}; } =head3 database (internal) Returns the database of the identical sequence =cut sub database { my ($self) = @_; return $self->{database}; } ############################################################ ############################################################ package Observation::Identical; use base qw(Observation); sub new { my ($class,$dataset) = @_; my $self = $class->SUPER::new($dataset); $self->{id} = $dataset->{'id'}; $self->{organism} = $dataset->{'organism'}; $self->{function} = $dataset->{'function'}; $self->{database} = $dataset->{'database'}; bless($self,$class); return $self; } =head3 display() If available use the function specified here to display the "raw" observation. This code will display a table for the identical protein B that URL linked to in display_method() is an external component and needs to added to the code for every class of evi dence. =cut sub display{ my ($self, $cgi, $dataset) = @_; my $all_domains = []; my $count_identical = 0; my $content; foreach my $thing (@$dataset) { next if ($thing->class ne "IDENTICAL"); my $single_domain = []; push(@$single_domain,$thing->database); my $id = $thing->id; $count_identical++; push(@$single_domain,&HTML::set_prot_links($cgi,$id)); push(@$single_domain,$thing->organism); #push(@$single_domain,$thing->type); push(@$single_domain,$thing->function); push(@$all_domains,$single_domain); } if ($count_identical >0){ $content = $all_domains; } else{ $content = "

This PEG does not have any essentially identical proteins

"; } return ($content); } 1; ######################################### ######################################### package Observation::FC; 1; use base qw(Observation); sub new { my ($class,$dataset) = @_; my $self = $class->SUPER::new($dataset); $self->{score} = $dataset->{'score'}; $self->{id} = $dataset->{'id'}; $self->{function} = $dataset->{'function'}; bless($self,$class); return $self; } =head3 display() If available use the function specified here to display the "raw" observation. This code will display a table for the identical protein B that URL linked to in display_method() is an external component and needs to added to the code for every class of evi dence. =cut sub display { my ($self,$cgi,$dataset, $fid) = @_; my $functional_data = []; my $count = 0; my $content; foreach my $thing (@$dataset) { my $single_domain = []; next if ($thing->class ne "PCH"); $count++; # construct the score link my $score = $thing->score; my $toid = $thing->id; my $link = $cgi->url(-relative => 1) . "?user=master&request=show_coupling_evidence&prot=$fid&to=$toid&SPROUT="; my $sc_link = "$score"; push(@$single_domain,$sc_link); push(@$single_domain,$thing->id); push(@$single_domain,$thing->function); push(@$functional_data,$single_domain); } if ($count >0){ $content = $functional_data; } else { $content = "

This PEG does not have any functional coupling

"; } return ($content); } ######################################### ######################################### package Observation::Domain; use base qw(Observation); sub new { my ($class,$dataset) = @_; my $self = $class->SUPER::new($dataset); $self->{evalue} = $dataset->{'evalue'}; $self->{acc} = $dataset->{'acc'}; $self->{start} = $dataset->{'start'}; $self->{stop} = $dataset->{'stop'}; bless($self,$class); return $self; } sub display { my ($thing,$gd) = @_; my $lines = []; my $line_config = { 'title' => $thing->acc, 'short_title' => $thing->type, 'basepair_offset' => '1' }; my $color = "4"; my $line_data = []; my $links_list = []; my $descriptions = []; my $description_function; $description_function = {"title" => $thing->class, "value" => $thing->acc}; push(@$descriptions,$description_function); my $score; $score = {"title" => "score", "value" => $thing->evalue}; push(@$descriptions,$score); my $link_id; if ($thing->acc =~/CDD::(\d+)/){ $link_id = $1; } my $link; $link = {"link_title" => $thing->acc, "link" => "http://0-www.ncbi.nlm.nih.gov.library.vu.edu.au:80/Structure/cdd/cddsrv.cgi?uid=$link_id"}; push(@$links_list,$link); my $element_hash = { "title" => $thing->type, "start" => $thing->start, "end" => $thing->stop, "color"=> $color, "zlayer" => '2', "links_list" => $links_list, "description" => $descriptions}; push(@$line_data,$element_hash); $gd->add_line($line_data, $line_config); return $gd; } ######################################### ######################################### package Observation::Sims; use base qw(Observation); sub new { my ($class,$dataset) = @_; my $self = $class->SUPER::new($dataset); $self->{acc} = $dataset->{'acc'}; $self->{evalue} = $dataset->{'evalue'}; $self->{start} = $dataset->{'start'}; $self->{stop} = $dataset->{'stop'}; bless($self,$class); return $self; } =head3 display() If available use the function specified here to display the "raw" observation. This code will display a table for the similarities protein B that URL linked to in display_method() is an external component and needs to added to the code for every class of evidence. =cut sub display { my ($self,$cgi,$dataset) = @_; my $data = []; my $count = 0; my $content; foreach my $thing (@$dataset) { my $single_domain = []; next if ($thing->class ne "SIM"); $count++; push(@$single_domain,&HTML::set_prot_links($cgi,$thing->acc)); push(@$single_domain,$thing->start); push(@$single_domain,$thing->stop); push(@$single_domain,$thing->evalue); push(@$data,$single_domain); } if ($count >0){ $content = $data; } else { $content = "

This PEG does not have any similarities

"; } return ($content); }