[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

View of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.19 - (download) (annotate)
Thu Oct 6 22:44:02 2005 UTC (14 years, 6 months ago) by overbeek
Branch: MAIN
Changes since 1.18: +26 -7 lines
adding more functions with external ids

# -*- perl -*-

=pod

=head1 proteinfamilies.cgi

A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.

PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->

=cut

use strict;
use FIG;
use HTML;
use raelib;
use CGI;
use CGI::Carp qw(fatalsToBrowser);
my $cgi=new CGI;
use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.

my $fig;
eval {
    $fig = new FIG;
};  

if ($@ ne "")
{
    my $err = $@;
    
    my(@html);
    
    push(@html, $cgi->p("Error connecting to SEED database."));
    if ($err =~ /Could not connect to DBI:.*could not connect to server/)
    {
        push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
    }   
    else
    {
        push(@html, $cgi->pre($err));
    }   
    &HTML::show_page($cgi, \@html, 1);
    exit;
}   
    
my $html = [];
my $user = $cgi->param('user');

unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");

my %proteinbase=(
 "fig" 	=> "/FIG/protein.cgi?user=$user&prot=fig|",
 "cog"  => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
 "sp"   => "http://www.expasy.org/uniprot/",
 "tr"   => "http://www.expasy.org/uniprot/",
 "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
);



if ($cgi->param('Show Proteins In Each Family')) 
{
 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
 $cgi->param(-name=>'family', -value=>\@needed);
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param('analyse_family')) {
 # finding what is there is the same as findingh what is missing you just need one extra !
 # these two things call the same method.
 &analyse_family($fig,$cgi,$html, 0);
}
elsif ($cgi->param('reverse_analyse_family')) {
 &analyse_family($fig,$cgi,$html, 1);
}
elsif ($cgi->param('equivalence'))
{
 &set_of_equivs($fig,$cgi,$html);
}
elsif ($cgi->param('differentiate'))
{
 &differentiate($fig,$cgi,$html);
}
elsif ($cgi->param('family'))
{
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param('prot')) 
{
 &show_protein($fig,$cgi,$html);
}
elsif ($cgi->param('fig2kegg')) 
{
 &comparefig2kegg($fig,$cgi,$html);
}
else
{
  &show_initial($fig,$cgi,$html);
}

&HTML::show_page($cgi,$html,1);
exit;


sub show_initial {
 my ($fig,$cgi,$html)=@_;
 # generate a blank page
 push @$html, 
 "<h2>Protein Families</h2>\n",
 "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
 "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
 $cgi->start_form(-method=>'get'), 
 "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>", 
 $cgi->submit(-name=>'equivalence', -value=>"Show an equivalence table"),
 "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
 "Please enter a family id:  ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
 $cgi->submit, $cgi->reset, $cgi->end_form;
 return $html;
}


sub show_family {
 my ($fig,$cgi,$html)=@_;
 foreach my $fam ($cgi->param('family')) {
  my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
  my $tab=[];
  my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
  foreach my $cid (@cids) {
   my @pegs=$fig->cid_to_prots($cid);
   foreach my $p (@pegs) {
    foreach my $k (keys %proteinbase) {
     if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
    }
   }
   push @$tab, [$cid, (join ", ", (@pegs))];
  }

  push @$html, "<h2>$fam Family</h2>\n",
  "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
  "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
  "even though they may come from different organisms.</p>",
  "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
  $cgi->start_form(-method=>'get'),
  &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"), 
  $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
  $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),    
  $cgi->end_form;
 }
}

sub show_protein {
 my ($fig,$cgi,$html)=@_;
 foreach my $peg ($cgi->param('prot')) {
  my @families=$fig->families_for_protein($peg);
  unless (@families) 
  {
   push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
   return;
  }
 
  my $tab=[];
  my $self=$cgi->url;
  foreach my $fam (@families) {
   my %idcount;
   my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
   #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
   push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
  }
 
  my $col_hdrs=['Family ID', 'Family Function', 'Number of CIDs in Family', 'Choose Family'];
  push @$html, "<h2>Families for $peg</h2>\n", 
  $cgi->start_form(-method=>'get'),
  "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
  "A CID is a unique, internal ID we have assigned to proteins with identical sequences",
  &HTML::make_table($col_hdrs, $tab, "Families for $peg"),  "\n",
  $cgi->submit('Show Proteins In Each Family'), 
  $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
  $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
  $cgi->reset, $cgi->end_form;
 }
}




sub analyse_family {
 my ($fig,$cgi,$html, $reverse)=@_;
# here are the questions:
# 1. Given a column in a spreadsheet:
# 2. Here are the proteins in that column
# 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
# 	if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
#  	so we recommend that a protein be removed from a family.
# 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
# 5. For each of the families that are good, which proteins are only in one of those families and not in any others?

# Note that column == family, But start with fig and then  allow a replace ID feature like before.
 
 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
 undef $focus if ($focus eq "all");

 my @cols;
 if ($cgi->param("allfams")) {@cols=grep {$cgi->param($_)} $cgi->param("allfams")}
 elsif ($cgi->param("family")) {push @cols, $cgi->param('family')}
 else {die "No families declared!"}
 
 foreach my $col (@cols)
 {
  # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
  my $proteins_in_col;
  map {$proteins_in_col->{$_}=1} $fig->ids_in_family($col);

  # @proteins are the proteins in that column, although these are cids and not fids at the moment
  my $familycount;
  foreach my $prot (keys %$proteins_in_col) {
   foreach my $fam ($fig->in_family($prot)) {
    $familycount->{$fam}++;
   }
  }
  
  my $count_of;
  my $fams;
  foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount)
  { 
   if ($reverse) {($fams, $count_of)=&ids_missing_from_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)} 
   else {($fams, $count_of)=&ids_are_in_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
  } 
  
  # create a list of families that we know about
  my @fams=grep {!/$col/} sort {scalar(keys %{$fams->{$b}}) <=> scalar(keys %{$fams->{$a}})} keys %$fams; 
  unshift @fams, $col;

  my $tab=[[["Number of proteins in family", "th colspan=4"], map {scalar(keys %{$fams->{$_}})} @fams]];
  
  my @fids;
  if ($cgi->param('sort') eq "genome")  
  {
   @fids=sort {$fig->genome_of($a) <=> $fig->genome_of($b)} keys %$count_of;
  } 
  elsif ($cgi->param('sort') eq "cid")
  {
   @fids=sort {$fig->prot_to_cid($a) <=> $fig->prot_to_cid($b)} keys %$count_of;
  }
  else 
  {
   @fids=sort {scalar(keys %{$count_of->{$b}}) <=> scalar(keys %{$count_of->{$a}})} keys %$count_of;
  }
 
  my $rowcount;
  foreach my $fid (@fids) 
  {
   my @row=(++$rowcount, $fig->prot_to_cid($fid), $fid, scalar(keys %{$count_of->{$fid}}));
    
   foreach my $fam (@fams) {
    $count_of->{$fid}->{$fam} ? push @row, ["Y", "td style='background-color: lightgrey; text-align: center'"] : push @row, " &nbsp ";
   }
   push @$tab, \@row;
  }

  push @$html,  $cgi->start_form(-method=>'get'), $cgi->p("Limit the display to proteins from ", &choose_focus($cgi), "\n"), $cgi->p("Sort the order by ", &choose_sort($cgi),"\n");
  if ($reverse) {
   push @$html, $cgi->p("These are proteins that ARE NOT in ", $fig->family_function($col), " ($col) but are in other families that have proteins in this family.");
  } else {
   push @$html, $cgi->p("These are proteins that ARE in ", $fig->family_function($col), " ($col) and are in other families that have proteins in this family.");
  }

  # merge cells in the table
  my $skip;
  map {$skip->{$_}=1} (0, 2 .. 40); # ignore everything except column 2
  $tab=&HTML::merge_table_rows($tab, $skip);

  push @$html, 
    $cgi->hidden(-name=>"family", -value=>@cols), $cgi->hidden("prot"), $cgi->hidden(-name=>"user"), 
    $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
    $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
    &HTML::make_table(["Count", "Unique ID", "Protein ID", "Number of fams protein is in", @fams], $tab,' &nbsp; ');
 }
}


sub ids_are_in_fam {
   my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
   # It seems that $sz_family is not right
   map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++} 
	grep {/^$focus/} 
   	map {$fig->cid_to_prots($_)}
      	grep {$proteins_in_col->{$_}} 
	($fig->ids_in_family($f));
   return ($fams, $count_of);
}

sub ids_missing_from_fam {
   my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
   # It seems that $sz_family is not right
   map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++} 
	grep {/^$focus/} 
   	map {$fig->cid_to_prots($_)}
      	grep {!$proteins_in_col->{$_}} 
	($fig->ids_in_family($f));
   return ($fams, $count_of);
}
   


sub choose_focus {
 my ($cgi)=@_;
 my %choices=(
  "all"		=> "All",
  "fig"		=> "FIGfams",
  "tigr"	=> "TIGRfams",
  "pfam"	=> "PFAM",
  "sp"		=> "SwissProt",
  "kegg"	=> "KEGG",
  "pir"		=> "PIR SuperFams",
  "mcl"		=> "MCL",
  "cog"		=> "COG",
 );

 my $default = $cgi->param("focus"); unless ($default) {$default="all"}
  
 return $cgi->popup_menu(
  -name     => "focus",
  -values   => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
  -labels   => \%choices,
  -default  => $default,
 );
}

sub choose_sort {
 my ($cgi)=@_;
 my %choices=(
  "none"	=> "Number of Proteins",
  "genome"	=> "Genome",
  "cid"		=> "Unique ID",
 );

 my $default = $cgi->param("sort"); unless ($default) {$default="none"}
  
 return $cgi->popup_menu(
  -name     => "sort",
  -values   => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
  -labels   => \%choices,
  -default  => $default,
 );
}


sub comparefig2kegg {
 my ($fig,$cgi,$html)=@_;

 my $classification; my %subsystem;
 # read classification from kegg file
 if (open(IN, "$FIG_Config::global/ProteinFamilies/kegg_classificaation.txt")) {
  while (<IN>) {
   chomp; 
   my @a=split /\t/;
   my $id=shift(@a);
   $subsystem{"kegg|$id"}=pop(@a);
   push @{$classification->{"kegg|$id"}}, \@a;
  }
 }


 my $tab=[];
 # find out what families our proteins are in
 map {
  my $prot=$_;
  map {
   my $fam=$_;
   if ($fam =~ /^fig/) {
    my %ss;
    map {$ss{$_->[0]}++} ($fig->subsystems_for_role($fig->family_function($fam)));
    map {my $ss=$_; push @$tab, [$prot, @{$fig->subsystem_classification($ss)}, $ss, $fam, $fig->family_function($fam)]} keys %ss; 
   }
   else {
    map {push @$tab, [$prot, @{$_}, $subsystem{$fam}, $fam, $fig->family_function($fam)]} @{$classification->{$fam}}
   }
   } grep {/^fig/ || /^kegg/} $fig->families_for_protein($prot);
   } $cgi->param('proteins');

 my $col_hdrs=['Protein', ['Classification', 'th colspan=2'], 'Subsystem', 'Family ID', 'Family Function'];
 push @$html, &HTML::make_table($col_hdrs, $tab, "Families"),  "\n",
}

## Based on request from Ross:
#	Subject: 	Re: fig.pl
#	Date: 	October 4, 2005 6:21:00 AM PDT
#	From: 	  Ross@theFIG.info
#	To: 	  raedwards@gmail.com
#
#Rob,
#
#It seems to me that you got that right, and the function is certainly at the
#core of what is needed.  I have been thinking about what I would want with
#protein families,
#and it goes something like this:
#
#1. Given a protein FIG1, you can get the set of proteins with the same CID
#(call it CID1).  Call this set EQUIV, since it is really a set of IDs that are
#equivalent.
#
#2. From the set of IDs in EQUIV, you can get the set of protein families (from
#all sources) that contain the IDs in EQUIV.  This gives a table
#
#
#            [,ID,Function,Family,FamilyFunction]
#
#    All of the table entries describe a family containing CID1.
#
#3.  From this table you select two Families to be compared (e.g., one KEGG
#family vs a FIG family).  This ends the first part -- selecting the precise
#two
#      families to be compared.  Each of the two families  should be thought of
#as [CID,ID,Family].
#
#4.  The comparison of SET1 and SET2 uses essentially the function you
#implemented.  You need to form three sets:
#
#            the intersection of SET1 and SET2
#            SET1 - SET2
#            SET2 - SET1
#
#       You may or may not wish to display each of the three sets.  The user
#should be able to select which.  When you think
#        of one of these sets, it is useful to think of
#{CID,Family,Set-of-CIDs}.  That is, it is not just a set of CIDs; it should be
#viewed as a
#        set of CIDs from a specific family that was chosen because it
#contained a specific CID.
#
#5. When displaying a set of proteins from a given family, you start with
#(CID,Family,Set-of-CIDs).  Each line should contain
#
#            1. A single CID from the Set-of-CIDs  (call this CID2).
#
#            2. A count of the number of sources that place both CID1 and CID2
#in the same family (note that this is not a count of the families that include
#both CID1 and CID2)
#
#            3.  For each source a "Y" or space indicating whether or not the
#source placed CID1 and CID2 into the same family (i.e., whether or not there
#                  is at least one family from the source that contains both
#CID1 and CID2).
#
#That is what I think should be done.  Can we discuss it?
#



sub set_of_equivs {
 my ($fig, $cgi, $html)=@_;
 foreach my $peg ($cgi->param('prot')) {
  my $cid=$fig->prot_to_cid($peg);
  my @equiv=$fig->cid_to_prots($cid);
  my $tab=[];
  my $allfams;
  
  # this block will make a table with all IDs of all proteins in CID.
  # Ross doesn't want that, so we don't use it
  if (0) 
  {
   map 
   {
    my $id=$_;
    map 
    {
     push @$tab, [$fig->prot_to_cid($id), $id, &functionate($id), $_, $fig->family_function($_)];
     $allfams->{$_}="$_ : " . $fig->family_function($_);
    } $fig->families_for_protein($id);
   } @equiv;
  }
  else
  {
   # instead we just show our query protein
   map 
   {
    my $ffunc=$fig->family_function($_) || " &nbsp; ";
    push @$tab, [$fig->prot_to_cid($peg), $peg, &functionate($peg), $_, $ffunc, $fig->ext_sz_family($_)];
    $allfams->{$_}="$_ : " . $fig->family_function($_);
   } $fig->families_for_protein($peg);
  }

  
  $tab=&HTML::merge_table_rows($tab, {3=>1, 4=>1});
  my $col_hdrs=['CID', 'Protein', 'Function of Proteins', 'Family', 'Family Function', 'No. Prots In Family'];
  push @$html, $cgi->start_form(-method=>'get'), $cgi->p("For protein <b>$peg</b>, which has the unique ID <b>$cid</b>, this is the EQUIV set."),
  &HTML::make_table($col_hdrs, $tab, ""),  "\n", $cgi->p("To differentiate families in this table, please choose two families:"),
  "Family 1: &nbsp; ", $cgi->popup_menu(-name=>"family1", -values=>[sort {$a cmp $b} keys %$allfams], -labels=>$allfams),
  " &nbsp; Family 2: &nbsp; ", $cgi->popup_menu(-name=>"family2", -values=>[sort {$a cmp $b} keys %$allfams], -labels=>$allfams),
  $cgi->p("Limit the display to proteins from ", &choose_focus($cgi), "\n"),
  $cgi->p("Show proteins:<br /><ul>\n", 
  $cgi->checkbox(-name=>"1not2", -label=>"In family one and NOT family two"), "<br />\n",
  $cgi->checkbox(-name=>"2not1", -label=>"In family two and NOT family one"), "<br />\n",
  $cgi->checkbox(-name=>"1and2", -label=>"In both families"), "\n</ul>\n"),
  $cgi->submit(-name=>"differentiate", -value=>"Differentiate these families"), $cgi->reset, $cgi->end_form();
 }
}

sub differentiate {
 my ($fig, $cgi, $html)=@_;
 
 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
 undef $focus if ($focus eq "all");
  
 my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
 if ($fam_id1 eq $fam_id2) {
  push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
  return;
 }
 my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
	 
 if ($cgi->param("1not2")) {
  my $tab;
  map {
    my $cid=$_; 
    map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
    } sort @{&set_utilities::set_diff($fam1, $fam2)};

  $tab=&HTML::merge_table_rows($tab);
  push @$html, $cgi->h3("Proteins that are in $fam_id1 (" , $fig->family_function($fam_id1) , ") and not in $fam_id2 (" ,
          $fig->family_function($fam_id2) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
  }
	 
	 
 if ($cgi->param("2not1")) {
  my $tab;
  map {
    my $cid=$_; 
    map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
    } sort @{&set_utilities::set_diff($fam2, $fam1)};

  $tab=&HTML::merge_table_rows($tab);
  push @$html, $cgi->hr, $cgi->h3("Proteins that are in $fam_id2 (" , $fig->family_function($fam_id2) , ") and not in $fam_id1 (" ,
          $fig->family_function($fam_id1) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
 }
	 
 if ($cgi->param("1and2")) {
  my $tab;
  map {
    my $cid=$_; 
    map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
    } @{&set_utilities::intersection($fam1, $fam2)};

  $tab=&HTML::merge_table_rows($tab);
  push @$html, $cgi->hr, $cgi->h3("Proteins that are in both $fam_id1 (" , $fig->family_function($fam_id1) , ") and in $fam_id2 (" ,
          $fig->family_function($fam_id2) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
 }
}



sub functionate {
 my $peg=shift;
 return scalar($fig->function_of($peg));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3