[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

View of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (download) (annotate)
Thu Jul 14 05:05:28 2005 UTC (14 years, 9 months ago) by redwards
Branch: MAIN
Changes since 1.6: +74 -2 lines
Changes to protein families that no sane person should ever view. Pay no attention to me

# -*- perl -*-

=pod

=head1 proteinfamilies.cgi

A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.

PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->

=cut

use strict;
use FIG;
use HTML;
use raelib;
use CGI;
my $cgi=new CGI;
use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.

my $fig;
eval {
    $fig = new FIG;
};  

if ($@ ne "")
{
    my $err = $@;
    
    my(@html);
    
    push(@html, $cgi->p("Error connecting to SEED database."));
    if ($err =~ /Could not connect to DBI:.*could not connect to server/)
    {
        push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
    }   
    else
    {
        push(@html, $cgi->pre($err));
    }   
    &HTML::show_page($cgi, \@html, 1);
    exit;
}   
    
my $html = [];
my $user = $cgi->param('user');

unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");

my %proteinbase=(
 "fig" 	=> "/FIG/protein.cgi?user=$user&prot=fig|",
 "cog"  => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
 "sp"   => "http://www.expasy.org/uniprot/",
 "tr"   => "http://www.expasy.org/uniprot/",
 "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
);



if ($cgi->param('Show Proteins In Each Family')) 
{
 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
 $cgi->param(-name=>'family', -value=>\@needed);
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param("Combine Families With And"))
{
 &combine_families($fig,$cgi,$html);
}
elsif ($cgi->param('proteins_between_two')) 
{
 &proteins_between_two($fig, $cgi, $html);
}
elsif ($cgi->param('Compare FIG Functions'))
{
 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
 $cgi->param(-name=>'family', -value=>\@needed);
 &compare_functions($fig,$cgi,$html);
}
elsif ($cgi->param('Suggest')) {
 &suggest_proteins($fig,$cgi,$html);
}
elsif ($cgi->param('extend_family')) {
 &extend_family($fig,$cgi,$html);
}
elsif ($cgi->param('family'))
{
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param('prot')) 
{
 &show_protein($fig,$cgi,$html);
}
else
{
  &show_initial($fig,$cgi,$html);
}

&HTML::show_page($cgi,$html,1);
exit;


sub show_initial {
 my ($fig,$cgi,$html)=@_;
 # generate a blank page
 push @$html, 
 "<h2>Protein Families</h2>\n",
 "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
 "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
 $cgi->start_form(), 
 "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>", 
 "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
 "Please enter a family id:  ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
 $cgi->submit, $cgi->reset, $cgi->end_form;
 return $html;
}

sub show_protein {
 my ($fig,$cgi,$html)=@_;
 foreach my $peg ($cgi->param('prot')) {
  my @families=$fig->families_for_protein($peg);
  unless (@families) 
  {
   push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
   return;
  }
 
  my $tab=[];
  my $self=$cgi->url;
  foreach my $fam (@families) {
   push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
  }
 
  my $col_hdrs=['Family ID', 'Family Function', 'Number of Unique Proteins in Family', 'Choose Family'];
  push @$html, "<h2>Families for $peg</h2>\n", 
  $cgi->start_form,
  "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
  &HTML::make_table($col_hdrs, $tab, "Families for $peg"),  "\n",
  $cgi->submit('Show Proteins In Each Family'), 
  $cgi->submit(-name=>'proteins_between_two', -value=>"Compare two or more families for missing"), "<br>\n",
  $cgi->submit(-name=>'extend_family', -value=>"Extend and contract family"),
  $cgi->submit("Combine Families With And"), $cgi->submit("Compare FIG Functions"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
  $cgi->reset, $cgi->end_form;
 }
}

# this was deleted from above
#$cgi->submit(-name=>"Suggest", -value=>"Suggest Additional families for this protein"), 

sub show_family {
 my ($fig,$cgi,$html)=@_;
 foreach my $fam ($cgi->param('family')) {
  my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
  my $tab=[];
  my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
  foreach my $cid (@cids) {
   my @pegs=$fig->cid_to_prots($cid);
   foreach my $p (@pegs) {
    foreach my $k (keys %proteinbase) {
     if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
    }
   }
   push @$tab, [$cid, (join ", ", (@pegs))];
  }

  push @$html, "<h2>$fam Family</h2>\n",
  "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
  "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
  "even though they may come from different organisms.</p>",
  "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
  $cgi->start_form,
  &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"), $cgi->submit("Compare FIG Functions"),
  $cgi->end_form;
 }
}

sub combine_families {
 my ($fig,$cgi,$html)=@_;
 # first find all the families and all the cids in those families
 my $cids; my @families; my $allprots;
 foreach my $f (grep {$cgi->param($_)} $cgi->param("allfams"))
 {
  push @families, $f;
  foreach my $e ($fig->ids_in_family($f)) {$cids->{$e}->{$f}=1; $allprots++}
 }
 
 # now figure out those cids that are in all families
 # we are going to do this with a boolean switch rather than just counting occurences
 my @wanted;
 foreach my $cid (keys %$cids) {
  my $keep=1;
  foreach my $f (@families) {undef $keep unless ($cids->{$cid}->{$f})}
  push @wanted, $cid if ($keep);
 }
 
 my $tab=[];
 my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
 foreach my $cid (sort {$a <=> $b} @wanted) {
  my @pegs=$fig->cid_to_prots($cid);
  foreach my $p (@pegs) {
   foreach my $k (keys %proteinbase) {
    if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
   }
  }
  push @$tab, [$cid, (join ", ", (@pegs))];
 }
			      
  
 push @$html, "<h2>Proteins in ", (join ", ", (@families)), " families</h2>\n",
 "<h3>Summary</h3>\n<p>There were $allprots proteins in the ", scalar(@families), " families that you selected.\n<br>",
 "Out of a those proteins, there were ", scalar(keys %$cids),
 " unique proteins, and ", scalar(@wanted), " proteins are present in the ",
 scalar(@families), " families.</p>\n",
 "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, even though they may come from different organisms.</p>",
 "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
 $cgi->start_form,
 &HTML::make_table($col_hdrs, $tab, "Proteins in " . (join ", ", (@families)) . " families"),
 $cgi->end_form;  
}
 
sub compare_functions {
 my ($fig,$cgi,$html)=@_;
 foreach my $fam ($cgi->param('family')) {
  my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
  my $tab=[];
  my $col_hdrs=['FIG ID', 'Genome', 'Assigned Function'];
  foreach my $cid (@cids) {
   foreach my $peg (grep {/^fig/} $fig->cid_to_prots($cid)) {
    my $p=$peg;
    $p =~ s/fig\|//; 
    $p = "<a href='$proteinbase{'fig'}$p'>$peg</a>";
    push @$tab, [$p, ($fig->genus_species($fig->genome_of($peg))), scalar($fig->function_of($peg))];
   }
  }

  push @$html, "<h2>Functions in $fam Family</h2>\n",
  "<p>The family $fam has the function ", $fig->family_function($fam), ".\n",
  "The functions shown here are the functions of just the proteins in the SEED database from this family</p>",
  $cgi->start_form,
  &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
  $cgi->hidden(-name=>'prot'), $cgi->hidden(-name=>'family', -value=>"$fam"), $cgi->submit("Compare FIG Functions"),
  $cgi->end_form;
 }
}
 
sub suggest_proteins {
 my ($fig,$cgi,$html)=@_;
 my $prot=$cgi->param('prot');
 my @families=$fig->families_for_protein($prot);
 my $knownfamily;
 map {$knownfamily->{$_}=1} @families;

 print STDERR "Looking through ", scalar(@families), " families\n";
 my $famcount=1;my $time=time;
 my $content;
 my $row;
 foreach my $fam (@families) {
  print STDERR "$famcount in ", time-$time, "\n"; $time=time; $famcount++;
  
  push @$row, [$fam, 'td style="background-color: grey'];
  
  my @newprots=$fig->proteins_in_family($fam);
  push @$row, scalar(@newprots);
  
  print STDERR "$fam has ", scalar(@newprots), "\n";
  foreach my $p (@newprots) {
   my @newfamilies=$fig->families_for_protein($p);
   foreach my $nf (@newfamilies) {
    next if ($knownfamily->{$nf});
    push @{$content->{$nf}}, $p;
   }
  }
 }
 
 my $tab=[];
 @$tab=map {[$_, $fig->family_function($_), scalar(@{$content->{$_}})]} keys %$content;
 my $col_hdrs=["Family ID", "Family function", "Number of proteins in family"];
 
 push @$html, $cgi->p("$prot is in families ", join ",", @families),
 $cgi->p("These are the other families that $prot should also appear in:"),
 &HTML::make_table($col_hdrs, $tab, "Other Families"), "\n";
}


sub proteins_between_two {
 my ($fig,$cgi,$html)=@_;
 # first find all the families and all the cids in those families
 my $cids; my @families; my $allprots;
 foreach my $f (grep {$cgi->param($_)} $cgi->param("allfams"))
 {
  push @families, $f;
  foreach my $e ($fig->ids_in_family($f)) {$cids->{$e}->{$f}=1; $allprots++}
 }

 # now figure out those cids that are in all families
 # we are going to do this with a boolean switch rather than just counting occurences
 my @wanted; my @missing;
 foreach my $cid (keys %$cids) {
  my $keep=1;
  foreach my $f (@families) {
   unless ($cids->{$cid}->{$f}) {
    undef $keep;
    push @missing, $cid;
   }
  }
  push @wanted, $cid if ($keep);
 }
 
 # make some HTML header
 push @$html, "<h2>Comparison of different protein families</h2><ul>\n";
 foreach my $f (sort @families) {push @$html, "<li>", $fig->family_function($f), " ($f) </li>\n"}
 push @$html, "</ul><p>These are the proteins that are unique to one or other of these families:</p>";

 foreach my $f (sort @families) {

  my $tab=[];
  foreach my $p (sort {$a <=> $b} @missing) {
   next unless ($cids->{$p}->{$f});
   push @$tab, [$p, (join ", ", $fig->cid_to_prots($p))];
  }
  
  my $list;
  foreach my $x (@families) {next if ($f eq $x); $list .= "<li>". $fig->family_function($x). " ($x)</li>\n"}
  push @$html, $cgi->p, "<div style='font-weight: bolder'>Proteins that are in <br><center style='font-weight: bigger'>",  
  $fig->family_function($f), " ($f)</center><br>\nthat are not in <ul>$list</ul>\n</div>\n",
  &HTML::make_table(["Cluster ID", "Other Proteins"], $tab, " &nbsp; ");
 }
}
 

sub extend_family {
 my ($fig,$cgi,$html)=@_;
# here are the questions:
# 1. Given a column in a spreadsheet:
# 2. Here are the proteins in that column
# 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
# 	if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
#  	so we recommend that a protein be removed from a family.
# 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
# 5. For each of the families that are good, which proteins are only in one of those families and not in any others?

# Note that column == family, But start with fig and then  allow a replace ID feature like before.
 
 #my $focus=$cgi->param('focus') or "fig"; # these are the things that we are interested in
 my $focus='fig';
 push @$html, "<h2>Testing: Limited to fig</h2>\n";
 foreach my $col (grep {$cgi->param($_)} $cgi->param("allfams"))
 {
  # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
  my %proteins_in_col;
  map {$proteins_in_col{$_}=1} $fig->ids_in_family($col);

  # @proteins are the proteins in that column, although these are cids and not fids at the moment
  my $familycount;
  foreach my $prot (keys %proteins_in_col) {
   foreach my $fam ($fig->in_family($prot)) {
    $familycount->{$fam}++;
   }
  }
  
  my $tab=[];
  foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount) {
   next if ($f eq $col);
   # It seems that $sz_family is not right
   my @all=$fig->ids_in_family($f);
   #my @allmissing=grep {!$proteins_in_col{$_}} ($fig->ids_in_family($f)); # note this is done in two separate lines so I can get the value of @allmissing
   my @allmissing=grep {!$proteins_in_col{$_}} @all;
   print STDERR "ALL: ", scalar(@all), " ALLMISSING: ", scalar(@allmissing), " IN COL: ", scalar(keys %proteins_in_col), "\n";
   my @missing=
	map {"<a href=\'/FIG/protein.cgi?user=$user&prot=$_'>$_</a>"} 
	grep {/^$focus/} 
   	map {$fig->cid_to_prots($_)} @allmissing;
      	#grep {!$proteins_in_col{$_}} 
	#($fig->ids_in_family($f));
		
   my $prots_by_fam=join ", ", @missing;
   push @$tab, [$f, scalar(@all), $fig->family_function($f), $familycount->{$f}, scalar(@allmissing), scalar(@missing), $prots_by_fam];
  }

  my $col_hdrs=["Family ID", "Size of Family", "Family Function", 
  		"Number of proteins in $col that are also in the family in column 1", 
		"Number of proteins in the family in column 1 that are not in $col",
  		"Number of proteins in the family in column 1 that are not in $col and begin with $focus",
  		"Proteins that are in the family in column 1 that are not"];
  push @$html, $cgi->p, "<h3>Your chosen family was $col that has the function ", $fig->family_function($col), " and has ", 
  $fig->sz_family($col), " proteins</h3>", 
  $cgi->p("\nThese are the proteins that should also be in $col. Or maybe not.\n"), &HTML::make_table($col_hdrs, $tab, ' &nbsp; ');
 }
}

  


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3