[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

View of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.48 - (download) (annotate)
Thu Aug 7 18:13:25 2008 UTC (11 years, 3 months ago) by arodri7
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.47: +6 -4 lines
redirect to new FIGfam page

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


=pod

=head1 proteinfamilies.cgi

A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.


=cut

use strict;
use FIG;
use FIGjs;
use HTML;
use raelib;
use CGI;
use CGI::Carp qw(fatalsToBrowser);
my $cgi=new CGI;
use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.

my $fig;
eval {
    $fig = new FIG;
};  

if ($cgi->param("family"))
{
 print $cgi->redirect("http://anno-3.nmpdr.org/anno/FIG/seedviewer.cgi?page=FigFamViewer&figfam=" . $cgi->param("family"));
 #push @$html, $cgi->h1({class=>"center"}, $cgi->param("family"), " Family : ", $fig->family_function($cgi->param("family")), "\n");
}

if ($@ ne "")
{
    my $err = $@;
    
    my(@html);
    
    push(@html, $cgi->p("Error connecting to SEED database."));
    if ($err =~ /Could not connect to DBI:.*could not connect to server/)
    {
        push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
    }   
    else
    {
        push(@html, $cgi->pre($err));
    }   
    &HTML::show_page($cgi, \@html, 1);
    exit;
}   
    
my $html = [];
my $user = $cgi->param('user');

unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");





if ($cgi->param('equivalence'))
{
 &set_of_equivs($fig,$cgi,$html);
}
elsif ($cgi->param('bysource'))
{
 &show_sources;
 if ($cgi->param('family')) {&show_family($fig,$cgi,$html)}
}
elsif ($cgi->param('freetextsearch'))
{
 &freetextsearch($fig,$cgi,$html);
}
elsif ($cgi->param('statistics'))
{
 &statistics($fig,$cgi,$html);
}
elsif ($cgi->param('differentiate'))
{
 &set_of_equivs($fig,$cgi,$html) if ($cgi->param('prot'));
 &differentiate($fig,$cgi,$html);
}
elsif ($cgi->param('suggestmore'))
{
 &suggest_more_proteins($fig,$cgi,$html);
}
elsif ($cgi->param('family'))
{
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param('prot')) 
{
 &show_protein($fig,$cgi,$html);
}
elsif ($cgi->param('fig2kegg')) 
{
 &comparefig2kegg($fig,$cgi,$html);
}
else
{
  &show_initial($fig,$cgi,$html);
}





&HTML::show_page($cgi,$html,1);
exit;


sub show_initial {
 my ($fig,$cgi,$html)=@_;
 # generate a blank page

 # generate the buttons for each of the submits
 my $fams=&sources();
 my $buttons="<table width=100%><tr>".
             join("\n", 
	            map {
                          "<td><p><center><input type='submit' name='bysource' value='".$fams->{$_}."' /></p>\n" .
                          "<p><a href=\"".&source_url($_)."\" target=\"window$$\">".$fams->{$_}." website</a></p>\n" .
                          "<p><a href=\"proteinfamilies.cgi?statistics=$_\" target=\"window$$\">statistics</a></p></center></td>\n"
                        }  sort {$fams->{$a} cmp $fams->{$b}} keys %$fams
		 )."</tr></table>";

 push @$html, 
 "<h1>Protein Families</h1>\n",

 $cgi->p("SEED Protein Families have been built from the sources shown below. This is an interface designed to compare the different families"),
 
 
 $cgi->div(
             {class=>"bordermargin"},
             $cgi->start_form(-method=>'get'), "\n",
             $cgi->p("You can enter a protein ID . You will recieve a list of all the families that protein is in. \n",
             "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.\n"),
             "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>\n", 
             $cgi->p("You can enter a family ID. You will receive a list of all proteins in that family. \n",
             "You can use a family ID such as fig|PF003462 or pir|PIRSF012283.\n"),
             "Please enter a family id: ", $cgi->textfield(-name=>"family", -size=>40), "<br>\n",
             $cgi->submit(-name=>'equivalence', -value=>"Show Protein Families"), "\n", $cgi->reset,
             $cgi->end_form(),
           ),

 $cgi->div(
             {class=>"bordermargin"},
             $cgi->start_form(-method=>'get'), "\n",
             "<p>You can select one of the data sources below to choose families from that source:</p>", $buttons,
             $cgi->end_form(),
        ),
 $cgi->div( 
             {class=>"bordermargin"},
             $cgi->start_form(-method=>'get'), "\n",
             $cgi->p("You can search through the family functions for some text. Please note that at the moment this is a case insensitive search only."), 
             $cgi->textfield(-name=>"freetext", -size=>40),
             $cgi->submit(-name=>"freetextsearch", -value=>'Text Search'), $cgi->reset,
             $cgi->end_form(),
          ),

 $cgi->div(
            {class=>"bordermargin"},
            $cgi->p("Generate some <a href=\"proteinfamilies.cgi?statistics=all\" target=\"window$$\">statistics on all the protein families"),
    ),
                                                                                          

 
 $cgi->end_form;
 return $html;
}

# CUT::             {class=>'bytext', style=>'border: solid 1px black; margin: 20px 20px; padding: 20px 20px'},


# "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
# "Please enter a family id:  ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
# $cgi->submit, $cgi->reset, $cgi->end_form;

sub show_family {
 my ($fig,$cgi,$html)=@_;
 my $fam=$cgi->param('family');
 
 # proteins table
 my $tab1=[];
 my $colhdr1=['#', 'Protein', 'Other proteins with same amino acid sequence'];
 
 # families table
 my $tab2=[];
 my $colhdr2=["Source", "Family Function", "Proteins<br />in both<br />families", "Combine with"];
 
 my $count=1;
 my $otherfamilies;

 # first, generate the list of identical proteins
 foreach my $extid (sort {$a cmp $b} $fig->ext_ids_in_family($fam)) 
 {
  my $cid=$fig->prot_to_cid($extid);
  foreach my $newfamily ($fig->in_family($cid)) {$otherfamilies->{$newfamily}++}
  my @pegs=map {&protein_link($_)} grep {$_ ne $extid} $fig->cid_to_prots($cid);
  push @$tab1, [$count, &protein_link($extid, 1), (join ", ", (@pegs))];
  $count++;
 }

 # next, generate the list of families that these are also in
 if ($cgi->param('comparepairs'))
 {
  &comparepairs($fig, $cgi, $html);
 }
 
 foreach my $newfam (sort {&source_for_family($a) cmp &source_for_family($b) || $otherfamilies->{$b} <=> $otherfamilies->{$a}} keys %$otherfamilies)   
 {
    next if ($newfam eq $fam);
    my $func=$fig->family_function($newfam);
    $func or ($func=$newfam);
    my $link="<a href=\'proteinfamilies.cgi?family=$newfam&user=$user\'>$func</a>";
    my $compare=<<EOF;
<a href="proteinfamilies.cgi?user=$user&family1=$fam&family2=$newfam&diff=1and2&differentiate=Compare+these+families" target="window_$$">And</a> | 
<a href="proteinfamilies.cgi?user=$user&family1=$fam&family2=$newfam&diff=1or2&differentiate=Compare+these+families" target="window_$$">Or</a> | 
<a href="proteinfamilies.cgi?user=$user&family1=$fam&family2=$newfam&diff=1not2&differentiate=Compare+these+families" target="window_$$">Not</a>

EOF
    push @$tab2, [&source_for_family($newfam), $link, $otherfamilies->{$newfam}, $compare];
 }
 push @$html, (
    $cgi->div(
        {class=>"bordermargin"},
        $cgi->p("The family $fam has the function <em>", $fig->family_function($fam), "</em>, and contains ", $fig->sz_family($fam), " proteins.",
        "Each of these proteins are present in other databases, and we have cross mapped them"),
        $cgi->p("The table below shows you the other families that proteins in $fam are also in, and the number of other proteins.",
        "Click on the family name to see that protein, or on the compare link to compare the two families",),
        &HTML::make_table($colhdr2, $tab2, "Other families that these proteins are in", (class=>'white')),
        $cgi->h4("<a href=\"proteinfamilies.cgi?family=$fam&comparepairs=1&user=$user\" target=\"window$$\">Compare this family to these other protein families.</a> This is a more detailed comparison that will show total number of proteins for and against, but is much slower to calculate."),
    ),
  
    $cgi->div(
        {class=>"bordermargin"},
        $cgi->p("This table shows you the ID of the identical proteins in other databases</p>\n",
        "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",),
        &HTML::make_table($colhdr1, $tab1, "Proteins in " . $fig->family_function($fam) . " ($fam)", (class=>'white')),
    ),
    );
}

sub show_protein {
 my ($fig,$cgi,$html)=@_;
 foreach my $peg ($cgi->param('prot')) {
  my @families;
  if ($peg =~ /^\d+$/) 
  {
   # it is a cid
   @families=$fig->in_family($peg);
   $peg = "CID $peg";
  }
  else 
  {
   @families=$fig->families_for_protein($peg);
  }

  unless (@families) 
  {
   push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
   return;
  }
 
  my $tab=[];
  my $self=$cgi->url(-relative => 1);
  foreach my $fam (@families) {
   my %idcount;
   my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
   #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
   push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
  }
 
  my $col_hdrs=['Family ID', 'Family Function', 'Number of CIDs in Family', 'Choose Family'];
  push @$html, "<h2>Families for $peg</h2>\n", 
  $cgi->start_form(-method=>'get'),
  "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
  "A CID is a unique, internal ID we have assigned to proteins with identical sequences",
  &HTML::make_table($col_hdrs, $tab, "Families for $peg", (class=>'white')),  "\n",
  $cgi->submit('Show Proteins In Each Family'), 
  $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
  $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
  $cgi->reset, $cgi->end_form;
 }
}



sub set_of_equivs {
 ($fig, $cgi, $html)=@_;
 my $peg=$cgi->param('prot');
 my $tab=[];
 my $allfams;
  
 #begin the html so we can add hidden things 
 my $genusspecies=$fig->genus_species($fig->genome_of($peg));
 push @$html, (
 	$cgi->start_form(-method=>'get'),  "\n",
	$cgi->p("<h1>Protein <b>$peg</b>: $genusspecies</h1>"), "\n",
        "<div class=\"bordermargin\">",
 	$cgi->hidden(-name=>'querycid', -value=>$fig->prot_to_cid($peg)), "\n",
        $cgi->p("This table shows all the families that contain a protein with the same sequence as <b>$peg</b>, although each family uses different IDs. The number of different IDs in each protein family and the number of unique protein sequences in each family are also shown. The latter is often less than the former since many databases contain identical proteins with different identifiers (for example the same protein from different <i>$genusspecies</i> genome sequences)."),
       );
 
   foreach my $fam ($fig->families_for_protein($peg))
   {
   	my $ffunc=$fig->family_function($fam) || " &nbsp; ";
	push @$tab, [
		    	&protein_link($peg), 
		    	scalar($fig->function_of($peg)), 
	 	    	"<a href=\'proteinfamilies.cgi?family=$fam&user=$user\'>$fam</a>", 
		    	$ffunc, 
		    	$fig->ext_sz_family($fam),
		    	$fig->sz_family($fam),
		    ];
	#$allfams->{$fam}="$fam :" . $fig->family_function($fam);
        $allfams->{$fam}=substr("$fam : " . $fig->family_function($fam), 0, 100);
   }

  
  $tab=&HTML::merge_table_rows($tab, {3=>1, 4=>1});
  
  my $radbut={
   "1not2"=>"In family one and NOT family two\n",
   "2not1"=>"In family two and NOT family one\n",
   "1and2"=>"In family one and family two\n",
   "1or2" =>"In family one or  family two\n",
   };
  
  # sort the list of families in this table but put the fig families at the beginning of the list
  my @familylist=sort {$a cmp $b} grep {$_ !~ /^fig/} keys %$allfams;
  unshift @familylist, sort {$a cmp $b} grep {$_ =~ /^fig/} keys %$allfams;
  
  my $col_hdrs=['Protein', 'Function', 'Family', 'Family Function', 'External<br>IDs In<br>Family', 'Unique<br>Proteins<br>In Family'];
  push @$html, &HTML::make_table($col_hdrs, $tab, "", (class=>'white')),  "\n", $cgi->p("To differentiate families in this table, please choose two families:"),
  "Family 1: &nbsp; ", $cgi->popup_menu(-name=>"family1", -values=>\@familylist, -labels=>$allfams, -default=>$familylist[0]),
  " <br /> Family 2: &nbsp; ", $cgi->popup_menu(-name=>"family2", -values=>\@familylist, -labels=>$allfams, -default=>$familylist[1]),
  $cgi->p("Show proteins:<br /><ul>\n", 
  $cgi->radio_group(-name=>"diff", -values=>[keys %$radbut], -labels=>$radbut, -rows=>4),
  "</ul>\n"),
  $cgi->p($cgi->checkbox(-name=>"show", -value=>"all", -label=>""), " &nbsp; Show all proteins in the two families (otherwise, just show candidate family members that have different function)."),
  $cgi->hidden(-name=>'prot', -value=>$peg),
  $cgi->submit(-name=>"differentiate", -value=>"Compare these families"), $cgi->reset, $cgi->end_form(), "</div>";
}

  # initially I added this option in, with appropriate help text, but then I added the union option, and I think that surplants this, so I removed it!
  # $cgi->a({class=>"help", onMouseover=>"javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this, 'Help', 'The table will show only those proteins from one of the families that contains proteins that could be in the other family but are not. Most likely this will only be a few proteins, but the table above shows that there are a large number of proteins in one family but not another. Many of these are proteins that are missing from the database. Checking this box will show all the proteins from the chosen families, not just those with missing proteins. By default, you should not check this box.', ''); this.tooltip.addHandler(); return false;", href=>"Html/ProteinFamilies.html"}, "Help"),
  # "Show all correspondences, not just those with missing proteins: ", $cgi->checkbox(-name=>"show", -value=>"all", -label=>""),


sub differentiate {
 ($fig, $cgi, $html)=@_;
 
 #my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
 #undef $focus if ($focus eq "all");
 my $focus=$cgi->param('family2');
 $focus =~ s/\|.*//;
  
 my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
 if ($fam_id1 eq $fam_id2) 
 {
  push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
  return;
 }
 my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
 
 # This block of code is the one that really decides what the data is that is analyzed. 
 # $fam1 and $fam2 are references to arrays containing all the CIDs in each family. We combine them with intersection, union, or differences
 # and then process the remaining IDs.
 
 my $set=[];
 if ($cgi->param("diff") eq "1and2")
 {
 	$set=&set_utilities::intersection($fam1, $fam2);
 }
 elsif ($cgi->param("diff") eq "1not2")
 {
 	$set=&set_utilities::set_diff($fam1, $fam2);
 }
 elsif ($cgi->param("diff") eq "2not1")
 {
	$set=&set_utilities::set_diff($fam2, $fam1);
 }
 elsif ($cgi->param("diff") eq "1or2")
 {
 	$set=&set_utilities::union($fam1, $fam2);
 }

 #### which families
 #
 # We need to figure out which other families to compare to (i.e. what families will be in the columns on the table).
 # There are two ways to do this. If we come from the perspective of a single protein, we just want to look at the families
 # that protein maps to. This is the regular view.
 # 
 # Alternatively, if we come from the perspective of the whole family, we will just work out all the families
 # that are present in @$set
 #
 # Note that in both cases we grep out our two query families and then unshift them to make them the first two query columns
 
 my @families; my $peg;
 if ($cgi->param('prot'))
 {
  $peg=$cgi->param('prot');
  @families=sort {$a cmp $b} grep {$_ ne $fam_id1} grep {$_ ne $fam_id2} $fig->families_for_protein($peg);
 }
 else 
 {
  my %allfams;
  foreach my $cid (@$set)
  {
   foreach my $infamily ($fig->in_family($cid))
   {
    $allfams{$infamily}++;
   }
  }
  
  # limit based on how many proteins are in each
  # we want to eliminate some families that only have one protein
  my $lz=0;
  $cgi->param('limitsz') && ($lz=$cgi->param('limitsz'));
  @families=sort {$allfams{$b} <=> $allfams{$a}} grep {$allfams{$_}>$lz} grep {$_ ne $fam_id1} grep {$_ ne $fam_id2} keys %allfams;
 }

 unshift @families, ($fam_id1, $fam_id2);
 my @source=@families;
 map {/^(.*?)\|/; $_=$1} @source;
 
 # now figure out all the external IDs in those families
 my $extids;
 foreach my $fam (@families)
 {
  map {$extids->{$_}->{$fam}=1} $fig->ext_ids_in_family($fam);
 }

 # finally generate the table. Note that there are three different arrays that we operate on depending on the user input
 # but it really only changes which set algorith we use. Each array is handled identically.
 my $tab=[];
 my $firstrow=[]; # this will hold the first row with the peg we selected.
 # add the first line of the table, with a nice colored background, that shows what the original protein is.
 if ($peg)
 {
    my $seen;
    foreach my $prot ($fig->cid_to_prots($fig->prot_to_cid($peg)))
    {
        for (my $i=0; $i<=$#families; $i++)
        {
            if ($extids->{$prot}->{$families[$i]} && !($seen->{$prot}->{$i}))
            {
                if ($i == 0) { $firstrow->[$i] .= &protein_link($prot, 1, $families[$i]) . "<br />" if ($peg eq $prot)}
                else {$firstrow->[$i] .= &protein_link($prot, 1, $families[$i]) . "<br />"}
                $seen->{$prot}->{$i}=1;
            }
        }
    }
    unshift @$firstrow, "Your<br>peg";
    map {$firstrow->[$_] = [$firstrow->[$_], "td style='background-color: #FF9966'"]} (0 .. $#$firstrow);
    
 } 
        
 

 my ($totalfor, $totalagainst);
 foreach my $cid (@$set)
 {
  #my $row=["<a href='proteinfamilies.cgi?prot=$cid'>$cid</a>"];
  my $row=[];
  my $seen; my $mismatchcolor; my $nofamcolor;
  my ($for, $against)=(0,0);
  foreach my $prot (sort $fig->cid_to_prots($cid)) 
  {
   for (my $i=0; $i<=$#families; $i++)
   {
    # add the protein info to the cell in the table if it this family has that protein. Note that we have seen it, and increment the column counter
    # this if is if the protein that we are looking at is in the family for this column then add it

    # if the protein is not added, we want to know if it has the same start characters as the family (i.e. from the same source), and note that.
    if ($extids->{$prot}->{$families[$i]})
    {
    	$seen->{$prot}=1; 
	$row->[$i] .= &protein_link($prot, 1, $families[$i]) . "<br />";
	$for++;
    }
    elsif ($prot =~ /^$source[$i]/) 
    {
    	if ($fig->ext_in_family($prot)) {$mismatchcolor->{$i}=1} else {$nofamcolor->{$i}=1}
	$row->[$i] .= &protein_link($prot, 1, $families[$i]) . "<br />";
	$against--; # note that against is a negative score!
    }
   }
  }
  
  unless ($#$row == $#families) {$#$row=$#families}
   
  # color those cells that have a mismatch. note that this colors the whole cell even if there is more than one protein mismatching 
  map {$row->[$_] = [$row->[$_], "td style='background-color: #CCCCFF'"]} keys %$nofamcolor;
  map {$row->[$_] = [$row->[$_], "td style='background-color: #FF3366'"]} keys %$mismatchcolor;
  # change empty cells
  map {$row->[$_] = " &nbsp; " unless ($row->[$_])} (0 .. $#$row); 
  
  # add the score
  splice(@$row, 0, 0, "$for/$against");
  if ($for == 0 && $against == 0) {push @$html, $cgi->p({class=>"diagnostic"}, "Cant map $cid\n")}
  
  
  # if we want to show everything do so, otherwise only show the rows where there is a missing protein
  if (($cgi->param("diff") eq "1and2") || ($cgi->param("diff") eq "1or2") || ($cgi->param('show') eq "all"))
  {
   push @$tab, $row;
   $totalfor+=$for; $totalagainst+=$against;
  }
  elsif ($cgi->param("diff") eq "1not2" && $row->[2] ne " &nbsp; ")
  {
   push @$tab, $row;
   $totalfor+=$for; $totalagainst+=$against;
  }
  elsif ($cgi->param("diff") eq "2not1" && $row->[1] ne " &nbsp; ")
  {
   #($row->[1], $row->[2])=($row->[2], $row->[1]);
   push @$tab, $row;
   $totalfor+=$for; $totalagainst+=$against;
  }
 } 

 # sort the table
 @$tab=sort {$a->[1] cmp $b->[1] || $a->[2] cmp $b->[2]} @$tab;
 $firstrow && (unshift @$tab, $firstrow);

 #generate the titles
 my $title;
 ($cgi->param("diff") eq "1and2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
 ($cgi->param("diff") eq "1or2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) OR in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
 ($cgi->param("diff") eq "1not2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) BUT NOT in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
 ($cgi->param("diff") eq "2not1") ? $title.=$fig->family_function($fam_id2). " ($fam_id2) BUT NOT in " . $fig->family_function($fam_id1). " ($fam_id1)\n" : 1;
 
 push @$html, (
    "<div class=\"bordermargin\">\n<h3 style='text-align: center'>Comparison of proteins in $title</h3>\n",
    "<p>The table shows those proteins that are in $title. The rows show proteins with the same sequence. All the IDs along the row are different IDs from identical protein sequences. The columns of the table are different protein families. If a protein in an individual cell has a red background, that protein is either not in the family for that column, or is in that family and another family in the same database.</p>\n",
    "<p>To find out which families each protein is in, scroll the mouse over the link to the protein database. The popup window will show you the protein families for each protein. Note, however, if this protein is in the family for that column only that family is currently shown. If the pop-up window has a green background the protein is in the same family as the column as a whole. If the pop-up window has a red background the protein is in a different family than the column as a whole. If the pop-up window has a blue background the protein is not in any protein families. The red background and the red cell color and the blue background and blue cell color are complimentary and reinforce that these are proteins you should look at.</p>\n",
    );
 

 my @headers=@families;
 map {$_ = "<a " . FIGjs::mouseover("Column Family", $fig->family_function($_) . " ($_)", '') . " href='proteinfamilies.cgi?family=$_'>$_</a>"} @headers;
 splice(@headers, 0, 0, "Score");
 if (($firstrow && $tab->[1]) || (!($firstrow) && $tab->[0]))
 {
  push @$html, $cgi->start_form(), $cgi->p("You can also eliminate columns from this table based on how many proteins that family has correctly grouped.", $cgi->br,
  " Set the minimum number of correct proteins and click update to see the revised table: ", $cgi->textfield(-name=>"limitsz", -size=>3, -default=>2),
  $cgi->hidden(-name=>"prot"), $cgi->hidden(-name=>"user"), $cgi->hidden(-name=>"family1"), $cgi->hidden(-name=>"family2"), 
  $cgi->hidden(-name=>"diff"), $cgi->hidden(-name=>"differentiate"),
  $cgi->submit(-value=>"Update"), $cgi->reset, $cgi->end_form,),
  HTML::make_table(\@headers, $tab, "Proteins In $title", (class=>'white')),
  "<p><b>Total Score: </b><br>\nSupporting this comparison between families: <b>$totalfor</b><br>\n",
               "Not supporting this comparison between families: <b>$totalagainst</b></p>\n</div>\n";
	       
  
 }
 else
 {
  my $sorry="<p>Sorry there were no protein families that satisfied looking for</p>\n<p>$title</p>";
  if (($cgi->param("diff") eq "1not2") || ($cgi->param("diff") eq "2not1")) {$sorry .= "<p>and had candidate proteins that could be in those families</p>"}
  push @$html, $cgi->h3("<div style='color: blue; text-align: center'>$sorry</span>\n</div>");
 }
}

=head2 protein_link()

This takes a protein ID and returns the link (full link including ID) back to the appropriate database.

If addmouseover is true then a mouseover will be added showing the families the peg is in

If $fam is provided it will be the header of the mouseover popup.

=cut

sub protein_link {
 my ($p, $addmouseover, $fam) =@_;
 my %proteinbase=(
  # put here the link to the site for the proteins. 
  # Note the use of single quotes. Later, anywhere where the protein ID should be I'll replace \$p with the protein id.
  # This is because for some reason cog needs the protein ID twice.
  "fig" 	=> "protein.cgi?user=$user&prot=fig|".'$p',
  "cog"  => 'http://www.ncbi.nlm.nih.gov/COG/old/blux.cgi?cog=$p&$p',
  "sp"   => 'http://www.expasy.org/uniprot/$p',
  "tr"   => 'http://www.expasy.org/uniprot/$p',
  "kegg" => 'http://www.genome.jp/dbget-bin/www_bget?$p',
  "pfam" => 'http://pfam.wustl.edu/cgi-bin/getswisspfam?key=$p',
  "pir"  => 'http://pir.georgetown.edu/cgi-bin/ipcEntry?id=$p',
 );

# this was the old link that I think is wrong
#  "cog"  => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",

 my $mouseovertitle="Protein Families";
 if ($fam)
 {
  	$mouseovertitle="<i>Column family: " . $fig->family_function($fam) . " ($fam)</i>";
 }
 my $familiesforp = "<b>Families for $p from ".$fig->genus_species($fig->genome_of($p))."</b><br>";
 my ($hcolor, $bgcolor)=('#11AA66','#BBFFBB'); # text background color.
 
 # if the protein is in our family of interest show just that, otherwise show all the families
 my @thisfam=$fig->ext_in_family($p);
 if (grep {$_ eq $fam} @thisfam)
 {
  	$familiesforp .= $fig->family_function($fam) . " ($fam)";
 }
 elsif (scalar(@thisfam))
 {
  	$familiesforp .= "<ul>" . join("", map {"<li> " . $fig->family_function($_) . " ($_)</li>"} @thisfam) . "</ul>";
        if (!$fam) {($hcolor, $bgcolor)=('','')} # use the default colors 
	else {($hcolor, $bgcolor)=('#CC0000', '#FF3366')} # we're doing a comparison and the families are different so color them red
 }
 else
 {
     	$familiesforp="<b>Families for $p:</b><br>No protein families";
	($hcolor, $bgcolor)=('','');
 }
 
 foreach my $key (keys %proteinbase)
 {
 	if ($p =~ /^$key/ && $addmouseover)
	{
		$p =~ s/^(.*?)\|//; 
		my $dbase=$1;
		my $location=$proteinbase{$key};
		$location =~ s/\$p/$p/g; # this is for stupid cog that requires two invocations of the protein ID
		$p = "<a " . FIGjs::mouseover($mouseovertitle, $familiesforp, '', '1', $hcolor, $bgcolor) . " href='$location' target='window_$$'>$dbase|$p</a>";
	}
	elsif ($p =~ /^$key/)
	{
		$p =~ s/^(.*?)\|//;
		my $dbase=$1;
		my $location=$proteinbase{$key};
		$location =~ s/\$p/$p/g; # this is for stupid cog that requires two invocations of the protein ID
		$p = "<a href='$location' target='window_$$'>$dbase|$p</a>";
	}
 }
 return $p;
}


sub comparepairs{
 ($fig, $cgi, $html)=@_;
 # very very experimental. Don't look at this code or use it
 # we are going to take a protein family and then look at all the ids in that family and count the occurences of other families
 # then find the proteins and give them a score

 my $fam=$cgi->param("family");
 return unless ($fam);
 
 my $count;
 foreach my $cid ($fig->ids_in_family($fam))
 {
  foreach my $newfamily ($fig->in_family($cid))
  {
   $count->{$newfamily}++;
  }
 }
 
 my $tab;
 foreach my $test (keys %$count)
 {
  next if ($test eq $fam);
  my $res=&score_two_families($fam, $test, "or");
  my $f1=($fig->family_function($fam)) ? $fig->family_function($fam) : $fam;
  my $f2=($fig->family_function($test)) ? $fig->family_function($test) : $test;
  
  my $link1="<a href=\"proteinfamilies.cgi?user=$user&family1=$fam&family2=$test&diff=2not1&differentiate=Compare+these+families\" target=\"window_$$\">$f1</a>";
  my $link2="<a href=\"proteinfamilies.cgi?user=$user&family=$test\" target=\"window_$$\">$f2</a>";
  push @$tab, [$link1, $link2, $count->{$test}, @$res];
 }

 @$tab=sort {$b->[2] <=> $a->[2] || $b->[3] <=> $a->[3] || $a->[4] <=> $b->[4]} @$tab;

 
 push @$html, ( 
    $cgi->div(
        {class=>"bordermargin"},
        $cgi->h3("Comparisons between protein families"),
        $cgi->p("The table below shows a comparison of <b>", $fig->family_function($fam), " ($fam)</b> with all the other protein families ",
        " that proteins in $fam are also present in. The link in the first column will take you to a table showing proteins that are in family two ",
        " that are not in $fam. These are proteins that you should investigate for being in your family. ",
        " The link in the second column will take you to that families page. The <b>frequency</b> is the number of proteins in $fam that are ",
        " also in the second family.",
        " The <b>For</b> score is the total number of proteins that agree both families are similar. The <b>Against</b> score is the total number of proteins ",
        " that disagree that the two proteins are similar. These numbers are the sum of the for and against scores shown under the side-by-side comparison ",
        " when you click the link in column 1."),
        HTML::make_table(["Family 1", "Family 2", "Frequency", "For", "Against"], $tab, "Scores", (class=>'white')),
 ));
}

sub score_two_families {
 my ($fam_id1, $fam_id2, $method)=@_;
 my $focus=$fam_id1;
 $focus =~ s/\|.*//;
 
 return unless ($fam_id1 && $fam_id2);
 my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
	 
 my $set=[];
 if ($method eq "and")     {$set=&set_utilities::intersection($fam1, $fam2)}
 elsif ($method eq "not")  {$set=&set_utilities::set_diff($fam1, $fam2)}
 elsif ($method eq "or")   {$set=&set_utilities::union($fam1, $fam2)}

 # now figure out all the families represented by @$set
 my %allfams;
 foreach my $cid (@$set)
 {
  foreach my $infamily ($fig->in_family($cid))
  {
   $allfams{$infamily}++;
  }
 }

 my @families=sort {$allfams{$b} <=> $allfams{$a}} grep {$_ ne $fam_id1} grep {$_ ne $fam_id2} keys %allfams;
 unshift @families, ($fam_id1, $fam_id2);
 my @source=@families;
 map {/^(.*?)\|/; $_=$1} @source;
 
 # now figure out all the external IDs in those families
 my $extids;
 foreach my $fam (@families) {map {$extids->{$_}->{$fam}=1} $fig->ext_ids_in_family($fam)}

 my ($totalfor, $totalagainst);
 foreach my $cid (@$set)
 {
  my ($for, $against)=(0,0);
  foreach my $prot (sort $fig->cid_to_prots($cid)) 
  {
   for (my $i=0; $i<=$#families; $i++)
   {
    if ($extids->{$prot}->{$families[$i]})
    {
	$for++;
    }
    elsif ($prot =~ /^$source[$i]/) 
    {
	$against--; # note that against is a negative score!
    }
   }
  }
  
  $totalfor+=$for; $totalagainst+=$against;
 }
 return [$totalfor, $totalagainst];
}


sub show_sources {
 my $source=$cgi->param('bysource');
 # this is the display name and so we need to reverse map it to the source from $fams
 my $fams=&sources();
 my $extrahtml;
 map {$source=$_ if ($source eq $fams->{$_})} keys %$fams;
 # now source will be something like fig, kegg, mcl, etc
 # get all families
 my @families = $fig->families_by_source($source);
 my $label;
 map {$label->{$_}=substr($fig->family_function($_), 0, 100); unless ($label->{$_}) {$label->{$_}=$_}} @families;
 @families=sort {$label->{$a} cmp $label->{$b}} @families;
 # limit the families to some text
 if ($cgi->param('limitfamiliesto'))
 {
  my $text=$cgi->param('limitfamiliesto');
  @families=grep{$label->{$_}=~/$text/i} @families;
 }

 if ($source eq "pir") 
 {
  my $full='full';
  if ($cgi->param('pirsflimit')) {$full=$cgi->param('pirsflimit')} else {$cgi->param('pirsflimit', $full)}
  if ($full ne "No Limit")
  {
   print STDERR "Looking for  $full\n";
   @families=grep{$label->{$_}=~/^\($full/i} @families;
  }
  my $pirsflimits={full=>"Full", prelim=>"Preliminary", 'NoLimit'=>'No Limits', none=>"None of these"};
  $extrahtml.=$cgi->p("Limit to PIR SF: ", $cgi->popup_menu(-name=>"pirsflimit", -values=>[keys %$pirsflimits], -labels=>$pirsflimits));
 }
 
 my $error=""; 
 if (scalar(@families) > 1000)
 {
    $error .= "<p><b>Warning:</b> This list has been limited to 1,000 families so that it displays easily.<br>\n";
    $error .= "Please use the text search function below to find the families your are interested in.</p>";
    @families=splice(@families, 0, 1000);
 }
 my $sl=$cgi->scrolling_list(-name=>'family', -values=>\@families, -labels=>$label, -size=>10, -multiple=>0);
 push @$html,
    $cgi->start_form(-method=>'get'), "\n",
    $cgi->div({class=>'bordermargin'}, "<h2>These are ", scalar(@families), " protein families from the ", $fams->{$source}, " databse</h2>\n",
    $error,
    "Please choose a protein family from the list and then you will be shown the proteins in that family</p>\n",
    $sl,
    $cgi->p("You can limit this table to some text: ", $cgi->textfield(-name=>"limitfamiliesto", -value=>"", -size=>20)),
    $extrahtml,
    $cgi->hidden(-name=>'bysource', -value=>$fams->{$source}),
    $cgi->p($cgi->submit(-name=>'bysource', -value=>"Rebuild table"), $cgi->submit('submit', 'Show Family'), $cgi->reset()),
 ), 
 $cgi->end_form;
 return $html;
}


sub sources {
    my ($fam)=@_;
    my %choices=(
            "aclame"	=> "Aclame",
            "fig"	=> "FIGfams",
            "tigr"	=> "TIGRfams",
            "pfam"	=> "PFAM",
            "sp"	=> "Prosite",
            "kegg"	=> "KEGG",
            "pir"	=> "PIR SuperFams",
            "mcl"	=> "MCL",
            "cog"	=> "COG",
            );
    if ($fam) {return $choices{$fam}}
    else {return \%choices}
}

sub source_url {
    my $fam=shift;
    my %urls=(
            "aclame"      => "http://aclame.ulb.ac.be/",
            "fig"         => "proteinfamilies.cgi",
            "tigr"        => "http://www.tigr.org/TIGRFAMs/",
            "pfam"        => "http://pfam.wustl.edu/",
            "sp"          => "http://www.expasy.org/prosite/",
            "kegg"        => "http://www.genome.ad.jp/kegg/",
            "pir"         => "http://pir.georgetown.edu/",
            "mcl"         => "http://www.cbil.upenn.edu/gene-family/",
            "cog"         => "http://www.ncbi.nlm.nih.gov/COG/",
            );
    return $urls{$fam};
}

sub source_for_family {
 my $fam=shift;
 return unless ($fam);
 $fam =~ /^(.*?)\|/;
 return &sources($1) if ($1);
}


sub freetextsearch {
    ($fig, $cgi, $html)=@_;
    
   my $text=$cgi->param("freetext");
   my @tab=map {[&source_for_family($_), "<a href=\"proteinfamilies.cgi?family=$_\" target=\"window$$\">" . $fig->family_function($_) . "</a>", $fig->sz_family($_)]} 
               sort {&source_for_family($a) cmp &source_for_family($b) || $fig->sz_family($b) <=> $fig->sz_family($a)} $fig->family_by_function($text);
   push @$html,
    $cgi->div(
      {class=>"bordermargin"},
      $cgi->p("These are the protein families that have <b>$text</b> somewhere in their name. Please click on one of the families to see more information about them"),
      &HTML::make_table(["Source", "Family", "Size"], \@tab, "", (class=>'white')),
    );
    return $html;
}


sub statistics {
    ($fig, $cgi, $html)=@_;

    my $sources=&sources(undef);
    my @sources;
    my $need=$cgi->param("statistics");
    (lc($need) eq "all") ? eval{@sources=sort {$a cmp $b} keys %$sources} : eval{@sources=grep {/$need/i} sort {$a cmp $b} keys %$sources};
     
    my $tab; 
    my $totalprots=$fig->number_of_cids();
    foreach my $s (@sources)
    {
     my $numberoffams=$fig->number_of_families($s);
     my $numberofprots=$fig->number_of_proteins_in_families($s);
     my $numberofuniqueprots=$fig->number_of_proteins_in_families($s, 1);
     push @$tab, 
        [
            "<a href=\"".&source_url($s)."\">".$sources->{$s}."</a>",
            &commify($numberoffams),
            &commify($numberofprots),
            &commify($numberofuniqueprots),
            int($numberofprots/$numberoffams * 10)/10,
            int($numberofuniqueprots/$totalprots * 1000)/1000,
        ];
     }
     
    my $headers=["Source", "Number of families", "Total number<br />of proteins<br />in families", "Number of unique<br />protein ids<br />in families", "Proteins per family", "Fraction"]; 
 
    push @$html,
        $cgi->h2("Protein Family Statistics"),
        $cgi->div(
              {class=>"bordermargin"},
              $cgi->p("There are ", &commify($totalprots), " unique proteins in the database."),
              $cgi->p("This table shows the coverage of proteins in the FIG protein family database from each of the sources that we have used"),
              &HTML::make_table($headers, $tab, "", (class=>'white')),
              $cgi->p("The difference between the two columns is because some proteins can occur in more than one family. In the first column they are counted each time they appear, and in the second they are counted only once regardless of how many times they appear. Proteins per family is the total number of proteins in families divided by the number of families. The fraction is the number of unique proteins divided by the total number of unique CIDs in the database. (CIDs are our internal IDs and there are currently $totalprots CIDs). This fraction is not exactly right since two different proteins can have the same CID if they have identical sequences, however it is a close approximation."),
        );
    return $html;
}


# put commas in numbers. This is taken directly from the perl cookbook
sub commify {
    my $text = reverse $_[0];
    $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
    return scalar reverse $text;
}




###################### SUGGEST MORE PROTEINS
# 
# The aim is to be able to provide a link from the subsystem page to here, saying suggest proteins that should be in this family, but which are not.
# Then annotators can come to this page and see proteins/genomes that they should extend into the ss.
#
# We need to give evidence, current assignments/roles/locations in other spreadsheets, and perhaps a score based on that and the number of votes.
#


sub suggest_more_proteins {
    my ($fig,$cgi,$html)=@_;
 
    # families table
    my $tab=[];
 
    my $count=1;
    my $otherfamilies;
    
    my $fam=$cgi->param('family');
    $fam =~ /^(.*?)\|/;
    my $source=$1;
    
    # first, generate the list of identical proteins and find what families they are in
    foreach my $cid ($fig->ids_in_family($fam))
    {
        foreach my $newfamily ($fig->in_family($cid)) {$otherfamilies->{$newfamily}++}
    }

    # members of %$otherfamilies should also be members of $fam. We just need to map backward
    push @$html, $cgi->div({class=>"bordermargin"}, $cgi->p("Proteins in $fam are also in: "),
        $cgi->ol("<li>", join("</li>\n<li>", sort {$a cmp $b} keys %$otherfamilies), "</li>\n"));

    my $same; my $diff;  my $conflicting;
    foreach my $nf (keys %$otherfamilies)
    {
        foreach my $cid ($fig->ids_in_family($nf))
        {
            foreach my $extid ($fig->cid_to_prots($cid))
            {
                next if (!($extid =~ /^$source/) || $same->{$extid});
                if (!$fig->ext_in_family($extid)) {$diff->{$extid}->{$nf}++; $conflicting->{$extid}->{undef}++; next}
                foreach my $nef ($fig->ext_in_family($extid))
                {
                    if ($fam eq $nef) {$same->{$extid}=1; delete $diff->{$extid}; last}
                    else {$diff->{$extid}->{$nf}++; $conflicting->{$extid}->{$nef}++}
                }
            }
        }
    }

    
    unless (keys %$diff) 
    {
        my $sorry="Sorry. There are no proteins in the families associated with $fam that are not in $fam already!";
        push @$html, $cgi->h3("<div class='bordermargin' style='color: blue; text-align: center'>$sorry</span>\n</div>");
        return;
    }
    
    foreach my $id (sort {$fig->genus_species($fig->genome_of($a)) cmp $fig->genus_species($fig->genome_of($b))} keys %$diff)
    {
        my $row=[];
        push @$row, $fig->genus_species($fig->genome_of($id)), "<a href='protein.cgi?prot=$id&user=$user' target='new'>".scalar($fig->function_of($id))."</a>";
        my @arr=map {
            $_=$_->[0]; 
            my $l=$_; 
            s/_/ /g; 
            $_="<a href='subsys.cgi?request=show_ssa&ssa_name=$l&user=$user' target='new'>$_</a>"
        } $fig->subsystems_for_peg($id);
        if (@arr) {push @$row, join("<br />\n", @arr)} else {push @$row, " &nbsp; "}
        push @$row, join("<br />\n", keys %{$diff->{$id}});
        push @$row, join("<br />\n", keys %{$conflicting->{$id}});
        push @$tab, $row;
    }

    my $colhdr=["Genome", "Current Role", "Current Subsystems", "Supporting families", "Conflicting Families"];
    
    
    
    push @$html, $cgi->div({class=>"bordermargin"}, $cgi->p("These are the IDs that could be in this family but are not"),
     &HTML::make_table($colhdr, $tab, "", (class=>'white')));

        
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3