[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

View of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (download) (annotate)
Tue Oct 25 14:54:26 2005 UTC (14 years, 5 months ago) by overbeek
Branch: MAIN
Changes since 1.23: +1 -3 lines
pf was right, forgot to load ext ids! Correcting again

# -*- perl -*-

=pod

=head1 proteinfamilies.cgi

A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.

PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->

=cut

use strict;
use FIG;
use HTML;
use raelib;
use CGI;
use CGI::Carp qw(fatalsToBrowser);
my $cgi=new CGI;
use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.

my $fig;
eval {
    $fig = new FIG;
};  

if ($@ ne "")
{
    my $err = $@;
    
    my(@html);
    
    push(@html, $cgi->p("Error connecting to SEED database."));
    if ($err =~ /Could not connect to DBI:.*could not connect to server/)
    {
        push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
    }   
    else
    {
        push(@html, $cgi->pre($err));
    }   
    &HTML::show_page($cgi, \@html, 1);
    exit;
}   
    
my $html = [];
my $user = $cgi->param('user');

unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");


if ($cgi->param('Show Proteins In Each Family')) 
{
 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
 $cgi->param(-name=>'family', -value=>\@needed);
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param('analyse_family')) {
 # finding what is there is the same as findingh what is missing you just need one extra !
 # these two things call the same method.
 &analyse_family($fig,$cgi,$html, 0);
}
elsif ($cgi->param('reverse_analyse_family')) {
 &analyse_family($fig,$cgi,$html, 1);
}
elsif ($cgi->param('equivalence'))
{
 &set_of_equivs($fig,$cgi,$html);
}
elsif ($cgi->param('differentiate'))
{
 push @$html, $cgi->h3({style=>"color: red"}, "Scroll down to see the table");
 &set_of_equivs($fig,$cgi,$html);
 &differentiate($fig,$cgi,$html);
}
elsif ($cgi->param('family'))
{
 &show_family($fig,$cgi,$html);
}
elsif ($cgi->param('prot')) 
{
 &show_protein($fig,$cgi,$html);
}
elsif ($cgi->param('fig2kegg')) 
{
 &comparefig2kegg($fig,$cgi,$html);
}
else
{
  &show_initial($fig,$cgi,$html);
}

&HTML::show_page($cgi,$html,1);
exit;


sub show_initial {
 my ($fig,$cgi,$html)=@_;
 # generate a blank page
 push @$html, 
 "<h2>Protein Families</h2>\n",
 "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
 "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
 $cgi->start_form(-method=>'get'), 
 "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>", 
 $cgi->submit(-name=>'equivalence', -value=>"Show an equivalence table"), $cgi->reset, $cgi->end_form;
 return $html;
}

# "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
# "Please enter a family id:  ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
# $cgi->submit, $cgi->reset, $cgi->end_form;

sub show_family {
 my ($fig,$cgi,$html)=@_;
 foreach my $fam ($cgi->param('family')) {
  my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
  my $tab=[];
  my $col_hdrs=['CID', 'Polypeptides with same amino acid sequence'];
  foreach my $cid (@cids) {
   my @pegs=$fig->cid_to_prots($cid);
   @pegs=map {&protein_link($_)} @pegs;
   push @$tab, [$cid, (join ", ", (@pegs))];
  }

  push @$html, "<h2>$fam Family</h2>\n",
  "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
  "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
  "even though they may come from different organisms.</p>",
  "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
  $cgi->start_form(-method=>'get'),
  &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"), 
  $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
  $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),    
  $cgi->end_form;
 }
}

sub show_protein {
 my ($fig,$cgi,$html)=@_;
 foreach my $peg ($cgi->param('prot')) {
  my @families;
  if ($peg =~ /^\d+$/) 
  {
   # it is a cid
   @families=$fig->in_family($peg);
   $peg = "CID $peg";
  }
  else 
  {
   @families=$fig->families_for_protein($peg);
  }

  unless (@families) 
  {
   push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
   return;
  }
 
  my $tab=[];
  my $self=$cgi->url(-relative => 1);
  foreach my $fam (@families) {
   my %idcount;
   my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
   #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
   push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
  }
 
  my $col_hdrs=['Family ID', 'Family Function', 'Number of CIDs in Family', 'Choose Family'];
  push @$html, "<h2>Families for $peg</h2>\n", 
  $cgi->start_form(-method=>'get'),
  "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
  "A CID is a unique, internal ID we have assigned to proteins with identical sequences",
  &HTML::make_table($col_hdrs, $tab, "Families for $peg"),  "\n",
  $cgi->submit('Show Proteins In Each Family'), 
  $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
  $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
  $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
  $cgi->reset, $cgi->end_form;
 }
}




sub analyse_family {
 my ($fig,$cgi,$html, $reverse)=@_;
# here are the questions:
# 1. Given a column in a spreadsheet:
# 2. Here are the proteins in that column
# 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
# 	if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
#  	so we recommend that a protein be removed from a family.
# 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
# 5. For each of the families that are good, which proteins are only in one of those families and not in any others?

# Note that column == family, But start with fig and then  allow a replace ID feature like before.
 
 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
 undef $focus if ($focus eq "all");

 my @cols;
 if ($cgi->param("allfams")) {@cols=grep {$cgi->param($_)} $cgi->param("allfams")}
 elsif ($cgi->param("family")) {push @cols, $cgi->param('family')}
 else {die "No families declared!"}
 
 foreach my $col (@cols)
 {
  # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
  my $proteins_in_col;
  map {$proteins_in_col->{$_}=1} $fig->ids_in_family($col);

  # @proteins are the proteins in that column, although these are cids and not fids at the moment
  my $familycount;
  foreach my $prot (keys %$proteins_in_col) {
   foreach my $fam ($fig->in_family($prot)) {
    $familycount->{$fam}++;
   }
  }
  
  my $count_of;
  my $fams;
  foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount)
  { 
   if ($reverse) {($fams, $count_of)=&ids_missing_from_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)} 
   else {($fams, $count_of)=&ids_are_in_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
  } 
  
  # create a list of families that we know about
  my @fams=grep {!/$col/} sort {scalar(keys %{$fams->{$b}}) <=> scalar(keys %{$fams->{$a}})} keys %$fams; 
  unshift @fams, $col;

  my $tab=[[["Number of proteins in family", "th colspan=4"], map {scalar(keys %{$fams->{$_}})} @fams]];
  
  my @fids;
  if ($cgi->param('sort') eq "genome")  
  {
   @fids=sort {$fig->genome_of($a) <=> $fig->genome_of($b)} keys %$count_of;
  } 
  elsif ($cgi->param('sort') eq "cid")
  {
   @fids=sort {$fig->prot_to_cid($a) <=> $fig->prot_to_cid($b)} keys %$count_of;
  }
  else 
  {
   @fids=sort {scalar(keys %{$count_of->{$b}}) <=> scalar(keys %{$count_of->{$a}})} keys %$count_of;
  }
 
  my $rowcount;
  foreach my $fid (@fids) 
  {
   my @row=(++$rowcount, $fig->prot_to_cid($fid), $fid, scalar(keys %{$count_of->{$fid}}));
    
   foreach my $fam (@fams) {
    $count_of->{$fid}->{$fam} ? push @row, ["Y", "td style='background-color: lightgrey; text-align: center'"] : push @row, " &nbsp ";
   }
   push @$tab, \@row;
  }

  push @$html,  $cgi->start_form(-method=>'get'), $cgi->p("Limit display to proteins from ", &choose_focus($cgi), "\n"), $cgi->p("Sort the order by ", &choose_sort($cgi),"\n");
  if ($reverse) {
   push @$html, $cgi->p("These are proteins that ARE NOT in ", $fig->family_function($col), " ($col) but are in other families that have proteins in this family.");
  } else {
   push @$html, $cgi->p("These are proteins that ARE in ", $fig->family_function($col), " ($col) and are in other families that have proteins in this family.");
  }

  # merge cells in the table
  my $skip;
  map {$skip->{$_}=1} (0, 2 .. 40); # ignore everything except column 2
  $tab=&HTML::merge_table_rows($tab, $skip);

  push @$html, 
    $cgi->hidden(-name=>"family", -value=>@cols), $cgi->hidden("prot"), $cgi->hidden(-name=>"user"), 
    $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
    $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
    &HTML::make_table(["Count", "Unique ID", "Protein ID", "Number of fams protein is in", @fams], $tab,' &nbsp; ');
 }
}


sub ids_are_in_fam {
   my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
   # It seems that $sz_family is not right
   map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++} 
	grep {/^$focus/} 
   	map {$fig->cid_to_prots($_)}
      	grep {$proteins_in_col->{$_}} 
	($fig->ids_in_family($f));
   return ($fams, $count_of);
}

sub ids_missing_from_fam {
   my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
   # It seems that $sz_family is not right
   map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++} 
	grep {/^$focus/} 
   	map {$fig->cid_to_prots($_)}
      	grep {!$proteins_in_col->{$_}} 
	($fig->ids_in_family($f));
   return ($fams, $count_of);
}
   


sub choose_focus {
 my ($cgi)=@_;
 my %choices=(
  "all"		=> "All",
  "fig"		=> "FIGfams",
  "tigr"	=> "TIGRfams",
  "pfam"	=> "PFAM",
  "sp"		=> "SwissProt",
  "kegg"	=> "KEGG",
  "pir"		=> "PIR SuperFams",
  "mcl"		=> "MCL",
  "cog"		=> "COG",
 );

 my $default = $cgi->param("focus"); unless ($default) {$default="fig"}
  
 return $cgi->popup_menu(
  -name     => "focus",
  -values   => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
  -labels   => \%choices,
  -default  => $default,
 );
}

sub choose_sort {
 my ($cgi)=@_;
 my %choices=(
  "none"	=> "Number of Proteins",
  "genome"	=> "Genome",
  "cid"		=> "Unique ID",
 );

 my $default = $cgi->param("sort"); unless ($default) {$default="none"}
  
 return $cgi->popup_menu(
  -name     => "sort",
  -values   => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
  -labels   => \%choices,
  -default  => $default,
 );
}


sub comparefig2kegg {
 my ($fig,$cgi,$html)=@_;

 my $classification; my %subsystem;
 # read classification from kegg file
 if (open(IN, "$FIG_Config::global/ProteinFamilies/kegg_classificaation.txt")) {
  while (<IN>) {
   chomp; 
   my @a=split /\t/;
   my $id=shift(@a);
   $subsystem{"kegg|$id"}=pop(@a);
   push @{$classification->{"kegg|$id"}}, \@a;
  }
 }


 my $tab=[];
 # find out what families our proteins are in
 map {
  my $prot=$_;
  map {
   my $fam=$_;
   if ($fam =~ /^fig/) {
    my %ss;
    map {$ss{$_->[0]}++} ($fig->subsystems_for_role($fig->family_function($fam)));
    map {my $ss=$_; push @$tab, [$prot, @{$fig->subsystem_classification($ss)}, $ss, $fam, $fig->family_function($fam)]} keys %ss; 
   }
   else {
    map {push @$tab, [$prot, @{$_}, $subsystem{$fam}, $fam, $fig->family_function($fam)]} @{$classification->{$fam}}
   }
   } grep {/^fig/ || /^kegg/} $fig->families_for_protein($prot);
   } $cgi->param('proteins');

 my $col_hdrs=['Protein', ['Classification', 'th colspan=2'], 'Subsystem', 'Family ID', 'Family Function'];
 push @$html, &HTML::make_table($col_hdrs, $tab, "Families"),  "\n",
}

## Based on request from Ross:
#	Subject: 	Re: fig.pl
#	Date: 	October 4, 2005 6:21:00 AM PDT
#	From: 	  Ross@theFIG.info
#	To: 	  raedwards@gmail.com
#
#Rob,
#
#It seems to me that you got that right, and the function is certainly at the
#core of what is needed.  I have been thinking about what I would want with
#protein families,
#and it goes something like this:
#
#1. Given a protein FIG1, you can get the set of proteins with the same CID
#(call it CID1).  Call this set EQUIV, since it is really a set of IDs that are
#equivalent.
#
#2. From the set of IDs in EQUIV, you can get the set of protein families (from
#all sources) that contain the IDs in EQUIV.  This gives a table
#
#
#            [,ID,Function,Family,FamilyFunction]
#
#    All of the table entries describe a family containing CID1.
#
#3.  From this table you select two Families to be compared (e.g., one KEGG
#family vs a FIG family).  This ends the first part -- selecting the precise
#two
#      families to be compared.  Each of the two families  should be thought of
#as [CID,ID,Family].
#
#4.  The comparison of SET1 and SET2 uses essentially the function you
#implemented.  You need to form three sets:
#
#            the intersection of SET1 and SET2
#            SET1 - SET2
#            SET2 - SET1
#
#       You may or may not wish to display each of the three sets.  The user
#should be able to select which.  When you think
#        of one of these sets, it is useful to think of
#{CID,Family,Set-of-CIDs}.  That is, it is not just a set of CIDs; it should be
#viewed as a
#        set of CIDs from a specific family that was chosen because it
#contained a specific CID.
#
#5. When displaying a set of proteins from a given family, you start with
#(CID,Family,Set-of-CIDs).  Each line should contain
#
#            1. A single CID from the Set-of-CIDs  (call this CID2).
#
#            2. A count of the number of sources that place both CID1 and CID2
#in the same family (note that this is not a count of the families that include
#both CID1 and CID2)
#
#            3.  For each source a "Y" or space indicating whether or not the
#source placed CID1 and CID2 into the same family (i.e., whether or not there
#                  is at least one family from the source that contains both
#CID1 and CID2).
#
#That is what I think should be done.  Can we discuss it?
#



sub set_of_equivs {
 ($fig, $cgi, $html)=@_;
 foreach my $peg ($cgi->param('prot')) {
  my $cid=$fig->prot_to_cid($peg);
  my @equiv=$fig->cid_to_prots($cid);
  my $tab=[];
  my $allfams;
  
  #begin the html so we can add hidden things 
  push @$html, $cgi->start_form(-method=>'get'), $cgi->p("For protein <b>$peg</b>, which has the unique ID <b>$cid</b>, this is the EQUIV set.");
  push @$html, $cgi->hidden(-name=>'querycid', -value=>$fig->prot_to_cid($peg));
  
  # this block will make a table with all IDs of all proteins in CID.
  # Ross doesn't want that, so we don't use it
  if (0) 
  {
   map 
   {
    my $id=$_;
    map 
    {
     push @$tab, [$fig->prot_to_cid($id), $id, &functionate($id), $_, $fig->family_function($_)];
     $allfams->{$_}="$_ : " . $fig->family_function($_);
    } $fig->families_for_protein($id);
   } @equiv;
  }
  else
  {
   # instead we just show our query protein
   map 
   {
    my $ffunc=$fig->family_function($_) || " &nbsp; ";
    push @$tab, [$fig->prot_to_cid($peg), $peg, &functionate($peg), $_, $ffunc, $fig->ext_sz_family($_)];
    $allfams->{$_}="$_ : " . $fig->family_function($_);
   } $fig->families_for_protein($peg);
  }

  
  $tab=&HTML::merge_table_rows($tab, {3=>1, 4=>1});
  my $radbut={
   "1not2"=>"In family one and NOT family two\n",
   "2not1"=>"In family two and NOT family one\n",
   "1and2"=>"In both families\n"
   };
  my $col_hdrs=['CID', 'Protein', 'Function of Proteins', 'Family', 'Family Function', 'Ext. IDs In Family'];
  push @$html, &HTML::make_table($col_hdrs, $tab, ""),  "\n", $cgi->p("To differentiate families in this table, please choose two families:"),
  "Family 1: &nbsp; ", $cgi->popup_menu(-name=>"family1", -values=>[sort {$a cmp $b} keys %$allfams], -labels=>$allfams, -default=>[sort {$a cmp $b} keys %$allfams]->[0]),
  " <br /> Family 2: &nbsp; ", $cgi->popup_menu(-name=>"family2", -values=>[sort {$a cmp $b} keys %$allfams], -labels=>$allfams, -default=>[sort {$a cmp $b} keys %$allfams]->[1]),
  $cgi->p("Show proteins:<br /><ul>\n", 
  $cgi->radio_group(-name=>"diff", -values=>[keys %$radbut], -labels=>$radbut, -rows=>3),
  "</ul>\n"),
  $cgi->hidden(-name=>'prot', -value=>$peg),
  $cgi->submit(-name=>"differentiate", -value=>"Differentiate these families"), $cgi->reset, $cgi->end_form();
 }
}

#  $cgi->p("I am most interested in proteins from ", &choose_focus($cgi), " that are missing from these families\n"), 

sub differentiate {
 ($fig, $cgi, $html)=@_;
 
 #my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
 #undef $focus if ($focus eq "all");
 my $focus=$cgi->param('family2');
 $focus =~ s/\|.*//;
  
 my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
 if ($fam_id1 eq $fam_id2) 
 {
  push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
  return;
 }
 my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
	 
	 
 # figure out all the families we know about
 my %families;
 #map 
 #{
 # my $cid=$_;
 # map
 # {
 #  $families{$_}=1; 
 # }
 # $fig->in_family($cid);
 #} @{&set_utilities::union($fam1, $fam2)};
 
 map {$families{$_}=1} $fig->in_family($cgi->param('querycid'));
 
 my @families=sort {$a cmp $b} grep {!/$fam_id1/} grep {!/$fam_id2/} keys %families;
 unshift @families, ($fam_id1, $fam_id2);
 my @source=@families;
 map {/^(.*?)\|/; $_=$1} @source;
 
 # now figure out all the external IDs in those families
 my $extids;
 map 
 {
  my $fam=$_;
  map 
  {
   $extids->{$_}->{$fam}=1;
  } $fig->ext_ids_in_family($fam);
 } @families;

 # finally generate the table. Note that there are three different arrays that we operate on depending on the user input
 # but it really only changes which set algorith we use. Each array is handled identically.
 my $tab;
 map 
 {
  my $cid=$_;
  my $row=["<a href='proteinfamilies.cgi?prot=$cid'>$cid</a>"];
  my $seen; my $mismatchcolor;
  map 
  {
   my $prot=$_;
   map 
   {
    # add the protein info to the cell in the table if it this family has that protein. Note that we have seen it, and increment the column counter
    # this if is if the protein that we are looking at is in the family for this column then add it

    # if the protein is not added, we want to know if it has the same start characters as the family (i.e. from the same source), and note that.
    ($extids->{$prot}->{$families[$_]}) ? 
    eval {$seen->{$prot}=1; $row->[$_+1] .= &protein_link($prot) . "<br />" } : 
    ($prot =~ /^$source[$_]/) ? eval 
    {$mismatchcolor->{$_+1}=1; $row->[$_+1] .= &protein_link($prot) . "<br />" } :
    1;
   } (0 .. $#families); 
  } sort $fig->cid_to_prots($cid);
  unless ($#$row == $#families+1) {$#$row=$#families+1}
   
  #$row->[1] = [join("<br />", map {&protein_link($_)} grep {/^$focus/} grep {!$seen->{$_}} $fig->cid_to_prots($cid)), "td style='background-color: #FF3366'"];
  #$row->[1] = [join("<br />", map {&protein_link($_)} grep {/^$focus/} grep {!$mismatch->{$_}} grep {!$seen->{$_}} $fig->cid_to_prots($cid)), "td style='background-color: #FF3366'"];
  
  map {$row->[$_] = [$row->[$_], "td style='background-color: #FF3366'"]} keys %$mismatchcolor;
  map {$row->[$_] = " &nbsp; " unless ($row->[$_])} (0 .. $#$row); 
  #push @$tab, $row if ($row->[1]->[0]);
  
  # if we want to show everything do so, otherwise only show the rows where there is a missing protein
  if ($cgi->param('show') eq "all")
  {
   push @$tab, $row;
  }
  elsif ($row->[2] ne " &nbsp; ")
  {
   push @$tab, $row;
  }
 } 
 ($cgi->param("diff") eq "1and2") ? @{&set_utilities::intersection($fam1, $fam2)} : 
 ($cgi->param("diff") eq "1not2") ? @{&set_utilities::set_diff($fam1, $fam2)} :
 ($cgi->param("diff") eq "2not1") ? @{&set_utilities::set_diff($fam2, $fam1)} : ();
 
 
 my $title="Proteins in ";
 ($cgi->param("diff") eq "1and2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
 ($cgi->param("diff") eq "1not2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) BUT NOT in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
 ($cgi->param("diff") eq "2not1") ? $title.=$fig->family_function($fam_id2). " ($fam_id2) BUT NOT in " . $fig->family_function($fam_id1). " ($fam_id1)\n" : 1;

 # add the count of the number of members in the family
 #my @colcounts=map {$column->{$_}} (1 .. $#families+2); $colcounts[0] = [$colcounts[0], "td style='background-color: #FF3366'"];
 #my @sz=map {$fig->ext_sz_family($_)} @families;
 #unshift @$tab, [["# Ext. IDs in fam", "th "], ['', "td style='background-color: #FF3366'"], @sz];
 #unshift @$tab, [["Ext. IDs shown", "th "], @colcounts];
 
 # remove empty columns from the table
 #for (my $y=$#families+2; $y>3; $y--) 
 #{
  # do it backwards so the splice still works, and stop at 3 since the first four columns are the ones we need
  #next if ($column->{$y});
  #splice(@families, $y-2, 1);
  #map {splice(@$_, $y, 1)} @$tab;
 #}


 #my @headers=map {$_ = "$_: <a href='proteinfamilies.cgi?family=$families[$_]'>$source[$_]</a>"} (0..$#families);
 my @headers=@families;
 map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @headers;
 #my $colhdrs=["CID", ["Proteins not in families", "th style='background-color: #FF3366'"], @headers];
 my $colhdrs=["CID", @headers];
 push @$html, HTML::make_table($colhdrs, $tab, $title);
}


sub old_differentiate {
 my ($fig, $cgi, $html)=@_;
 
 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
 undef $focus if ($focus eq "all");
  
 my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
 if ($fam_id1 eq $fam_id2) {
  push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
  return;
 }
 my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
	 
 if ($cgi->param("1not2")) {
  my $tab;
  map {
    my $cid=$_; 
    map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
    } sort @{&set_utilities::set_diff($fam1, $fam2)};

  $tab=&HTML::merge_table_rows($tab);
  push @$html, $cgi->h3("Proteins that are in $fam_id1 (" , $fig->family_function($fam_id1) , ") and not in $fam_id2 (" ,
          $fig->family_function($fam_id2) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
  }
	 
	 
 if ($cgi->param("2not1")) {
   
   ###########NOTICE THIS REDIRECT!!!
   ($fam_id1, $fam_id2)=($fam_id2, $fam_id1);
   ($fam1, $fam2)=($fam2, $fam1);
   $cgi->param("1not2c", "1");
  }
  if (0) {
  my $tab;
  map {
    my $cid=$_; 
    map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
    } sort @{&set_utilities::set_diff($fam2, $fam1)};

  $tab=&HTML::merge_table_rows($tab);
  push @$html, $cgi->hr, $cgi->h3("Proteins that are in $fam_id2 (" , $fig->family_function($fam_id2) , ") and not in $fam_id1 (" ,
          $fig->family_function($fam_id1) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
 }
	 
 if ($cgi->param("1and2")) {
  my $tab;
  my @evidence;
  map {
    my $cid=$_; 
    #push @evidence, &compare_two_cids($cgi->param('querycid'), $cid);
    push @evidence, &IHatePfams($cgi->param('querycid'), $cid);
    map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
    } @{&set_utilities::intersection($fam1, $fam2)};

  $tab=&HTML::merge_table_rows($tab);
  push @$html, $cgi->hr, $cgi->h3("Proteins that are in both $fam_id1 (" , $fig->family_function($fam_id1) , ") and in $fam_id2 (" ,
          $fig->family_function($fam_id2) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, ""),
	  $cgi->hr, $cgi->h3("Evidence for the connections between proteins and the families they are in"),
	  $cgi->p("These tables show the connections between the families. In the first row of each table is the initial family that you chose, and in the second row is the comparator. The first column is the CID and the remaining columns are the protein families that these two CIDs are in. In each cell are the external IDs that are (a) in the CIDs per the row headers AND (b) in the families per column headers. Thus, if you are here from <a href=\"http://listeria.uchicago.edu/dev/FIG/proteinfamilies.cgi?querycid=965112&family1=fig%7CPF002033&family2=mcl%7CORTHOMCL676&focus=all&1and2=on&differentiate=Differentiate+these+families&.cgifields=1not2&.cgifields=2not1&.cgifields=1and2\">this link</a> cog|thrB is in the family cog|COG0083 and has the ID 965112 and sp|P00547 has the ID 965112 and is in the families pfam|PB000121, pfam|PB007585, pfam|PF00288.12, pir|PIRSF000676, sp|PS00627, and tigr|TIGR00191"),
	  @evidence;
 }

 if ($cgi->param("1and2b")) {
  # figure out all the families we know about
  my %families;
  map 
  {
   my $cid=$_;
   map
   {
    $families{$_}=1; 
   }
   $fig->in_family($cid);
  } @{&set_utilities::intersection($fam1, $fam2)};
  my @families=sort {$a cmp $b} keys %families;
  
  # now figure out all the external IDs in those families
  my $extids;
  map 
  {
   my $fam=$_;
   map 
   {
    $extids->{$_}->{$fam}=1;
   } $fig->ext_ids_in_family($fam);
  } @families;

  my $tab;
  map 
  {
   my $cid=$_;
   map 
   {
    my $prot=$_;
    my $row=[$cid, $prot, &functionate($prot)];
    map {($extids->{$prot}->{$_}) ? push @$row, $prot : push @$row, " &nbsp; "} @families; 
    push @$tab, $row;
   } sort grep {/^$focus/} $fig->cid_to_prots($cid)
  } @{&set_utilities::intersection($fam1, $fam2)};

  my $skip;
  map {$skip->{$_+4}=1} (0 .. $#families);
  $tab=&HTML::merge_table_rows($tab, $skip);
 
  map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @families;
  my $colhdrs=["CID", "Protein", "Function", @families];
  my $title="Proteins in ". $fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n";#$fam_id1, $fam_id2
  push @$html, HTML::make_table($colhdrs, $tab, $title);
 }
 
 if ($cgi->param("1and2c")) {
  # figure out all the families we know about
  my %families;
  map 
  {
   my $cid=$_;
   map
   {
    $families{$_}=1; 
   }
   $fig->in_family($cid);
  } @{&set_utilities::intersection($fam1, $fam2)};
  my @families=sort {$a cmp $b} grep {!/$fam_id1/} grep {!/$fam_id2/} keys %families;
  unshift @families, ($fam_id1, $fam_id2);
  
  # now figure out all the external IDs in those families
  my $extids;
  map 
  {
   my $fam=$_;
   map 
   {
    $extids->{$_}->{$fam}=1;
   } $fig->ext_ids_in_family($fam);
  } @families;

  my $tab;
  map 
  {
   my $cid=$_;
   my $row=[$cid];
   map 
   {
    my $prot=$_;
    map {($extids->{$prot}->{$families[$_]}) ? $row->[$_+1] = $prot : 1 } (0 .. $#families); 
   } sort grep {/^$focus/} $fig->cid_to_prots($cid);
   unless ($#$row == $#families+1) {$#$row=$#families+1}
   push @$tab, $row;
  } @{&set_utilities::intersection($fam1, $fam2)};

  $tab=&HTML::merge_table_rows($tab);
  
  map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @families;
  my $colhdrs=["CID", @families];
  my $title="Proteins in ". $fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n";#$fam_id1, $fam_id2
  push @$html, HTML::make_table($colhdrs, $tab, $title);
 }
 
 if ($cgi->param("1not2c")) {
  # figure out all the families we know about
  my %families;
  map 
  {
   my $cid=$_;
   map
   {
    $families{$_}=1; 
   }
   $fig->in_family($cid);
  } @{&set_utilities::set_diff($fam1, $fam2)};
  my @families=sort {$a cmp $b} grep {!/$fam_id1/} grep {!/$fam_id2/} keys %families;
  unshift @families, ($fam_id1, $fam_id2);
  
  # now figure out all the external IDs in those families
  my $extids;
  map 
  {
   my $fam=$_;
   map 
   {
    $extids->{$_}->{$fam}=1;
   } $fig->ext_ids_in_family($fam);
  } @families;

  my $tab;
  map 
  {
   my $cid=$_;
   my $row=["<a href='proteinfamilies.cgi?prot=$cid'>CID $cid</a>"];
   my $seen;
   map 
   {
    my $prot=$_;
    my $column;
    map 
    {
     ($extids->{$prot}->{$families[$_]}) ? 
     eval {$seen->{$prot}=1; $row->[$_+2] = &protein_link($prot) ; $column->{$_+2}++} : 
     1 
    } (0 .. $#families); 
   } sort grep {/^$focus/} $fig->cid_to_prots($cid);
   unless ($#$row == $#families+2) {$#$row=$#families+2}
   
   $row->[1] = [join("<br />", map {&protein_link($_)} grep {!$seen->{$_}} $fig->cid_to_prots($cid)), "td style='background-color: #FF3366'"];
   
   push @$tab, $row;
  } @{&set_utilities::set_diff($fam1, $fam2)};
 
  $tab=&HTML::merge_table_rows($tab);
 
  map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @families;
  my $colhdrs=["CID", "<span style='background-color: #FF3366'>Proteins not one of the families listed</span>", @families];
  my $title="Proteins in ". $fig->family_function($fam_id1). " ($fam_id1) BUT NOT in " . $fig->family_function($fam_id2). " ($fam_id2)\n";#$fam_id1, $fam_id2
  push @$html, HTML::make_table($colhdrs, $tab, $title);
 }

   
}



sub functionate {
 my $peg=shift;
 return scalar($fig->function_of($peg));
}


sub compare_two_cids {
 my ($id1, $id2)=@_;
 # which families is ID in and why?
 # going via the external IDs, although I am not convinced we need to do this
 my $family;
 map {
  my $id=$_;
  map 
  { 
   map {$family->{$_}->{$id}++} $fig->ext_family_for_id($_)
  } $fig->cid_to_prots($id);
 } ($id1, $id2);

 # find the families that have both id1 and id2
 my $tab;
 map 
 {
  push @$tab, [$_, $family->{$_}->{$id1}, $family->{$_}->{$id2}] if ( $family->{$_}->{$id1} &&  $family->{$_}->{$id2} ); 
 } keys %$family;
 
 return &HTML::make_table(["Family", "Cnx to $id1", "Cnx to $id2"], $tab, "Connections (cnx) between proteins and families"), $cgi->p("\n");
}
 
 
sub IHatePfams {
 my ($id1, $id2)=@_;
 # id1 and id2 are cids
 
 # what families are the cids in?
 my %families;
 map {$families{$_}=1} ($fig->in_family($id1), $fig->in_family($id2));
 my @families=sort {$a cmp $b} keys %families;
 
 # what external IDs are the cids
 my %external;
 map {$external{$_}=$id1} $fig->cid_to_prots($id1);
 map {$external{$_}=$id2} $fig->cid_to_prots($id2);
 
 
 my $tab=[];
 foreach my $cid ($id1, $id2)
 {
  my $row=[$cid];
  foreach my $fam (@families) {
   # which proteins are in $fam
   my $cell;
   foreach my $id ($fig->ext_ids_in_family($fam))
   {
    $cell .= "$id<br>" if ($external{$id} eq $cid);
   }
   $cell = " &nbsp; " unless ($cell);
   push @$row, $cell;
  }
  push @$tab, $row;
 }

 &HTML::make_table(["CID", @families], $tab, "Proteins in families");
}
  
  

sub protein_link {
 my $p=shift;
 my %proteinbase=(
  "fig" 	=> "protein.cgi?user=$user&prot=fig|",
  "cog"  => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
  "sp"   => "http://www.expasy.org/uniprot/",
  "tr"   => "http://www.expasy.org/uniprot/",
  "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
 );
 
 map {($p =~ /^$_/) ? eval {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$_}$p'>$1|$p</a>"} : 1;} keys %proteinbase;
 return $p;
}


 

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3