[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

Annotation of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.32 - (view) (download)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 :     =head1 proteinfamilies.cgi
6 :    
7 : redwards 1.3 A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.
8 : redwards 1.1
9 : redwards 1.7 PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->
10 :    
11 : redwards 1.1 =cut
12 :    
13 :     use strict;
14 :     use FIG;
15 : overbeek 1.26 use FIGjs;
16 : redwards 1.1 use HTML;
17 :     use raelib;
18 :     use CGI;
19 : overbeek 1.11 use CGI::Carp qw(fatalsToBrowser);
20 : redwards 1.1 my $cgi=new CGI;
21 :     use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.
22 :    
23 :     my $fig;
24 :     eval {
25 :     $fig = new FIG;
26 :     };
27 :    
28 :     if ($@ ne "")
29 :     {
30 :     my $err = $@;
31 :    
32 :     my(@html);
33 :    
34 :     push(@html, $cgi->p("Error connecting to SEED database."));
35 :     if ($err =~ /Could not connect to DBI:.*could not connect to server/)
36 :     {
37 :     push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
38 :     }
39 :     else
40 :     {
41 :     push(@html, $cgi->pre($err));
42 :     }
43 :     &HTML::show_page($cgi, \@html, 1);
44 :     exit;
45 :     }
46 :    
47 :     my $html = [];
48 :     my $user = $cgi->param('user');
49 :    
50 :     unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");
51 :    
52 : redwards 1.2
53 : overbeek 1.27 if ($cgi->param('equivalence'))
54 : overbeek 1.17 {
55 :     &set_of_equivs($fig,$cgi,$html);
56 :     }
57 :     elsif ($cgi->param('differentiate'))
58 :     {
59 : overbeek 1.32 &set_of_equivs($fig,$cgi,$html) if ($cgi->param('prot'));
60 : overbeek 1.17 &differentiate($fig,$cgi,$html);
61 :     }
62 : redwards 1.2 elsif ($cgi->param('family'))
63 :     {
64 :     &show_family($fig,$cgi,$html);
65 : redwards 1.1 }
66 :     elsif ($cgi->param('prot'))
67 :     {
68 : redwards 1.2 &show_protein($fig,$cgi,$html);
69 : redwards 1.1 }
70 : overbeek 1.14 elsif ($cgi->param('fig2kegg'))
71 :     {
72 :     &comparefig2kegg($fig,$cgi,$html);
73 :     }
74 : redwards 1.1 else
75 :     {
76 :     &show_initial($fig,$cgi,$html);
77 :     }
78 :    
79 :     &HTML::show_page($cgi,$html,1);
80 :     exit;
81 :    
82 :    
83 :     sub show_initial {
84 :     my ($fig,$cgi,$html)=@_;
85 :     # generate a blank page
86 : redwards 1.2 push @$html,
87 :     "<h2>Protein Families</h2>\n",
88 :     "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
89 : overbeek 1.27 "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>\n",
90 :     $cgi->start_form(-method=>'get'), "\n",
91 :     "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>\n",
92 :     $cgi->submit(-name=>'equivalence', -value=>"Show Protein Families"), "\n", $cgi->reset, $cgi->end_form;
93 : redwards 1.1 return $html;
94 :     }
95 :    
96 : overbeek 1.22 # "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
97 :     # "Please enter a family id: ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
98 :     # $cgi->submit, $cgi->reset, $cgi->end_form;
99 : redwards 1.7
100 : redwards 1.2 sub show_family {
101 : redwards 1.1 my ($fig,$cgi,$html)=@_;
102 : overbeek 1.32 my $fam=$cgi->param('family');
103 :     my $tab=[];
104 :     my $col_hdrs=['#', 'Protein', 'Other proteins with same amino acid sequence'];
105 :     my $count=1;
106 : overbeek 1.27
107 : overbeek 1.32 foreach my $extid (sort {$a cmp $b} $fig->ext_ids_in_family($fam))
108 :     {
109 :     my $cid=$fig->prot_to_cid($extid);
110 :     my @pegs=map {&protein_link($_)} grep {$_ ne $extid} $fig->cid_to_prots($cid);
111 :     push @$tab, [$count, &protein_link($extid, 1), (join ", ", (@pegs))];
112 :     $count++;
113 :     }
114 :    
115 :     push @$html, "<h2>$fam Family</h2>\n",
116 :     "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
117 :     "Each of these proteins are present in other databases, and this table shows you the ID of the identical proteins in those databases</p>\n",
118 :     "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
119 :     &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)");
120 :    
121 :     if ($cgi->param('comparepairs'))
122 :     {
123 :     &comparepairs($fig, $cgi, $html);
124 :     }
125 :     else
126 :     {
127 :     push @$html, $cgi->h3("<a href=\"proteinfamilies.cgi?family=$fam&comparepairs=1&user=$user\">Compare this family to other protein families</a>");
128 : redwards 1.1 }
129 :     }
130 :    
131 : overbeek 1.11 sub show_protein {
132 : redwards 1.1 my ($fig,$cgi,$html)=@_;
133 : overbeek 1.11 foreach my $peg ($cgi->param('prot')) {
134 : overbeek 1.22 my @families;
135 :     if ($peg =~ /^\d+$/)
136 :     {
137 :     # it is a cid
138 :     @families=$fig->in_family($peg);
139 :     $peg = "CID $peg";
140 :     }
141 :     else
142 :     {
143 :     @families=$fig->families_for_protein($peg);
144 :     }
145 :    
146 : overbeek 1.11 unless (@families)
147 :     {
148 :     push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
149 :     return;
150 : redwards 1.1 }
151 :    
152 : redwards 1.2 my $tab=[];
153 : overbeek 1.20 my $self=$cgi->url(-relative => 1);
154 : overbeek 1.11 foreach my $fam (@families) {
155 :     my %idcount;
156 :     my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
157 :     #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
158 :     push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
159 : redwards 1.2 }
160 : overbeek 1.11
161 : overbeek 1.17 my $col_hdrs=['Family ID', 'Family Function', 'Number of CIDs in Family', 'Choose Family'];
162 : overbeek 1.11 push @$html, "<h2>Families for $peg</h2>\n",
163 : overbeek 1.17 $cgi->start_form(-method=>'get'),
164 : overbeek 1.11 "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
165 : overbeek 1.17 "A CID is a unique, internal ID we have assigned to proteins with identical sequences",
166 : overbeek 1.11 &HTML::make_table($col_hdrs, $tab, "Families for $peg"), "\n",
167 :     $cgi->submit('Show Proteins In Each Family'),
168 :     $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
169 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
170 :     $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
171 :     $cgi->reset, $cgi->end_form;
172 : redwards 1.2 }
173 :     }
174 : redwards 1.10
175 :    
176 : overbeek 1.12
177 :    
178 : redwards 1.10 sub analyse_family {
179 : overbeek 1.12 my ($fig,$cgi,$html, $reverse)=@_;
180 : redwards 1.10 # here are the questions:
181 :     # 1. Given a column in a spreadsheet:
182 :     # 2. Here are the proteins in that column
183 :     # 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
184 :     # if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
185 :     # so we recommend that a protein be removed from a family.
186 :     # 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
187 :     # 5. For each of the families that are good, which proteins are only in one of those families and not in any others?
188 :    
189 :     # Note that column == family, But start with fig and then allow a replace ID feature like before.
190 :    
191 : overbeek 1.15 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
192 :     undef $focus if ($focus eq "all");
193 : overbeek 1.12
194 :     my @cols;
195 :     if ($cgi->param("allfams")) {@cols=grep {$cgi->param($_)} $cgi->param("allfams")}
196 :     elsif ($cgi->param("family")) {push @cols, $cgi->param('family')}
197 :     else {die "No families declared!"}
198 :    
199 :     foreach my $col (@cols)
200 : redwards 1.10 {
201 :     # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
202 : overbeek 1.12 my $proteins_in_col;
203 :     map {$proteins_in_col->{$_}=1} $fig->ids_in_family($col);
204 : redwards 1.10
205 :     # @proteins are the proteins in that column, although these are cids and not fids at the moment
206 :     my $familycount;
207 : overbeek 1.12 foreach my $prot (keys %$proteins_in_col) {
208 : redwards 1.10 foreach my $fam ($fig->in_family($prot)) {
209 :     $familycount->{$fam}++;
210 :     }
211 :     }
212 :    
213 :     my $count_of;
214 :     my $fams;
215 : overbeek 1.12 foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount)
216 :     {
217 :     if ($reverse) {($fams, $count_of)=&ids_missing_from_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
218 :     else {($fams, $count_of)=&ids_are_in_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
219 :     }
220 : redwards 1.10
221 :     # create a list of families that we know about
222 : overbeek 1.27 my @fams=grep {$_ ne $col} sort {scalar(keys %{$fams->{$b}}) <=> scalar(keys %{$fams->{$a}})} keys %$fams;
223 : overbeek 1.11 unshift @fams, $col;
224 :    
225 : overbeek 1.15 my $tab=[[["Number of proteins in family", "th colspan=4"], map {scalar(keys %{$fams->{$_}})} @fams]];
226 : redwards 1.10
227 : overbeek 1.11 my @fids;
228 :     if ($cgi->param('sort') eq "genome")
229 :     {
230 :     @fids=sort {$fig->genome_of($a) <=> $fig->genome_of($b)} keys %$count_of;
231 :     }
232 : overbeek 1.15 elsif ($cgi->param('sort') eq "cid")
233 :     {
234 :     @fids=sort {$fig->prot_to_cid($a) <=> $fig->prot_to_cid($b)} keys %$count_of;
235 :     }
236 : overbeek 1.11 else
237 :     {
238 :     @fids=sort {scalar(keys %{$count_of->{$b}}) <=> scalar(keys %{$count_of->{$a}})} keys %$count_of;
239 :     }
240 :    
241 :     my $rowcount;
242 :     foreach my $fid (@fids)
243 :     {
244 : overbeek 1.15 my @row=(++$rowcount, $fig->prot_to_cid($fid), $fid, scalar(keys %{$count_of->{$fid}}));
245 : overbeek 1.11
246 :     foreach my $fam (@fams) {
247 :     $count_of->{$fid}->{$fam} ? push @row, ["Y", "td style='background-color: lightgrey; text-align: center'"] : push @row, " &nbsp ";
248 : redwards 1.10 }
249 :     push @$tab, \@row;
250 :     }
251 : overbeek 1.12
252 : overbeek 1.22 push @$html, $cgi->start_form(-method=>'get'), $cgi->p("Limit display to proteins from ", &choose_focus($cgi), "\n"), $cgi->p("Sort the order by ", &choose_sort($cgi),"\n");
253 : overbeek 1.12 if ($reverse) {
254 :     push @$html, $cgi->p("These are proteins that ARE NOT in ", $fig->family_function($col), " ($col) but are in other families that have proteins in this family.");
255 :     } else {
256 :     push @$html, $cgi->p("These are proteins that ARE in ", $fig->family_function($col), " ($col) and are in other families that have proteins in this family.");
257 :     }
258 :    
259 : redwards 1.16 # merge cells in the table
260 :     my $skip;
261 :     map {$skip->{$_}=1} (0, 2 .. 40); # ignore everything except column 2
262 :     $tab=&HTML::merge_table_rows($tab, $skip);
263 :    
264 : overbeek 1.12 push @$html,
265 :     $cgi->hidden(-name=>"family", -value=>@cols), $cgi->hidden("prot"), $cgi->hidden(-name=>"user"),
266 : overbeek 1.11 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
267 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
268 : overbeek 1.15 &HTML::make_table(["Count", "Unique ID", "Protein ID", "Number of fams protein is in", @fams], $tab,' &nbsp; ');
269 : redwards 1.10 }
270 :     }
271 :    
272 :    
273 : overbeek 1.12 sub ids_are_in_fam {
274 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
275 :     # It seems that $sz_family is not right
276 :     map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
277 :     grep {/^$focus/}
278 :     map {$fig->cid_to_prots($_)}
279 :     grep {$proteins_in_col->{$_}}
280 :     ($fig->ids_in_family($f));
281 :     return ($fams, $count_of);
282 :     }
283 : redwards 1.10
284 : overbeek 1.12 sub ids_missing_from_fam {
285 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
286 : redwards 1.10 # It seems that $sz_family is not right
287 : overbeek 1.11 map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
288 : redwards 1.10 grep {/^$focus/}
289 :     map {$fig->cid_to_prots($_)}
290 : overbeek 1.12 grep {!$proteins_in_col->{$_}}
291 : redwards 1.10 ($fig->ids_in_family($f));
292 : overbeek 1.12 return ($fams, $count_of);
293 :     }
294 :    
295 : overbeek 1.11
296 : redwards 1.10
297 : overbeek 1.11 sub choose_focus {
298 :     my ($cgi)=@_;
299 :     my %choices=(
300 : overbeek 1.15 "all" => "All",
301 : overbeek 1.11 "fig" => "FIGfams",
302 :     "tigr" => "TIGRfams",
303 :     "pfam" => "PFAM",
304 :     "sp" => "SwissProt",
305 :     "kegg" => "KEGG",
306 :     "pir" => "PIR SuperFams",
307 :     "mcl" => "MCL",
308 :     "cog" => "COG",
309 :     );
310 :    
311 : overbeek 1.22 my $default = $cgi->param("focus"); unless ($default) {$default="fig"}
312 : overbeek 1.11
313 :     return $cgi->popup_menu(
314 :     -name => "focus",
315 : overbeek 1.15 -values => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
316 :     -labels => \%choices,
317 :     -default => $default,
318 :     );
319 :     }
320 :    
321 :     sub choose_sort {
322 :     my ($cgi)=@_;
323 :     my %choices=(
324 :     "none" => "Number of Proteins",
325 :     "genome" => "Genome",
326 :     "cid" => "Unique ID",
327 :     );
328 :    
329 :     my $default = $cgi->param("sort"); unless ($default) {$default="none"}
330 :    
331 :     return $cgi->popup_menu(
332 :     -name => "sort",
333 :     -values => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
334 : overbeek 1.11 -labels => \%choices,
335 :     -default => $default,
336 :     );
337 :     }
338 : redwards 1.10
339 : redwards 1.7
340 : overbeek 1.14 sub comparefig2kegg {
341 :     my ($fig,$cgi,$html)=@_;
342 :    
343 :     my $classification; my %subsystem;
344 :     # read classification from kegg file
345 :     if (open(IN, "$FIG_Config::global/ProteinFamilies/kegg_classificaation.txt")) {
346 :     while (<IN>) {
347 :     chomp;
348 :     my @a=split /\t/;
349 :     my $id=shift(@a);
350 :     $subsystem{"kegg|$id"}=pop(@a);
351 :     push @{$classification->{"kegg|$id"}}, \@a;
352 :     }
353 :     }
354 :    
355 :    
356 :     my $tab=[];
357 :     # find out what families our proteins are in
358 :     map {
359 :     my $prot=$_;
360 :     map {
361 :     my $fam=$_;
362 :     if ($fam =~ /^fig/) {
363 :     my %ss;
364 :     map {$ss{$_->[0]}++} ($fig->subsystems_for_role($fig->family_function($fam)));
365 :     map {my $ss=$_; push @$tab, [$prot, @{$fig->subsystem_classification($ss)}, $ss, $fam, $fig->family_function($fam)]} keys %ss;
366 :     }
367 :     else {
368 :     map {push @$tab, [$prot, @{$_}, $subsystem{$fam}, $fam, $fig->family_function($fam)]} @{$classification->{$fam}}
369 :     }
370 :     } grep {/^fig/ || /^kegg/} $fig->families_for_protein($prot);
371 :     } $cgi->param('proteins');
372 :    
373 :     my $col_hdrs=['Protein', ['Classification', 'th colspan=2'], 'Subsystem', 'Family ID', 'Family Function'];
374 :     push @$html, &HTML::make_table($col_hdrs, $tab, "Families"), "\n",
375 :     }
376 : overbeek 1.17
377 :     ## Based on request from Ross:
378 :     # Subject: Re: fig.pl
379 :     # Date: October 4, 2005 6:21:00 AM PDT
380 :     # From: Ross@theFIG.info
381 :     # To: raedwards@gmail.com
382 :     #
383 :     #Rob,
384 :     #
385 :     #It seems to me that you got that right, and the function is certainly at the
386 :     #core of what is needed. I have been thinking about what I would want with
387 :     #protein families,
388 :     #and it goes something like this:
389 :     #
390 :     #1. Given a protein FIG1, you can get the set of proteins with the same CID
391 :     #(call it CID1). Call this set EQUIV, since it is really a set of IDs that are
392 :     #equivalent.
393 :     #
394 :     #2. From the set of IDs in EQUIV, you can get the set of protein families (from
395 :     #all sources) that contain the IDs in EQUIV. This gives a table
396 :     #
397 :     #
398 :     # [,ID,Function,Family,FamilyFunction]
399 :     #
400 :     # All of the table entries describe a family containing CID1.
401 :     #
402 :     #3. From this table you select two Families to be compared (e.g., one KEGG
403 :     #family vs a FIG family). This ends the first part -- selecting the precise
404 :     #two
405 :     # families to be compared. Each of the two families should be thought of
406 :     #as [CID,ID,Family].
407 :     #
408 :     #4. The comparison of SET1 and SET2 uses essentially the function you
409 :     #implemented. You need to form three sets:
410 :     #
411 :     # the intersection of SET1 and SET2
412 :     # SET1 - SET2
413 :     # SET2 - SET1
414 :     #
415 :     # You may or may not wish to display each of the three sets. The user
416 :     #should be able to select which. When you think
417 :     # of one of these sets, it is useful to think of
418 :     #{CID,Family,Set-of-CIDs}. That is, it is not just a set of CIDs; it should be
419 :     #viewed as a
420 :     # set of CIDs from a specific family that was chosen because it
421 :     #contained a specific CID.
422 :     #
423 :     #5. When displaying a set of proteins from a given family, you start with
424 :     #(CID,Family,Set-of-CIDs). Each line should contain
425 :     #
426 :     # 1. A single CID from the Set-of-CIDs (call this CID2).
427 :     #
428 :     # 2. A count of the number of sources that place both CID1 and CID2
429 :     #in the same family (note that this is not a count of the families that include
430 :     #both CID1 and CID2)
431 :     #
432 :     # 3. For each source a "Y" or space indicating whether or not the
433 :     #source placed CID1 and CID2 into the same family (i.e., whether or not there
434 :     # is at least one family from the source that contains both
435 :     #CID1 and CID2).
436 :     #
437 :     #That is what I think should be done. Can we discuss it?
438 :     #
439 :    
440 :    
441 :    
442 :     sub set_of_equivs {
443 : overbeek 1.22 ($fig, $cgi, $html)=@_;
444 : overbeek 1.27 my $peg=$cgi->param('prot');
445 :     my $tab=[];
446 :     my $allfams;
447 : overbeek 1.19
448 : overbeek 1.27 #begin the html so we can add hidden things
449 :     my $genusspecies=$fig->genus_species($fig->genome_of($peg));
450 :     push @$html, (
451 :     $cgi->start_form(-method=>'get'), "\n",
452 :     $cgi->p("<h1>Protein <b>$peg</b>: $genusspecies</h1>"), "\n",
453 :     $cgi->hidden(-name=>'querycid', -value=>$fig->prot_to_cid($peg)), "\n",
454 :     $cgi->a({class=>"help", onMouseover=>"javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this, 'Help', 'The table below shows all the families that contain a protein with the same sequence as <b>$peg</b>, although each family uses different IDs. The number of different IDs in each protein family and the number of unique protein sequences in each family are also shown. The latter is often less than the former since many databases contain identical proteins with different identifiers (for example the same protein from different <i>$genusspecies</i> genome sequences).', ''); this.tooltip.addHandler(); return false;", href=>"Html/ProteinFamilies.html"}, "Help"),
455 :     );
456 :    
457 :     foreach my $fam ($fig->families_for_protein($peg))
458 :     {
459 :     my $ffunc=$fig->family_function($fam) || " &nbsp; ";
460 :     push @$tab, [
461 :     &protein_link($peg),
462 :     scalar($fig->function_of($peg)),
463 :     "<a href=\'proteinfamilies.cgi?family=$fam&user=$user\'>$fam</a>",
464 :     $ffunc,
465 :     $fig->ext_sz_family($fam),
466 :     $fig->sz_family($fam),
467 :     ];
468 :     $allfams->{$fam}="$fam :" . $fig->family_function($fam);
469 :     }
470 : overbeek 1.19
471 : overbeek 1.17
472 :     $tab=&HTML::merge_table_rows($tab, {3=>1, 4=>1});
473 : overbeek 1.27
474 : overbeek 1.22 my $radbut={
475 :     "1not2"=>"In family one and NOT family two\n",
476 :     "2not1"=>"In family two and NOT family one\n",
477 : overbeek 1.31 "1and2"=>"In family one and family two\n",
478 :     "1or2" =>"In family one or family two\n",
479 : overbeek 1.22 };
480 : overbeek 1.27
481 :     # sort the list of families in this table but put the fig families at the beginning of the list
482 :     my @familylist=sort {$a cmp $b} grep {$_ !~ /^fig/} keys %$allfams;
483 :     unshift @familylist, sort {$a cmp $b} grep {$_ =~ /^fig/} keys %$allfams;
484 :    
485 :     my $col_hdrs=['Protein', 'Function', 'Family', 'Family Function', 'External<br>IDs In<br>Family', 'Unique<br>Proteins<br>In Family'];
486 : overbeek 1.21 push @$html, &HTML::make_table($col_hdrs, $tab, ""), "\n", $cgi->p("To differentiate families in this table, please choose two families:"),
487 : overbeek 1.27 "Family 1: &nbsp; ", $cgi->popup_menu(-name=>"family1", -values=>\@familylist, -labels=>$allfams, -default=>$familylist[0]),
488 :     " <br /> Family 2: &nbsp; ", $cgi->popup_menu(-name=>"family2", -values=>\@familylist, -labels=>$allfams, -default=>$familylist[1]),
489 : overbeek 1.18 $cgi->p("Show proteins:<br /><ul>\n",
490 : overbeek 1.27 $cgi->radio_group(-name=>"diff", -values=>[keys %$radbut], -labels=>$radbut, -rows=>4),
491 : overbeek 1.22 "</ul>\n"),
492 :     $cgi->hidden(-name=>'prot', -value=>$peg),
493 : overbeek 1.27 $cgi->submit(-name=>"differentiate", -value=>"Compare these families"), $cgi->reset, $cgi->end_form();
494 : overbeek 1.17 }
495 :    
496 : overbeek 1.27 # initially I added this option in, with appropriate help text, but then I added the union option, and I think that surplants this, so I removed it!
497 :     # $cgi->a({class=>"help", onMouseover=>"javascript:if(!this.tooltip) this.tooltip=new Popup_Tooltip(this, 'Help', 'The table will show only those proteins from one of the families that contains proteins that could be in the other family but are not. Most likely this will only be a few proteins, but the table above shows that there are a large number of proteins in one family but not another. Many of these are proteins that are missing from the database. Checking this box will show all the proteins from the chosen families, not just those with missing proteins. By default, you should not check this box.', ''); this.tooltip.addHandler(); return false;", href=>"Html/ProteinFamilies.html"}, "Help"),
498 :     # "Show all correspondences, not just those with missing proteins: ", $cgi->checkbox(-name=>"show", -value=>"all", -label=>""),
499 :    
500 : overbeek 1.22
501 : overbeek 1.17 sub differentiate {
502 : overbeek 1.22 ($fig, $cgi, $html)=@_;
503 :    
504 :     #my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
505 :     #undef $focus if ($focus eq "all");
506 :     my $focus=$cgi->param('family2');
507 :     $focus =~ s/\|.*//;
508 :    
509 :     my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
510 :     if ($fam_id1 eq $fam_id2)
511 :     {
512 :     push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
513 :     return;
514 :     }
515 :     my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
516 : overbeek 1.27
517 :     # This block of code is the one that really decides what the data is that is analyzed.
518 :     # $fam1 and $fam2 are references to arrays containing all the CIDs in each family. We combine them with intersection, union, or differences
519 :     # and then process the remaining IDs.
520 :    
521 :     my $set=[];
522 :     if ($cgi->param("diff") eq "1and2")
523 :     {
524 :     $set=&set_utilities::intersection($fam1, $fam2);
525 :     }
526 :     elsif ($cgi->param("diff") eq "1not2")
527 :     {
528 :     $set=&set_utilities::set_diff($fam1, $fam2);
529 :     }
530 :     elsif ($cgi->param("diff") eq "2not1")
531 :     {
532 :     $set=&set_utilities::set_diff($fam2, $fam1);
533 :     }
534 :     elsif ($cgi->param("diff") eq "1or2")
535 : overbeek 1.22 {
536 : overbeek 1.27 $set=&set_utilities::union($fam1, $fam2);
537 :     }
538 :    
539 : overbeek 1.32 #### which families
540 :     #
541 :     # We need to figure out which other families to compare to (i.e. what families will be in the columns on the table).
542 :     # There are two ways to do this. If we come from the perspective of a single protein, we just want to look at the families
543 :     # that protein maps to. This is the regular view.
544 :     #
545 :     # Alternatively, if we come from the perspective of the whole family, we will just work out all the families
546 :     # that are present in @$set
547 :     #
548 :     # Note that in both cases we grep out our two query families and then unshift them to make them the first two query columns
549 :    
550 :     my @families;
551 :     if ($cgi->param('prot'))
552 :     {
553 :     my $peg=$cgi->param('prot');
554 :     @families=sort {$a cmp $b} grep {$_ ne $fam_id1} grep {$_ ne $fam_id2} $fig->families_for_protein($peg);
555 :     }
556 :     else
557 :     {
558 :     my %allfams;
559 :     foreach my $cid (@$set)
560 :     {
561 :     foreach my $infamily ($fig->in_family($cid))
562 :     {
563 :     $allfams{$infamily}++;
564 :     }
565 :     }
566 :     @families=sort {$allfams{$b} <=> $allfams{$a}} grep {$_ ne $fam_id1} grep {$_ ne $fam_id2} keys %allfams;
567 :     }
568 :    
569 :     unshift @families, ($fam_id1, $fam_id2);
570 :     my @source=@families;
571 :     map {/^(.*?)\|/; $_=$1} @source;
572 :    
573 :     # now figure out all the external IDs in those families
574 :     my $extids;
575 :     foreach my $fam (@families)
576 :     {
577 :     map {$extids->{$_}->{$fam}=1} $fig->ext_ids_in_family($fam);
578 :     }
579 : overbeek 1.31
580 : overbeek 1.32 # finally generate the table. Note that there are three different arrays that we operate on depending on the user input
581 :     # but it really only changes which set algorith we use. Each array is handled identically.
582 :     my $tab=[];
583 :    
584 :     my ($totalfor, $totalagainst);
585 : overbeek 1.27 foreach my $cid (@$set)
586 :     {
587 :     #my $row=["<a href='proteinfamilies.cgi?prot=$cid'>$cid</a>"];
588 :     my $row=[];
589 : overbeek 1.32 my $seen; my $mismatchcolor; my $nofamcolor;
590 : overbeek 1.31 my ($for, $against)=(0,0);
591 : overbeek 1.27 foreach my $prot (sort $fig->cid_to_prots($cid))
592 : overbeek 1.22 {
593 : overbeek 1.27 for (my $i=0; $i<=$#families; $i++)
594 : overbeek 1.22 {
595 :     # add the protein info to the cell in the table if it this family has that protein. Note that we have seen it, and increment the column counter
596 :     # this if is if the protein that we are looking at is in the family for this column then add it
597 :    
598 :     # if the protein is not added, we want to know if it has the same start characters as the family (i.e. from the same source), and note that.
599 : overbeek 1.27 if ($extids->{$prot}->{$families[$i]})
600 :     {
601 :     $seen->{$prot}=1;
602 :     $row->[$i] .= &protein_link($prot, 1, $families[$i]) . "<br />";
603 : overbeek 1.31 $for++;
604 : overbeek 1.27 }
605 :     elsif ($prot =~ /^$source[$i]/)
606 :     {
607 : overbeek 1.32 if ($fig->ext_family_for_id($prot)) {$mismatchcolor->{$i}=1} else {$nofamcolor->{$i}=1}
608 : overbeek 1.27 $row->[$i] .= &protein_link($prot, 1, $families[$i]) . "<br />";
609 : overbeek 1.31 $against--; # note that against is a negative score!
610 : overbeek 1.27 }
611 :     }
612 :     }
613 :    
614 :     unless ($#$row == $#families) {$#$row=$#families}
615 : overbeek 1.22
616 : overbeek 1.27 # color those cells that have a mismatch. note that this colors the whole cell even if there is more than one protein mismatching
617 : overbeek 1.32 map {$row->[$_] = [$row->[$_], "td style='background-color: #CCCCFF'"]} keys %$nofamcolor;
618 : overbeek 1.22 map {$row->[$_] = [$row->[$_], "td style='background-color: #FF3366'"]} keys %$mismatchcolor;
619 : overbeek 1.27 # change empty cells
620 : overbeek 1.22 map {$row->[$_] = " &nbsp; " unless ($row->[$_])} (0 .. $#$row);
621 :    
622 : overbeek 1.31 # add the score
623 : overbeek 1.32 splice(@$row, 0, 0, "$for/$against");
624 : overbeek 1.31
625 : overbeek 1.22 # if we want to show everything do so, otherwise only show the rows where there is a missing protein
626 : overbeek 1.27 if (($cgi->param("diff") eq "1and2") || ($cgi->param("diff") eq "1or2") || ($cgi->param('show') eq "all"))
627 : overbeek 1.22 {
628 :     push @$tab, $row;
629 : overbeek 1.32 $totalfor+=$for; $totalagainst+=$against;
630 : overbeek 1.22 }
631 : overbeek 1.31 elsif ($cgi->param("diff") eq "1not2" && $row->[2] ne " &nbsp; ")
632 : overbeek 1.22 {
633 :     push @$tab, $row;
634 : overbeek 1.32 $totalfor+=$for; $totalagainst+=$against;
635 : overbeek 1.22 }
636 : overbeek 1.32 elsif ($cgi->param("diff") eq "2not1" && $row->[1] ne " &nbsp; ")
637 : overbeek 1.25 {
638 : overbeek 1.26 #($row->[1], $row->[2])=($row->[2], $row->[1]);
639 : overbeek 1.25 push @$tab, $row;
640 : overbeek 1.32 $totalfor+=$for; $totalagainst+=$against;
641 : overbeek 1.25 }
642 : overbeek 1.22 }
643 : overbeek 1.32
644 :     # sort the table
645 :     @$tab=sort {$a->[1] cmp $b->[1] || $a->[2] cmp $b->[2]} @$tab;
646 : overbeek 1.27
647 :     #generate the titles
648 :     my $title;
649 : overbeek 1.22 ($cgi->param("diff") eq "1and2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
650 : overbeek 1.27 ($cgi->param("diff") eq "1or2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) OR in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
651 : overbeek 1.22 ($cgi->param("diff") eq "1not2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) BUT NOT in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
652 :     ($cgi->param("diff") eq "2not1") ? $title.=$fig->family_function($fam_id2). " ($fam_id2) BUT NOT in " . $fig->family_function($fam_id1). " ($fam_id1)\n" : 1;
653 : overbeek 1.27
654 :     push @$html, (
655 :     "<h3 style='text-align: center'>Comparison of proteins in $title</h3>\n",
656 :     "<p>The table shows those proteins that are in $title. The rows show proteins with the same sequence. All the IDs along the row are different IDs from identical protein sequences. The columns of the table are different protein families. If a protein in an individual cell has a red background, that protein is either not in the family for that column, or is in that family and another family in the same database.</p>\n",
657 : overbeek 1.32 "<p>To find out which families each protein is in, scroll the mouse over the link to the protein database. The popup window will show you the protein families for each protein. Note, however, if this protein is in the family for that column only that family is currently shown. If the pop-up window has a green background the protein is in the same family as the column as a whole. If the pop-up window has a red background the protein is in a different family than the column as a whole. If the pop-up window has a blue background the protein is not in any protein families. The red background and the red cell color and the blue background and blue cell color are complimentary and reinforce that these are proteins you should look at.</p>\n",
658 : overbeek 1.27 );
659 :    
660 : overbeek 1.22
661 :     my @headers=@families;
662 : overbeek 1.26 map {$_ = "<a " . FIGjs::mouseover("Column Family", $fig->family_function($_) . " ($_)", '') . " href='proteinfamilies.cgi?family=$_'>$_</a>"} @headers;
663 : overbeek 1.32 splice(@headers, 0, 0, "Score");
664 :     if ($tab->[0])
665 : overbeek 1.26 {
666 : overbeek 1.27 push @$html, HTML::make_table(\@headers, $tab, "Proteins In $title");
667 : overbeek 1.32 push @$html, "<p><b>Total Score: </b><br>\nSupporting this comparison between families: <b>$totalfor</b><br>\n",
668 :     "Not supporting this comparison between families: <b>$totalagainst</b></p>\n";
669 :    
670 :    
671 : overbeek 1.26 }
672 :     else
673 :     {
674 : overbeek 1.27 my $sorry="<p>Sorry there were no protein families that satisfied looking for</p>\n<p>$title</p>";
675 :     if (($cgi->param("diff") eq "1not2") || ($cgi->param("diff") eq "2not1")) {$sorry .= "<p>and had candidate proteins that could be in those families</p>"}
676 :     push @$html, $cgi->h3("<div style='color: blue; text-align: center'>$sorry</span>");
677 : overbeek 1.18 }
678 :     }
679 :    
680 : overbeek 1.27 =head2 protein_link()
681 : overbeek 1.18
682 : overbeek 1.27 This takes a protein ID and returns the link (full link including ID) back to the appropriate database.
683 : overbeek 1.18
684 : overbeek 1.27 If addmouseover is true then a mouseover will be added showing the families the peg is in
685 : overbeek 1.21
686 : overbeek 1.27 If $fam is provided it will be the header of the mouseover popup.
687 : overbeek 1.21
688 : overbeek 1.27 =cut
689 : overbeek 1.22
690 :     sub protein_link {
691 : overbeek 1.27 my ($p, $addmouseover, $fam) =@_;
692 : overbeek 1.22 my %proteinbase=(
693 : overbeek 1.30 # put here the link to the site for the proteins.
694 :     # Note the use of single quotes. Later, anywhere where the protein ID should be I'll replace \$p with the protein id.
695 :     # This is because for some reason cog needs the protein ID twice.
696 :     "fig" => "protein.cgi?user=$user&prot=fig|" . '$p',
697 :     "cog" => "http://www.ncbi.nlm.nih.gov/COG/old/blux.cgi?cog=" . '$p&$p',
698 :     "sp" => "http://www.expasy.org/uniprot/" . '$p',
699 :     "tr" => "http://www.expasy.org/uniprot/" . '$p',
700 :     "kegg" => "http://www.genome.jp/dbget-bin/www_bget?" .'$p',
701 : overbeek 1.22 );
702 : overbeek 1.30
703 :     # this was the old link that I think is wrong
704 :     # "cog" => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
705 :    
706 : overbeek 1.26 my $mouseovertitle="Protein Families";
707 :     if ($fam)
708 :     {
709 : overbeek 1.27 $mouseovertitle="<i>Column family: " . $fig->family_function($fam) . " ($fam)</i>";
710 :     }
711 :     my $familiesforp = "<b>Families for $p:</b><br>";
712 :     my ($hcolor, $bgcolor)=('#11AA66','#BBFFBB'); # text background color.
713 :    
714 :     # if the protein is in our family of interest show just that, otherwise show all the families
715 :     my @thisfam=$fig->ext_family_for_id($p);
716 :     if (grep {$_ eq $fam} @thisfam)
717 :     {
718 :     $familiesforp .= $fig->family_function($fam) . " ($fam)";
719 :     }
720 :     elsif (scalar(@thisfam))
721 :     {
722 :     $familiesforp .= "<ul>" . join("", map {"<li> " . $fig->family_function($_) . " ($_)</li>"} @thisfam) . "</ul>";
723 :     if (!$fam) {($hcolor, $bgcolor)=('','')} # use the default colors
724 :     else {($hcolor, $bgcolor)=('#CC0000', '#FF3366')} # we're doing a comparison and the families are different so color them red
725 :     }
726 :     else
727 :     {
728 :     $familiesforp="<b>Families for $p:</b><br>No protein families";
729 : overbeek 1.29 ($hcolor, $bgcolor)=('','');
730 : overbeek 1.27 }
731 :    
732 :     foreach my $key (keys %proteinbase)
733 :     {
734 :     if ($p =~ /^$key/ && $addmouseover)
735 :     {
736 :     $p =~ s/^(.*?)\|//;
737 : overbeek 1.30 my $dbase=$1;
738 :     my $location=$proteinbase{$key};
739 :     $location =~ s/\$p/$p/g; # this is for stupid cog that requires two invocations of the protein ID
740 : overbeek 1.32 $p = "<a " . FIGjs::mouseover($mouseovertitle, $familiesforp, '', '1', $hcolor, $bgcolor) . " href='$location' target='window_$$'>$dbase|$p</a>";
741 : overbeek 1.27 }
742 :     elsif ($p =~ /^$key/)
743 :     {
744 :     $p =~ s/^(.*?)\|//;
745 : overbeek 1.30 my $dbase=$1;
746 :     my $location=$proteinbase{$key};
747 :     $location =~ s/\$p/$p/g; # this is for stupid cog that requires two invocations of the protein ID
748 : overbeek 1.32 $p = "<a href='$location' target='window_$$'>$dbase|$p</a>";
749 : overbeek 1.27 }
750 : overbeek 1.26 }
751 : overbeek 1.22 return $p;
752 :     }
753 :    
754 : overbeek 1.21
755 : overbeek 1.32 sub comparepairs{
756 :     ($fig, $cgi, $html)=@_;
757 :     # very very experimental. Don't look at this code or use it
758 :     # we are going to take a protein family and then look at all the ids in that family and count the occurences of other families
759 :     # then find the proteins and give them a score
760 :    
761 :     my $fam=$cgi->param("family");
762 :     return unless ($fam);
763 : overbeek 1.21
764 : overbeek 1.32 my $count;
765 :     foreach my $cid ($fig->ids_in_family($fam))
766 :     {
767 :     foreach my $newfamily ($fig->in_family($cid))
768 :     {
769 :     $count->{$newfamily}++;
770 :     }
771 :     }
772 :    
773 :     my $tab;
774 :     foreach my $test (keys %$count)
775 :     {
776 :     next if ($test eq $fam);
777 :     my $res=&score_two_families($fam, $test, "or");
778 :     my $link1="<a href=\"proteinfamilies.cgi?user=$user&family1=$fam&family2=$test&diff=2not1&differentiate=Compare+these+families\" target=\"window_$$\">$fam</a>";
779 :     my $link2="<a href=\"proteinfamilies.cgi?user=$user&family=$test\" target=\"window_$$\">$test</a>";
780 :     push @$tab, [$link1, $link2, $count->{$test}, @$res];
781 :     }
782 :    
783 :     @$tab=sort {$b->[2] <=> $a->[2] || $b->[3] <=> $a->[3] || $a->[4] <=> $b->[4]} @$tab;
784 :    
785 :    
786 :     push @$html,
787 :     $cgi->h3("Comparisons between protein families"),
788 :     $cgi->p("The table below shows a comparison of <b>", $fig->family_function($fam), " ($fam)</b> with all the other protein families ",
789 :     " that proteins in $fam are also present in. The link in the first column will take you to a table showing proteins that are in family two ",
790 :     " that are not in $fam. These are proteins that you should investigate for being in your family. ",
791 :     " The link in the second column will take you to that families page. The <b>frequency</b> is the number of proteins in $fam that are also in the second family.",
792 :     " The <b>For</b> score is the total number of proteins that agree both families are similar. The <b>Against</b> score is the total number of proteins ",
793 :     " that disagree that the two proteins are similar. These numbers are the sum of the for and against scores shown under the side-by-side comparison ",
794 :     " when you click the link in column 1."),
795 :     HTML::make_table(["Family 1", "Family 2", "Frequency", "For", "Against"], $tab, "Scores");
796 :     }
797 :    
798 :     sub score_two_families {
799 :     my ($fam_id1, $fam_id2, $method)=@_;
800 :     my $focus=$fam_id1;
801 :     $focus =~ s/\|.*//;
802 :    
803 :     return unless ($fam_id1 && $fam_id2);
804 :     my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
805 :    
806 :     my $set=[];
807 :     if ($method eq "and") {$set=&set_utilities::intersection($fam1, $fam2)}
808 :     elsif ($method eq "not") {$set=&set_utilities::set_diff($fam1, $fam2)}
809 :     elsif ($method eq "or") {$set=&set_utilities::union($fam1, $fam2)}
810 :    
811 :     # now figure out all the families represented by @$set
812 :     my %allfams;
813 :     foreach my $cid (@$set)
814 :     {
815 :     foreach my $infamily ($fig->in_family($cid))
816 :     {
817 :     $allfams{$infamily}++;
818 :     }
819 :     }
820 :    
821 :     my @families=sort {$allfams{$b} <=> $allfams{$a}} grep {$_ ne $fam_id1} grep {$_ ne $fam_id2} keys %allfams;
822 :     unshift @families, ($fam_id1, $fam_id2);
823 :     my @source=@families;
824 :     map {/^(.*?)\|/; $_=$1} @source;
825 :    
826 :     # now figure out all the external IDs in those families
827 :     my $extids;
828 :     foreach my $fam (@families) {map {$extids->{$_}->{$fam}=1} $fig->ext_ids_in_family($fam)}
829 :    
830 :     my ($totalfor, $totalagainst);
831 :     foreach my $cid (@$set)
832 :     {
833 :     my ($for, $against)=(0,0);
834 :     foreach my $prot (sort $fig->cid_to_prots($cid))
835 :     {
836 :     for (my $i=0; $i<=$#families; $i++)
837 :     {
838 :     if ($extids->{$prot}->{$families[$i]})
839 :     {
840 :     $for++;
841 :     }
842 :     elsif ($prot =~ /^$source[$i]/)
843 :     {
844 :     $against--; # note that against is a negative score!
845 :     }
846 :     }
847 :     }
848 :    
849 :     $totalfor+=$for; $totalagainst+=$against;
850 :     }
851 :     return [$totalfor, $totalagainst];
852 :     }
853 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3