[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

Annotation of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.24 - (view) (download)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 :     =head1 proteinfamilies.cgi
6 :    
7 : redwards 1.3 A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.
8 : redwards 1.1
9 : redwards 1.7 PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->
10 :    
11 : redwards 1.1 =cut
12 :    
13 :     use strict;
14 :     use FIG;
15 :     use HTML;
16 :     use raelib;
17 :     use CGI;
18 : overbeek 1.11 use CGI::Carp qw(fatalsToBrowser);
19 : redwards 1.1 my $cgi=new CGI;
20 :     use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.
21 :    
22 :     my $fig;
23 :     eval {
24 :     $fig = new FIG;
25 :     };
26 :    
27 :     if ($@ ne "")
28 :     {
29 :     my $err = $@;
30 :    
31 :     my(@html);
32 :    
33 :     push(@html, $cgi->p("Error connecting to SEED database."));
34 :     if ($err =~ /Could not connect to DBI:.*could not connect to server/)
35 :     {
36 :     push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
37 :     }
38 :     else
39 :     {
40 :     push(@html, $cgi->pre($err));
41 :     }
42 :     &HTML::show_page($cgi, \@html, 1);
43 :     exit;
44 :     }
45 :    
46 :     my $html = [];
47 :     my $user = $cgi->param('user');
48 :    
49 :     unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");
50 :    
51 : redwards 1.2
52 :     if ($cgi->param('Show Proteins In Each Family'))
53 : redwards 1.1 {
54 : redwards 1.2 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
55 :     $cgi->param(-name=>'family', -value=>\@needed);
56 :     &show_family($fig,$cgi,$html);
57 :     }
58 : overbeek 1.11 elsif ($cgi->param('analyse_family')) {
59 : overbeek 1.12 # finding what is there is the same as findingh what is missing you just need one extra !
60 :     # these two things call the same method.
61 :     &analyse_family($fig,$cgi,$html, 0);
62 : redwards 1.7 }
63 : redwards 1.10 elsif ($cgi->param('reverse_analyse_family')) {
64 : overbeek 1.12 &analyse_family($fig,$cgi,$html, 1);
65 : redwards 1.10 }
66 : overbeek 1.17 elsif ($cgi->param('equivalence'))
67 :     {
68 :     &set_of_equivs($fig,$cgi,$html);
69 :     }
70 :     elsif ($cgi->param('differentiate'))
71 :     {
72 : overbeek 1.22 push @$html, $cgi->h3({style=>"color: red"}, "Scroll down to see the table");
73 :     &set_of_equivs($fig,$cgi,$html);
74 : overbeek 1.17 &differentiate($fig,$cgi,$html);
75 :     }
76 : redwards 1.2 elsif ($cgi->param('family'))
77 :     {
78 :     &show_family($fig,$cgi,$html);
79 : redwards 1.1 }
80 :     elsif ($cgi->param('prot'))
81 :     {
82 : redwards 1.2 &show_protein($fig,$cgi,$html);
83 : redwards 1.1 }
84 : overbeek 1.14 elsif ($cgi->param('fig2kegg'))
85 :     {
86 :     &comparefig2kegg($fig,$cgi,$html);
87 :     }
88 : redwards 1.1 else
89 :     {
90 :     &show_initial($fig,$cgi,$html);
91 :     }
92 :    
93 :     &HTML::show_page($cgi,$html,1);
94 :     exit;
95 :    
96 :    
97 :     sub show_initial {
98 :     my ($fig,$cgi,$html)=@_;
99 :     # generate a blank page
100 : redwards 1.2 push @$html,
101 :     "<h2>Protein Families</h2>\n",
102 :     "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
103 :     "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
104 : overbeek 1.17 $cgi->start_form(-method=>'get'),
105 : redwards 1.2 "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>",
106 : overbeek 1.22 $cgi->submit(-name=>'equivalence', -value=>"Show an equivalence table"), $cgi->reset, $cgi->end_form;
107 : redwards 1.1 return $html;
108 :     }
109 :    
110 : overbeek 1.22 # "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
111 :     # "Please enter a family id: ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
112 :     # $cgi->submit, $cgi->reset, $cgi->end_form;
113 : redwards 1.7
114 : redwards 1.2 sub show_family {
115 : redwards 1.1 my ($fig,$cgi,$html)=@_;
116 : redwards 1.2 foreach my $fam ($cgi->param('family')) {
117 : redwards 1.4 my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
118 : redwards 1.2 my $tab=[];
119 : overbeek 1.22 my $col_hdrs=['CID', 'Polypeptides with same amino acid sequence'];
120 : redwards 1.4 foreach my $cid (@cids) {
121 :     my @pegs=$fig->cid_to_prots($cid);
122 : overbeek 1.22 @pegs=map {&protein_link($_)} @pegs;
123 : redwards 1.4 push @$tab, [$cid, (join ", ", (@pegs))];
124 : redwards 1.1 }
125 : redwards 1.2
126 :     push @$html, "<h2>$fam Family</h2>\n",
127 :     "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
128 : redwards 1.3 "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
129 :     "even though they may come from different organisms.</p>",
130 : redwards 1.2 "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
131 : overbeek 1.17 $cgi->start_form(-method=>'get'),
132 : redwards 1.2 &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
133 : overbeek 1.13 $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"),
134 : overbeek 1.12 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
135 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
136 : redwards 1.2 $cgi->end_form;
137 : redwards 1.1 }
138 :     }
139 :    
140 : overbeek 1.11 sub show_protein {
141 : redwards 1.1 my ($fig,$cgi,$html)=@_;
142 : overbeek 1.11 foreach my $peg ($cgi->param('prot')) {
143 : overbeek 1.22 my @families;
144 :     if ($peg =~ /^\d+$/)
145 :     {
146 :     # it is a cid
147 :     @families=$fig->in_family($peg);
148 :     $peg = "CID $peg";
149 :     }
150 :     else
151 :     {
152 :     @families=$fig->families_for_protein($peg);
153 :     }
154 :    
155 : overbeek 1.11 unless (@families)
156 :     {
157 :     push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
158 :     return;
159 : redwards 1.1 }
160 :    
161 : redwards 1.2 my $tab=[];
162 : overbeek 1.20 my $self=$cgi->url(-relative => 1);
163 : overbeek 1.11 foreach my $fam (@families) {
164 :     my %idcount;
165 :     my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
166 :     #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
167 :     push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
168 : redwards 1.2 }
169 : overbeek 1.11
170 : overbeek 1.17 my $col_hdrs=['Family ID', 'Family Function', 'Number of CIDs in Family', 'Choose Family'];
171 : overbeek 1.11 push @$html, "<h2>Families for $peg</h2>\n",
172 : overbeek 1.17 $cgi->start_form(-method=>'get'),
173 : overbeek 1.11 "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
174 : overbeek 1.17 "A CID is a unique, internal ID we have assigned to proteins with identical sequences",
175 : overbeek 1.11 &HTML::make_table($col_hdrs, $tab, "Families for $peg"), "\n",
176 :     $cgi->submit('Show Proteins In Each Family'),
177 :     $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
178 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
179 :     $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
180 :     $cgi->reset, $cgi->end_form;
181 : redwards 1.2 }
182 :     }
183 : redwards 1.10
184 :    
185 : overbeek 1.12
186 :    
187 : redwards 1.10 sub analyse_family {
188 : overbeek 1.12 my ($fig,$cgi,$html, $reverse)=@_;
189 : redwards 1.10 # here are the questions:
190 :     # 1. Given a column in a spreadsheet:
191 :     # 2. Here are the proteins in that column
192 :     # 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
193 :     # if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
194 :     # so we recommend that a protein be removed from a family.
195 :     # 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
196 :     # 5. For each of the families that are good, which proteins are only in one of those families and not in any others?
197 :    
198 :     # Note that column == family, But start with fig and then allow a replace ID feature like before.
199 :    
200 : overbeek 1.15 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
201 :     undef $focus if ($focus eq "all");
202 : overbeek 1.12
203 :     my @cols;
204 :     if ($cgi->param("allfams")) {@cols=grep {$cgi->param($_)} $cgi->param("allfams")}
205 :     elsif ($cgi->param("family")) {push @cols, $cgi->param('family')}
206 :     else {die "No families declared!"}
207 :    
208 :     foreach my $col (@cols)
209 : redwards 1.10 {
210 :     # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
211 : overbeek 1.12 my $proteins_in_col;
212 :     map {$proteins_in_col->{$_}=1} $fig->ids_in_family($col);
213 : redwards 1.10
214 :     # @proteins are the proteins in that column, although these are cids and not fids at the moment
215 :     my $familycount;
216 : overbeek 1.12 foreach my $prot (keys %$proteins_in_col) {
217 : redwards 1.10 foreach my $fam ($fig->in_family($prot)) {
218 :     $familycount->{$fam}++;
219 :     }
220 :     }
221 :    
222 :     my $count_of;
223 :     my $fams;
224 : overbeek 1.12 foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount)
225 :     {
226 :     if ($reverse) {($fams, $count_of)=&ids_missing_from_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
227 :     else {($fams, $count_of)=&ids_are_in_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
228 :     }
229 : redwards 1.10
230 :     # create a list of families that we know about
231 : overbeek 1.11 my @fams=grep {!/$col/} sort {scalar(keys %{$fams->{$b}}) <=> scalar(keys %{$fams->{$a}})} keys %$fams;
232 :     unshift @fams, $col;
233 :    
234 : overbeek 1.15 my $tab=[[["Number of proteins in family", "th colspan=4"], map {scalar(keys %{$fams->{$_}})} @fams]];
235 : redwards 1.10
236 : overbeek 1.11 my @fids;
237 :     if ($cgi->param('sort') eq "genome")
238 :     {
239 :     @fids=sort {$fig->genome_of($a) <=> $fig->genome_of($b)} keys %$count_of;
240 :     }
241 : overbeek 1.15 elsif ($cgi->param('sort') eq "cid")
242 :     {
243 :     @fids=sort {$fig->prot_to_cid($a) <=> $fig->prot_to_cid($b)} keys %$count_of;
244 :     }
245 : overbeek 1.11 else
246 :     {
247 :     @fids=sort {scalar(keys %{$count_of->{$b}}) <=> scalar(keys %{$count_of->{$a}})} keys %$count_of;
248 :     }
249 :    
250 :     my $rowcount;
251 :     foreach my $fid (@fids)
252 :     {
253 : overbeek 1.15 my @row=(++$rowcount, $fig->prot_to_cid($fid), $fid, scalar(keys %{$count_of->{$fid}}));
254 : overbeek 1.11
255 :     foreach my $fam (@fams) {
256 :     $count_of->{$fid}->{$fam} ? push @row, ["Y", "td style='background-color: lightgrey; text-align: center'"] : push @row, " &nbsp ";
257 : redwards 1.10 }
258 :     push @$tab, \@row;
259 :     }
260 : overbeek 1.12
261 : overbeek 1.22 push @$html, $cgi->start_form(-method=>'get'), $cgi->p("Limit display to proteins from ", &choose_focus($cgi), "\n"), $cgi->p("Sort the order by ", &choose_sort($cgi),"\n");
262 : overbeek 1.12 if ($reverse) {
263 :     push @$html, $cgi->p("These are proteins that ARE NOT in ", $fig->family_function($col), " ($col) but are in other families that have proteins in this family.");
264 :     } else {
265 :     push @$html, $cgi->p("These are proteins that ARE in ", $fig->family_function($col), " ($col) and are in other families that have proteins in this family.");
266 :     }
267 :    
268 : redwards 1.16 # merge cells in the table
269 :     my $skip;
270 :     map {$skip->{$_}=1} (0, 2 .. 40); # ignore everything except column 2
271 :     $tab=&HTML::merge_table_rows($tab, $skip);
272 :    
273 : overbeek 1.12 push @$html,
274 :     $cgi->hidden(-name=>"family", -value=>@cols), $cgi->hidden("prot"), $cgi->hidden(-name=>"user"),
275 : overbeek 1.11 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
276 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
277 : overbeek 1.15 &HTML::make_table(["Count", "Unique ID", "Protein ID", "Number of fams protein is in", @fams], $tab,' &nbsp; ');
278 : redwards 1.10 }
279 :     }
280 :    
281 :    
282 : overbeek 1.12 sub ids_are_in_fam {
283 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
284 :     # It seems that $sz_family is not right
285 :     map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
286 :     grep {/^$focus/}
287 :     map {$fig->cid_to_prots($_)}
288 :     grep {$proteins_in_col->{$_}}
289 :     ($fig->ids_in_family($f));
290 :     return ($fams, $count_of);
291 :     }
292 : redwards 1.10
293 : overbeek 1.12 sub ids_missing_from_fam {
294 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
295 : redwards 1.10 # It seems that $sz_family is not right
296 : overbeek 1.11 map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
297 : redwards 1.10 grep {/^$focus/}
298 :     map {$fig->cid_to_prots($_)}
299 : overbeek 1.12 grep {!$proteins_in_col->{$_}}
300 : redwards 1.10 ($fig->ids_in_family($f));
301 : overbeek 1.12 return ($fams, $count_of);
302 :     }
303 :    
304 : overbeek 1.11
305 : redwards 1.10
306 : overbeek 1.11 sub choose_focus {
307 :     my ($cgi)=@_;
308 :     my %choices=(
309 : overbeek 1.15 "all" => "All",
310 : overbeek 1.11 "fig" => "FIGfams",
311 :     "tigr" => "TIGRfams",
312 :     "pfam" => "PFAM",
313 :     "sp" => "SwissProt",
314 :     "kegg" => "KEGG",
315 :     "pir" => "PIR SuperFams",
316 :     "mcl" => "MCL",
317 :     "cog" => "COG",
318 :     );
319 :    
320 : overbeek 1.22 my $default = $cgi->param("focus"); unless ($default) {$default="fig"}
321 : overbeek 1.11
322 :     return $cgi->popup_menu(
323 :     -name => "focus",
324 : overbeek 1.15 -values => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
325 :     -labels => \%choices,
326 :     -default => $default,
327 :     );
328 :     }
329 :    
330 :     sub choose_sort {
331 :     my ($cgi)=@_;
332 :     my %choices=(
333 :     "none" => "Number of Proteins",
334 :     "genome" => "Genome",
335 :     "cid" => "Unique ID",
336 :     );
337 :    
338 :     my $default = $cgi->param("sort"); unless ($default) {$default="none"}
339 :    
340 :     return $cgi->popup_menu(
341 :     -name => "sort",
342 :     -values => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
343 : overbeek 1.11 -labels => \%choices,
344 :     -default => $default,
345 :     );
346 :     }
347 : redwards 1.10
348 : redwards 1.7
349 : overbeek 1.14 sub comparefig2kegg {
350 :     my ($fig,$cgi,$html)=@_;
351 :    
352 :     my $classification; my %subsystem;
353 :     # read classification from kegg file
354 :     if (open(IN, "$FIG_Config::global/ProteinFamilies/kegg_classificaation.txt")) {
355 :     while (<IN>) {
356 :     chomp;
357 :     my @a=split /\t/;
358 :     my $id=shift(@a);
359 :     $subsystem{"kegg|$id"}=pop(@a);
360 :     push @{$classification->{"kegg|$id"}}, \@a;
361 :     }
362 :     }
363 :    
364 :    
365 :     my $tab=[];
366 :     # find out what families our proteins are in
367 :     map {
368 :     my $prot=$_;
369 :     map {
370 :     my $fam=$_;
371 :     if ($fam =~ /^fig/) {
372 :     my %ss;
373 :     map {$ss{$_->[0]}++} ($fig->subsystems_for_role($fig->family_function($fam)));
374 :     map {my $ss=$_; push @$tab, [$prot, @{$fig->subsystem_classification($ss)}, $ss, $fam, $fig->family_function($fam)]} keys %ss;
375 :     }
376 :     else {
377 :     map {push @$tab, [$prot, @{$_}, $subsystem{$fam}, $fam, $fig->family_function($fam)]} @{$classification->{$fam}}
378 :     }
379 :     } grep {/^fig/ || /^kegg/} $fig->families_for_protein($prot);
380 :     } $cgi->param('proteins');
381 :    
382 :     my $col_hdrs=['Protein', ['Classification', 'th colspan=2'], 'Subsystem', 'Family ID', 'Family Function'];
383 :     push @$html, &HTML::make_table($col_hdrs, $tab, "Families"), "\n",
384 :     }
385 : overbeek 1.17
386 :     ## Based on request from Ross:
387 :     # Subject: Re: fig.pl
388 :     # Date: October 4, 2005 6:21:00 AM PDT
389 :     # From: Ross@theFIG.info
390 :     # To: raedwards@gmail.com
391 :     #
392 :     #Rob,
393 :     #
394 :     #It seems to me that you got that right, and the function is certainly at the
395 :     #core of what is needed. I have been thinking about what I would want with
396 :     #protein families,
397 :     #and it goes something like this:
398 :     #
399 :     #1. Given a protein FIG1, you can get the set of proteins with the same CID
400 :     #(call it CID1). Call this set EQUIV, since it is really a set of IDs that are
401 :     #equivalent.
402 :     #
403 :     #2. From the set of IDs in EQUIV, you can get the set of protein families (from
404 :     #all sources) that contain the IDs in EQUIV. This gives a table
405 :     #
406 :     #
407 :     # [,ID,Function,Family,FamilyFunction]
408 :     #
409 :     # All of the table entries describe a family containing CID1.
410 :     #
411 :     #3. From this table you select two Families to be compared (e.g., one KEGG
412 :     #family vs a FIG family). This ends the first part -- selecting the precise
413 :     #two
414 :     # families to be compared. Each of the two families should be thought of
415 :     #as [CID,ID,Family].
416 :     #
417 :     #4. The comparison of SET1 and SET2 uses essentially the function you
418 :     #implemented. You need to form three sets:
419 :     #
420 :     # the intersection of SET1 and SET2
421 :     # SET1 - SET2
422 :     # SET2 - SET1
423 :     #
424 :     # You may or may not wish to display each of the three sets. The user
425 :     #should be able to select which. When you think
426 :     # of one of these sets, it is useful to think of
427 :     #{CID,Family,Set-of-CIDs}. That is, it is not just a set of CIDs; it should be
428 :     #viewed as a
429 :     # set of CIDs from a specific family that was chosen because it
430 :     #contained a specific CID.
431 :     #
432 :     #5. When displaying a set of proteins from a given family, you start with
433 :     #(CID,Family,Set-of-CIDs). Each line should contain
434 :     #
435 :     # 1. A single CID from the Set-of-CIDs (call this CID2).
436 :     #
437 :     # 2. A count of the number of sources that place both CID1 and CID2
438 :     #in the same family (note that this is not a count of the families that include
439 :     #both CID1 and CID2)
440 :     #
441 :     # 3. For each source a "Y" or space indicating whether or not the
442 :     #source placed CID1 and CID2 into the same family (i.e., whether or not there
443 :     # is at least one family from the source that contains both
444 :     #CID1 and CID2).
445 :     #
446 :     #That is what I think should be done. Can we discuss it?
447 :     #
448 :    
449 :    
450 :    
451 :     sub set_of_equivs {
452 : overbeek 1.22 ($fig, $cgi, $html)=@_;
453 : overbeek 1.17 foreach my $peg ($cgi->param('prot')) {
454 :     my $cid=$fig->prot_to_cid($peg);
455 :     my @equiv=$fig->cid_to_prots($cid);
456 :     my $tab=[];
457 :     my $allfams;
458 : overbeek 1.19
459 : overbeek 1.21 #begin the html so we can add hidden things
460 :     push @$html, $cgi->start_form(-method=>'get'), $cgi->p("For protein <b>$peg</b>, which has the unique ID <b>$cid</b>, this is the EQUIV set.");
461 :     push @$html, $cgi->hidden(-name=>'querycid', -value=>$fig->prot_to_cid($peg));
462 :    
463 : overbeek 1.19 # this block will make a table with all IDs of all proteins in CID.
464 :     # Ross doesn't want that, so we don't use it
465 :     if (0)
466 :     {
467 :     map
468 :     {
469 :     my $id=$_;
470 :     map
471 :     {
472 :     push @$tab, [$fig->prot_to_cid($id), $id, &functionate($id), $_, $fig->family_function($_)];
473 :     $allfams->{$_}="$_ : " . $fig->family_function($_);
474 :     } $fig->families_for_protein($id);
475 :     } @equiv;
476 :     }
477 :     else
478 :     {
479 :     # instead we just show our query protein
480 :     map
481 :     {
482 :     my $ffunc=$fig->family_function($_) || " &nbsp; ";
483 :     push @$tab, [$fig->prot_to_cid($peg), $peg, &functionate($peg), $_, $ffunc, $fig->ext_sz_family($_)];
484 : overbeek 1.17 $allfams->{$_}="$_ : " . $fig->family_function($_);
485 : overbeek 1.19 } $fig->families_for_protein($peg);
486 :     }
487 :    
488 : overbeek 1.17
489 :     $tab=&HTML::merge_table_rows($tab, {3=>1, 4=>1});
490 : overbeek 1.22 my $radbut={
491 :     "1not2"=>"In family one and NOT family two\n",
492 :     "2not1"=>"In family two and NOT family one\n",
493 :     "1and2"=>"In both families\n"
494 :     };
495 :     my $col_hdrs=['CID', 'Protein', 'Function of Proteins', 'Family', 'Family Function', 'Ext. IDs In Family'];
496 : overbeek 1.21 push @$html, &HTML::make_table($col_hdrs, $tab, ""), "\n", $cgi->p("To differentiate families in this table, please choose two families:"),
497 : overbeek 1.22 "Family 1: &nbsp; ", $cgi->popup_menu(-name=>"family1", -values=>[sort {$a cmp $b} keys %$allfams], -labels=>$allfams, -default=>[sort {$a cmp $b} keys %$allfams]->[0]),
498 :     " <br /> Family 2: &nbsp; ", $cgi->popup_menu(-name=>"family2", -values=>[sort {$a cmp $b} keys %$allfams], -labels=>$allfams, -default=>[sort {$a cmp $b} keys %$allfams]->[1]),
499 : overbeek 1.18 $cgi->p("Show proteins:<br /><ul>\n",
500 : overbeek 1.22 $cgi->radio_group(-name=>"diff", -values=>[keys %$radbut], -labels=>$radbut, -rows=>3),
501 :     "</ul>\n"),
502 :     $cgi->hidden(-name=>'prot', -value=>$peg),
503 : overbeek 1.17 $cgi->submit(-name=>"differentiate", -value=>"Differentiate these families"), $cgi->reset, $cgi->end_form();
504 :     }
505 :     }
506 :    
507 : overbeek 1.22 # $cgi->p("I am most interested in proteins from ", &choose_focus($cgi), " that are missing from these families\n"),
508 :    
509 : overbeek 1.17 sub differentiate {
510 : overbeek 1.22 ($fig, $cgi, $html)=@_;
511 :    
512 :     #my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
513 :     #undef $focus if ($focus eq "all");
514 :     my $focus=$cgi->param('family2');
515 :     $focus =~ s/\|.*//;
516 :    
517 :     my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
518 :     if ($fam_id1 eq $fam_id2)
519 :     {
520 :     push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
521 :     return;
522 :     }
523 :     my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
524 :    
525 :    
526 :     # figure out all the families we know about
527 :     my %families;
528 :     #map
529 :     #{
530 :     # my $cid=$_;
531 :     # map
532 :     # {
533 :     # $families{$_}=1;
534 :     # }
535 :     # $fig->in_family($cid);
536 :     #} @{&set_utilities::union($fam1, $fam2)};
537 :    
538 :     map {$families{$_}=1} $fig->in_family($cgi->param('querycid'));
539 :    
540 :     my @families=sort {$a cmp $b} grep {!/$fam_id1/} grep {!/$fam_id2/} keys %families;
541 :     unshift @families, ($fam_id1, $fam_id2);
542 :     my @source=@families;
543 :     map {/^(.*?)\|/; $_=$1} @source;
544 :    
545 :     # now figure out all the external IDs in those families
546 :     my $extids;
547 :     map
548 :     {
549 :     my $fam=$_;
550 :     map
551 :     {
552 :     $extids->{$_}->{$fam}=1;
553 :     } $fig->ext_ids_in_family($fam);
554 :     } @families;
555 :    
556 :     # finally generate the table. Note that there are three different arrays that we operate on depending on the user input
557 :     # but it really only changes which set algorith we use. Each array is handled identically.
558 :     my $tab;
559 :     map
560 :     {
561 :     my $cid=$_;
562 :     my $row=["<a href='proteinfamilies.cgi?prot=$cid'>$cid</a>"];
563 : overbeek 1.23 my $seen; my $mismatchcolor;
564 : overbeek 1.22 map
565 :     {
566 :     my $prot=$_;
567 :     map
568 :     {
569 :     # add the protein info to the cell in the table if it this family has that protein. Note that we have seen it, and increment the column counter
570 :     # this if is if the protein that we are looking at is in the family for this column then add it
571 :    
572 :     # if the protein is not added, we want to know if it has the same start characters as the family (i.e. from the same source), and note that.
573 :     ($extids->{$prot}->{$families[$_]}) ?
574 : overbeek 1.23 eval {$seen->{$prot}=1; $row->[$_+1] .= &protein_link($prot) . "<br />" } :
575 :     ($prot =~ /^$source[$_]/) ? eval
576 : overbeek 1.24 {$mismatchcolor->{$_+1}=1; $row->[$_+1] .= &protein_link($prot) . "<br />" } :
577 : overbeek 1.22 1;
578 :     } (0 .. $#families);
579 :     } sort $fig->cid_to_prots($cid);
580 :     unless ($#$row == $#families+1) {$#$row=$#families+1}
581 :    
582 :     #$row->[1] = [join("<br />", map {&protein_link($_)} grep {/^$focus/} grep {!$seen->{$_}} $fig->cid_to_prots($cid)), "td style='background-color: #FF3366'"];
583 :     #$row->[1] = [join("<br />", map {&protein_link($_)} grep {/^$focus/} grep {!$mismatch->{$_}} grep {!$seen->{$_}} $fig->cid_to_prots($cid)), "td style='background-color: #FF3366'"];
584 :    
585 :     map {$row->[$_] = [$row->[$_], "td style='background-color: #FF3366'"]} keys %$mismatchcolor;
586 :     map {$row->[$_] = " &nbsp; " unless ($row->[$_])} (0 .. $#$row);
587 :     #push @$tab, $row if ($row->[1]->[0]);
588 :    
589 :     # if we want to show everything do so, otherwise only show the rows where there is a missing protein
590 :     if ($cgi->param('show') eq "all")
591 :     {
592 :     push @$tab, $row;
593 :     }
594 :     elsif ($row->[2] ne " &nbsp; ")
595 :     {
596 :     push @$tab, $row;
597 :     }
598 :     }
599 :     ($cgi->param("diff") eq "1and2") ? @{&set_utilities::intersection($fam1, $fam2)} :
600 :     ($cgi->param("diff") eq "1not2") ? @{&set_utilities::set_diff($fam1, $fam2)} :
601 :     ($cgi->param("diff") eq "2not1") ? @{&set_utilities::set_diff($fam2, $fam1)} : ();
602 :    
603 :    
604 :     my $title="Proteins in ";
605 :     ($cgi->param("diff") eq "1and2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
606 :     ($cgi->param("diff") eq "1not2") ? $title.=$fig->family_function($fam_id1). " ($fam_id1) BUT NOT in " . $fig->family_function($fam_id2). " ($fam_id2)\n" :
607 :     ($cgi->param("diff") eq "2not1") ? $title.=$fig->family_function($fam_id2). " ($fam_id2) BUT NOT in " . $fig->family_function($fam_id1). " ($fam_id1)\n" : 1;
608 :    
609 :     # add the count of the number of members in the family
610 :     #my @colcounts=map {$column->{$_}} (1 .. $#families+2); $colcounts[0] = [$colcounts[0], "td style='background-color: #FF3366'"];
611 :     #my @sz=map {$fig->ext_sz_family($_)} @families;
612 :     #unshift @$tab, [["# Ext. IDs in fam", "th "], ['', "td style='background-color: #FF3366'"], @sz];
613 :     #unshift @$tab, [["Ext. IDs shown", "th "], @colcounts];
614 :    
615 :     # remove empty columns from the table
616 :     #for (my $y=$#families+2; $y>3; $y--)
617 :     #{
618 :     # do it backwards so the splice still works, and stop at 3 since the first four columns are the ones we need
619 :     #next if ($column->{$y});
620 :     #splice(@families, $y-2, 1);
621 :     #map {splice(@$_, $y, 1)} @$tab;
622 :     #}
623 :    
624 :    
625 :     #my @headers=map {$_ = "$_: <a href='proteinfamilies.cgi?family=$families[$_]'>$source[$_]</a>"} (0..$#families);
626 :     my @headers=@families;
627 :     map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @headers;
628 :     #my $colhdrs=["CID", ["Proteins not in families", "th style='background-color: #FF3366'"], @headers];
629 :     my $colhdrs=["CID", @headers];
630 :     push @$html, HTML::make_table($colhdrs, $tab, $title);
631 :     }
632 :    
633 :    
634 :     sub old_differentiate {
635 : overbeek 1.17 my ($fig, $cgi, $html)=@_;
636 :    
637 :     my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
638 :     undef $focus if ($focus eq "all");
639 :    
640 :     my ($fam_id1, $fam_id2)=($cgi->param('family1'), $cgi->param('family2'));
641 :     if ($fam_id1 eq $fam_id2) {
642 :     push @$html, "<h2 style='color: red'>Please choose two different protein families</h2>";
643 :     return;
644 :     }
645 :     my ($fam1, $fam2)=([$fig->ids_in_family($fam_id1)], [$fig->ids_in_family($fam_id2)]);
646 : overbeek 1.18
647 :     if ($cgi->param("1not2")) {
648 :     my $tab;
649 :     map {
650 :     my $cid=$_;
651 :     map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
652 :     } sort @{&set_utilities::set_diff($fam1, $fam2)};
653 :    
654 :     $tab=&HTML::merge_table_rows($tab);
655 :     push @$html, $cgi->h3("Proteins that are in $fam_id1 (" , $fig->family_function($fam_id1) , ") and not in $fam_id2 (" ,
656 :     $fig->family_function($fam_id2) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
657 :     }
658 :    
659 :    
660 :     if ($cgi->param("2not1")) {
661 : overbeek 1.22
662 :     ###########NOTICE THIS REDIRECT!!!
663 :     ($fam_id1, $fam_id2)=($fam_id2, $fam_id1);
664 :     ($fam1, $fam2)=($fam2, $fam1);
665 :     $cgi->param("1not2c", "1");
666 :     }
667 :     if (0) {
668 : overbeek 1.18 my $tab;
669 :     map {
670 :     my $cid=$_;
671 :     map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
672 :     } sort @{&set_utilities::set_diff($fam2, $fam1)};
673 :    
674 :     $tab=&HTML::merge_table_rows($tab);
675 :     push @$html, $cgi->hr, $cgi->h3("Proteins that are in $fam_id2 (" , $fig->family_function($fam_id2) , ") and not in $fam_id1 (" ,
676 :     $fig->family_function($fam_id1) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, "");
677 :     }
678 :    
679 :     if ($cgi->param("1and2")) {
680 :     my $tab;
681 : overbeek 1.21 my @evidence;
682 : overbeek 1.18 map {
683 :     my $cid=$_;
684 : overbeek 1.21 #push @evidence, &compare_two_cids($cgi->param('querycid'), $cid);
685 :     push @evidence, &IHatePfams($cgi->param('querycid'), $cid);
686 : overbeek 1.18 map {push @$tab, [$cid, $_, &functionate($_)]} sort grep {/^$focus/} $fig->cid_to_prots($cid)
687 :     } @{&set_utilities::intersection($fam1, $fam2)};
688 :    
689 :     $tab=&HTML::merge_table_rows($tab);
690 :     push @$html, $cgi->hr, $cgi->h3("Proteins that are in both $fam_id1 (" , $fig->family_function($fam_id1) , ") and in $fam_id2 (" ,
691 : overbeek 1.21 $fig->family_function($fam_id2) . ")\n"), &HTML::make_table(["CID", "Protein ID", "Function"], $tab, ""),
692 :     $cgi->hr, $cgi->h3("Evidence for the connections between proteins and the families they are in"),
693 :     $cgi->p("These tables show the connections between the families. In the first row of each table is the initial family that you chose, and in the second row is the comparator. The first column is the CID and the remaining columns are the protein families that these two CIDs are in. In each cell are the external IDs that are (a) in the CIDs per the row headers AND (b) in the families per column headers. Thus, if you are here from <a href=\"http://listeria.uchicago.edu/dev/FIG/proteinfamilies.cgi?querycid=965112&family1=fig%7CPF002033&family2=mcl%7CORTHOMCL676&focus=all&1and2=on&differentiate=Differentiate+these+families&.cgifields=1not2&.cgifields=2not1&.cgifields=1and2\">this link</a> cog|thrB is in the family cog|COG0083 and has the ID 965112 and sp|P00547 has the ID 965112 and is in the families pfam|PB000121, pfam|PB007585, pfam|PF00288.12, pir|PIRSF000676, sp|PS00627, and tigr|TIGR00191"),
694 :     @evidence;
695 :     }
696 :    
697 :     if ($cgi->param("1and2b")) {
698 :     # figure out all the families we know about
699 :     my %families;
700 :     map
701 :     {
702 :     my $cid=$_;
703 :     map
704 :     {
705 :     $families{$_}=1;
706 :     }
707 :     $fig->in_family($cid);
708 :     } @{&set_utilities::intersection($fam1, $fam2)};
709 :     my @families=sort {$a cmp $b} keys %families;
710 :    
711 :     # now figure out all the external IDs in those families
712 :     my $extids;
713 :     map
714 :     {
715 :     my $fam=$_;
716 :     map
717 :     {
718 :     $extids->{$_}->{$fam}=1;
719 :     } $fig->ext_ids_in_family($fam);
720 :     } @families;
721 :    
722 :     my $tab;
723 :     map
724 :     {
725 :     my $cid=$_;
726 :     map
727 :     {
728 :     my $prot=$_;
729 :     my $row=[$cid, $prot, &functionate($prot)];
730 :     map {($extids->{$prot}->{$_}) ? push @$row, $prot : push @$row, " &nbsp; "} @families;
731 :     push @$tab, $row;
732 :     } sort grep {/^$focus/} $fig->cid_to_prots($cid)
733 :     } @{&set_utilities::intersection($fam1, $fam2)};
734 :    
735 :     my $skip;
736 :     map {$skip->{$_+4}=1} (0 .. $#families);
737 :     $tab=&HTML::merge_table_rows($tab, $skip);
738 :    
739 : overbeek 1.22 map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @families;
740 : overbeek 1.21 my $colhdrs=["CID", "Protein", "Function", @families];
741 :     my $title="Proteins in ". $fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n";#$fam_id1, $fam_id2
742 :     push @$html, HTML::make_table($colhdrs, $tab, $title);
743 :     }
744 :    
745 :     if ($cgi->param("1and2c")) {
746 :     # figure out all the families we know about
747 :     my %families;
748 :     map
749 :     {
750 :     my $cid=$_;
751 :     map
752 :     {
753 :     $families{$_}=1;
754 :     }
755 :     $fig->in_family($cid);
756 :     } @{&set_utilities::intersection($fam1, $fam2)};
757 :     my @families=sort {$a cmp $b} grep {!/$fam_id1/} grep {!/$fam_id2/} keys %families;
758 :     unshift @families, ($fam_id1, $fam_id2);
759 :    
760 :     # now figure out all the external IDs in those families
761 :     my $extids;
762 :     map
763 :     {
764 :     my $fam=$_;
765 :     map
766 :     {
767 :     $extids->{$_}->{$fam}=1;
768 :     } $fig->ext_ids_in_family($fam);
769 :     } @families;
770 :    
771 :     my $tab;
772 :     map
773 :     {
774 :     my $cid=$_;
775 :     my $row=[$cid];
776 :     map
777 :     {
778 :     my $prot=$_;
779 :     map {($extids->{$prot}->{$families[$_]}) ? $row->[$_+1] = $prot : 1 } (0 .. $#families);
780 :     } sort grep {/^$focus/} $fig->cid_to_prots($cid);
781 :     unless ($#$row == $#families+1) {$#$row=$#families+1}
782 :     push @$tab, $row;
783 :     } @{&set_utilities::intersection($fam1, $fam2)};
784 :    
785 :     $tab=&HTML::merge_table_rows($tab);
786 : overbeek 1.22
787 :     map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @families;
788 : overbeek 1.21 my $colhdrs=["CID", @families];
789 :     my $title="Proteins in ". $fig->family_function($fam_id1). " ($fam_id1) AND in " . $fig->family_function($fam_id2). " ($fam_id2)\n";#$fam_id1, $fam_id2
790 :     push @$html, HTML::make_table($colhdrs, $tab, $title);
791 :     }
792 :    
793 :     if ($cgi->param("1not2c")) {
794 :     # figure out all the families we know about
795 :     my %families;
796 :     map
797 :     {
798 :     my $cid=$_;
799 :     map
800 :     {
801 :     $families{$_}=1;
802 :     }
803 :     $fig->in_family($cid);
804 :     } @{&set_utilities::set_diff($fam1, $fam2)};
805 :     my @families=sort {$a cmp $b} grep {!/$fam_id1/} grep {!/$fam_id2/} keys %families;
806 :     unshift @families, ($fam_id1, $fam_id2);
807 :    
808 :     # now figure out all the external IDs in those families
809 :     my $extids;
810 :     map
811 :     {
812 :     my $fam=$_;
813 :     map
814 :     {
815 :     $extids->{$_}->{$fam}=1;
816 :     } $fig->ext_ids_in_family($fam);
817 :     } @families;
818 :    
819 :     my $tab;
820 :     map
821 :     {
822 :     my $cid=$_;
823 : overbeek 1.22 my $row=["<a href='proteinfamilies.cgi?prot=$cid'>CID $cid</a>"];
824 :     my $seen;
825 : overbeek 1.21 map
826 :     {
827 :     my $prot=$_;
828 : overbeek 1.22 my $column;
829 :     map
830 :     {
831 :     ($extids->{$prot}->{$families[$_]}) ?
832 :     eval {$seen->{$prot}=1; $row->[$_+2] = &protein_link($prot) ; $column->{$_+2}++} :
833 :     1
834 :     } (0 .. $#families);
835 : overbeek 1.21 } sort grep {/^$focus/} $fig->cid_to_prots($cid);
836 : overbeek 1.22 unless ($#$row == $#families+2) {$#$row=$#families+2}
837 :    
838 :     $row->[1] = [join("<br />", map {&protein_link($_)} grep {!$seen->{$_}} $fig->cid_to_prots($cid)), "td style='background-color: #FF3366'"];
839 :    
840 : overbeek 1.21 push @$tab, $row;
841 :     } @{&set_utilities::set_diff($fam1, $fam2)};
842 : overbeek 1.22
843 : overbeek 1.21 $tab=&HTML::merge_table_rows($tab);
844 :    
845 : overbeek 1.22 map {$_ = "<a href='proteinfamilies.cgi?family=$_'>$_</a>"} @families;
846 :     my $colhdrs=["CID", "<span style='background-color: #FF3366'>Proteins not one of the families listed</span>", @families];
847 :     my $title="Proteins in ". $fig->family_function($fam_id1). " ($fam_id1) BUT NOT in " . $fig->family_function($fam_id2). " ($fam_id2)\n";#$fam_id1, $fam_id2
848 : overbeek 1.21 push @$html, HTML::make_table($colhdrs, $tab, $title);
849 : overbeek 1.18 }
850 : overbeek 1.21
851 :    
852 : overbeek 1.18 }
853 :    
854 :    
855 :    
856 :     sub functionate {
857 :     my $peg=shift;
858 :     return scalar($fig->function_of($peg));
859 : overbeek 1.17 }
860 : overbeek 1.21
861 :    
862 :     sub compare_two_cids {
863 :     my ($id1, $id2)=@_;
864 :     # which families is ID in and why?
865 :     # going via the external IDs, although I am not convinced we need to do this
866 :     my $family;
867 :     map {
868 :     my $id=$_;
869 :     map
870 :     {
871 :     map {$family->{$_}->{$id}++} $fig->ext_family_for_id($_)
872 :     } $fig->cid_to_prots($id);
873 :     } ($id1, $id2);
874 :    
875 :     # find the families that have both id1 and id2
876 :     my $tab;
877 :     map
878 :     {
879 :     push @$tab, [$_, $family->{$_}->{$id1}, $family->{$_}->{$id2}] if ( $family->{$_}->{$id1} && $family->{$_}->{$id2} );
880 :     } keys %$family;
881 :    
882 :     return &HTML::make_table(["Family", "Cnx to $id1", "Cnx to $id2"], $tab, "Connections (cnx) between proteins and families"), $cgi->p("\n");
883 :     }
884 :    
885 :    
886 :     sub IHatePfams {
887 :     my ($id1, $id2)=@_;
888 :     # id1 and id2 are cids
889 :    
890 :     # what families are the cids in?
891 :     my %families;
892 :     map {$families{$_}=1} ($fig->in_family($id1), $fig->in_family($id2));
893 :     my @families=sort {$a cmp $b} keys %families;
894 :    
895 :     # what external IDs are the cids
896 :     my %external;
897 :     map {$external{$_}=$id1} $fig->cid_to_prots($id1);
898 :     map {$external{$_}=$id2} $fig->cid_to_prots($id2);
899 :    
900 :    
901 :     my $tab=[];
902 :     foreach my $cid ($id1, $id2)
903 :     {
904 :     my $row=[$cid];
905 :     foreach my $fam (@families) {
906 :     # which proteins are in $fam
907 :     my $cell;
908 :     foreach my $id ($fig->ext_ids_in_family($fam))
909 :     {
910 :     $cell .= "$id<br>" if ($external{$id} eq $cid);
911 :     }
912 :     $cell = " &nbsp; " unless ($cell);
913 :     push @$row, $cell;
914 :     }
915 :     push @$tab, $row;
916 :     }
917 :    
918 :     &HTML::make_table(["CID", @families], $tab, "Proteins in families");
919 :     }
920 :    
921 :    
922 : overbeek 1.22
923 :     sub protein_link {
924 :     my $p=shift;
925 :     my %proteinbase=(
926 :     "fig" => "protein.cgi?user=$user&prot=fig|",
927 :     "cog" => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
928 :     "sp" => "http://www.expasy.org/uniprot/",
929 :     "tr" => "http://www.expasy.org/uniprot/",
930 :     "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
931 :     );
932 :    
933 :     map {($p =~ /^$_/) ? eval {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$_}$p'>$1|$p</a>"} : 1;} keys %proteinbase;
934 :     return $p;
935 :     }
936 :    
937 : overbeek 1.21
938 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3