[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

Annotation of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 :     =head1 proteinfamilies.cgi
6 :    
7 : redwards 1.3 A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.
8 : redwards 1.1
9 : redwards 1.7 PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->
10 :    
11 : redwards 1.1 =cut
12 :    
13 :     use strict;
14 :     use FIG;
15 :     use HTML;
16 :     use raelib;
17 :     use CGI;
18 :     my $cgi=new CGI;
19 :     use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.
20 :    
21 :     my $fig;
22 :     eval {
23 :     $fig = new FIG;
24 :     };
25 :    
26 :     if ($@ ne "")
27 :     {
28 :     my $err = $@;
29 :    
30 :     my(@html);
31 :    
32 :     push(@html, $cgi->p("Error connecting to SEED database."));
33 :     if ($err =~ /Could not connect to DBI:.*could not connect to server/)
34 :     {
35 :     push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
36 :     }
37 :     else
38 :     {
39 :     push(@html, $cgi->pre($err));
40 :     }
41 :     &HTML::show_page($cgi, \@html, 1);
42 :     exit;
43 :     }
44 :    
45 :     my $html = [];
46 :     my $user = $cgi->param('user');
47 :    
48 :     unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");
49 :    
50 : redwards 1.2 my %proteinbase=(
51 :     "fig" => "/FIG/protein.cgi?user=$user&prot=fig|",
52 :     "cog" => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
53 :     "sp" => "http://www.expasy.org/uniprot/",
54 :     "tr" => "http://www.expasy.org/uniprot/",
55 :     "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
56 :     );
57 :    
58 :    
59 :    
60 :     if ($cgi->param('Show Proteins In Each Family'))
61 : redwards 1.1 {
62 : redwards 1.2 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
63 :     $cgi->param(-name=>'family', -value=>\@needed);
64 :     &show_family($fig,$cgi,$html);
65 :     }
66 :     elsif ($cgi->param("Combine Families With And"))
67 :     {
68 :     &combine_families($fig,$cgi,$html);
69 :     }
70 : redwards 1.5 elsif ($cgi->param('proteins_between_two'))
71 :     {
72 :     &proteins_between_two($fig, $cgi, $html);
73 :     }
74 : redwards 1.2 elsif ($cgi->param('Compare FIG Functions'))
75 :     {
76 :     my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
77 :     $cgi->param(-name=>'family', -value=>\@needed);
78 :     &compare_functions($fig,$cgi,$html);
79 :     }
80 : redwards 1.4 elsif ($cgi->param('Suggest')) {
81 :     &suggest_proteins($fig,$cgi,$html);
82 :     }
83 : redwards 1.7 elsif ($cgi->param('extend_family')) {
84 :     &extend_family($fig,$cgi,$html);
85 :     }
86 : redwards 1.2 elsif ($cgi->param('family'))
87 :     {
88 :     &show_family($fig,$cgi,$html);
89 : redwards 1.1 }
90 :     elsif ($cgi->param('prot'))
91 :     {
92 : redwards 1.2 &show_protein($fig,$cgi,$html);
93 : redwards 1.1 }
94 :     else
95 :     {
96 :     &show_initial($fig,$cgi,$html);
97 :     }
98 :    
99 :     &HTML::show_page($cgi,$html,1);
100 :     exit;
101 :    
102 :    
103 :     sub show_initial {
104 :     my ($fig,$cgi,$html)=@_;
105 :     # generate a blank page
106 : redwards 1.2 push @$html,
107 :     "<h2>Protein Families</h2>\n",
108 :     "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
109 :     "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
110 :     $cgi->start_form(),
111 :     "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>",
112 :     "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
113 :     "Please enter a family id: ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
114 :     $cgi->submit, $cgi->reset, $cgi->end_form;
115 : redwards 1.1 return $html;
116 :     }
117 :    
118 : redwards 1.2 sub show_protein {
119 : redwards 1.1 my ($fig,$cgi,$html)=@_;
120 : redwards 1.2 foreach my $peg ($cgi->param('prot')) {
121 :     my @families=$fig->families_for_protein($peg);
122 :     unless (@families)
123 :     {
124 :     push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
125 :     return;
126 :     }
127 :    
128 :     my $tab=[];
129 :     my $self=$cgi->url;
130 :     foreach my $fam (@families) {
131 :     push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
132 : redwards 1.1 }
133 : redwards 1.2
134 :     my $col_hdrs=['Family ID', 'Family Function', 'Number of Unique Proteins in Family', 'Choose Family'];
135 :     push @$html, "<h2>Families for $peg</h2>\n",
136 :     $cgi->start_form,
137 :     "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
138 :     &HTML::make_table($col_hdrs, $tab, "Families for $peg"), "\n",
139 : redwards 1.7 $cgi->submit('Show Proteins In Each Family'),
140 : redwards 1.6 $cgi->submit(-name=>'proteins_between_two', -value=>"Compare two or more families for missing"), "<br>\n",
141 : redwards 1.7 $cgi->submit(-name=>'extend_family', -value=>"Extend and contract family"),
142 : redwards 1.4 $cgi->submit("Combine Families With And"), $cgi->submit("Compare FIG Functions"),
143 :     $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
144 : redwards 1.2 $cgi->reset, $cgi->end_form;
145 : redwards 1.1 }
146 :     }
147 : redwards 1.7
148 :     # this was deleted from above
149 :     #$cgi->submit(-name=>"Suggest", -value=>"Suggest Additional families for this protein"),
150 :    
151 : redwards 1.2 sub show_family {
152 : redwards 1.1 my ($fig,$cgi,$html)=@_;
153 : redwards 1.2 foreach my $fam ($cgi->param('family')) {
154 : redwards 1.4 my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
155 : redwards 1.2 my $tab=[];
156 : redwards 1.3 my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
157 : redwards 1.4 foreach my $cid (@cids) {
158 :     my @pegs=$fig->cid_to_prots($cid);
159 : redwards 1.2 foreach my $p (@pegs) {
160 :     foreach my $k (keys %proteinbase) {
161 :     if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
162 : redwards 1.1 }
163 :     }
164 : redwards 1.4 push @$tab, [$cid, (join ", ", (@pegs))];
165 : redwards 1.1 }
166 : redwards 1.2
167 :     push @$html, "<h2>$fam Family</h2>\n",
168 :     "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
169 : redwards 1.3 "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
170 :     "even though they may come from different organisms.</p>",
171 : redwards 1.2 "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
172 :     $cgi->start_form,
173 :     &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
174 : redwards 1.4 $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"), $cgi->submit("Compare FIG Functions"),
175 : redwards 1.2 $cgi->end_form;
176 : redwards 1.1 }
177 :     }
178 :    
179 : redwards 1.2 sub combine_families {
180 : redwards 1.1 my ($fig,$cgi,$html)=@_;
181 : redwards 1.4 # first find all the families and all the cids in those families
182 :     my $cids; my @families; my $allprots;
183 : redwards 1.2 foreach my $f (grep {$cgi->param($_)} $cgi->param("allfams"))
184 :     {
185 :     push @families, $f;
186 : redwards 1.4 foreach my $e ($fig->ids_in_family($f)) {$cids->{$e}->{$f}=1; $allprots++}
187 : redwards 1.1 }
188 : redwards 1.3
189 : redwards 1.4 # now figure out those cids that are in all families
190 : redwards 1.2 # we are going to do this with a boolean switch rather than just counting occurences
191 :     my @wanted;
192 : redwards 1.4 foreach my $cid (keys %$cids) {
193 : redwards 1.2 my $keep=1;
194 : redwards 1.4 foreach my $f (@families) {undef $keep unless ($cids->{$cid}->{$f})}
195 :     push @wanted, $cid if ($keep);
196 : redwards 1.2 }
197 :    
198 :     my $tab=[];
199 : redwards 1.3 my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
200 : redwards 1.4 foreach my $cid (sort {$a <=> $b} @wanted) {
201 :     my @pegs=$fig->cid_to_prots($cid);
202 : redwards 1.2 foreach my $p (@pegs) {
203 :     foreach my $k (keys %proteinbase) {
204 :     if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
205 :     }
206 : redwards 1.1 }
207 : redwards 1.4 push @$tab, [$cid, (join ", ", (@pegs))];
208 : redwards 1.1 }
209 : redwards 1.2
210 :    
211 :     push @$html, "<h2>Proteins in ", (join ", ", (@families)), " families</h2>\n",
212 : redwards 1.3 "<h3>Summary</h3>\n<p>There were $allprots proteins in the ", scalar(@families), " families that you selected.\n<br>",
213 : redwards 1.4 "Out of a those proteins, there were ", scalar(keys %$cids),
214 : redwards 1.3 " unique proteins, and ", scalar(@wanted), " proteins are present in the ",
215 :     scalar(@families), " families.</p>\n",
216 :     "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, even though they may come from different organisms.</p>",
217 : redwards 1.2 "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
218 :     $cgi->start_form,
219 :     &HTML::make_table($col_hdrs, $tab, "Proteins in " . (join ", ", (@families)) . " families"),
220 :     $cgi->end_form;
221 : redwards 1.1 }
222 :    
223 : redwards 1.2 sub compare_functions {
224 :     my ($fig,$cgi,$html)=@_;
225 :     foreach my $fam ($cgi->param('family')) {
226 : redwards 1.4 my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
227 : redwards 1.2 my $tab=[];
228 :     my $col_hdrs=['FIG ID', 'Genome', 'Assigned Function'];
229 : redwards 1.4 foreach my $cid (@cids) {
230 :     foreach my $peg (grep {/^fig/} $fig->cid_to_prots($cid)) {
231 : redwards 1.2 my $p=$peg;
232 :     $p =~ s/fig\|//;
233 :     $p = "<a href='$proteinbase{'fig'}$p'>$peg</a>";
234 :     push @$tab, [$p, ($fig->genus_species($fig->genome_of($peg))), scalar($fig->function_of($peg))];
235 :     }
236 :     }
237 :    
238 :     push @$html, "<h2>Functions in $fam Family</h2>\n",
239 :     "<p>The family $fam has the function ", $fig->family_function($fam), ".\n",
240 :     "The functions shown here are the functions of just the proteins in the SEED database from this family</p>",
241 :     $cgi->start_form,
242 :     &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
243 : redwards 1.4 $cgi->hidden(-name=>'prot'), $cgi->hidden(-name=>'family', -value=>"$fam"), $cgi->submit("Compare FIG Functions"),
244 : redwards 1.2 $cgi->end_form;
245 :     }
246 :     }
247 : redwards 1.1
248 : redwards 1.4 sub suggest_proteins {
249 :     my ($fig,$cgi,$html)=@_;
250 :     my $prot=$cgi->param('prot');
251 :     my @families=$fig->families_for_protein($prot);
252 :     my $knownfamily;
253 :     map {$knownfamily->{$_}=1} @families;
254 :    
255 :     print STDERR "Looking through ", scalar(@families), " families\n";
256 :     my $famcount=1;my $time=time;
257 :     my $content;
258 : redwards 1.5 my $row;
259 : redwards 1.4 foreach my $fam (@families) {
260 :     print STDERR "$famcount in ", time-$time, "\n"; $time=time; $famcount++;
261 :    
262 : redwards 1.5 push @$row, [$fam, 'td style="background-color: grey'];
263 :    
264 : redwards 1.4 my @newprots=$fig->proteins_in_family($fam);
265 : redwards 1.5 push @$row, scalar(@newprots);
266 :    
267 : redwards 1.4 print STDERR "$fam has ", scalar(@newprots), "\n";
268 :     foreach my $p (@newprots) {
269 :     my @newfamilies=$fig->families_for_protein($p);
270 :     foreach my $nf (@newfamilies) {
271 :     next if ($knownfamily->{$nf});
272 :     push @{$content->{$nf}}, $p;
273 :     }
274 :     }
275 :     }
276 :    
277 :     my $tab=[];
278 :     @$tab=map {[$_, $fig->family_function($_), scalar(@{$content->{$_}})]} keys %$content;
279 : redwards 1.5 my $col_hdrs=["Family ID", "Family function", "Number of proteins in family"];
280 : redwards 1.4
281 :     push @$html, $cgi->p("$prot is in families ", join ",", @families),
282 :     $cgi->p("These are the other families that $prot should also appear in:"),
283 :     &HTML::make_table($col_hdrs, $tab, "Other Families"), "\n";
284 :     }
285 : redwards 1.5
286 :    
287 :     sub proteins_between_two {
288 :     my ($fig,$cgi,$html)=@_;
289 :     # first find all the families and all the cids in those families
290 :     my $cids; my @families; my $allprots;
291 :     foreach my $f (grep {$cgi->param($_)} $cgi->param("allfams"))
292 :     {
293 :     push @families, $f;
294 :     foreach my $e ($fig->ids_in_family($f)) {$cids->{$e}->{$f}=1; $allprots++}
295 :     }
296 :    
297 :     # now figure out those cids that are in all families
298 :     # we are going to do this with a boolean switch rather than just counting occurences
299 :     my @wanted; my @missing;
300 :     foreach my $cid (keys %$cids) {
301 :     my $keep=1;
302 :     foreach my $f (@families) {
303 :     unless ($cids->{$cid}->{$f}) {
304 :     undef $keep;
305 :     push @missing, $cid;
306 :     }
307 :     }
308 :     push @wanted, $cid if ($keep);
309 :     }
310 :    
311 :     # make some HTML header
312 :     push @$html, "<h2>Comparison of different protein families</h2><ul>\n";
313 :     foreach my $f (sort @families) {push @$html, "<li>", $fig->family_function($f), " ($f) </li>\n"}
314 :     push @$html, "</ul><p>These are the proteins that are unique to one or other of these families:</p>";
315 :    
316 :     foreach my $f (sort @families) {
317 :    
318 :     my $tab=[];
319 :     foreach my $p (sort {$a <=> $b} @missing) {
320 :     next unless ($cids->{$p}->{$f});
321 :     push @$tab, [$p, (join ", ", $fig->cid_to_prots($p))];
322 :     }
323 :    
324 :     my $list;
325 :     foreach my $x (@families) {next if ($f eq $x); $list .= "<li>". $fig->family_function($x). " ($x)</li>\n"}
326 :     push @$html, $cgi->p, "<div style='font-weight: bolder'>Proteins that are in <br><center style='font-weight: bigger'>",
327 :     $fig->family_function($f), " ($f)</center><br>\nthat are not in <ul>$list</ul>\n</div>\n",
328 :     &HTML::make_table(["Cluster ID", "Other Proteins"], $tab, " &nbsp; ");
329 :     }
330 :     }
331 : redwards 1.4
332 : redwards 1.7
333 :     sub extend_family {
334 :     my ($fig,$cgi,$html)=@_;
335 :     # here are the questions:
336 :     # 1. Given a column in a spreadsheet:
337 :     # 2. Here are the proteins in that column
338 :     # 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
339 :     # if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
340 :     # so we recommend that a protein be removed from a family.
341 :     # 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
342 :     # 5. For each of the families that are good, which proteins are only in one of those families and not in any others?
343 :    
344 :     # Note that column == family, But start with fig and then allow a replace ID feature like before.
345 :    
346 :     #my $focus=$cgi->param('focus') or "fig"; # these are the things that we are interested in
347 :     my $focus='fig';
348 :     push @$html, "<h2>Testing: Limited to fig</h2>\n";
349 :     foreach my $col (grep {$cgi->param($_)} $cgi->param("allfams"))
350 :     {
351 :     # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
352 :     my %proteins_in_col;
353 :     map {$proteins_in_col{$_}=1} $fig->ids_in_family($col);
354 :    
355 :     # @proteins are the proteins in that column, although these are cids and not fids at the moment
356 :     my $familycount;
357 :     foreach my $prot (keys %proteins_in_col) {
358 :     foreach my $fam ($fig->in_family($prot)) {
359 :     $familycount->{$fam}++;
360 :     }
361 :     }
362 :    
363 :     my $tab=[];
364 :     foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount) {
365 :     next if ($f eq $col);
366 :     # It seems that $sz_family is not right
367 :     my @all=$fig->ids_in_family($f);
368 :     #my @allmissing=grep {!$proteins_in_col{$_}} ($fig->ids_in_family($f)); # note this is done in two separate lines so I can get the value of @allmissing
369 :     my @allmissing=grep {!$proteins_in_col{$_}} @all;
370 :     print STDERR "ALL: ", scalar(@all), " ALLMISSING: ", scalar(@allmissing), " IN COL: ", scalar(keys %proteins_in_col), "\n";
371 :     my @missing=
372 :     map {"<a href=\'/FIG/protein.cgi?user=$user&prot=$_'>$_</a>"}
373 :     grep {/^$focus/}
374 :     map {$fig->cid_to_prots($_)} @allmissing;
375 :     #grep {!$proteins_in_col{$_}}
376 :     #($fig->ids_in_family($f));
377 :    
378 :     my $prots_by_fam=join ", ", @missing;
379 :     push @$tab, [$f, scalar(@all), $fig->family_function($f), $familycount->{$f}, scalar(@allmissing), scalar(@missing), $prots_by_fam];
380 :     }
381 :    
382 :     my $col_hdrs=["Family ID", "Size of Family", "Family Function",
383 :     "Number of proteins in $col that are also in the family in column 1",
384 :     "Number of proteins in the family in column 1 that are not in $col",
385 :     "Number of proteins in the family in column 1 that are not in $col and begin with $focus",
386 :     "Proteins that are in the family in column 1 that are not"];
387 :     push @$html, $cgi->p, "<h3>Your chosen family was $col that has the function ", $fig->family_function($col), " and has ",
388 :     $fig->sz_family($col), " proteins</h3>",
389 :     $cgi->p("\nThese are the proteins that should also be in $col. Or maybe not.\n"), &HTML::make_table($col_hdrs, $tab, ' &nbsp; ');
390 :     }
391 :     }
392 :    
393 :    
394 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3