[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

Annotation of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 :     =head1 proteinfamilies.cgi
6 :    
7 : redwards 1.3 A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.
8 : redwards 1.1
9 : redwards 1.7 PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->
10 :    
11 : redwards 1.1 =cut
12 :    
13 :     use strict;
14 :     use FIG;
15 :     use HTML;
16 :     use raelib;
17 :     use CGI;
18 : overbeek 1.11 use CGI::Carp qw(fatalsToBrowser);
19 : redwards 1.1 my $cgi=new CGI;
20 :     use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.
21 :    
22 :     my $fig;
23 :     eval {
24 :     $fig = new FIG;
25 :     };
26 :    
27 :     if ($@ ne "")
28 :     {
29 :     my $err = $@;
30 :    
31 :     my(@html);
32 :    
33 :     push(@html, $cgi->p("Error connecting to SEED database."));
34 :     if ($err =~ /Could not connect to DBI:.*could not connect to server/)
35 :     {
36 :     push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
37 :     }
38 :     else
39 :     {
40 :     push(@html, $cgi->pre($err));
41 :     }
42 :     &HTML::show_page($cgi, \@html, 1);
43 :     exit;
44 :     }
45 :    
46 :     my $html = [];
47 :     my $user = $cgi->param('user');
48 :    
49 :     unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");
50 :    
51 : redwards 1.2 my %proteinbase=(
52 :     "fig" => "/FIG/protein.cgi?user=$user&prot=fig|",
53 :     "cog" => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
54 :     "sp" => "http://www.expasy.org/uniprot/",
55 :     "tr" => "http://www.expasy.org/uniprot/",
56 :     "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
57 :     );
58 :    
59 :    
60 :    
61 :     if ($cgi->param('Show Proteins In Each Family'))
62 : redwards 1.1 {
63 : redwards 1.2 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
64 :     $cgi->param(-name=>'family', -value=>\@needed);
65 :     &show_family($fig,$cgi,$html);
66 :     }
67 : overbeek 1.11 elsif ($cgi->param('analyse_family')) {
68 : overbeek 1.12 # finding what is there is the same as findingh what is missing you just need one extra !
69 :     # these two things call the same method.
70 :     &analyse_family($fig,$cgi,$html, 0);
71 : redwards 1.7 }
72 : redwards 1.10 elsif ($cgi->param('reverse_analyse_family')) {
73 : overbeek 1.12 &analyse_family($fig,$cgi,$html, 1);
74 : redwards 1.10 }
75 : redwards 1.2 elsif ($cgi->param('family'))
76 :     {
77 :     &show_family($fig,$cgi,$html);
78 : redwards 1.1 }
79 :     elsif ($cgi->param('prot'))
80 :     {
81 : redwards 1.2 &show_protein($fig,$cgi,$html);
82 : redwards 1.1 }
83 : overbeek 1.14 elsif ($cgi->param('fig2kegg'))
84 :     {
85 :     &comparefig2kegg($fig,$cgi,$html);
86 :     }
87 : redwards 1.1 else
88 :     {
89 :     &show_initial($fig,$cgi,$html);
90 :     }
91 :    
92 :     &HTML::show_page($cgi,$html,1);
93 :     exit;
94 :    
95 :    
96 :     sub show_initial {
97 :     my ($fig,$cgi,$html)=@_;
98 :     # generate a blank page
99 : redwards 1.2 push @$html,
100 :     "<h2>Protein Families</h2>\n",
101 :     "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
102 :     "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
103 :     $cgi->start_form(),
104 :     "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>",
105 :     "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
106 :     "Please enter a family id: ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
107 :     $cgi->submit, $cgi->reset, $cgi->end_form;
108 : redwards 1.1 return $html;
109 :     }
110 :    
111 : redwards 1.7
112 : redwards 1.2 sub show_family {
113 : redwards 1.1 my ($fig,$cgi,$html)=@_;
114 : redwards 1.2 foreach my $fam ($cgi->param('family')) {
115 : redwards 1.4 my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
116 : redwards 1.2 my $tab=[];
117 : redwards 1.3 my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
118 : redwards 1.4 foreach my $cid (@cids) {
119 :     my @pegs=$fig->cid_to_prots($cid);
120 : redwards 1.2 foreach my $p (@pegs) {
121 :     foreach my $k (keys %proteinbase) {
122 :     if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
123 : redwards 1.1 }
124 :     }
125 : redwards 1.4 push @$tab, [$cid, (join ", ", (@pegs))];
126 : redwards 1.1 }
127 : redwards 1.2
128 :     push @$html, "<h2>$fam Family</h2>\n",
129 :     "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
130 : redwards 1.3 "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
131 :     "even though they may come from different organisms.</p>",
132 : redwards 1.2 "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
133 :     $cgi->start_form,
134 :     &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
135 : overbeek 1.13 $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"),
136 : overbeek 1.12 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
137 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
138 : redwards 1.2 $cgi->end_form;
139 : redwards 1.1 }
140 :     }
141 :    
142 : overbeek 1.11 sub show_protein {
143 : redwards 1.1 my ($fig,$cgi,$html)=@_;
144 : overbeek 1.11 foreach my $peg ($cgi->param('prot')) {
145 :     my @families=$fig->families_for_protein($peg);
146 :     unless (@families)
147 :     {
148 :     push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
149 :     return;
150 : redwards 1.1 }
151 :    
152 : redwards 1.2 my $tab=[];
153 : overbeek 1.11 my $self=$cgi->url;
154 :     foreach my $fam (@families) {
155 :     my %idcount;
156 :     my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
157 :     #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
158 :     push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
159 : redwards 1.2 }
160 : overbeek 1.11
161 :     my $col_hdrs=['Family ID', 'Family Function', 'Number of IDs in Family', 'Choose Family'];
162 :     push @$html, "<h2>Families for $peg</h2>\n",
163 : redwards 1.2 $cgi->start_form,
164 : overbeek 1.11 "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
165 :     &HTML::make_table($col_hdrs, $tab, "Families for $peg"), "\n",
166 :     $cgi->submit('Show Proteins In Each Family'),
167 :     $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
168 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
169 :     $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
170 :     $cgi->reset, $cgi->end_form;
171 : redwards 1.2 }
172 :     }
173 : redwards 1.10
174 :    
175 : overbeek 1.12
176 :    
177 : redwards 1.10 sub analyse_family {
178 : overbeek 1.12 my ($fig,$cgi,$html, $reverse)=@_;
179 : redwards 1.10 # here are the questions:
180 :     # 1. Given a column in a spreadsheet:
181 :     # 2. Here are the proteins in that column
182 :     # 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
183 :     # if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
184 :     # so we recommend that a protein be removed from a family.
185 :     # 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
186 :     # 5. For each of the families that are good, which proteins are only in one of those families and not in any others?
187 :    
188 :     # Note that column == family, But start with fig and then allow a replace ID feature like before.
189 :    
190 : overbeek 1.15 my $focus=$cgi->param('focus') or 'all'; # these are the things that we are interested in
191 :     undef $focus if ($focus eq "all");
192 : overbeek 1.12
193 :     my @cols;
194 :     if ($cgi->param("allfams")) {@cols=grep {$cgi->param($_)} $cgi->param("allfams")}
195 :     elsif ($cgi->param("family")) {push @cols, $cgi->param('family')}
196 :     else {die "No families declared!"}
197 :    
198 :     foreach my $col (@cols)
199 : redwards 1.10 {
200 :     # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
201 : overbeek 1.12 my $proteins_in_col;
202 :     map {$proteins_in_col->{$_}=1} $fig->ids_in_family($col);
203 : redwards 1.10
204 :     # @proteins are the proteins in that column, although these are cids and not fids at the moment
205 :     my $familycount;
206 : overbeek 1.12 foreach my $prot (keys %$proteins_in_col) {
207 : redwards 1.10 foreach my $fam ($fig->in_family($prot)) {
208 :     $familycount->{$fam}++;
209 :     }
210 :     }
211 :    
212 :     my $count_of;
213 :     my $fams;
214 : overbeek 1.12 foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount)
215 :     {
216 :     if ($reverse) {($fams, $count_of)=&ids_missing_from_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
217 :     else {($fams, $count_of)=&ids_are_in_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
218 :     }
219 : redwards 1.10
220 :     # create a list of families that we know about
221 : overbeek 1.11 my @fams=grep {!/$col/} sort {scalar(keys %{$fams->{$b}}) <=> scalar(keys %{$fams->{$a}})} keys %$fams;
222 :     unshift @fams, $col;
223 :    
224 : overbeek 1.15 my $tab=[[["Number of proteins in family", "th colspan=4"], map {scalar(keys %{$fams->{$_}})} @fams]];
225 : redwards 1.10
226 : overbeek 1.11 my @fids;
227 :     if ($cgi->param('sort') eq "genome")
228 :     {
229 :     @fids=sort {$fig->genome_of($a) <=> $fig->genome_of($b)} keys %$count_of;
230 :     }
231 : overbeek 1.15 elsif ($cgi->param('sort') eq "cid")
232 :     {
233 :     @fids=sort {$fig->prot_to_cid($a) <=> $fig->prot_to_cid($b)} keys %$count_of;
234 :     }
235 : overbeek 1.11 else
236 :     {
237 :     @fids=sort {scalar(keys %{$count_of->{$b}}) <=> scalar(keys %{$count_of->{$a}})} keys %$count_of;
238 :     }
239 :    
240 :     my $rowcount;
241 :     foreach my $fid (@fids)
242 :     {
243 : overbeek 1.15 my @row=(++$rowcount, $fig->prot_to_cid($fid), $fid, scalar(keys %{$count_of->{$fid}}));
244 : overbeek 1.11
245 :     foreach my $fam (@fams) {
246 :     $count_of->{$fid}->{$fam} ? push @row, ["Y", "td style='background-color: lightgrey; text-align: center'"] : push @row, " &nbsp ";
247 : redwards 1.10 }
248 :     push @$tab, \@row;
249 :     }
250 : overbeek 1.12
251 : overbeek 1.15 push @$html, $cgi->start_form(), $cgi->p("Limit the display to proteins from ", &choose_focus($cgi), "\n"), $cgi->p("Sort the order by ", &choose_sort($cgi),"\n");
252 : overbeek 1.12 if ($reverse) {
253 :     push @$html, $cgi->p("These are proteins that ARE NOT in ", $fig->family_function($col), " ($col) but are in other families that have proteins in this family.");
254 :     } else {
255 :     push @$html, $cgi->p("These are proteins that ARE in ", $fig->family_function($col), " ($col) and are in other families that have proteins in this family.");
256 :     }
257 :    
258 : redwards 1.16 # merge cells in the table
259 :     my $skip;
260 :     map {$skip->{$_}=1} (0, 2 .. 40); # ignore everything except column 2
261 :     $tab=&HTML::merge_table_rows($tab, $skip);
262 :    
263 : overbeek 1.12 push @$html,
264 :     $cgi->hidden(-name=>"family", -value=>@cols), $cgi->hidden("prot"), $cgi->hidden(-name=>"user"),
265 : overbeek 1.11 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
266 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
267 : overbeek 1.15 &HTML::make_table(["Count", "Unique ID", "Protein ID", "Number of fams protein is in", @fams], $tab,' &nbsp; ');
268 : redwards 1.10 }
269 :     }
270 :    
271 :    
272 : overbeek 1.12 sub ids_are_in_fam {
273 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
274 :     # It seems that $sz_family is not right
275 :     map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
276 :     grep {/^$focus/}
277 :     map {$fig->cid_to_prots($_)}
278 :     grep {$proteins_in_col->{$_}}
279 :     ($fig->ids_in_family($f));
280 :     return ($fams, $count_of);
281 :     }
282 : redwards 1.10
283 : overbeek 1.12 sub ids_missing_from_fam {
284 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
285 : redwards 1.10 # It seems that $sz_family is not right
286 : overbeek 1.11 map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
287 : redwards 1.10 grep {/^$focus/}
288 :     map {$fig->cid_to_prots($_)}
289 : overbeek 1.12 grep {!$proteins_in_col->{$_}}
290 : redwards 1.10 ($fig->ids_in_family($f));
291 : overbeek 1.12 return ($fams, $count_of);
292 :     }
293 :    
294 : overbeek 1.11
295 : redwards 1.10
296 : overbeek 1.11 sub choose_focus {
297 :     my ($cgi)=@_;
298 :     my %choices=(
299 : overbeek 1.15 "all" => "All",
300 : overbeek 1.11 "fig" => "FIGfams",
301 :     "tigr" => "TIGRfams",
302 :     "pfam" => "PFAM",
303 :     "sp" => "SwissProt",
304 :     "kegg" => "KEGG",
305 :     "pir" => "PIR SuperFams",
306 :     "mcl" => "MCL",
307 :     "cog" => "COG",
308 :     );
309 :    
310 : overbeek 1.15 my $default = $cgi->param("focus"); unless ($default) {$default="all"}
311 : overbeek 1.11
312 :     return $cgi->popup_menu(
313 :     -name => "focus",
314 : overbeek 1.15 -values => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
315 :     -labels => \%choices,
316 :     -default => $default,
317 :     );
318 :     }
319 :    
320 :     sub choose_sort {
321 :     my ($cgi)=@_;
322 :     my %choices=(
323 :     "none" => "Number of Proteins",
324 :     "genome" => "Genome",
325 :     "cid" => "Unique ID",
326 :     );
327 :    
328 :     my $default = $cgi->param("sort"); unless ($default) {$default="none"}
329 :    
330 :     return $cgi->popup_menu(
331 :     -name => "sort",
332 :     -values => [sort {$choices{$a} cmp $choices{$b}} keys %choices],
333 : overbeek 1.11 -labels => \%choices,
334 :     -default => $default,
335 :     );
336 :     }
337 : redwards 1.10
338 : redwards 1.7
339 : overbeek 1.14 sub comparefig2kegg {
340 :     my ($fig,$cgi,$html)=@_;
341 :    
342 :     my $classification; my %subsystem;
343 :     # read classification from kegg file
344 :     if (open(IN, "$FIG_Config::global/ProteinFamilies/kegg_classificaation.txt")) {
345 :     while (<IN>) {
346 :     chomp;
347 :     my @a=split /\t/;
348 :     my $id=shift(@a);
349 :     $subsystem{"kegg|$id"}=pop(@a);
350 :     push @{$classification->{"kegg|$id"}}, \@a;
351 :     }
352 :     }
353 :    
354 :    
355 :     my $tab=[];
356 :     # find out what families our proteins are in
357 :     map {
358 :     my $prot=$_;
359 :     map {
360 :     my $fam=$_;
361 :     if ($fam =~ /^fig/) {
362 :     my %ss;
363 :     map {$ss{$_->[0]}++} ($fig->subsystems_for_role($fig->family_function($fam)));
364 :     map {my $ss=$_; push @$tab, [$prot, @{$fig->subsystem_classification($ss)}, $ss, $fam, $fig->family_function($fam)]} keys %ss;
365 :     }
366 :     else {
367 :     map {push @$tab, [$prot, @{$_}, $subsystem{$fam}, $fam, $fig->family_function($fam)]} @{$classification->{$fam}}
368 :     }
369 :     } grep {/^fig/ || /^kegg/} $fig->families_for_protein($prot);
370 :     } $cgi->param('proteins');
371 :    
372 :     my $col_hdrs=['Protein', ['Classification', 'th colspan=2'], 'Subsystem', 'Family ID', 'Family Function'];
373 :     push @$html, &HTML::make_table($col_hdrs, $tab, "Families"), "\n",
374 :     }
375 :    
376 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3