[Bio] / FigWebServices / proteinfamilies.cgi Repository:
ViewVC logotype

Annotation of /FigWebServices/proteinfamilies.cgi

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.14 - (view) (download)

1 : redwards 1.1 # -*- perl -*-
2 :    
3 :     =pod
4 :    
5 :     =head1 proteinfamilies.cgi
6 :    
7 : redwards 1.3 A base web interface for getting information about protein families in and out. Initially we are going to make a 3 (or 4) column table of protein, family and other proteins.
8 : redwards 1.1
9 : redwards 1.7 PLEASE NOTE: Do not attempt to read or understand this code. Please leave now. It is a complete mess because it is very experimental and we are trying stuff out. none of it will work. The exit is this way ---->
10 :    
11 : redwards 1.1 =cut
12 :    
13 :     use strict;
14 :     use FIG;
15 :     use HTML;
16 :     use raelib;
17 :     use CGI;
18 : overbeek 1.11 use CGI::Carp qw(fatalsToBrowser);
19 : redwards 1.1 my $cgi=new CGI;
20 :     use LWP::Simple qw(!head); # see the caveat in perldoc LWP about importing two head methods.
21 :    
22 :     my $fig;
23 :     eval {
24 :     $fig = new FIG;
25 :     };
26 :    
27 :     if ($@ ne "")
28 :     {
29 :     my $err = $@;
30 :    
31 :     my(@html);
32 :    
33 :     push(@html, $cgi->p("Error connecting to SEED database."));
34 :     if ($err =~ /Could not connect to DBI:.*could not connect to server/)
35 :     {
36 :     push(@html, $cgi->p("Could not connect to relational database of type $FIG_Config::dbms named $FIG_Config::db on port $FIG_Config::dbport."));
37 :     }
38 :     else
39 :     {
40 :     push(@html, $cgi->pre($err));
41 :     }
42 :     &HTML::show_page($cgi, \@html, 1);
43 :     exit;
44 :     }
45 :    
46 :     my $html = [];
47 :     my $user = $cgi->param('user');
48 :    
49 :     unshift(@$html, "<TITLE>The SEED - Global Protein Families </TITLE>\n");
50 :    
51 : redwards 1.2 my %proteinbase=(
52 :     "fig" => "/FIG/protein.cgi?user=$user&prot=fig|",
53 :     "cog" => "http://www.ncbi.nlm.nih.gov/COG/old/palox.cgi?",
54 :     "sp" => "http://www.expasy.org/uniprot/",
55 :     "tr" => "http://www.expasy.org/uniprot/",
56 :     "kegg" => "http://www.genome.jp/dbget-bin/www_bget?",
57 :     );
58 :    
59 :    
60 :    
61 :     if ($cgi->param('Show Proteins In Each Family'))
62 : redwards 1.1 {
63 : redwards 1.2 my @needed=grep {$cgi->param($_)} $cgi->param("allfams");
64 :     $cgi->param(-name=>'family', -value=>\@needed);
65 :     &show_family($fig,$cgi,$html);
66 :     }
67 : overbeek 1.11 elsif ($cgi->param('analyse_family')) {
68 : overbeek 1.12 # finding what is there is the same as findingh what is missing you just need one extra !
69 :     # these two things call the same method.
70 :     &analyse_family($fig,$cgi,$html, 0);
71 : redwards 1.7 }
72 : redwards 1.10 elsif ($cgi->param('reverse_analyse_family')) {
73 : overbeek 1.12 &analyse_family($fig,$cgi,$html, 1);
74 : redwards 1.10 }
75 : redwards 1.2 elsif ($cgi->param('family'))
76 :     {
77 :     &show_family($fig,$cgi,$html);
78 : redwards 1.1 }
79 :     elsif ($cgi->param('prot'))
80 :     {
81 : redwards 1.2 &show_protein($fig,$cgi,$html);
82 : redwards 1.1 }
83 : overbeek 1.14 elsif ($cgi->param('fig2kegg'))
84 :     {
85 :     &comparefig2kegg($fig,$cgi,$html);
86 :     }
87 : redwards 1.1 else
88 :     {
89 :     &show_initial($fig,$cgi,$html);
90 :     }
91 :    
92 :     &HTML::show_page($cgi,$html,1);
93 :     exit;
94 :    
95 :    
96 :     sub show_initial {
97 :     my ($fig,$cgi,$html)=@_;
98 :     # generate a blank page
99 : redwards 1.2 push @$html,
100 :     "<h2>Protein Families</h2>\n",
101 :     "<p>Please enter a protein ID . You will recieve a list of all the families that protein is in. \n",
102 :     "You can use a FIG ID such as fig|83333.1.peg.3, or an ID from SwissProt, KEGG, NCBI, and others.</p>",
103 :     $cgi->start_form(),
104 :     "Please enter a protein id: ", $cgi->textfield(-name=>"prot", -size=>40), "<br>",
105 :     "<p>Alternately, you can enter a family. Please enter a family name in the format pir|PIRSF001547 or fig|PF002363.</p>",
106 :     "Please enter a family id: ", $cgi->textfield(-name=>"family", -size=>40), "<br>",
107 :     $cgi->submit, $cgi->reset, $cgi->end_form;
108 : redwards 1.1 return $html;
109 :     }
110 :    
111 : redwards 1.7
112 : redwards 1.2 sub show_family {
113 : redwards 1.1 my ($fig,$cgi,$html)=@_;
114 : redwards 1.2 foreach my $fam ($cgi->param('family')) {
115 : redwards 1.4 my @cids=sort {$a <=> $b} $fig->ids_in_family($fam);
116 : redwards 1.2 my $tab=[];
117 : redwards 1.3 my $col_hdrs=['Cluster ID', 'Polypeptides with same amino acid sequence'];
118 : redwards 1.4 foreach my $cid (@cids) {
119 :     my @pegs=$fig->cid_to_prots($cid);
120 : redwards 1.2 foreach my $p (@pegs) {
121 :     foreach my $k (keys %proteinbase) {
122 :     if ($p =~ /^$k/) {$p =~ s/^(.*?)\|//; $p = "<a href='$proteinbase{$k}$p'>$1|$p</a>"}
123 : redwards 1.1 }
124 :     }
125 : redwards 1.4 push @$tab, [$cid, (join ", ", (@pegs))];
126 : redwards 1.1 }
127 : redwards 1.2
128 :     push @$html, "<h2>$fam Family</h2>\n",
129 :     "<p>The family $fam has the function ", $fig->family_function($fam), ", and contains ", $fig->sz_family($fam), " proteins, as shown in the table below.<br>",
130 : redwards 1.3 "Each of the sequences with a given ID have the same amino acid sequence, and hence are the same polypeptide, ",
131 :     "even though they may come from different organisms.</p>",
132 : redwards 1.2 "<p>The links will take you to the respective databases for each of the other protein families.\n</p>",
133 :     $cgi->start_form,
134 :     &HTML::make_table($col_hdrs, $tab, "Proteins in " . $fig->family_function($fam) . " ($fam)"),
135 : overbeek 1.13 $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>'family', -value=>"$fam"),
136 : overbeek 1.12 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
137 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
138 : redwards 1.2 $cgi->end_form;
139 : redwards 1.1 }
140 :     }
141 :    
142 : overbeek 1.11 sub show_protein {
143 : redwards 1.1 my ($fig,$cgi,$html)=@_;
144 : overbeek 1.11 foreach my $peg ($cgi->param('prot')) {
145 :     my @families=$fig->families_for_protein($peg);
146 :     unless (@families)
147 :     {
148 :     push @$html, "<h2 style='color: red'>Sorry, $peg is not in any protein families</h2>";
149 :     return;
150 : redwards 1.1 }
151 :    
152 : redwards 1.2 my $tab=[];
153 : overbeek 1.11 my $self=$cgi->url;
154 :     foreach my $fam (@families) {
155 :     my %idcount;
156 :     my $noprots=scalar(map {$idcount{$_}=1} $fig->ids_in_family($fam));
157 :     #push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $fig->sz_family($fam), $cgi->checkbox(-name=>$fam, -label=>'')];
158 :     push @$tab, ["<a href='$self?family=$fam'>$fam</a>", $fig->family_function($fam), $noprots, $cgi->checkbox(-name=>$fam, -label=>'')];
159 : redwards 1.2 }
160 : overbeek 1.11
161 :     my $col_hdrs=['Family ID', 'Family Function', 'Number of IDs in Family', 'Choose Family'];
162 :     push @$html, "<h2>Families for $peg</h2>\n",
163 : redwards 1.2 $cgi->start_form,
164 : overbeek 1.11 "<p>$peg is in the following ", scalar(@families), " families. Please choose one or more families using the checkboxes</p>\n",
165 :     &HTML::make_table($col_hdrs, $tab, "Families for $peg"), "\n",
166 :     $cgi->submit('Show Proteins In Each Family'),
167 :     $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
168 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
169 :     $cgi->hidden(-name=>'prot'),$cgi->hidden(-name=>"allfams", -value=>\@families), "\n",
170 :     $cgi->reset, $cgi->end_form;
171 : redwards 1.2 }
172 :     }
173 : redwards 1.10
174 :    
175 : overbeek 1.12
176 :    
177 : redwards 1.10 sub analyse_family {
178 : overbeek 1.12 my ($fig,$cgi,$html, $reverse)=@_;
179 : redwards 1.10 # here are the questions:
180 :     # 1. Given a column in a spreadsheet:
181 :     # 2. Here are the proteins in that column
182 :     # 3. For each protein, here are the families that they are in. How many families are unique and how many families is every protein in?
183 :     # if we start with a column of 10 proteins, and nine of them are all in the same families and one is not, we want to exclude the one and keep the nine.
184 :     # so we recommend that a protein be removed from a family.
185 :     # 4. For each of the families that are good, which proteins are there in some/most of the families that are not in the column that we are looking at
186 :     # 5. For each of the families that are good, which proteins are only in one of those families and not in any others?
187 :    
188 :     # Note that column == family, But start with fig and then allow a replace ID feature like before.
189 :    
190 : overbeek 1.14 my $focus=$cgi->param('focus') or 'fig'; # these are the things that we are interested in
191 : overbeek 1.12
192 :     my @cols;
193 :     if ($cgi->param("allfams")) {@cols=grep {$cgi->param($_)} $cgi->param("allfams")}
194 :     elsif ($cgi->param("family")) {push @cols, $cgi->param('family')}
195 :     else {die "No families declared!"}
196 :    
197 :     foreach my $col (@cols)
198 : redwards 1.10 {
199 :     # $col is the column in the spreadsheet. This is really a family, but to visualize and code this I am doing it in a FIG-centric way
200 : overbeek 1.12 my $proteins_in_col;
201 :     map {$proteins_in_col->{$_}=1} $fig->ids_in_family($col);
202 : redwards 1.10
203 :     # @proteins are the proteins in that column, although these are cids and not fids at the moment
204 :     my $familycount;
205 : overbeek 1.12 foreach my $prot (keys %$proteins_in_col) {
206 : redwards 1.10 foreach my $fam ($fig->in_family($prot)) {
207 :     $familycount->{$fam}++;
208 :     }
209 :     }
210 :    
211 :     my $count_of;
212 :     my $fams;
213 : overbeek 1.12 foreach my $f (sort {$familycount->{$b} <=> $familycount->{$a}} keys %$familycount)
214 :     {
215 :     if ($reverse) {($fams, $count_of)=&ids_missing_from_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
216 :     else {($fams, $count_of)=&ids_are_in_fam($fig, $f, $focus, $proteins_in_col, $fams, $count_of)}
217 :     }
218 : redwards 1.10
219 :     # create a list of families that we know about
220 : overbeek 1.11 my @fams=grep {!/$col/} sort {scalar(keys %{$fams->{$b}}) <=> scalar(keys %{$fams->{$a}})} keys %$fams;
221 :     unshift @fams, $col;
222 :    
223 :     my $tab=[[["Number of proteins in family", "th colspan=3"], map {scalar(keys %{$fams->{$_}})} @fams]];
224 : redwards 1.10
225 : overbeek 1.11 my @fids;
226 :     if ($cgi->param('sort') eq "genome")
227 :     {
228 :     @fids=sort {$fig->genome_of($a) <=> $fig->genome_of($b)} keys %$count_of;
229 :     }
230 :     else
231 :     {
232 :     @fids=sort {scalar(keys %{$count_of->{$b}}) <=> scalar(keys %{$count_of->{$a}})} keys %$count_of;
233 :     }
234 :    
235 :     my $rowcount;
236 :     foreach my $fid (@fids)
237 :     {
238 :     my @row=(++$rowcount, $fid, scalar(keys %{$count_of->{$fid}}));
239 :    
240 :     foreach my $fam (@fams) {
241 :     $count_of->{$fid}->{$fam} ? push @row, ["Y", "td style='background-color: lightgrey; text-align: center'"] : push @row, " &nbsp ";
242 : redwards 1.10 }
243 :     push @$tab, \@row;
244 :     }
245 : overbeek 1.12
246 :     push @$html, $cgi->start_form(), "Limit the display to proteins from ", &choose_focus($cgi);
247 :     if ($reverse) {
248 :     push @$html, $cgi->p("These are proteins that ARE NOT in ", $fig->family_function($col), " ($col) but are in other families that have proteins in this family.");
249 :     } else {
250 :     push @$html, $cgi->p("These are proteins that ARE in ", $fig->family_function($col), " ($col) and are in other families that have proteins in this family.");
251 :     }
252 :    
253 :     push @$html,
254 :     $cgi->hidden(-name=>"family", -value=>@cols), $cgi->hidden("prot"), $cgi->hidden(-name=>"user"),
255 : overbeek 1.11 $cgi->submit(-name=>'analyse_family', -value=>"Show Proteins that are in family"),
256 :     $cgi->submit(-name=>'reverse_analyse_family', -value=>"Show Proteins that are NOT in family"),
257 :     &HTML::make_table(["Count", "Protein ID", "Number of fams protein is in", @fams], $tab,' &nbsp; ');
258 : redwards 1.10 }
259 :     }
260 :    
261 :    
262 : overbeek 1.12 sub ids_are_in_fam {
263 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
264 :     # It seems that $sz_family is not right
265 :     map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
266 :     grep {/^$focus/}
267 :     map {$fig->cid_to_prots($_)}
268 :     grep {$proteins_in_col->{$_}}
269 :     ($fig->ids_in_family($f));
270 :     return ($fams, $count_of);
271 :     }
272 : redwards 1.10
273 : overbeek 1.12 sub ids_missing_from_fam {
274 :     my ($fig, $f, $focus, $proteins_in_col, $fams, $count_of)=@_;
275 : redwards 1.10 # It seems that $sz_family is not right
276 : overbeek 1.11 map {$fams->{$f}->{$_}++; $count_of->{$_}->{$f}++}
277 : redwards 1.10 grep {/^$focus/}
278 :     map {$fig->cid_to_prots($_)}
279 : overbeek 1.12 grep {!$proteins_in_col->{$_}}
280 : redwards 1.10 ($fig->ids_in_family($f));
281 : overbeek 1.12 return ($fams, $count_of);
282 :     }
283 :    
284 : overbeek 1.11
285 : redwards 1.10
286 : overbeek 1.11 sub choose_focus {
287 :     my ($cgi)=@_;
288 :     my %choices=(
289 :     "fig" => "FIGfams",
290 :     "tigr" => "TIGRfams",
291 :     "pfam" => "PFAM",
292 :     "sp" => "SwissProt",
293 :     "kegg" => "KEGG",
294 :     "pir" => "PIR SuperFams",
295 :     "mcl" => "MCL",
296 :     "cog" => "COG",
297 :     );
298 :    
299 :     my $default = $cgi->param("focus"); unless ($default) {$default="fig"}
300 :    
301 :     return $cgi->popup_menu(
302 :     -name => "focus",
303 :     -values => [keys %choices],
304 :     -labels => \%choices,
305 :     -default => $default,
306 :     );
307 :     }
308 : redwards 1.10
309 : redwards 1.7
310 : overbeek 1.14 sub comparefig2kegg {
311 :     my ($fig,$cgi,$html)=@_;
312 :    
313 :     my $classification; my %subsystem;
314 :     # read classification from kegg file
315 :     if (open(IN, "$FIG_Config::global/ProteinFamilies/kegg_classificaation.txt")) {
316 :     while (<IN>) {
317 :     chomp;
318 :     my @a=split /\t/;
319 :     my $id=shift(@a);
320 :     $subsystem{"kegg|$id"}=pop(@a);
321 :     push @{$classification->{"kegg|$id"}}, \@a;
322 :     }
323 :     }
324 :    
325 :    
326 :     my $tab=[];
327 :     # find out what families our proteins are in
328 :     map {
329 :     my $prot=$_;
330 :     map {
331 :     my $fam=$_;
332 :     if ($fam =~ /^fig/) {
333 :     my %ss;
334 :     map {$ss{$_->[0]}++} ($fig->subsystems_for_role($fig->family_function($fam)));
335 :     map {my $ss=$_; push @$tab, [$prot, @{$fig->subsystem_classification($ss)}, $ss, $fam, $fig->family_function($fam)]} keys %ss;
336 :     }
337 :     else {
338 :     map {push @$tab, [$prot, @{$_}, $subsystem{$fam}, $fam, $fig->family_function($fam)]} @{$classification->{$fam}}
339 :     }
340 :     } grep {/^fig/ || /^kegg/} $fig->families_for_protein($prot);
341 :     } $cgi->param('proteins');
342 :    
343 :     my $col_hdrs=['Protein', ['Classification', 'th colspan=2'], 'Subsystem', 'Family ID', 'Family Function'];
344 :     push @$html, &HTML::make_table($col_hdrs, $tab, "Families"), "\n",
345 :     }
346 :    
347 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3