Parent Directory
|
Revision Log
Revision 1.10 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | package SHSigGenes; | ||
4 : | |||
5 : | use strict; | ||
6 : | use Tracer; | ||
7 : | use SearchHelper; | ||
8 : | use CGI; | ||
9 : | use HTML; | ||
10 : | use Sprout; | ||
11 : | parrello | 1.10 | use Time::HiRes; |
12 : | parrello | 1.1 | |
13 : | our @ISA = qw(SearchHelper); | ||
14 : | |||
15 : | =head1 Gene Discrimination Feature Search Helper | ||
16 : | |||
17 : | =head2 Introduction | ||
18 : | |||
19 : | This search performs a signature genes comparison. The user selects two genome sets, | ||
20 : | and the search returns genes from a given genome which are common only in the first set | ||
21 : | and not in the second. If the second set is empty, the search will return genes from | ||
22 : | the given genome that are common in the first set. | ||
23 : | |||
24 : | Gene identity will be computed in this case using bidirectional best hits. If gene X | ||
25 : | from the given genome has a BBH in a specified genome Y, then it is said to occur | ||
26 : | in whatever set includes genome Y. A gene is considered I<common> if it occurs in a | ||
27 : | certain percentage of the genomes of the set. | ||
28 : | |||
29 : | This search has the following extra parameters. | ||
30 : | |||
31 : | =over 4 | ||
32 : | |||
33 : | =item given | ||
34 : | |||
35 : | The ID of the given genome. | ||
36 : | |||
37 : | =item target[] | ||
38 : | |||
39 : | The IDs of the genomes in the first (target) set. The given genome is | ||
40 : | automatically considered a part of this set, so it can never be empty. | ||
41 : | |||
42 : | =item exclusion[] | ||
43 : | |||
44 : | The IDs of the genomes in the second (exclusion) set. If this set is empty, then | ||
45 : | no genes will be considered common in set 2, causing all genes common in set 1 | ||
46 : | to be selected. | ||
47 : | |||
48 : | =item commonality | ||
49 : | |||
50 : | Minimum score for a gene to be considered common. The score is equal to the number | ||
51 : | of genomes containing a bidirectional best hit of the gene divided by the total | ||
52 : | number of genomes. The default is C<0.8>. A value of C<1> means a gene must have | ||
53 : | BBHs in all of the genomes to be considered common; a value of C<0> is invalid. | ||
54 : | |||
55 : | =item cutoff | ||
56 : | |||
57 : | Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>. | ||
58 : | |||
59 : | =back | ||
60 : | |||
61 : | =head2 Virtual Methods | ||
62 : | |||
63 : | =head3 Form | ||
64 : | |||
65 : | parrello | 1.7 | C<< my $html = $shelp->Form(); >> |
66 : | parrello | 1.1 | |
67 : | Generate the HTML for a form to request a new search. | ||
68 : | |||
69 : | =cut | ||
70 : | |||
71 : | sub Form { | ||
72 : | # Get the parameters. | ||
73 : | my ($self) = @_; | ||
74 : | # Get the CGI and sprout objects. | ||
75 : | my $cgi = $self->Q(); | ||
76 : | my $sprout = $self->DB(); | ||
77 : | # Start the form. | ||
78 : | my $retVal = $self->FormStart("Signature Genes"); | ||
79 : | # The bulk of this form will be two genome selection menus, one for the first | ||
80 : | # (target) set and one for the second (exclusion) set. Above these two controls | ||
81 : | # there is the selector for the given genome, the commonality and cutoff values, | ||
82 : | # and the submit button. Our first task, then, is to get the genome selection | ||
83 : | # menus. | ||
84 : | parrello | 1.5 | my $givenMenu = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]); |
85 : | parrello | 1.4 | my $targetMenu = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion'); |
86 : | my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target'); | ||
87 : | parrello | 1.1 | # Get the default values to use for the commonality and cutoff controls. |
88 : | my $commonality = $cgi->param('commonality') || "0.8"; | ||
89 : | my $cutoff = $cgi->param('cutoff') || "1e-10"; | ||
90 : | parrello | 1.6 | my $statistical = $cgi->param('statistical') || 1; |
91 : | parrello | 1.9 | # Now we build the table rows. |
92 : | parrello | 1.1 | my @rows = (); |
93 : | parrello | 1.9 | # First we have the given genome. |
94 : | push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"), | ||
95 : | $cgi->td({colspan => 2}, $givenMenu)); | ||
96 : | # Now show the target and exclusion menus. | ||
97 : | push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"), | ||
98 : | $cgi->td({colspan => 2}, $targetMenu)); | ||
99 : | push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"), | ||
100 : | $cgi->td({colspan => 2}, $excludeMenu)); | ||
101 : | # Next, the numeric parameters. | ||
102 : | parrello | 1.1 | push @rows, $cgi->Tr($cgi->td("Commonality"), |
103 : | $cgi->td($cgi->textfield(-name => 'commonality', | ||
104 : | -value => $commonality, | ||
105 : | parrello | 1.9 | -size => 5))), |
106 : | $cgi->Tr($cgi->td(), $cgi->td( | ||
107 : | parrello | 1.5 | $cgi->checkbox(-name => 'statistical', |
108 : | -checked => $statistical, | ||
109 : | -value => 1, | ||
110 : | -label => 'Use Statistical Algorithm'))); | ||
111 : | parrello | 1.1 | push @rows, $cgi->Tr($cgi->td("Cutoff"), |
112 : | $cgi->td($cgi->textfield(-name => 'cutoff', | ||
113 : | -value => $cutoff, | ||
114 : | -size => 5))); | ||
115 : | parrello | 1.9 | # Next, the feature filter rows. |
116 : | push @rows, $self->FeatureFilterRows(); | ||
117 : | # Finally, the submit button. | ||
118 : | parrello | 1.1 | push @rows, $self->SubmitRow(); |
119 : | # Create the table. | ||
120 : | $retVal .= $self->MakeTable(\@rows); | ||
121 : | # Close the form. | ||
122 : | $retVal .= $self->FormEnd(); | ||
123 : | # Return the result. | ||
124 : | return $retVal; | ||
125 : | } | ||
126 : | |||
127 : | =head3 Find | ||
128 : | |||
129 : | C<< my $resultCount = $shelp->Find(); >> | ||
130 : | |||
131 : | Conduct a search based on the current CGI query parameters. The search results will | ||
132 : | be written to the session cache file and the number of results will be | ||
133 : | returned. If the search parameters are invalid, a result count of C<undef> will be | ||
134 : | returned and a result message will be stored in this object describing the problem. | ||
135 : | |||
136 : | =cut | ||
137 : | |||
138 : | sub Find { | ||
139 : | # Get the parameters. | ||
140 : | my ($self) = @_; | ||
141 : | # Get the sprout and CGI query objects. | ||
142 : | my $cgi = $self->Q(); | ||
143 : | my $sprout = $self->DB(); | ||
144 : | parrello | 1.2 | # Declare the return variable. If it remains undefined, the caller will |
145 : | # assume there was an error. | ||
146 : | parrello | 1.1 | my $retVal; |
147 : | parrello | 1.10 | # Create the timers. |
148 : | my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0); | ||
149 : | parrello | 1.2 | # Validate the numeric parameters. |
150 : | my $commonality = $cgi->param('commonality'); | ||
151 : | my $cutoff = $cgi->param('cutoff'); | ||
152 : | if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) { | ||
153 : | $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number."); | ||
154 : | } elsif ($commonality <= 0 || $commonality > 1) { | ||
155 : | $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1."); | ||
156 : | } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) { | ||
157 : | $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\"."); | ||
158 : | } elsif ($cutoff > 1) { | ||
159 : | $self->SetMessage("Cutoff cannot be greater than 1."); | ||
160 : | } else { | ||
161 : | # Now we need to gather and validate the genome sets. | ||
162 : | parrello | 1.3 | my ($givenGenomeID) = $self->GetGenomes('given'); |
163 : | my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target'); | ||
164 : | my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion'); | ||
165 : | parrello | 1.2 | # Insure the given genome is not in the exclusion set. |
166 : | if ($exclusionGenomes{$givenGenomeID}) { | ||
167 : | $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set."); | ||
168 : | } else { | ||
169 : | # Insure the given genome is in the target set. | ||
170 : | $targetGenomes{$givenGenomeID} = 1 | ||
171 : | } | ||
172 : | parrello | 1.5 | # Find out if we want to use a statistical analysis. |
173 : | my $statistical = $cgi->param('statistical') || 0; | ||
174 : | parrello | 1.2 | # Denote we have not yet found any genomes. |
175 : | $retVal = 0; | ||
176 : | parrello | 1.10 | # Compute the list of genomes of interest. |
177 : | my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes); | ||
178 : | # Get the BBH matrix. | ||
179 : | $saveTime = time(); | ||
180 : | my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes); | ||
181 : | $bbhTimer += time() - $saveTime; | ||
182 : | parrello | 1.4 | # Create a feature query object to loop through the chosen features of the given |
183 : | # genome. | ||
184 : | parrello | 1.10 | Trace("Creating feature query.") if T(3); |
185 : | $saveTime = time(); | ||
186 : | parrello | 1.4 | my $fquery = FeatureQuery->new($self, $givenGenomeID); |
187 : | parrello | 1.10 | $queryTimer += time() - $saveTime; |
188 : | parrello | 1.4 | # Get the sizes of the two sets. This information is useful in computing commonality. |
189 : | my $targetSetSize = scalar keys %targetGenomes; | ||
190 : | my $exclusionSetSize = scalar keys %exclusionGenomes; | ||
191 : | # Loop through the features. | ||
192 : | parrello | 1.10 | my $done = 0; |
193 : | while (! $done) { | ||
194 : | # Get the next feature. | ||
195 : | $saveTime = time(); | ||
196 : | my $record = $fquery->Fetch(); | ||
197 : | $queryTimer += time() - $saveTime; | ||
198 : | if (! $record) { | ||
199 : | $done = 1; | ||
200 : | } else { | ||
201 : | # Get the feature's ID. | ||
202 : | my $fid = $fquery->FID(); | ||
203 : | Trace("Processing feature $fid.") if T(4); | ||
204 : | # Get its list of BBHs. The list is actually a hash mapping each BBH to its | ||
205 : | # score. All we care about, however, are the BBHs themselves. | ||
206 : | my $bbhList = $bbhMatrix{$fid}; | ||
207 : | # We next wish to loop through the BBH IDs, counting how many are in each of the | ||
208 : | # sets. If a genome occurs twice, we only want to count the first occurrence, so | ||
209 : | # we have a hash of genomes we've already seen. | ||
210 : | my %alreadySeen = (); | ||
211 : | # Clear the exclusion count. | ||
212 : | my $exclusionCount = 0; | ||
213 : | # Denote that we're in our own genome. | ||
214 : | $alreadySeen{$givenGenomeID} = 1; | ||
215 : | my $targetCount = 1; | ||
216 : | # Loop through the BBHs. | ||
217 : | for my $bbhPeg (keys %{$bbhList}) { | ||
218 : | # Get the genome ID. We want to find out if this genome is new. | ||
219 : | my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg); | ||
220 : | if (! $alreadySeen{$genomeID}) { | ||
221 : | # It's new, so we check to see which set it's in. | ||
222 : | if ($targetGenomes{$genomeID}) { | ||
223 : | $targetCount++; | ||
224 : | } elsif ($exclusionGenomes{$genomeID}) { | ||
225 : | $exclusionCount++; | ||
226 : | } | ||
227 : | # Make sure we don't look at it again. | ||
228 : | $alreadySeen{$genomeID} = 1; | ||
229 : | } | ||
230 : | } | ||
231 : | # Create a variable to indicate whether or not we want to keep this feature. | ||
232 : | my $okFlag; | ||
233 : | # We need to see if we're using statistics or not. This only matters | ||
234 : | # for a two-set situation. | ||
235 : | if ($statistical && $exclusionSetSize > 0) { | ||
236 : | # This looks like it has something to do with variance computations, | ||
237 : | # but I'm not sure. | ||
238 : | my $targetNotCount = $targetSetSize - $targetCount; | ||
239 : | my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount; | ||
240 : | my $exclusionNotCount = $exclusionSetSize - $exclusionCount; | ||
241 : | my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount; | ||
242 : | my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount; | ||
243 : | my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare)); | ||
244 : | my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare)); | ||
245 : | # If the two differentials are greater than one, we keep this feature. | ||
246 : | $okFlag = ($inD + $outD > 1); | ||
247 : | } else { | ||
248 : | # Check to see if we're common in set 1 and not in set 2. | ||
249 : | if (IsCommon($targetCount, $targetSetSize, $commonality) && | ||
250 : | ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) { | ||
251 : | # We satisfy the criterion, so we put this feature to the output. | ||
252 : | $okFlag = 1; | ||
253 : | parrello | 1.4 | } |
254 : | } | ||
255 : | parrello | 1.10 | if ($okFlag) { |
256 : | # Put this feature to the output. | ||
257 : | $saveTime = time(); | ||
258 : | $self->PutFeature($fquery); | ||
259 : | $putTimer += time() - $saveTime; | ||
260 : | # Increase the result count. | ||
261 : | $retVal++; | ||
262 : | } | ||
263 : | # Check for a timer trace. We trace every 500 features. | ||
264 : | $loopCounter++; | ||
265 : | if (T(3) && $loopCounter % 500 == 0) { | ||
266 : | Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer."); | ||
267 : | parrello | 1.5 | } |
268 : | } | ||
269 : | parrello | 1.4 | } |
270 : | parrello | 1.5 | # Close the session file. |
271 : | parrello | 1.10 | $saveTime = time(); |
272 : | parrello | 1.5 | $self->CloseSession(); |
273 : | parrello | 1.10 | $putTimer += time() - $saveTime; |
274 : | parrello | 1.2 | } |
275 : | parrello | 1.10 | # Trace the timers. |
276 : | Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3); | ||
277 : | parrello | 1.1 | # Return the result count. |
278 : | return $retVal; | ||
279 : | } | ||
280 : | |||
281 : | =head3 Description | ||
282 : | |||
283 : | C<< my $htmlText = $shelp->Description(); >> | ||
284 : | |||
285 : | Return a description of this search. The description is used for the table of contents | ||
286 : | on the main search tools page. It may contain HTML, but it should be character-level, | ||
287 : | not block-level, since the description is going to appear in a list. | ||
288 : | |||
289 : | =cut | ||
290 : | |||
291 : | sub Description { | ||
292 : | # Get the parameters. | ||
293 : | my ($self) = @_; | ||
294 : | # Return the result. | ||
295 : | parrello | 1.8 | return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms."; |
296 : | parrello | 1.1 | } |
297 : | |||
298 : | parrello | 1.4 | =head2 Internal Utilities |
299 : | |||
300 : | =head3 IsCommon | ||
301 : | |||
302 : | C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >> | ||
303 : | |||
304 : | Return TRUE if a specified count indicates a gene is common in a specified set. | ||
305 : | Commonality is computed by dividing the count by the size of the set and | ||
306 : | comparing the result to the minimum commonality ratio. The one exception is | ||
307 : | if the set size is 0. In that case, this method always returns FALSE. | ||
308 : | |||
309 : | =over 4 | ||
310 : | |||
311 : | =item count | ||
312 : | |||
313 : | Number of elements of the set that have the relevant characteristic. | ||
314 : | |||
315 : | =item size | ||
316 : | |||
317 : | Total number of elements in the set. | ||
318 : | |||
319 : | =item commonality | ||
320 : | |||
321 : | Minimum count/size ratio for the characteristic to be considered common. | ||
322 : | |||
323 : | =item RETURN | ||
324 : | |||
325 : | Returns TRUE if the characteristic is common, else FALSE. | ||
326 : | |||
327 : | =back | ||
328 : | |||
329 : | =cut | ||
330 : | |||
331 : | sub IsCommon { | ||
332 : | # Get the parameters. | ||
333 : | my ($count, $size, $commonality) = @_; | ||
334 : | # Declare the return variable. | ||
335 : | my $retVal = 0; | ||
336 : | # Only procced if the size is positive. | ||
337 : | if ($size > 0) { | ||
338 : | $retVal = ($count/$size >= $commonality); | ||
339 : | } | ||
340 : | # Return the result. | ||
341 : | return $retVal; | ||
342 : | } | ||
343 : | |||
344 : | parrello | 1.1 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |