[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.11 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHSigGenes;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use SearchHelper;
8 :     use CGI;
9 :     use HTML;
10 :     use Sprout;
11 : parrello 1.10 use Time::HiRes;
12 : parrello 1.11 use FIGRules;
13 : parrello 1.1
14 :     our @ISA = qw(SearchHelper);
15 :    
16 :     =head1 Gene Discrimination Feature Search Helper
17 :    
18 :     =head2 Introduction
19 :    
20 :     This search performs a signature genes comparison. The user selects two genome sets,
21 :     and the search returns genes from a given genome which are common only in the first set
22 :     and not in the second. If the second set is empty, the search will return genes from
23 :     the given genome that are common in the first set.
24 :    
25 :     Gene identity will be computed in this case using bidirectional best hits. If gene X
26 :     from the given genome has a BBH in a specified genome Y, then it is said to occur
27 :     in whatever set includes genome Y. A gene is considered I<common> if it occurs in a
28 :     certain percentage of the genomes of the set.
29 :    
30 :     This search has the following extra parameters.
31 :    
32 :     =over 4
33 :    
34 :     =item given
35 :    
36 :     The ID of the given genome.
37 :    
38 :     =item target[]
39 :    
40 :     The IDs of the genomes in the first (target) set. The given genome is
41 :     automatically considered a part of this set, so it can never be empty.
42 :    
43 :     =item exclusion[]
44 :    
45 :     The IDs of the genomes in the second (exclusion) set. If this set is empty, then
46 :     no genes will be considered common in set 2, causing all genes common in set 1
47 :     to be selected.
48 :    
49 :     =item commonality
50 :    
51 :     Minimum score for a gene to be considered common. The score is equal to the number
52 :     of genomes containing a bidirectional best hit of the gene divided by the total
53 :     number of genomes. The default is C<0.8>. A value of C<1> means a gene must have
54 :     BBHs in all of the genomes to be considered common; a value of C<0> is invalid.
55 :    
56 :     =item cutoff
57 :    
58 :     Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59 :    
60 : parrello 1.11 =item showMatch
61 :    
62 :     If TRUE, then all the genes in the target set that match the ones in the reference genome
63 :     will be shown in an extra column.
64 :    
65 : parrello 1.1 =back
66 :    
67 :     =head2 Virtual Methods
68 :    
69 :     =head3 Form
70 :    
71 : parrello 1.7 C<< my $html = $shelp->Form(); >>
72 : parrello 1.1
73 :     Generate the HTML for a form to request a new search.
74 :    
75 :     =cut
76 :    
77 :     sub Form {
78 :     # Get the parameters.
79 :     my ($self) = @_;
80 :     # Get the CGI and sprout objects.
81 :     my $cgi = $self->Q();
82 :     my $sprout = $self->DB();
83 :     # Start the form.
84 :     my $retVal = $self->FormStart("Signature Genes");
85 :     # The bulk of this form will be two genome selection menus, one for the first
86 :     # (target) set and one for the second (exclusion) set. Above these two controls
87 :     # there is the selector for the given genome, the commonality and cutoff values,
88 :     # and the submit button. Our first task, then, is to get the genome selection
89 :     # menus.
90 : parrello 1.5 my $givenMenu = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
91 : parrello 1.4 my $targetMenu = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
92 :     my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
93 : parrello 1.1 # Get the default values to use for the commonality and cutoff controls.
94 :     my $commonality = $cgi->param('commonality') || "0.8";
95 :     my $cutoff = $cgi->param('cutoff') || "1e-10";
96 : parrello 1.6 my $statistical = $cgi->param('statistical') || 1;
97 : parrello 1.11 my $showMatch = $cgi->param('showMatch') || 0;
98 : parrello 1.9 # Now we build the table rows.
99 : parrello 1.1 my @rows = ();
100 : parrello 1.9 # First we have the given genome.
101 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
102 :     $cgi->td({colspan => 2}, $givenMenu));
103 :     # Now show the target and exclusion menus.
104 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
105 :     $cgi->td({colspan => 2}, $targetMenu));
106 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
107 :     $cgi->td({colspan => 2}, $excludeMenu));
108 :     # Next, the numeric parameters.
109 : parrello 1.1 push @rows, $cgi->Tr($cgi->td("Commonality"),
110 :     $cgi->td($cgi->textfield(-name => 'commonality',
111 :     -value => $commonality,
112 : parrello 1.9 -size => 5))),
113 : parrello 1.11 $cgi->Tr($cgi->td(), $cgi->td(join(" ",
114 : parrello 1.5 $cgi->checkbox(-name => 'statistical',
115 :     -checked => $statistical,
116 :     -value => 1,
117 : parrello 1.11 -label => 'Use Statistical Algorithm')),
118 :     $cgi->checkbox(-name => 'showMatch',
119 :     -checked => $showMatch,
120 :     -value => 1,
121 :     -label => 'Show Matching Genes')));
122 : parrello 1.1 push @rows, $cgi->Tr($cgi->td("Cutoff"),
123 :     $cgi->td($cgi->textfield(-name => 'cutoff',
124 :     -value => $cutoff,
125 :     -size => 5)));
126 : parrello 1.9 # Next, the feature filter rows.
127 :     push @rows, $self->FeatureFilterRows();
128 :     # Finally, the submit button.
129 : parrello 1.1 push @rows, $self->SubmitRow();
130 :     # Create the table.
131 :     $retVal .= $self->MakeTable(\@rows);
132 :     # Close the form.
133 :     $retVal .= $self->FormEnd();
134 :     # Return the result.
135 :     return $retVal;
136 :     }
137 :    
138 :     =head3 Find
139 :    
140 :     C<< my $resultCount = $shelp->Find(); >>
141 :    
142 :     Conduct a search based on the current CGI query parameters. The search results will
143 :     be written to the session cache file and the number of results will be
144 :     returned. If the search parameters are invalid, a result count of C<undef> will be
145 :     returned and a result message will be stored in this object describing the problem.
146 :    
147 :     =cut
148 :    
149 :     sub Find {
150 :     # Get the parameters.
151 :     my ($self) = @_;
152 :     # Get the sprout and CGI query objects.
153 :     my $cgi = $self->Q();
154 :     my $sprout = $self->DB();
155 : parrello 1.2 # Declare the return variable. If it remains undefined, the caller will
156 :     # assume there was an error.
157 : parrello 1.1 my $retVal;
158 : parrello 1.11 # Denote the extra columns go at the end.
159 :     $self->SetExtraPos(1);
160 : parrello 1.10 # Create the timers.
161 :     my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
162 : parrello 1.2 # Validate the numeric parameters.
163 :     my $commonality = $cgi->param('commonality');
164 :     my $cutoff = $cgi->param('cutoff');
165 :     if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
166 :     $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
167 :     } elsif ($commonality <= 0 || $commonality > 1) {
168 :     $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1.");
169 :     } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) {
170 :     $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\".");
171 :     } elsif ($cutoff > 1) {
172 :     $self->SetMessage("Cutoff cannot be greater than 1.");
173 :     } else {
174 :     # Now we need to gather and validate the genome sets.
175 : parrello 1.3 my ($givenGenomeID) = $self->GetGenomes('given');
176 :     my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
177 :     my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
178 : parrello 1.2 # Insure the given genome is not in the exclusion set.
179 :     if ($exclusionGenomes{$givenGenomeID}) {
180 :     $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
181 :     } else {
182 :     # Insure the given genome is in the target set.
183 :     $targetGenomes{$givenGenomeID} = 1
184 :     }
185 : parrello 1.5 # Find out if we want to use a statistical analysis.
186 : parrello 1.11 my $statistical = $cgi->param('statistical') || 1;
187 :     # Find out if we need to show matching genes.
188 :     my $showMatch = $cgi->param('showMatch') || 0;
189 : parrello 1.2 # Denote we have not yet found any genomes.
190 :     $retVal = 0;
191 : parrello 1.10 # Compute the list of genomes of interest.
192 :     my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
193 :     # Get the BBH matrix.
194 :     $saveTime = time();
195 :     my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
196 :     $bbhTimer += time() - $saveTime;
197 : parrello 1.4 # Create a feature query object to loop through the chosen features of the given
198 :     # genome.
199 : parrello 1.10 Trace("Creating feature query.") if T(3);
200 :     $saveTime = time();
201 : parrello 1.4 my $fquery = FeatureQuery->new($self, $givenGenomeID);
202 : parrello 1.10 $queryTimer += time() - $saveTime;
203 : parrello 1.4 # Get the sizes of the two sets. This information is useful in computing commonality.
204 :     my $targetSetSize = scalar keys %targetGenomes;
205 :     my $exclusionSetSize = scalar keys %exclusionGenomes;
206 :     # Loop through the features.
207 : parrello 1.10 my $done = 0;
208 :     while (! $done) {
209 :     # Get the next feature.
210 :     $saveTime = time();
211 :     my $record = $fquery->Fetch();
212 :     $queryTimer += time() - $saveTime;
213 :     if (! $record) {
214 :     $done = 1;
215 :     } else {
216 :     # Get the feature's ID.
217 :     my $fid = $fquery->FID();
218 :     Trace("Processing feature $fid.") if T(4);
219 :     # Get its list of BBHs. The list is actually a hash mapping each BBH to its
220 :     # score. All we care about, however, are the BBHs themselves.
221 :     my $bbhList = $bbhMatrix{$fid};
222 :     # We next wish to loop through the BBH IDs, counting how many are in each of the
223 :     # sets. If a genome occurs twice, we only want to count the first occurrence, so
224 : parrello 1.11 # we have a hash of genomes we've already seen. The hash will map each gene ID
225 :     # to 0, 1, or 2, depending on whether it was found in the reference genome,
226 :     # a target genome, or an exclusion genome.
227 : parrello 1.10 my %alreadySeen = ();
228 : parrello 1.11 # Save the matching genes in here.
229 :     my %genesMatching = ();
230 : parrello 1.10 # Clear the exclusion count.
231 :     my $exclusionCount = 0;
232 :     # Denote that we're in our own genome.
233 : parrello 1.11 $alreadySeen{$givenGenomeID} = 0;
234 : parrello 1.10 my $targetCount = 1;
235 :     # Loop through the BBHs.
236 :     for my $bbhPeg (keys %{$bbhList}) {
237 :     # Get the genome ID. We want to find out if this genome is new.
238 :     my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
239 : parrello 1.11 if (! exists $alreadySeen{$genomeID}) {
240 : parrello 1.10 # It's new, so we check to see which set it's in.
241 :     if ($targetGenomes{$genomeID}) {
242 : parrello 1.11 # It's in the target set.
243 : parrello 1.10 $targetCount++;
244 : parrello 1.11 $alreadySeen{$genomeID} = 1;
245 : parrello 1.10 } elsif ($exclusionGenomes{$genomeID}) {
246 : parrello 1.11 # It's in the exclusion set.
247 : parrello 1.10 $exclusionCount++;
248 : parrello 1.11 $alreadySeen{$genomeID} = 2;
249 :     }
250 :     # Note that $alreadySeen{$genomeID} exists in the hash by this
251 :     # point. If it's 1, we need to save the current PEG.
252 :     if ($alreadySeen{$genomeID} == 1) {
253 :     $genesMatching{$bbhPeg} = 1;
254 : parrello 1.10 }
255 :     }
256 :     }
257 : parrello 1.11 # Create a variable to indicate whether or not we want to keep this feature and
258 :     # another for the score.
259 :     my ($okFlag, $score);
260 : parrello 1.10 # We need to see if we're using statistics or not. This only matters
261 :     # for a two-set situation.
262 :     if ($statistical && $exclusionSetSize > 0) {
263 : parrello 1.11 # This is the magic formula for choosing the differentiating genes. It looks like
264 :     # it has something to do with variance computations, but I'm not sure.
265 : parrello 1.10 my $targetNotCount = $targetSetSize - $targetCount;
266 :     my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
267 :     my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
268 :     my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
269 :     my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
270 :     my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
271 :     my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
272 :     # If the two differentials are greater than one, we keep this feature.
273 : parrello 1.11 $score = $inD + $outD;
274 :     $okFlag = ($score > 1);
275 : parrello 1.10 } else {
276 :     # Check to see if we're common in set 1 and not in set 2.
277 : parrello 1.11 my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
278 :     my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
279 :     if ($score1 && ! $score2) {
280 :     # We satisfy the criterion, so we put this feature to the output. The
281 :     # score is normalize to a range similar to the scores in the statistical
282 :     # method.
283 :     $score = $score1 + (1 - $score2);
284 : parrello 1.10 $okFlag = 1;
285 : parrello 1.4 }
286 :     }
287 : parrello 1.10 if ($okFlag) {
288 : parrello 1.11 # Put this feature to the output. We have one or two extra columns.
289 :     # First we store the score.
290 :     $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
291 :     # Next we add the list of matching genes, but only if "showMatch" is specified.
292 :     if ($showMatch) {
293 :     # The matching genes are in the hash "genesMatching".
294 :     my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
295 :     # We need to linkify them.
296 :     my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
297 :     # Now add them as an extra column.
298 :     $fquery->AddExtraColumns(matches => $genesHTML);
299 :     }
300 : parrello 1.10 $saveTime = time();
301 :     $self->PutFeature($fquery);
302 :     $putTimer += time() - $saveTime;
303 :     # Increase the result count.
304 :     $retVal++;
305 :     }
306 :     # Check for a timer trace. We trace every 500 features.
307 :     $loopCounter++;
308 :     if (T(3) && $loopCounter % 500 == 0) {
309 :     Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
310 : parrello 1.5 }
311 :     }
312 : parrello 1.4 }
313 : parrello 1.5 # Close the session file.
314 : parrello 1.10 $saveTime = time();
315 : parrello 1.5 $self->CloseSession();
316 : parrello 1.10 $putTimer += time() - $saveTime;
317 : parrello 1.2 }
318 : parrello 1.10 # Trace the timers.
319 :     Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
320 : parrello 1.1 # Return the result count.
321 :     return $retVal;
322 :     }
323 :    
324 :     =head3 Description
325 :    
326 :     C<< my $htmlText = $shelp->Description(); >>
327 :    
328 :     Return a description of this search. The description is used for the table of contents
329 :     on the main search tools page. It may contain HTML, but it should be character-level,
330 :     not block-level, since the description is going to appear in a list.
331 :    
332 :     =cut
333 :    
334 :     sub Description {
335 :     # Get the parameters.
336 :     my ($self) = @_;
337 :     # Return the result.
338 : parrello 1.8 return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
339 : parrello 1.1 }
340 :    
341 : parrello 1.4 =head2 Internal Utilities
342 :    
343 :     =head3 IsCommon
344 :    
345 : parrello 1.11 C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
346 : parrello 1.4
347 : parrello 1.11 Return the match score if a specified count indicates a gene is common in a specified set
348 :     and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
349 : parrello 1.4 comparing the result to the minimum commonality ratio. The one exception is
350 : parrello 1.11 if the set size is 0. In that case, this method always returns 0.
351 : parrello 1.4
352 :     =over 4
353 :    
354 :     =item count
355 :    
356 :     Number of elements of the set that have the relevant characteristic.
357 :    
358 :     =item size
359 :    
360 :     Total number of elements in the set.
361 :    
362 :     =item commonality
363 :    
364 :     Minimum count/size ratio for the characteristic to be considered common.
365 :    
366 :     =item RETURN
367 :    
368 :     Returns TRUE if the characteristic is common, else FALSE.
369 :    
370 :     =back
371 :    
372 :     =cut
373 :    
374 :     sub IsCommon {
375 :     # Get the parameters.
376 :     my ($count, $size, $commonality) = @_;
377 :     # Declare the return variable.
378 :     my $retVal = 0;
379 :     # Only procced if the size is positive.
380 :     if ($size > 0) {
381 : parrello 1.11 # Compute the commonality.
382 :     $retVal = $count/$size;
383 :     # If it's too small, clear it.
384 :     if ($retVal < $commonality) {
385 :     $retVal = 0;
386 :     }
387 : parrello 1.4 }
388 :     # Return the result.
389 :     return $retVal;
390 :     }
391 :    
392 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3