[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHSigGenes;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use CGI;
8 :     use HTML;
9 :     use Sprout;
10 : parrello 1.10 use Time::HiRes;
11 : parrello 1.11 use FIGRules;
12 : parrello 1.16 use RHFeatures;
13 :     use base 'SearchHelper';
14 : parrello 1.1
15 :     =head1 Gene Discrimination Feature Search Helper
16 :    
17 :     =head2 Introduction
18 :    
19 :     This search performs a signature genes comparison. The user selects two genome sets,
20 :     and the search returns genes from a given genome which are common only in the first set
21 :     and not in the second. If the second set is empty, the search will return genes from
22 :     the given genome that are common in the first set.
23 :    
24 :     Gene identity will be computed in this case using bidirectional best hits. If gene X
25 :     from the given genome has a BBH in a specified genome Y, then it is said to occur
26 :     in whatever set includes genome Y. A gene is considered I<common> if it occurs in a
27 :     certain percentage of the genomes of the set.
28 :    
29 :     This search has the following extra parameters.
30 :    
31 :     =over 4
32 :    
33 :     =item given
34 :    
35 :     The ID of the given genome.
36 :    
37 :     =item target[]
38 :    
39 :     The IDs of the genomes in the first (target) set. The given genome is
40 :     automatically considered a part of this set, so it can never be empty.
41 :    
42 :     =item exclusion[]
43 :    
44 :     The IDs of the genomes in the second (exclusion) set. If this set is empty, then
45 :     no genes will be considered common in set 2, causing all genes common in set 1
46 :     to be selected.
47 :    
48 :     =item commonality
49 :    
50 :     Minimum score for a gene to be considered common. The score is equal to the number
51 :     of genomes containing a bidirectional best hit of the gene divided by the total
52 :     number of genomes. The default is C<0.8>. A value of C<1> means a gene must have
53 :     BBHs in all of the genomes to be considered common; a value of C<0> is invalid.
54 :    
55 :     =item cutoff
56 :    
57 :     Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
58 :    
59 : parrello 1.11 =item showMatch
60 :    
61 :     If TRUE, then all the genes in the target set that match the ones in the reference genome
62 :     will be shown in an extra column.
63 :    
64 : parrello 1.1 =back
65 :    
66 :     =head2 Virtual Methods
67 :    
68 :     =head3 Form
69 :    
70 : parrello 1.7 C<< my $html = $shelp->Form(); >>
71 : parrello 1.1
72 :     Generate the HTML for a form to request a new search.
73 :    
74 :     =cut
75 :    
76 :     sub Form {
77 :     # Get the parameters.
78 :     my ($self) = @_;
79 :     # Get the CGI and sprout objects.
80 :     my $cgi = $self->Q();
81 :     my $sprout = $self->DB();
82 :     # Start the form.
83 :     my $retVal = $self->FormStart("Signature Genes");
84 :     # The bulk of this form will be two genome selection menus, one for the first
85 :     # (target) set and one for the second (exclusion) set. Above these two controls
86 :     # there is the selector for the given genome, the commonality and cutoff values,
87 :     # and the submit button. Our first task, then, is to get the genome selection
88 :     # menus.
89 : parrello 1.5 my $givenMenu = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
90 : parrello 1.4 my $targetMenu = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
91 :     my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
92 : parrello 1.1 # Get the default values to use for the commonality and cutoff controls.
93 :     my $commonality = $cgi->param('commonality') || "0.8";
94 :     my $cutoff = $cgi->param('cutoff') || "1e-10";
95 : parrello 1.6 my $statistical = $cgi->param('statistical') || 1;
96 : parrello 1.11 my $showMatch = $cgi->param('showMatch') || 0;
97 : parrello 1.12 my $useSims = $cgi->param('useSims') || 0;
98 : parrello 1.13 my $pegsOnly = $cgi->param('pegsOnly') || 1;
99 : parrello 1.9 # Now we build the table rows.
100 : parrello 1.1 my @rows = ();
101 : parrello 1.9 # First we have the given genome.
102 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
103 :     $cgi->td({colspan => 2}, $givenMenu));
104 :     # Now show the target and exclusion menus.
105 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
106 :     $cgi->td({colspan => 2}, $targetMenu));
107 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
108 :     $cgi->td({colspan => 2}, $excludeMenu));
109 : parrello 1.12 # Next, the tuning parameters.
110 : parrello 1.1 push @rows, $cgi->Tr($cgi->td("Commonality"),
111 :     $cgi->td($cgi->textfield(-name => 'commonality',
112 :     -value => $commonality,
113 : parrello 1.9 -size => 5))),
114 : parrello 1.11 $cgi->Tr($cgi->td(), $cgi->td(join(" ",
115 : parrello 1.5 $cgi->checkbox(-name => 'statistical',
116 :     -checked => $statistical,
117 :     -value => 1,
118 : parrello 1.16 -label => 'Use Statistical Algorithm') .
119 :     SearchHelper::Hint("When two sets of genomees are specified, check this " .
120 :     "box to use a statistical algorithm designed " .
121 :     "specifically to choose differentiating genes. " .
122 :     "This box has no effect when looking for genes " .
123 :     "in common."),
124 : parrello 1.12 $cgi->checkbox(-name => 'useSims',
125 :     -checked => $useSims,
126 :     -value => 1,
127 : parrello 1.16 -label => 'Use Similarities') .
128 :     SearchHelper::Hint("Normally, Bidirectional Best Hits are used to " .
129 :     "find matching genes. Check this box to use " .
130 :     "similarities instead.")))),
131 : parrello 1.12 $cgi->Tr($cgi->td(), $cgi->td(join(" ",
132 : parrello 1.11 $cgi->checkbox(-name => 'showMatch',
133 :     -checked => $showMatch,
134 :     -value => 1,
135 : parrello 1.16 -label => 'Show Matching Genes') .
136 :     SearchHelper::Hint("Check this button to display the genes matching " .
137 :     "each gene displayed in the results.")))),
138 : parrello 1.12 $cgi->Tr($cgi->td("Cutoff"),
139 : parrello 1.1 $cgi->td($cgi->textfield(-name => 'cutoff',
140 :     -value => $cutoff,
141 :     -size => 5)));
142 : parrello 1.9 # Next, the feature filter rows.
143 : parrello 1.16 push @rows, RHFeatures::WordSearchRow($self);
144 :     push @rows, RHFeatures::FeatureFilterFormRows($self);
145 : parrello 1.9 # Finally, the submit button.
146 : parrello 1.1 push @rows, $self->SubmitRow();
147 :     # Create the table.
148 :     $retVal .= $self->MakeTable(\@rows);
149 :     # Close the form.
150 :     $retVal .= $self->FormEnd();
151 :     # Return the result.
152 :     return $retVal;
153 :     }
154 :    
155 :     =head3 Find
156 :    
157 :     C<< my $resultCount = $shelp->Find(); >>
158 :    
159 :     Conduct a search based on the current CGI query parameters. The search results will
160 :     be written to the session cache file and the number of results will be
161 :     returned. If the search parameters are invalid, a result count of C<undef> will be
162 :     returned and a result message will be stored in this object describing the problem.
163 :    
164 :     =cut
165 :    
166 :     sub Find {
167 :     # Get the parameters.
168 :     my ($self) = @_;
169 :     # Get the sprout and CGI query objects.
170 :     my $cgi = $self->Q();
171 :     my $sprout = $self->DB();
172 : parrello 1.2 # Declare the return variable. If it remains undefined, the caller will
173 :     # assume there was an error.
174 : parrello 1.1 my $retVal;
175 : parrello 1.10 # Create the timers.
176 :     my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
177 : parrello 1.2 # Validate the numeric parameters.
178 :     my $commonality = $cgi->param('commonality');
179 :     my $cutoff = $cgi->param('cutoff');
180 :     if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
181 :     $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
182 :     } elsif ($commonality <= 0 || $commonality > 1) {
183 :     $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1.");
184 :     } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) {
185 :     $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\".");
186 :     } elsif ($cutoff > 1) {
187 :     $self->SetMessage("Cutoff cannot be greater than 1.");
188 :     } else {
189 : parrello 1.16 # Get the result helper.
190 :     my $rhelp = RHFeatures->new($self);
191 :     # Set up the default columns.
192 :     $self->DefaultColumns($rhelp);
193 :     # Add the score at the end.
194 :     $rhelp->AddExtraColumn(score => undef, title => 'Score', style => 'rightAlign', download => 'num');
195 : parrello 1.11 # Find out if we need to show matching genes.
196 :     my $showMatch = $cgi->param('showMatch') || 0;
197 : parrello 1.16 # If we do, add a column for them at the front.
198 :     if ($showMatch) {
199 :     $rhelp->AddExtraColumn(matches => 0, title => 'Matches', style => 'leftAlign', download => 'list');
200 :     }
201 :     # Only proceed if the filtering parameters are valid.
202 :     if ($rhelp->Valid()) {
203 :     # Start the output session.
204 :     $self->OpenSession($rhelp);
205 :     # Now we need to gather and validate the genome sets.
206 :     $self->PrintLine("Gathering the target genomes. ");
207 :     my ($givenGenomeID) = $self->GetGenomes('given');
208 :     my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
209 :     $self->PrintLine("Gathering the exclusion genomes. ");
210 :     my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
211 :     $self->PrintLine("Validating the genome sets.<br />");
212 :     # Insure the given genome is not in the exclusion set.
213 :     if ($exclusionGenomes{$givenGenomeID}) {
214 :     $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
215 :     } else {
216 :     # Insure the given genome is in the target set.
217 :     $targetGenomes{$givenGenomeID} = 1;
218 :     }
219 :     # Find out if we want to use a statistical analysis.
220 :     my $statistical = $cgi->param('statistical') || 1;
221 :     # Denote we have not yet found any genomes.
222 :     $retVal = 0;
223 :     # Compute the list of genomes of interest.
224 :     my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
225 :     # Get the peg matrix.
226 :     Trace("Requesting matrix.") if T(3);
227 :     $saveTime = time();
228 :     my %bbhMatrix;
229 :     if (! $cgi->param('useSims')) {
230 :     # Here we are using BBHs, which are fast enough to do in one gulp.
231 :     $self->PrintLine("Requesting bidirectional best hits. ");
232 :     %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
233 :     } else {
234 :     # Here we are using similarities, which are much more complicated.
235 :     $self->PrintLine("Requesting similarities.<br />");
236 :     # Create a filtering matrix for the results. We only want to keep PEGs in the
237 :     # specified target and exclusion genomes.
238 :     my %keepGenomes = map { $_ => 1 } @allGenomes;
239 :     # Loop through the given genome's features.
240 :     my @features = $sprout->FeaturesOf($givenGenomeID);
241 :     for my $fid (@features) {
242 :     $self->PrintLine("Retrieving similarities for $fid. ");
243 :     # Get this feature's similarities.
244 :     my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
245 :     my $simCount = scalar @{$simList};
246 :     $self->PrintLine("Raw similarity count: $simCount. ");
247 :     # Create the matrix hash for this feature.
248 :     $bbhMatrix{$fid} = {};
249 :     # Now we need to filter out the similarities that don't land on the target genome.
250 :     $simCount = 0;
251 :     for my $sim (@{$simList}) {
252 :     # Insure this similarity lands on a target genome.
253 :     my $genomeID2 = $sprout->GenomeOf($sim->id2);
254 :     if ($keepGenomes{$genomeID2}) {
255 :     # Here we're keeping the similarity, so we put it in this feature's hash.
256 :     $bbhMatrix{$fid}->{$sim->id2} = $sim->psc;
257 :     $simCount++;
258 :     }
259 : parrello 1.12 }
260 : parrello 1.16 $self->PrintLine("Similarities retained: $simCount.<br />");
261 : parrello 1.12 }
262 :     }
263 : parrello 1.16 $bbhTimer += time() - $saveTime;
264 :     $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
265 :     Trace("Matrix built.") if T(3);
266 :     # Create a feature query object to loop through the chosen features of the given
267 :     # genome.
268 :     Trace("Creating feature query.") if T(3);
269 : parrello 1.10 $saveTime = time();
270 : parrello 1.16 my $fquery = $rhelp->GetQuery($givenGenomeID);
271 : parrello 1.10 $queryTimer += time() - $saveTime;
272 : parrello 1.16 # Get the sizes of the two sets. This information is useful in computing commonality.
273 :     my $targetSetSize = scalar keys %targetGenomes;
274 :     my $exclusionSetSize = scalar keys %exclusionGenomes;
275 :     # Loop through the features.
276 :     my $done = 0;
277 :     while (! $done) {
278 :     # Get the next feature.
279 :     $saveTime = time();
280 :     my $record = $rhelp->Fetch($fquery);
281 :     $queryTimer += time() - $saveTime;
282 :     if (! $record) {
283 :     $done = 1;
284 :     } else {
285 :     # Get the feature's ID.
286 :     my $fid = $record->PrimaryValue('Feature(id)');
287 : parrello 1.12 Trace("Checking feature $fid.") if T(4);
288 :     $self->PrintLine("Checking feature $fid.<br />");
289 : parrello 1.16 # Get its list of matching genes. The list is actually a hash mapping each matched gene to its
290 :     # score. All we care about, however, are the matches themselves.
291 : parrello 1.12 my $bbhList = $bbhMatrix{$fid};
292 :     # We next wish to loop through the BBH IDs, counting how many are in each of the
293 :     # sets. If a genome occurs twice, we only want to count the first occurrence, so
294 :     # we have a hash of genomes we've already seen. The hash will map each gene ID
295 :     # to 0, 1, or 2, depending on whether it was found in the reference genome,
296 :     # a target genome, or an exclusion genome.
297 :     my %alreadySeen = ();
298 :     # Save the matching genes in here.
299 :     my %genesMatching = ();
300 :     # Clear the exclusion count.
301 :     my $exclusionCount = 0;
302 :     # Denote that we're in our own genome.
303 :     $alreadySeen{$givenGenomeID} = 0;
304 :     my $targetCount = 1;
305 :     # Loop through the BBHs/Sims.
306 :     for my $bbhPeg (keys %{$bbhList}) {
307 :     # Get the genome ID. We want to find out if this genome is new.
308 : parrello 1.16 my $genomeID = $sprout->GenomeOf($bbhPeg);
309 : parrello 1.12 if (! exists $alreadySeen{$genomeID}) {
310 :     # It's new, so we check to see which set it's in.
311 :     if ($targetGenomes{$genomeID}) {
312 :     # It's in the target set.
313 :     $targetCount++;
314 :     $alreadySeen{$genomeID} = 1;
315 :     } elsif ($exclusionGenomes{$genomeID}) {
316 :     # It's in the exclusion set.
317 :     $exclusionCount++;
318 :     $alreadySeen{$genomeID} = 2;
319 :     }
320 :     # Note that $alreadySeen{$genomeID} exists in the hash by this
321 :     # point. If it's 1, we need to save the current PEG.
322 :     if ($alreadySeen{$genomeID} == 1) {
323 :     $genesMatching{$bbhPeg} = 1;
324 :     }
325 : parrello 1.11 }
326 : parrello 1.12 }
327 :     # Create a variable to indicate whether or not we want to keep this feature and
328 :     # another for the score.
329 :     my ($okFlag, $score);
330 :     # We need to see if we're using statistics or not. This only matters
331 :     # for a two-set situation.
332 :     if ($statistical && $exclusionSetSize > 0) {
333 :     # This is the magic formula for choosing the differentiating genes. It looks like
334 :     # it has something to do with variance computations, but I'm not sure.
335 :     my $targetNotCount = $targetSetSize - $targetCount;
336 :     my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
337 :     my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
338 :     my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
339 :     my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
340 :     my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
341 :     my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
342 :     # If the two differentials are greater than one, we keep this feature.
343 :     $score = $inD + $outD;
344 :     $okFlag = ($score > 1);
345 :     # Subtract 1 from the score so it looks like the commonality score.
346 :     $score -= 1.0;
347 :     } else {
348 :     # Check to see if we're common in set 1 and not in set 2.
349 :     my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
350 :     my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
351 :     if ($score1 && ! $score2) {
352 :     # We satisfy the criterion, so we put this feature to the output. The
353 :     # score is essentially $score1, since $score2 is zero.
354 :     $score = $score1;
355 :     $okFlag = 1;
356 : parrello 1.10 }
357 :     }
358 : parrello 1.12 if ($okFlag) {
359 :     # Put this feature to the output. We have one or two extra columns.
360 :     # First we store the score.
361 : parrello 1.16 $rhelp->PutExtraColumns(score => sprintf("%0.3f",$score));
362 : parrello 1.12 # Next we add the list of matching genes, but only if "showMatch" is specified.
363 :     if ($showMatch) {
364 :     # The matching genes are in the hash "genesMatching".
365 :     my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
366 :     # We need to linkify them.
367 :     my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
368 :     # Now add them as an extra column.
369 : parrello 1.16 $rhelp->PutExtraColumns(matches => $genesHTML);
370 : parrello 1.12 }
371 : parrello 1.16 # Compute a sort key from the feature data and the score.
372 :     my $sort = $rhelp->SortKey($record, sprintf("%0.3f", 1 - $score));
373 :     # Output the feature.
374 : parrello 1.12 $saveTime = time();
375 : parrello 1.16 $rhelp->PutData($sort, $fid, $record);
376 : parrello 1.12 $putTimer += time() - $saveTime;
377 :     # Increase the result count.
378 :     $retVal++;
379 : parrello 1.4 }
380 : parrello 1.12 # Check for a timer trace. We trace every 500 features.
381 :     $loopCounter++;
382 :     if (T(3) && $loopCounter % 500 == 0) {
383 :     Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
384 : parrello 1.11 }
385 : parrello 1.5 }
386 :     }
387 : parrello 1.16 # Close the session file.
388 :     $saveTime = time();
389 :     $self->CloseSession();
390 :     $putTimer += time() - $saveTime;
391 : parrello 1.4 }
392 : parrello 1.2 }
393 : parrello 1.10 # Trace the timers.
394 :     Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
395 : parrello 1.1 # Return the result count.
396 :     return $retVal;
397 :     }
398 :    
399 :     =head3 Description
400 :    
401 :     C<< my $htmlText = $shelp->Description(); >>
402 :    
403 :     Return a description of this search. The description is used for the table of contents
404 :     on the main search tools page. It may contain HTML, but it should be character-level,
405 :     not block-level, since the description is going to appear in a list.
406 :    
407 :     =cut
408 :    
409 :     sub Description {
410 :     # Get the parameters.
411 :     my ($self) = @_;
412 :     # Return the result.
413 : parrello 1.8 return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
414 : parrello 1.1 }
415 :    
416 : parrello 1.16 =head3 SearchTitle
417 : parrello 1.15
418 : parrello 1.16 C<< my $titleHtml = $shelp->SearchTitle(); >>
419 : parrello 1.15
420 : parrello 1.16 Return the display title for this search. The display title appears above the search results.
421 :     If no result is returned, no title will be displayed. The result should be an html string
422 :     that can be legally put inside a block tag such as C<h3> or C<p>.
423 : parrello 1.15
424 :     =cut
425 :    
426 : parrello 1.16 sub SearchTitle {
427 : parrello 1.15 # Get the parameters.
428 : parrello 1.16 my ($self) = @_;
429 :     # Compute the title. We extract the relevant clues from the query parameters.
430 :     my $cgi = $self->Q();
431 :     my $type = ($cgi->param('useSims') ? "Similarities" : "Bidirectional Best Hits");
432 :     my $style = ($cgi->param('exclusion') ? "Discriminating" : "Common");
433 :     my $retVal = "$style Genes using $type";
434 :     # Return it.
435 : parrello 1.15 return $retVal;
436 :     }
437 :    
438 : parrello 1.4 =head2 Internal Utilities
439 :    
440 :     =head3 IsCommon
441 :    
442 : parrello 1.11 C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
443 : parrello 1.4
444 : parrello 1.11 Return the match score if a specified count indicates a gene is common in a specified set
445 :     and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
446 : parrello 1.4 comparing the result to the minimum commonality ratio. The one exception is
447 : parrello 1.11 if the set size is 0. In that case, this method always returns 0.
448 : parrello 1.4
449 :     =over 4
450 :    
451 :     =item count
452 :    
453 :     Number of elements of the set that have the relevant characteristic.
454 :    
455 :     =item size
456 :    
457 :     Total number of elements in the set.
458 :    
459 :     =item commonality
460 :    
461 :     Minimum count/size ratio for the characteristic to be considered common.
462 :    
463 :     =item RETURN
464 :    
465 :     Returns TRUE if the characteristic is common, else FALSE.
466 :    
467 :     =back
468 :    
469 :     =cut
470 :    
471 :     sub IsCommon {
472 :     # Get the parameters.
473 :     my ($count, $size, $commonality) = @_;
474 :     # Declare the return variable.
475 :     my $retVal = 0;
476 :     # Only procced if the size is positive.
477 :     if ($size > 0) {
478 : parrello 1.11 # Compute the commonality.
479 :     $retVal = $count/$size;
480 :     # If it's too small, clear it.
481 :     if ($retVal < $commonality) {
482 :     $retVal = 0;
483 :     }
484 : parrello 1.4 }
485 :     # Return the result.
486 :     return $retVal;
487 :     }
488 :    
489 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3