[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Fri Sep 29 15:10:05 2006 UTC revision 1.21, Thu Feb 5 07:24:09 2009 UTC
# Line 4  Line 4 
4    
5      use strict;      use strict;
6      use Tracer;      use Tracer;
7      use SearchHelper;      use CGI qw(-nosticky);
     use CGI;  
8      use HTML;      use HTML;
9      use Sprout;      use Sprout;
10        use Time::HiRes;
11      our @ISA = qw(SearchHelper);      use FIGRules;
12        use RHFeatures;
13        use base 'SearchHelper';
14    
15  =head1 Gene Discrimination Feature Search Helper  =head1 Gene Discrimination Feature Search Helper
16    
# Line 55  Line 56 
56    
57  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
58    
59    =item showMatch
60    
61    If TRUE, then all the genes in the target set that match the ones in the reference genome
62    will be shown in an extra column.
63    
64  =back  =back
65    
66  =head2 Virtual Methods  =head2 Virtual Methods
67    
68  =head3 Form  =head3 Form
69    
70  C<< my $html = $shelp->Include(); >>      my $html = $shelp->Form();
71    
72  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
73    
# Line 80  Line 86 
86      # there is the selector for the given genome, the commonality and cutoff values,      # there is the selector for the given genome, the commonality and cutoff values,
87      # and the submit button. Our first task, then, is to get the genome selection      # and the submit button. Our first task, then, is to get the genome selection
88      # menus.      # menus.
89      my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('genome')]);      my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
90      my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')]);      my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
91      my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')]);      my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
92      # Get the default values to use for the commonality and cutoff controls.      # Get the default values to use for the commonality and cutoff controls.
93      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
94      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
95      # Now we build the table rows. The top contains the two numeric parameters and      my $statistical = $cgi->param('statistical') || 1;
96      # the submit button.      my $showMatch = $cgi->param('showMatch') || 0;
97        my $useSims = $cgi->param('useSims') || 0;
98        my $pegsOnly = $cgi->param('pegsOnly') || 1;
99        # Now we build the table rows.
100      my @rows = ();      my @rows = ();
101      push @rows, $cgi->Tr($cgi->td("Commonality"),      # First we have the given genome.
102                           $cgi->td($cgi->textfield(-name => 'commonality',      push @rows, CGI::Tr(CGI::td({valign => "top"}, "Reference Genome"),
103                             CGI::td({colspan => 2}, $givenMenu));
104        # Now show the target and exclusion menus.
105        push @rows, CGI::Tr(CGI::td({valign => "top"}, "Inclusion Genomes (Set 1)"),
106                             CGI::td({colspan => 2}, $targetMenu));
107        push @rows, CGI::Tr(CGI::td({valign => "top"}, "Exclusion Genomes (Set 2)"),
108                             CGI::td({colspan => 2}, $excludeMenu));
109        # Next, the tuning parameters.
110        push @rows, CGI::Tr(CGI::td("Commonality"),
111                             CGI::td(CGI::textfield(-name => 'commonality',
112                                                    -value => $commonality,                                                    -value => $commonality,
113                                                    -size => 5)));                                                    -size => 5))),
114      push @rows, $cgi->Tr($cgi->td("Cutoff"),                  CGI::Tr(CGI::td(), CGI::td(join(" ",
115                           $cgi->td($cgi->textfield(-name => 'cutoff',                                    CGI::checkbox(-name => 'statistical',
116                                                     -checked => $statistical,
117                                                     -value => 1,
118                                                     -label => 'Use Statistical Algorithm') .
119                                      SearchHelper::Hint("SigGenes",
120                                                         "When two sets of genomes are specified, check this " .
121                                                         "box to use a statistical algorithm designed " .
122                                                         "specifically to choose differentiating genes. " .
123                                                         "This box has no effect when looking for genes " .
124                                                         "in common."),
125                                      CGI::checkbox(-name => 'useSims',
126                                                     -checked => $useSims,
127                                                     -value => 1,
128                                                     -label => 'Use Similarities') .
129                                      SearchHelper::Hint("SigGenes",
130                                                         "Normally, Bidirectional Best Hits are used to " .
131                                                         "find matching genes. Check this box to use " .
132                                                         "similarities instead.")))),
133                    CGI::Tr(CGI::td(), CGI::td(join(" ",
134                                      CGI::checkbox(-name => 'showMatch',
135                                                     -checked => $showMatch,
136                                                     -value => 1,
137                                                     -label => 'Show Matching Genes') .
138                                      SearchHelper::Hint("SigGenes",
139                                                         "Check this button to display the genes matching " .
140                                                         "each gene displayed in the results.")))),
141                    CGI::Tr(CGI::td("Cutoff"),
142                             CGI::td(CGI::textfield(-name => 'cutoff',
143                                                    -value => $cutoff,                                                    -value => $cutoff,
144                                                    -size => 5)));                                                    -size => 5)));
145        # Next, the feature filter rows.
146        push @rows, RHFeatures::WordSearchRow($self);
147        push @rows, RHFeatures::FeatureFilterFormRows($self);
148        # Finally, the submit button.
149      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
     # The next rows have the given genome and a feature filter.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
     push @rows, $self->FeatureFilterRows();  
     # Now show the target and exclusion menus.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),  
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
150      # Create the table.      # Create the table.
151      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
152      # Close the form.      # Close the form.
# Line 117  Line 157 
157    
158  =head3 Find  =head3 Find
159    
160  C<< my $resultCount = $shelp->Find(); >>      my $resultCount = $shelp->Find();
161    
162  Conduct a search based on the current CGI query parameters. The search results will  Conduct a search based on the current CGI query parameters. The search results will
163  be written to the session cache file and the number of results will be  be written to the session cache file and the number of results will be
# Line 135  Line 175 
175      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
176      # assume there was an error.      # assume there was an error.
177      my $retVal;      my $retVal;
178        # Create the timers.
179        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
180      # Validate the numeric parameters.      # Validate the numeric parameters.
181      my $commonality = $cgi->param('commonality');      my $commonality = $cgi->param('commonality');
182      my $cutoff = $cgi->param('cutoff');      my $cutoff = $cgi->param('cutoff');
# Line 147  Line 189 
189      } elsif ($cutoff > 1) {      } elsif ($cutoff > 1) {
190          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
191      } else {      } else {
192            # Get the result helper.
193            my $rhelp = RHFeatures->new($self);
194            # Set up the default columns.
195            $self->DefaultColumns($rhelp);
196            # Add the score at the end.
197            $rhelp->AddExtraColumn(score => undef, title => 'Score', style => 'rightAlign', download => 'num');
198            # Find out if we need to show matching genes.
199            my $showMatch = $cgi->param('showMatch') || 0;
200            # If we do, add a column for them at the front.
201            if ($showMatch) {
202                $rhelp->AddExtraColumn(matches => 0, title => 'Matches', style => 'leftAlign', download => 'list');
203            }
204            # Only proceed if the filtering parameters are valid.
205            if ($rhelp->Valid()) {
206          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
207                $self->PrintLine("Gathering the target genomes.  ");
208          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
209                Trace("Given genome is $givenGenomeID.") if T(3);
210          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
211                Trace("Target genomes are " . join(", ", sort keys %targetGenomes) . ".") if T(3);
212                $self->PrintLine("Gathering the exclusion genomes.  ");
213          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
214                Trace("Exclusion genomes are " . join(", ", sort keys %exclusionGenomes) . ".") if T(3);
215                $self->PrintLine("Validating the genome sets.<br />");
216          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
217          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
218              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
219          } else {          } else {
220                    # Start the output session.
221                    $self->OpenSession($rhelp);
222              # Insure the given genome is in the target set.              # Insure the given genome is in the target set.
223              $targetGenomes{$givenGenomeID} = 1                  $targetGenomes{$givenGenomeID} = 1;
224          }                  Trace("$givenGenomeID added to target set.") if T(3);
225                    # Find out if we want to use a statistical analysis.
226                    my $statistical = $cgi->param('statistical') || 1;
227          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
228          $retVal = 0;          $retVal = 0;
229          #TODO: find stuff                  # Compute the list of genomes of interest.
230                    my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
231                    # Get the peg matrix.
232                    Trace("Requesting matrix.") if T(3);
233                    $saveTime = time();
234                    my $bbhMatrix;
235                    if (! $cgi->param('useSims')) {
236                        # Here we are using BBHs, which are fast enough to do in one gulp.
237                        $self->PrintLine("Requesting bidirectional best hits.  ");
238                        $bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
239                    } else {
240                        # Here we are using similarities, which are much more complicated.
241                        $self->PrintLine("Requesting similarities.<br />");
242                        # Create a filtering matrix for the results. We only want to keep PEGs in the
243                        # specified target and exclusion genomes.
244                        my %keepGenomes = map { $_ => 1 } @allGenomes;
245                        # Loop through the given genome's features.
246                        my @features = $sprout->FeaturesOf($givenGenomeID);
247                        for my $fid (@features) {
248                            $self->PrintLine("Retrieving similarities for $fid.  ");
249                            # Get this feature's similarities.
250                            my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
251                            my $simCount = scalar @{$simList};
252                            $self->PrintLine("Raw similarity count: $simCount.  ");
253                            # Create the matrix hash for this feature.
254                            $bbhMatrix->{$fid} = {};
255                            # Now we need to filter out the similarities that don't land on the target genome.
256                            $simCount = 0;
257                            for my $sim (@{$simList}) {
258                                # Insure this similarity lands on a target genome.
259                                my $genomeID2 = $sprout->GenomeOf($sim->id2);
260                                if ($keepGenomes{$genomeID2}) {
261                                    # Here we're keeping the similarity, so we put it in this feature's hash.
262                                    $bbhMatrix->{$fid}->{$sim->id2} = $sim->psc;
263                                    $simCount++;
264                                }
265                            }
266                            $self->PrintLine("Similarities retained: $simCount.<br />");
267                        }
268                    }
269                    $bbhTimer += time() - $saveTime;
270                    $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
271                    Trace("Matrix built.") if T(3);
272                    # Create a feature query object to loop through the chosen features of the given
273                    # genome.
274                    Trace("Creating feature query.") if T(3);
275                    $saveTime = time();
276                    my $fquery = $rhelp->GetQuery($givenGenomeID);
277                    $queryTimer += time() - $saveTime;
278                    # Get the sizes of the two sets. This information is useful in computing commonality.
279                    my $targetSetSize = scalar keys %targetGenomes;
280                    my $exclusionSetSize = scalar keys %exclusionGenomes;
281                    # Loop through the features.
282                    my $done = 0;
283                    while (! $done) {
284                        # Get the next feature.
285                        $saveTime = time();
286                        my $record = $rhelp->Fetch($fquery);
287                        $queryTimer += time() - $saveTime;
288                        if (! $record) {
289                            $done = 1;
290                        } else {
291                            # Get the feature's ID.
292                            my $fid = $record->PrimaryValue('Feature(id)');
293                            Trace("Checking feature $fid.") if T(4);
294                            $self->PrintLine("Checking feature $fid.<br />");
295                            # Get its list of matching genes. The list is actually a hash mapping each matched gene to its
296                            # score. All we care about, however, are the matches themselves.
297                            my $bbhList = $bbhMatrix->{$fid};
298                            # We next wish to loop through the BBH IDs, counting how many are in each of the
299                            # sets. If a genome occurs twice, we only want to count the first occurrence, so
300                            # we have a hash of genomes we've already seen. The hash will map each gene ID
301                            # to 0, 1, or 2, depending on whether it was found in the reference genome,
302                            # a target genome, or an exclusion genome.
303                            my %alreadySeen = ();
304                            # Save the matching genes in here.
305                            my %genesMatching = ();
306                            # Clear the exclusion count.
307                            my $exclusionCount = 0;
308                            # Denote that we're in our own genome.
309                            $alreadySeen{$givenGenomeID} = 0;
310                            my $targetCount = 1;
311                            # Loop through the BBHs/Sims.
312                            for my $bbhPeg (keys %{$bbhList}) {
313                                # Get the genome ID. We want to find out if this genome is new.
314                                my $genomeID = $sprout->GenomeOf($bbhPeg);
315                                if (! exists $alreadySeen{$genomeID}) {
316                                    # It's new, so we check to see which set it's in.
317                                    if ($targetGenomes{$genomeID}) {
318                                        # It's in the target set.
319                                        $targetCount++;
320                                        $alreadySeen{$genomeID} = 1;
321                                    } elsif ($exclusionGenomes{$genomeID}) {
322                                        # It's in the exclusion set.
323                                        $exclusionCount++;
324                                        $alreadySeen{$genomeID} = 2;
325                                    }
326                                    # Note that $alreadySeen{$genomeID} exists in the hash by this
327                                    # point. If it's 1, we need to save the current PEG.
328                                    if ($alreadySeen{$genomeID} == 1) {
329                                        $genesMatching{$bbhPeg} = 1;
330                                    }
331                                }
332      }      }
333                            # Create a variable to indicate whether or not we want to keep this feature and
334                            # another for the score.
335                            my ($okFlag, $score);
336                            # We need to see if we're using statistics or not. This only matters
337                            # for a two-set situation.
338                            if ($statistical && $exclusionSetSize > 0) {
339                                # This is the magic formula for choosing the differentiating genes. It looks like
340                                # it has something to do with variance computations, but I'm not sure.
341                                my $targetNotCount = $targetSetSize - $targetCount;
342                                my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
343                                my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
344                                my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
345                                my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
346                                my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
347                                my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
348                                # If the two differentials are greater than one, we keep this feature.
349                                $score = $inD + $outD;
350                                $okFlag = ($score > 1);
351                                # Subtract 1 from the score so it looks like the commonality score.
352                                $score -= 1.0;
353                            } else {
354                                # Check to see if we're common in set 1 and not in set 2.
355                                my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
356                                my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
357                                if ($score1 && ! $score2) {
358                                    # We satisfy the criterion, so we put this feature to the output. The
359                                    # score is essentially $score1, since $score2 is zero.
360                                    $score = $score1;
361                                    $okFlag = 1;
362                                }
363                            }
364                            if ($okFlag) {
365                                # Put this feature to the output. We have one or two extra columns.
366                                # First we store the score.
367                                $rhelp->PutExtraColumns(score => sprintf("%0.3f",$score));
368                                # Next we add the list of matching genes, but only if "showMatch" is specified.
369                                if ($showMatch) {
370                                    # The matching genes are in the hash "genesMatching".
371                                    my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
372                                    # We need to linkify them.
373                                    my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
374                                    # Now add them as an extra column.
375                                    $rhelp->PutExtraColumns(matches => $genesHTML);
376                                }
377                                # Compute a sort key from the feature data and the score.
378                                my $sort = $rhelp->SortKey($record, sprintf("%0.3f", 1 - $score));
379                                # Output the feature.
380                                $saveTime = time();
381                                $rhelp->PutData($sort, $fid, $record);
382                                $putTimer += time() - $saveTime;
383                                # Increase the result count.
384                                $retVal++;
385                            }
386                            # Check for a timer trace. We trace every 500 features.
387                            $loopCounter++;
388                            if (T(3) && $loopCounter % 500 == 0) {
389                                Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
390                            }
391                        }
392                    }
393                    # Close the session file.
394                    $saveTime = time();
395                    $self->CloseSession();
396                    $putTimer += time() - $saveTime;
397                }
398            }
399        }
400        # Trace the timers.
401        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
402      # Return the result count.      # Return the result count.
403      return $retVal;      return $retVal;
404  }  }
405    
406  =head3 Description  =head3 Description
407    
408  C<< my $htmlText = $shelp->Description(); >>      my $htmlText = $shelp->Description();
409    
410  Return a description of this search. The description is used for the table of contents  Return a description of this search. The description is used for the table of contents
411  on the main search tools page. It may contain HTML, but it should be character-level,  on the main search tools page. It may contain HTML, but it should be character-level,
# Line 180  Line 417 
417      # Get the parameters.      # Get the parameters.
418      my ($self) = @_;      my ($self) = @_;
419      # Return the result.      # Return the result.
420      return "Search for features that are common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
421    }
422    
423    =head3 SearchTitle
424    
425        my $titleHtml = $shelp->SearchTitle();
426    
427    Return the display title for this search. The display title appears above the search results.
428    If no result is returned, no title will be displayed. The result should be an html string
429    that can be legally put inside a block tag such as C<h3> or C<p>.
430    
431    =cut
432    
433    sub SearchTitle {
434        # Get the parameters.
435        my ($self) = @_;
436        # Compute the title. We extract the relevant clues from the query parameters.
437        my $cgi = $self->Q();
438        my $type = ($cgi->param('useSims') ? "Similarities" : "Bidirectional Best Hits");
439        my $style = ($cgi->param('exclusion') ? "Discriminating" : "Common");
440        my $retVal = "$style Genes using $type";
441        # Return it.
442        return $retVal;
443    }
444    
445    =head2 Internal Utilities
446    
447    =head3 IsCommon
448    
449        my $score = SHSigGenes::IsCommon($count, $size, $commonality);
450    
451    Return the match score if a specified count indicates a gene is common in a specified set
452    and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
453    comparing the result to the minimum commonality ratio. The one exception is
454    if the set size is 0. In that case, this method always returns 0.
455    
456    =over 4
457    
458    =item count
459    
460    Number of elements of the set that have the relevant characteristic.
461    
462    =item size
463    
464    Total number of elements in the set.
465    
466    =item commonality
467    
468    Minimum count/size ratio for the characteristic to be considered common.
469    
470    =item RETURN
471    
472    Returns TRUE if the characteristic is common, else FALSE.
473    
474    =back
475    
476    =cut
477    
478    sub IsCommon {
479        # Get the parameters.
480        my ($count, $size, $commonality) = @_;
481        # Declare the return variable.
482        my $retVal = 0;
483        # Only procced if the size is positive.
484        if ($size > 0) {
485            # Compute the commonality.
486            $retVal = $count/$size;
487            # If it's too small, clear it.
488            if ($retVal < $commonality) {
489                $retVal = 0;
490            }
491        }
492        # Return the result.
493        return $retVal;
494  }  }
495    
496  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3