[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Wed Dec 6 03:36:04 2006 UTC revision 1.18, Thu Dec 6 14:58:03 2007 UTC
# Line 4  Line 4 
4    
5      use strict;      use strict;
6      use Tracer;      use Tracer;
     use SearchHelper;  
7      use CGI;      use CGI;
8      use HTML;      use HTML;
9      use Sprout;      use Sprout;
10      use Time::HiRes;      use Time::HiRes;
11        use FIGRules;
12      our @ISA = qw(SearchHelper);      use RHFeatures;
13        use base 'SearchHelper';
14    
15  =head1 Gene Discrimination Feature Search Helper  =head1 Gene Discrimination Feature Search Helper
16    
# Line 56  Line 56 
56    
57  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
58    
59    =item showMatch
60    
61    If TRUE, then all the genes in the target set that match the ones in the reference genome
62    will be shown in an extra column.
63    
64  =back  =back
65    
66  =head2 Virtual Methods  =head2 Virtual Methods
67    
68  =head3 Form  =head3 Form
69    
70  C<< my $html = $shelp->Form(); >>      my $html = $shelp->Form();
71    
72  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
73    
# Line 88  Line 93 
93      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
94      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
95      my $statistical = $cgi->param('statistical') || 1;      my $statistical = $cgi->param('statistical') || 1;
96        my $showMatch = $cgi->param('showMatch') || 0;
97        my $useSims = $cgi->param('useSims') || 0;
98        my $pegsOnly = $cgi->param('pegsOnly') || 1;
99      # Now we build the table rows.      # Now we build the table rows.
100      my @rows = ();      my @rows = ();
101      # First we have the given genome.      # First we have the given genome.
# Line 98  Line 106 
106                           $cgi->td({colspan => 2}, $targetMenu));                           $cgi->td({colspan => 2}, $targetMenu));
107      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
108                           $cgi->td({colspan => 2}, $excludeMenu));                           $cgi->td({colspan => 2}, $excludeMenu));
109      # Next, the numeric parameters.      # Next, the tuning parameters.
110      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
111                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
112                                                    -value => $commonality,                                                    -value => $commonality,
113                                                    -size => 5))),                                                    -size => 5))),
114                  $cgi->Tr($cgi->td(), $cgi->td(                  $cgi->Tr($cgi->td(), $cgi->td(join(" ",
115                                    $cgi->checkbox(-name => 'statistical',                                    $cgi->checkbox(-name => 'statistical',
116                                                   -checked => $statistical,                                                   -checked => $statistical,
117                                                   -value => 1,                                                   -value => 1,
118                                                   -label => 'Use Statistical Algorithm')));                                                   -label => 'Use Statistical Algorithm') .
119      push @rows, $cgi->Tr($cgi->td("Cutoff"),                                    SearchHelper::Hint("SigGenes",
120                                                         "When two sets of genomees are specified, check this " .
121                                                         "box to use a statistical algorithm designed " .
122                                                         "specifically to choose differentiating genes. " .
123                                                         "This box has no effect when looking for genes " .
124                                                         "in common."),
125                                      $cgi->checkbox(-name => 'useSims',
126                                                     -checked => $useSims,
127                                                     -value => 1,
128                                                     -label => 'Use Similarities') .
129                                      SearchHelper::Hint("SigGenes",
130                                                         "Normally, Bidirectional Best Hits are used to " .
131                                                         "find matching genes. Check this box to use " .
132                                                         "similarities instead.")))),
133                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
134                                      $cgi->checkbox(-name => 'showMatch',
135                                                     -checked => $showMatch,
136                                                     -value => 1,
137                                                     -label => 'Show Matching Genes') .
138                                      SearchHelper::Hint("SigGenes",
139                                                         "Check this button to display the genes matching " .
140                                                         "each gene displayed in the results.")))),
141                    $cgi->Tr($cgi->td("Cutoff"),
142                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
143                                                    -value => $cutoff,                                                    -value => $cutoff,
144                                                    -size => 5)));                                                    -size => 5)));
145      # Next, the feature filter rows.      # Next, the feature filter rows.
146      push @rows, $self->FeatureFilterRows();      push @rows, RHFeatures::WordSearchRow($self);
147        push @rows, RHFeatures::FeatureFilterFormRows($self);
148      # Finally, the submit button.      # Finally, the submit button.
149      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
150      # Create the table.      # Create the table.
# Line 126  Line 157 
157    
158  =head3 Find  =head3 Find
159    
160  C<< my $resultCount = $shelp->Find(); >>      my $resultCount = $shelp->Find();
161    
162  Conduct a search based on the current CGI query parameters. The search results will  Conduct a search based on the current CGI query parameters. The search results will
163  be written to the session cache file and the number of results will be  be written to the session cache file and the number of results will be
# Line 158  Line 189 
189      } elsif ($cutoff > 1) {      } elsif ($cutoff > 1) {
190          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
191      } else {      } else {
192            # Get the result helper.
193            my $rhelp = RHFeatures->new($self);
194            # Set up the default columns.
195            $self->DefaultColumns($rhelp);
196            # Add the score at the end.
197            $rhelp->AddExtraColumn(score => undef, title => 'Score', style => 'rightAlign', download => 'num');
198            # Find out if we need to show matching genes.
199            my $showMatch = $cgi->param('showMatch') || 0;
200            # If we do, add a column for them at the front.
201            if ($showMatch) {
202                $rhelp->AddExtraColumn(matches => 0, title => 'Matches', style => 'leftAlign', download => 'list');
203            }
204            # Only proceed if the filtering parameters are valid.
205            if ($rhelp->Valid()) {
206                # Start the output session.
207                $self->OpenSession($rhelp);
208          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
209                $self->PrintLine("Gathering the target genomes.  ");
210          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
211          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
212                $self->PrintLine("Gathering the exclusion genomes.  ");
213          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
214                $self->PrintLine("Validating the genome sets.<br />");
215          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
216          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
217              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
218          } else {          } else {
219              # Insure the given genome is in the target set.              # Insure the given genome is in the target set.
220              $targetGenomes{$givenGenomeID} = 1                  $targetGenomes{$givenGenomeID} = 1;
221          }          }
222          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
223          my $statistical = $cgi->param('statistical') || 0;              my $statistical = $cgi->param('statistical') || 1;
224          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
225          $retVal = 0;          $retVal = 0;
226          # Compute the list of genomes of interest.          # Compute the list of genomes of interest.
227          my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);          my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
228          # Get the BBH matrix.              # Get the peg matrix.
229                Trace("Requesting matrix.") if T(3);
230          $saveTime = time();          $saveTime = time();
231          my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);              my %bbhMatrix;
232                if (! $cgi->param('useSims')) {
233                    # Here we are using BBHs, which are fast enough to do in one gulp.
234                    $self->PrintLine("Requesting bidirectional best hits.  ");
235                    %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
236                } else {
237                    # Here we are using similarities, which are much more complicated.
238                    $self->PrintLine("Requesting similarities.<br />");
239                    # Create a filtering matrix for the results. We only want to keep PEGs in the
240                    # specified target and exclusion genomes.
241                    my %keepGenomes = map { $_ => 1 } @allGenomes;
242                    # Loop through the given genome's features.
243                    my @features = $sprout->FeaturesOf($givenGenomeID);
244                    for my $fid (@features) {
245                        $self->PrintLine("Retrieving similarities for $fid.  ");
246                        # Get this feature's similarities.
247                        my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
248                        my $simCount = scalar @{$simList};
249                        $self->PrintLine("Raw similarity count: $simCount.  ");
250                        # Create the matrix hash for this feature.
251                        $bbhMatrix{$fid} = {};
252                        # Now we need to filter out the similarities that don't land on the target genome.
253                        $simCount = 0;
254                        for my $sim (@{$simList}) {
255                            # Insure this similarity lands on a target genome.
256                            my $genomeID2 = $sprout->GenomeOf($sim->id2);
257                            if ($keepGenomes{$genomeID2}) {
258                                # Here we're keeping the similarity, so we put it in this feature's hash.
259                                $bbhMatrix{$fid}->{$sim->id2} = $sim->psc;
260                                $simCount++;
261                            }
262                        }
263                        $self->PrintLine("Similarities retained: $simCount.<br />");
264                    }
265                }
266          $bbhTimer += time() - $saveTime;          $bbhTimer += time() - $saveTime;
267                $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
268                Trace("Matrix built.") if T(3);
269          # Create a feature query object to loop through the chosen features of the given          # Create a feature query object to loop through the chosen features of the given
270          # genome.          # genome.
271          Trace("Creating feature query.") if T(3);          Trace("Creating feature query.") if T(3);
272          $saveTime = time();          $saveTime = time();
273          my $fquery = FeatureQuery->new($self, $givenGenomeID);              my $fquery = $rhelp->GetQuery($givenGenomeID);
274          $queryTimer += time() - $saveTime;          $queryTimer += time() - $saveTime;
275          # Get the sizes of the two sets. This information is useful in computing commonality.          # Get the sizes of the two sets. This information is useful in computing commonality.
276          my $targetSetSize = scalar keys %targetGenomes;          my $targetSetSize = scalar keys %targetGenomes;
# Line 193  Line 280 
280          while (! $done) {          while (! $done) {
281              # Get the next feature.              # Get the next feature.
282              $saveTime = time();              $saveTime = time();
283              my $record = $fquery->Fetch();                  my $record = $rhelp->Fetch($fquery);
284              $queryTimer += time() - $saveTime;              $queryTimer += time() - $saveTime;
285              if (! $record) {              if (! $record) {
286                  $done = 1;                  $done = 1;
287              } else {              } else {
288                  # Get the feature's ID.                  # Get the feature's ID.
289                  my $fid = $fquery->FID();                      my $fid = $record->PrimaryValue('Feature(id)');
290                  Trace("Processing feature $fid.") if T(4);                      Trace("Checking feature $fid.") if T(4);
291                  # Get its list of BBHs. The list is actually a hash mapping each BBH to its                      $self->PrintLine("Checking feature $fid.<br />");
292                  # score. All we care about, however, are the BBHs themselves.                      # Get its list of matching genes. The list is actually a hash mapping each matched gene to its
293                        # score. All we care about, however, are the matches themselves.
294                  my $bbhList = $bbhMatrix{$fid};                  my $bbhList = $bbhMatrix{$fid};
295                  # We next wish to loop through the BBH IDs, counting how many are in each of the                  # We next wish to loop through the BBH IDs, counting how many are in each of the
296                  # sets. If a genome occurs twice, we only want to count the first occurrence, so                  # sets. If a genome occurs twice, we only want to count the first occurrence, so
297                  # we have a hash of genomes we've already seen.                      # we have a hash of genomes we've already seen. The hash will map each gene ID
298                        # to 0, 1, or 2, depending on whether it was found in the reference genome,
299                        # a target genome, or an exclusion genome.
300                  my %alreadySeen = ();                  my %alreadySeen = ();
301                        # Save the matching genes in here.
302                        my %genesMatching = ();
303                  # Clear the exclusion count.                  # Clear the exclusion count.
304                  my $exclusionCount = 0;                  my $exclusionCount = 0;
305                  # Denote that we're in our own genome.                  # Denote that we're in our own genome.
306                  $alreadySeen{$givenGenomeID} = 1;                      $alreadySeen{$givenGenomeID} = 0;
307                  my $targetCount = 1;                  my $targetCount = 1;
308                  # Loop through the BBHs.                      # Loop through the BBHs/Sims.
309                  for my $bbhPeg (keys %{$bbhList}) {                  for my $bbhPeg (keys %{$bbhList}) {
310                      # Get the genome ID. We want to find out if this genome is new.                      # Get the genome ID. We want to find out if this genome is new.
311                      my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                          my $genomeID = $sprout->GenomeOf($bbhPeg);
312                      if (! $alreadySeen{$genomeID}) {                          if (! exists $alreadySeen{$genomeID}) {
313                          # It's new, so we check to see which set it's in.                          # It's new, so we check to see which set it's in.
314                          if ($targetGenomes{$genomeID}) {                          if ($targetGenomes{$genomeID}) {
315                                    # It's in the target set.
316                              $targetCount++;                              $targetCount++;
317                                    $alreadySeen{$genomeID} = 1;
318                          } elsif ($exclusionGenomes{$genomeID}) {                          } elsif ($exclusionGenomes{$genomeID}) {
319                                    # It's in the exclusion set.
320                              $exclusionCount++;                              $exclusionCount++;
321                                    $alreadySeen{$genomeID} = 2;
322                                }
323                                # Note that $alreadySeen{$genomeID} exists in the hash by this
324                                # point. If it's 1, we need to save the current PEG.
325                                if ($alreadySeen{$genomeID} == 1) {
326                                    $genesMatching{$bbhPeg} = 1;
327                          }                          }
                         # Make sure we don't look at it again.  
                         $alreadySeen{$genomeID} = 1;  
328                      }                      }
329                  }                  }
330                  # Create a variable to indicate whether or not we want to keep this feature.                      # Create a variable to indicate whether or not we want to keep this feature and
331                  my $okFlag;                      # another for the score.
332                        my ($okFlag, $score);
333                  # We need to see if we're using statistics or not. This only matters                  # We need to see if we're using statistics or not. This only matters
334                  # for a two-set situation.                  # for a two-set situation.
335                  if ($statistical && $exclusionSetSize > 0) {                  if ($statistical && $exclusionSetSize > 0) {
336                      # This looks like it has something to do with variance computations,                          # This is the magic formula for choosing the differentiating genes. It looks like
337                      # but I'm not sure.                          # it has something to do with variance computations, but I'm not sure.
338                      my $targetNotCount = $targetSetSize - $targetCount;                      my $targetNotCount = $targetSetSize - $targetCount;
339                      my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;                      my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
340                      my $exclusionNotCount = $exclusionSetSize - $exclusionCount;                      my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
# Line 243  Line 343 
343                      my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));                      my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
344                      my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));                      my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
345                      # If the two differentials are greater than one, we keep this feature.                      # If the two differentials are greater than one, we keep this feature.
346                      $okFlag = ($inD + $outD > 1);                          $score = $inD + $outD;
347                            $okFlag = ($score > 1);
348                            # Subtract 1 from the score so it looks like the commonality score.
349                            $score -= 1.0;
350                  } else {                  } else {
351                      # Check to see if we're common in set 1 and not in set 2.                      # Check to see if we're common in set 1 and not in set 2.
352                      if (IsCommon($targetCount, $targetSetSize, $commonality) &&                          my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
353                          ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {                          my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
354                          # We satisfy the criterion, so we put this feature to the output.                          if ($score1 && ! $score2) {
355                                # We satisfy the criterion, so we put this feature to the output. The
356                                # score is essentially $score1, since $score2 is zero.
357                                $score = $score1;
358                          $okFlag = 1;                          $okFlag = 1;
359                      }                      }
360                  }                  }
361                  if ($okFlag) {                  if ($okFlag) {
362                      # Put this feature to the output.                          # Put this feature to the output. We have one or two extra columns.
363                            # First we store the score.
364                            $rhelp->PutExtraColumns(score => sprintf("%0.3f",$score));
365                            # Next we add the list of matching genes, but only if "showMatch" is specified.
366                            if ($showMatch) {
367                                # The matching genes are in the hash "genesMatching".
368                                my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
369                                # We need to linkify them.
370                                my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
371                                # Now add them as an extra column.
372                                $rhelp->PutExtraColumns(matches => $genesHTML);
373                            }
374                            # Compute a sort key from the feature data and the score.
375                            my $sort = $rhelp->SortKey($record, sprintf("%0.3f", 1 - $score));
376                            # Output the feature.
377                      $saveTime = time();                      $saveTime = time();
378                      $self->PutFeature($fquery);                          $rhelp->PutData($sort, $fid, $record);
379                      $putTimer += time() - $saveTime;                      $putTimer += time() - $saveTime;
380                      # Increase the result count.                      # Increase the result count.
381                      $retVal++;                      $retVal++;
# Line 272  Line 392 
392          $self->CloseSession();          $self->CloseSession();
393          $putTimer += time() - $saveTime;          $putTimer += time() - $saveTime;
394      }      }
395        }
396      # Trace the timers.      # Trace the timers.
397      Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);      Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
398      # Return the result count.      # Return the result count.
# Line 280  Line 401 
401    
402  =head3 Description  =head3 Description
403    
404  C<< my $htmlText = $shelp->Description(); >>      my $htmlText = $shelp->Description();
405    
406  Return a description of this search. The description is used for the table of contents  Return a description of this search. The description is used for the table of contents
407  on the main search tools page. It may contain HTML, but it should be character-level,  on the main search tools page. It may contain HTML, but it should be character-level,
# Line 295  Line 416 
416      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
417  }  }
418    
419    =head3 SearchTitle
420    
421        my $titleHtml = $shelp->SearchTitle();
422    
423    Return the display title for this search. The display title appears above the search results.
424    If no result is returned, no title will be displayed. The result should be an html string
425    that can be legally put inside a block tag such as C<h3> or C<p>.
426    
427    =cut
428    
429    sub SearchTitle {
430        # Get the parameters.
431        my ($self) = @_;
432        # Compute the title. We extract the relevant clues from the query parameters.
433        my $cgi = $self->Q();
434        my $type = ($cgi->param('useSims') ? "Similarities" : "Bidirectional Best Hits");
435        my $style = ($cgi->param('exclusion') ? "Discriminating" : "Common");
436        my $retVal = "$style Genes using $type";
437        # Return it.
438        return $retVal;
439    }
440    
441  =head2 Internal Utilities  =head2 Internal Utilities
442    
443  =head3 IsCommon  =head3 IsCommon
444    
445  C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>      my $score = SHSigGenes::IsCommon($count, $size, $commonality);
446    
447  Return TRUE if a specified count indicates a gene is common in a specified set.  Return the match score if a specified count indicates a gene is common in a specified set
448  Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
449  comparing the result to the minimum commonality ratio. The one exception is  comparing the result to the minimum commonality ratio. The one exception is
450  if the set size is 0. In that case, this method always returns FALSE.  if the set size is 0. In that case, this method always returns 0.
451    
452  =over 4  =over 4
453    
# Line 335  Line 478 
478      my $retVal = 0;      my $retVal = 0;
479      # Only procced if the size is positive.      # Only procced if the size is positive.
480      if ($size > 0) {      if ($size > 0) {
481          $retVal = ($count/$size >= $commonality);          # Compute the commonality.
482            $retVal = $count/$size;
483            # If it's too small, clear it.
484            if ($retVal < $commonality) {
485                $retVal = 0;
486            }
487      }      }
488      # Return the result.      # Return the result.
489      return $retVal;      return $retVal;

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.18

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3