[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Thu Nov 16 22:58:28 2006 UTC revision 1.22, Mon Mar 16 00:24:23 2009 UTC
# Line 4  Line 4 
4    
5      use strict;      use strict;
6      use Tracer;      use Tracer;
7      use SearchHelper;      use CGI qw(-nosticky);
     use CGI;  
8      use HTML;      use HTML;
9      use Sprout;      use Sprout;
10        use Time::HiRes;
11      our @ISA = qw(SearchHelper);      use FIGRules;
12        use RHFeatures;
13        use base 'SearchHelper';
14    
15  =head1 Gene Discrimination Feature Search Helper  =head1 Gene Discrimination Feature Search Helper
16    
# Line 55  Line 56 
56    
57  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
58    
59    =item showMatch
60    
61    If TRUE, then all the genes in the target set that match the ones in the reference genome
62    will be shown in an extra column.
63    
64  =back  =back
65    
66  =head2 Virtual Methods  =head2 Virtual Methods
67    
68  =head3 Form  =head3 Form
69    
70  C<< my $html = $shelp->Form(); >>      my $html = $shelp->Form();
71    
72  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
73    
# Line 87  Line 93 
93      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
94      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
95      my $statistical = $cgi->param('statistical') || 1;      my $statistical = $cgi->param('statistical') || 1;
96      # Now we build the table rows. The top contains the two numeric parameters and      my $showMatch = $cgi->param('showMatch') || 0;
97      # the submit button.      my $useSims = $cgi->param('useSims') || 0;
98        my $pegsOnly = $cgi->param('pegsOnly') || 1;
99        # Now we build the table rows.
100      my @rows = ();      my @rows = ();
101      push @rows, $cgi->Tr($cgi->td("Commonality"),      # First we have the given genome.
102                           $cgi->td($cgi->textfield(-name => 'commonality',      push @rows, CGI::Tr(CGI::td({valign => "top"}, "Reference Genome"),
103                             CGI::td({colspan => 2}, $givenMenu));
104        # Now show the target and exclusion menus.
105        push @rows, CGI::Tr(CGI::td({valign => "top"}, "Inclusion Genomes (Set 1)"),
106                             CGI::td({colspan => 2}, $targetMenu));
107        push @rows, CGI::Tr(CGI::td({valign => "top"}, "Exclusion Genomes (Set 2)"),
108                             CGI::td({colspan => 2}, $excludeMenu));
109        # Next, the tuning parameters.
110        push @rows, CGI::Tr(CGI::td("Commonality"),
111                             CGI::td(CGI::textfield(-name => 'commonality',
112                                                    -value => $commonality,                                                    -value => $commonality,
113                                                    -size => 5) . "&nbsp;" .                                                    -size => 5))),
114                                    $cgi->checkbox(-name => 'statistical',                  CGI::Tr(CGI::td(), CGI::td(join(" ",
115                                      CGI::checkbox(-name => 'statistical',
116                                                   -checked => $statistical,                                                   -checked => $statistical,
117                                                   -value => 1,                                                   -value => 1,
118                                                   -label => 'Use Statistical Algorithm')));                                                   -label => 'Use Statistical Algorithm') .
119      push @rows, $cgi->Tr($cgi->td("Cutoff"),                                    SearchHelper::Hint("SigGenes", 24),
120                           $cgi->td($cgi->textfield(-name => 'cutoff',                                    CGI::checkbox(-name => 'useSims',
121                                                     -checked => $useSims,
122                                                     -value => 1,
123                                                     -label => 'Use Similarities') .
124                                      SearchHelper::Hint("SigGenes", 25)))),
125                    CGI::Tr(CGI::td(), CGI::td(join(" ",
126                                      CGI::checkbox(-name => 'showMatch',
127                                                     -checked => $showMatch,
128                                                     -value => 1,
129                                                     -label => 'Show Matching Genes') .
130                                      SearchHelper::Hint("SigGenes", 26)))),
131                    CGI::Tr(CGI::td("Cutoff"),
132                             CGI::td(CGI::textfield(-name => 'cutoff',
133                                                    -value => $cutoff,                                                    -value => $cutoff,
134                                                    -size => 5)));                                                    -size => 5)));
135        # Next, the feature filter rows.
136        push @rows, RHFeatures::WordSearchRow($self);
137        push @rows, RHFeatures::FeatureFilterFormRows($self);
138        # Finally, the submit button.
139      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
     # The next rows have the given genome and a feature filter.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
     push @rows, $self->FeatureFilterRows();  
     # Now show the target and exclusion menus.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),  
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
140      # Create the table.      # Create the table.
141      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
142      # Close the form.      # Close the form.
# Line 122  Line 147 
147    
148  =head3 Find  =head3 Find
149    
150  C<< my $resultCount = $shelp->Find(); >>      my $resultCount = $shelp->Find();
151    
152  Conduct a search based on the current CGI query parameters. The search results will  Conduct a search based on the current CGI query parameters. The search results will
153  be written to the session cache file and the number of results will be  be written to the session cache file and the number of results will be
# Line 140  Line 165 
165      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
166      # assume there was an error.      # assume there was an error.
167      my $retVal;      my $retVal;
168        # Create the timers.
169        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
170      # Validate the numeric parameters.      # Validate the numeric parameters.
171      my $commonality = $cgi->param('commonality');      my $commonality = $cgi->param('commonality');
172      my $cutoff = $cgi->param('cutoff');      my $cutoff = $cgi->param('cutoff');
# Line 152  Line 179 
179      } elsif ($cutoff > 1) {      } elsif ($cutoff > 1) {
180          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
181      } else {      } else {
182            # Get the result helper.
183            my $rhelp = RHFeatures->new($self);
184            # Set up the default columns.
185            $self->DefaultColumns($rhelp);
186            # Add the score at the end.
187            $rhelp->AddExtraColumn(score => undef, title => 'Score', style => 'rightAlign', download => 'num');
188            # Find out if we need to show matching genes.
189            my $showMatch = $cgi->param('showMatch') || 0;
190            # If we do, add a column for them at the front.
191            if ($showMatch) {
192                $rhelp->AddExtraColumn(matches => 0, title => 'Matches', style => 'leftAlign', download => 'list');
193            }
194            # Only proceed if the filtering parameters are valid.
195            if ($rhelp->Valid()) {
196          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
197                $self->PrintLine("Gathering the target genomes.  ");
198          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
199                Trace("Given genome is $givenGenomeID.") if T(3);
200          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
201                Trace("Target genomes are " . join(", ", sort keys %targetGenomes) . ".") if T(3);
202                $self->PrintLine("Gathering the exclusion genomes.  ");
203          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
204                Trace("Exclusion genomes are " . join(", ", sort keys %exclusionGenomes) . ".") if T(3);
205                $self->PrintLine("Validating the genome sets.<br />");
206          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
207          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
208              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
209          } else {          } else {
210                    # Start the output session.
211                    $self->OpenSession($rhelp);
212              # Insure the given genome is in the target set.              # Insure the given genome is in the target set.
213              $targetGenomes{$givenGenomeID} = 1                  $targetGenomes{$givenGenomeID} = 1;
214          }                  Trace("$givenGenomeID added to target set.") if T(3);
215          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
216          my $statistical = $cgi->param('statistical') || 0;                  my $statistical = $cgi->param('statistical') || 1;
217          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
218          $retVal = 0;          $retVal = 0;
219                    # Compute the list of genomes of interest.
220                    my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
221                    # Get the peg matrix.
222                    Trace("Requesting matrix.") if T(3);
223                    $saveTime = time();
224                    my $bbhMatrix;
225                    if (! $cgi->param('useSims')) {
226                        # Here we are using BBHs, which are fast enough to do in one gulp.
227                        $self->PrintLine("Requesting bidirectional best hits.  ");
228                        $bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
229                    } else {
230                        # Here we are using similarities, which are much more complicated.
231                        $self->PrintLine("Requesting similarities.<br />");
232                        # Create a filtering matrix for the results. We only want to keep PEGs in the
233                        # specified target and exclusion genomes.
234                        my %keepGenomes = map { $_ => 1 } @allGenomes;
235                        # Loop through the given genome's features.
236                        my @features = $sprout->FeaturesOf($givenGenomeID);
237                        for my $fid (@features) {
238                            $self->PrintLine("Retrieving similarities for $fid.  ");
239                            # Get this feature's similarities.
240                            my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
241                            my $simCount = scalar @{$simList};
242                            $self->PrintLine("Raw similarity count: $simCount.  ");
243                            # Create the matrix hash for this feature.
244                            $bbhMatrix->{$fid} = {};
245                            # Now we need to filter out the similarities that don't land on the target genome.
246                            $simCount = 0;
247                            for my $sim (@{$simList}) {
248                                # Insure this similarity lands on a target genome.
249                                my $genomeID2 = $sprout->GenomeOf($sim->id2);
250                                if ($keepGenomes{$genomeID2}) {
251                                    # Here we're keeping the similarity, so we put it in this feature's hash.
252                                    $bbhMatrix->{$fid}->{$sim->id2} = $sim->psc;
253                                    $simCount++;
254                                }
255                            }
256                            $self->PrintLine("Similarities retained: $simCount.<br />");
257                        }
258                    }
259                    $bbhTimer += time() - $saveTime;
260                    $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
261                    Trace("Matrix built.") if T(3);
262          # Create a feature query object to loop through the chosen features of the given          # Create a feature query object to loop through the chosen features of the given
263          # genome.          # genome.
264          my $fquery = FeatureQuery->new($self, $givenGenomeID);                  Trace("Creating feature query.") if T(3);
265                    $saveTime = time();
266                    my $fquery = $rhelp->GetQuery($givenGenomeID);
267                    $queryTimer += time() - $saveTime;
268          # Get the sizes of the two sets. This information is useful in computing commonality.          # Get the sizes of the two sets. This information is useful in computing commonality.
269          my $targetSetSize = scalar keys %targetGenomes;          my $targetSetSize = scalar keys %targetGenomes;
270          my $exclusionSetSize = scalar keys %exclusionGenomes;          my $exclusionSetSize = scalar keys %exclusionGenomes;
271          # Loop through the features.          # Loop through the features.
272          while (my $record = $fquery->Fetch()) {                  my $done = 0;
273                    while (! $done) {
274                        # Get the next feature.
275                        $saveTime = time();
276                        my $record = $rhelp->Fetch($fquery);
277                        $queryTimer += time() - $saveTime;
278                        if (! $record) {
279                            $done = 1;
280                        } else {
281              # Get the feature's ID.              # Get the feature's ID.
282              my $fid = $fquery->FID();                          my $fid = $record->PrimaryValue('Feature(id)');
283              # Request its list of BBHs. The list is actually a hash mapping each BBH to its                          Trace("Checking feature $fid.") if T(4);
284              # score. All we care about, however, are the BBHs themselves.                          $self->PrintLine("Checking feature $fid.<br />");
285              my %bbhList = $sprout->LowBBHs($fid, $cutoff);                          # Get its list of matching genes. The list is actually a hash mapping each matched gene to its
286                            # score. All we care about, however, are the matches themselves.
287                            my $bbhList = $bbhMatrix->{$fid};
288              # We next wish to loop through the BBH IDs, counting how many are in each of the              # We next wish to loop through the BBH IDs, counting how many are in each of the
289              # sets. If a genome occurs twice, we only want to count the first occurrence, so              # sets. If a genome occurs twice, we only want to count the first occurrence, so
290              # we have a hash of genomes we've already seen.                          # we have a hash of genomes we've already seen. The hash will map each gene ID
291                            # to 0, 1, or 2, depending on whether it was found in the reference genome,
292                            # a target genome, or an exclusion genome.
293              my %alreadySeen = ();              my %alreadySeen = ();
294              # Clear the counts.                          # Save the matching genes in here.
295              my ($targetCount, $exclusionCount) = (0, 0);                          my %genesMatching = ();
296              # Loop through the BBHs.                          # Clear the exclusion count.
297              for my $bbhPeg (keys %bbhList) {                          my $exclusionCount = 0;
298                            # Denote that we're in our own genome.
299                            $alreadySeen{$givenGenomeID} = 0;
300                            my $targetCount = 1;
301                            # Loop through the BBHs/Sims.
302                            for my $bbhPeg (keys %{$bbhList}) {
303                  # Get the genome ID. We want to find out if this genome is new.                  # Get the genome ID. We want to find out if this genome is new.
304                  my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                              my $genomeID = $sprout->GenomeOf($bbhPeg);
305                  if (! $alreadySeen{$genomeID}) {                              if (! exists $alreadySeen{$genomeID}) {
306                      # It's new, so we check to see which set it's in.                      # It's new, so we check to see which set it's in.
307                      if ($targetGenomes{$genomeID}) {                      if ($targetGenomes{$genomeID}) {
308                                        # It's in the target set.
309                          $targetCount++;                          $targetCount++;
310                                        $alreadySeen{$genomeID} = 1;
311                      } elsif ($exclusionGenomes{$genomeID}) {                      } elsif ($exclusionGenomes{$genomeID}) {
312                                        # It's in the exclusion set.
313                          $exclusionCount++;                          $exclusionCount++;
314                                        $alreadySeen{$genomeID} = 2;
315                      }                      }
316                      # Make sure we don't look at it again.                                  # Note that $alreadySeen{$genomeID} exists in the hash by this
317                      $alreadySeen{$genomeID} = 1;                                  # point. If it's 1, we need to save the current PEG.
318                                    if ($alreadySeen{$genomeID} == 1) {
319                                        $genesMatching{$bbhPeg} = 1;
320                  }                  }
321              }              }
322              # Create a variable to indicate whether or not we want to keep this feature.                          }
323              my $okFlag;                          # Create a variable to indicate whether or not we want to keep this feature and
324                            # another for the score.
325                            my ($okFlag, $score);
326              # We need to see if we're using statistics or not. This only matters              # We need to see if we're using statistics or not. This only matters
327              # for a two-set situation.              # for a two-set situation.
328              if ($statistical && $exclusionSetSize > 0) {              if ($statistical && $exclusionSetSize > 0) {
329                  # This looks like it has something to do with variance computations,                              # This is the magic formula for choosing the differentiating genes. It looks like
330                  # but I'm not sure.                              # it has something to do with variance computations, but I'm not sure.
331                  my $targetNotCount = $targetSetSize - $targetCount;                  my $targetNotCount = $targetSetSize - $targetCount;
332                  my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;                  my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
333                  my $exclusionNotCount = $exclusionSetSize - $exclusionCount;                  my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
# Line 216  Line 336 
336                  my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));                  my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
337                  my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));                  my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
338                  # If the two differentials are greater than one, we keep this feature.                  # If the two differentials are greater than one, we keep this feature.
339                  $okFlag = ($inD + $outD > 1);                              $score = $inD + $outD;
340                                $okFlag = ($score > 1);
341                                # Subtract 1 from the score so it looks like the commonality score.
342                                $score -= 1.0;
343              } else {              } else {
344                  # Check to see if we're common in set 1 and not in set 2.                  # Check to see if we're common in set 1 and not in set 2.
345                  if (IsCommon($targetCount, $targetSetSize, $commonality) &&                              my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
346                      ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {                              my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
347                      # We satisfy the criterion, so we put this feature to the output.                              if ($score1 && ! $score2) {
348                                    # We satisfy the criterion, so we put this feature to the output. The
349                                    # score is essentially $score1, since $score2 is zero.
350                                    $score = $score1;
351                      $okFlag = 1;                      $okFlag = 1;
352                  }                  }
353              }              }
354              if ($okFlag) {              if ($okFlag) {
355                  # Put this feature to the output.                              # Put this feature to the output. We have one or two extra columns.
356                  $self->PutFeature($fquery);                              # First we store the score.
357                                $rhelp->PutExtraColumns(score => sprintf("%0.3f",$score));
358                                # Next we add the list of matching genes, but only if "showMatch" is specified.
359                                if ($showMatch) {
360                                    # The matching genes are in the hash "genesMatching".
361                                    my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
362                                    # We need to linkify them.
363                                    my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
364                                    # Now add them as an extra column.
365                                    $rhelp->PutExtraColumns(matches => $genesHTML);
366                                }
367                                # Compute a sort key from the feature data and the score.
368                                my $sort = $rhelp->SortKey($record, sprintf("%0.3f", 1 - $score));
369                                # Output the feature.
370                                $saveTime = time();
371                                $rhelp->PutData($sort, $fid, $record);
372                                $putTimer += time() - $saveTime;
373                  # Increase the result count.                  # Increase the result count.
374                  $retVal++;                  $retVal++;
375              }              }
376                            # Check for a timer trace. We trace every 500 features.
377                            $loopCounter++;
378                            if (T(3) && $loopCounter % 500 == 0) {
379                                Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
380                            }
381                        }
382          }          }
383          # Close the session file.          # Close the session file.
384                    $saveTime = time();
385          $self->CloseSession();          $self->CloseSession();
386                    $putTimer += time() - $saveTime;
387                }
388      }      }
389        }
390        # Trace the timers.
391        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
392      # Return the result count.      # Return the result count.
393      return $retVal;      return $retVal;
394  }  }
395    
396  =head3 Description  =head3 Description
397    
398  C<< my $htmlText = $shelp->Description(); >>      my $htmlText = $shelp->Description();
399    
400  Return a description of this search. The description is used for the table of contents  Return a description of this search. The description is used for the table of contents
401  on the main search tools page. It may contain HTML, but it should be character-level,  on the main search tools page. It may contain HTML, but it should be character-level,
# Line 256  Line 410 
410      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
411  }  }
412    
413    =head3 SearchTitle
414    
415        my $titleHtml = $shelp->SearchTitle();
416    
417    Return the display title for this search. The display title appears above the search results.
418    If no result is returned, no title will be displayed. The result should be an html string
419    that can be legally put inside a block tag such as C<h3> or C<p>.
420    
421    =cut
422    
423    sub SearchTitle {
424        # Get the parameters.
425        my ($self) = @_;
426        # Compute the title. We extract the relevant clues from the query parameters.
427        my $cgi = $self->Q();
428        my $type = ($cgi->param('useSims') ? "Similarities" : "Bidirectional Best Hits");
429        my $style = ($cgi->param('exclusion') ? "Discriminating" : "Common");
430        my $retVal = "$style Genes using $type";
431        # Return it.
432        return $retVal;
433    }
434    
435  =head2 Internal Utilities  =head2 Internal Utilities
436    
437  =head3 IsCommon  =head3 IsCommon
438    
439  C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>      my $score = SHSigGenes::IsCommon($count, $size, $commonality);
440    
441  Return TRUE if a specified count indicates a gene is common in a specified set.  Return the match score if a specified count indicates a gene is common in a specified set
442  Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
443  comparing the result to the minimum commonality ratio. The one exception is  comparing the result to the minimum commonality ratio. The one exception is
444  if the set size is 0. In that case, this method always returns FALSE.  if the set size is 0. In that case, this method always returns 0.
445    
446  =over 4  =over 4
447    
# Line 296  Line 472 
472      my $retVal = 0;      my $retVal = 0;
473      # Only procced if the size is positive.      # Only procced if the size is positive.
474      if ($size > 0) {      if ($size > 0) {
475          $retVal = ($count/$size >= $commonality);          # Compute the commonality.
476            $retVal = $count/$size;
477            # If it's too small, clear it.
478            if ($retVal < $commonality) {
479                $retVal = 0;
480            }
481      }      }
482      # Return the result.      # Return the result.
483      return $retVal;      return $retVal;

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.22

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3