[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.11, Wed Feb 21 13:20:37 2007 UTC revision 1.21, Thu Feb 5 07:24:09 2009 UTC
# Line 4  Line 4 
4    
5      use strict;      use strict;
6      use Tracer;      use Tracer;
7      use SearchHelper;      use CGI qw(-nosticky);
     use CGI;  
8      use HTML;      use HTML;
9      use Sprout;      use Sprout;
10      use Time::HiRes;      use Time::HiRes;
11      use FIGRules;      use FIGRules;
12        use RHFeatures;
13      our @ISA = qw(SearchHelper);      use base 'SearchHelper';
14    
15  =head1 Gene Discrimination Feature Search Helper  =head1 Gene Discrimination Feature Search Helper
16    
# Line 68  Line 67 
67    
68  =head3 Form  =head3 Form
69    
70  C<< my $html = $shelp->Form(); >>      my $html = $shelp->Form();
71    
72  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
73    
# Line 95  Line 94 
94      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
95      my $statistical = $cgi->param('statistical') || 1;      my $statistical = $cgi->param('statistical') || 1;
96      my $showMatch = $cgi->param('showMatch') || 0;      my $showMatch = $cgi->param('showMatch') || 0;
97        my $useSims = $cgi->param('useSims') || 0;
98        my $pegsOnly = $cgi->param('pegsOnly') || 1;
99      # Now we build the table rows.      # Now we build the table rows.
100      my @rows = ();      my @rows = ();
101      # First we have the given genome.      # First we have the given genome.
102      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),      push @rows, CGI::Tr(CGI::td({valign => "top"}, "Reference Genome"),
103                           $cgi->td({colspan => 2}, $givenMenu));                           CGI::td({colspan => 2}, $givenMenu));
104      # Now show the target and exclusion menus.      # Now show the target and exclusion menus.
105      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),      push @rows, CGI::Tr(CGI::td({valign => "top"}, "Inclusion Genomes (Set 1)"),
106                           $cgi->td({colspan => 2}, $targetMenu));                           CGI::td({colspan => 2}, $targetMenu));
107      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),      push @rows, CGI::Tr(CGI::td({valign => "top"}, "Exclusion Genomes (Set 2)"),
108                           $cgi->td({colspan => 2}, $excludeMenu));                           CGI::td({colspan => 2}, $excludeMenu));
109      # Next, the numeric parameters.      # Next, the tuning parameters.
110      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, CGI::Tr(CGI::td("Commonality"),
111                           $cgi->td($cgi->textfield(-name => 'commonality',                           CGI::td(CGI::textfield(-name => 'commonality',
112                                                    -value => $commonality,                                                    -value => $commonality,
113                                                    -size => 5))),                                                    -size => 5))),
114                  $cgi->Tr($cgi->td(), $cgi->td(join(" ",                  CGI::Tr(CGI::td(), CGI::td(join(" ",
115                                    $cgi->checkbox(-name => 'statistical',                                    CGI::checkbox(-name => 'statistical',
116                                                   -checked => $statistical,                                                   -checked => $statistical,
117                                                   -value => 1,                                                   -value => 1,
118                                                   -label => 'Use Statistical Algorithm')),                                                   -label => 'Use Statistical Algorithm') .
119                                    $cgi->checkbox(-name => 'showMatch',                                    SearchHelper::Hint("SigGenes",
120                                                         "When two sets of genomes are specified, check this " .
121                                                         "box to use a statistical algorithm designed " .
122                                                         "specifically to choose differentiating genes. " .
123                                                         "This box has no effect when looking for genes " .
124                                                         "in common."),
125                                      CGI::checkbox(-name => 'useSims',
126                                                     -checked => $useSims,
127                                                     -value => 1,
128                                                     -label => 'Use Similarities') .
129                                      SearchHelper::Hint("SigGenes",
130                                                         "Normally, Bidirectional Best Hits are used to " .
131                                                         "find matching genes. Check this box to use " .
132                                                         "similarities instead.")))),
133                    CGI::Tr(CGI::td(), CGI::td(join(" ",
134                                      CGI::checkbox(-name => 'showMatch',
135                                                   -checked => $showMatch,                                                   -checked => $showMatch,
136                                                   -value => 1,                                                   -value => 1,
137                                                   -label => 'Show Matching Genes')));                                                   -label => 'Show Matching Genes') .
138      push @rows, $cgi->Tr($cgi->td("Cutoff"),                                    SearchHelper::Hint("SigGenes",
139                           $cgi->td($cgi->textfield(-name => 'cutoff',                                                       "Check this button to display the genes matching " .
140                                                         "each gene displayed in the results.")))),
141                    CGI::Tr(CGI::td("Cutoff"),
142                             CGI::td(CGI::textfield(-name => 'cutoff',
143                                                    -value => $cutoff,                                                    -value => $cutoff,
144                                                    -size => 5)));                                                    -size => 5)));
145      # Next, the feature filter rows.      # Next, the feature filter rows.
146      push @rows, $self->FeatureFilterRows();      push @rows, RHFeatures::WordSearchRow($self);
147        push @rows, RHFeatures::FeatureFilterFormRows($self);
148      # Finally, the submit button.      # Finally, the submit button.
149      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
150      # Create the table.      # Create the table.
# Line 137  Line 157 
157    
158  =head3 Find  =head3 Find
159    
160  C<< my $resultCount = $shelp->Find(); >>      my $resultCount = $shelp->Find();
161    
162  Conduct a search based on the current CGI query parameters. The search results will  Conduct a search based on the current CGI query parameters. The search results will
163  be written to the session cache file and the number of results will be  be written to the session cache file and the number of results will be
# Line 155  Line 175 
175      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
176      # assume there was an error.      # assume there was an error.
177      my $retVal;      my $retVal;
     # Denote the extra columns go at the end.  
     $self->SetExtraPos(1);  
178      # Create the timers.      # Create the timers.
179      my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);      my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
180      # Validate the numeric parameters.      # Validate the numeric parameters.
# Line 171  Line 189 
189      } elsif ($cutoff > 1) {      } elsif ($cutoff > 1) {
190          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
191      } else {      } else {
192            # Get the result helper.
193            my $rhelp = RHFeatures->new($self);
194            # Set up the default columns.
195            $self->DefaultColumns($rhelp);
196            # Add the score at the end.
197            $rhelp->AddExtraColumn(score => undef, title => 'Score', style => 'rightAlign', download => 'num');
198            # Find out if we need to show matching genes.
199            my $showMatch = $cgi->param('showMatch') || 0;
200            # If we do, add a column for them at the front.
201            if ($showMatch) {
202                $rhelp->AddExtraColumn(matches => 0, title => 'Matches', style => 'leftAlign', download => 'list');
203            }
204            # Only proceed if the filtering parameters are valid.
205            if ($rhelp->Valid()) {
206          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
207                $self->PrintLine("Gathering the target genomes.  ");
208          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
209                Trace("Given genome is $givenGenomeID.") if T(3);
210          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
211                Trace("Target genomes are " . join(", ", sort keys %targetGenomes) . ".") if T(3);
212                $self->PrintLine("Gathering the exclusion genomes.  ");
213          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
214                Trace("Exclusion genomes are " . join(", ", sort keys %exclusionGenomes) . ".") if T(3);
215                $self->PrintLine("Validating the genome sets.<br />");
216          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
217          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
218              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
219          } else {          } else {
220                    # Start the output session.
221                    $self->OpenSession($rhelp);
222              # Insure the given genome is in the target set.              # Insure the given genome is in the target set.
223              $targetGenomes{$givenGenomeID} = 1                  $targetGenomes{$givenGenomeID} = 1;
224          }                  Trace("$givenGenomeID added to target set.") if T(3);
225          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
226          my $statistical = $cgi->param('statistical') || 1;          my $statistical = $cgi->param('statistical') || 1;
         # Find out if we need to show matching genes.  
         my $showMatch = $cgi->param('showMatch') || 0;  
227          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
228          $retVal = 0;          $retVal = 0;
229          # Compute the list of genomes of interest.          # Compute the list of genomes of interest.
230          my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);          my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
231          # Get the BBH matrix.                  # Get the peg matrix.
232                    Trace("Requesting matrix.") if T(3);
233          $saveTime = time();          $saveTime = time();
234          my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);                  my $bbhMatrix;
235                    if (! $cgi->param('useSims')) {
236                        # Here we are using BBHs, which are fast enough to do in one gulp.
237                        $self->PrintLine("Requesting bidirectional best hits.  ");
238                        $bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
239                    } else {
240                        # Here we are using similarities, which are much more complicated.
241                        $self->PrintLine("Requesting similarities.<br />");
242                        # Create a filtering matrix for the results. We only want to keep PEGs in the
243                        # specified target and exclusion genomes.
244                        my %keepGenomes = map { $_ => 1 } @allGenomes;
245                        # Loop through the given genome's features.
246                        my @features = $sprout->FeaturesOf($givenGenomeID);
247                        for my $fid (@features) {
248                            $self->PrintLine("Retrieving similarities for $fid.  ");
249                            # Get this feature's similarities.
250                            my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
251                            my $simCount = scalar @{$simList};
252                            $self->PrintLine("Raw similarity count: $simCount.  ");
253                            # Create the matrix hash for this feature.
254                            $bbhMatrix->{$fid} = {};
255                            # Now we need to filter out the similarities that don't land on the target genome.
256                            $simCount = 0;
257                            for my $sim (@{$simList}) {
258                                # Insure this similarity lands on a target genome.
259                                my $genomeID2 = $sprout->GenomeOf($sim->id2);
260                                if ($keepGenomes{$genomeID2}) {
261                                    # Here we're keeping the similarity, so we put it in this feature's hash.
262                                    $bbhMatrix->{$fid}->{$sim->id2} = $sim->psc;
263                                    $simCount++;
264                                }
265                            }
266                            $self->PrintLine("Similarities retained: $simCount.<br />");
267                        }
268                    }
269          $bbhTimer += time() - $saveTime;          $bbhTimer += time() - $saveTime;
270                    $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
271                    Trace("Matrix built.") if T(3);
272          # Create a feature query object to loop through the chosen features of the given          # Create a feature query object to loop through the chosen features of the given
273          # genome.          # genome.
274          Trace("Creating feature query.") if T(3);          Trace("Creating feature query.") if T(3);
275          $saveTime = time();          $saveTime = time();
276          my $fquery = FeatureQuery->new($self, $givenGenomeID);                  my $fquery = $rhelp->GetQuery($givenGenomeID);
277          $queryTimer += time() - $saveTime;          $queryTimer += time() - $saveTime;
278          # Get the sizes of the two sets. This information is useful in computing commonality.          # Get the sizes of the two sets. This information is useful in computing commonality.
279          my $targetSetSize = scalar keys %targetGenomes;          my $targetSetSize = scalar keys %targetGenomes;
# Line 208  Line 283 
283          while (! $done) {          while (! $done) {
284              # Get the next feature.              # Get the next feature.
285              $saveTime = time();              $saveTime = time();
286              my $record = $fquery->Fetch();                      my $record = $rhelp->Fetch($fquery);
287              $queryTimer += time() - $saveTime;              $queryTimer += time() - $saveTime;
288              if (! $record) {              if (! $record) {
289                  $done = 1;                  $done = 1;
290              } else {              } else {
291                  # Get the feature's ID.                  # Get the feature's ID.
292                  my $fid = $fquery->FID();                          my $fid = $record->PrimaryValue('Feature(id)');
293                  Trace("Processing feature $fid.") if T(4);                          Trace("Checking feature $fid.") if T(4);
294                  # Get its list of BBHs. The list is actually a hash mapping each BBH to its                          $self->PrintLine("Checking feature $fid.<br />");
295                  # score. All we care about, however, are the BBHs themselves.                          # Get its list of matching genes. The list is actually a hash mapping each matched gene to its
296                  my $bbhList = $bbhMatrix{$fid};                          # score. All we care about, however, are the matches themselves.
297                            my $bbhList = $bbhMatrix->{$fid};
298                  # We next wish to loop through the BBH IDs, counting how many are in each of the                  # We next wish to loop through the BBH IDs, counting how many are in each of the
299                  # sets. If a genome occurs twice, we only want to count the first occurrence, so                  # sets. If a genome occurs twice, we only want to count the first occurrence, so
300                  # we have a hash of genomes we've already seen. The hash will map each gene ID                  # we have a hash of genomes we've already seen. The hash will map each gene ID
# Line 232  Line 308 
308                  # Denote that we're in our own genome.                  # Denote that we're in our own genome.
309                  $alreadySeen{$givenGenomeID} = 0;                  $alreadySeen{$givenGenomeID} = 0;
310                  my $targetCount = 1;                  my $targetCount = 1;
311                  # Loop through the BBHs.                          # Loop through the BBHs/Sims.
312                  for my $bbhPeg (keys %{$bbhList}) {                  for my $bbhPeg (keys %{$bbhList}) {
313                      # Get the genome ID. We want to find out if this genome is new.                      # Get the genome ID. We want to find out if this genome is new.
314                      my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                              my $genomeID = $sprout->GenomeOf($bbhPeg);
315                      if (! exists $alreadySeen{$genomeID}) {                      if (! exists $alreadySeen{$genomeID}) {
316                          # It's new, so we check to see which set it's in.                          # It's new, so we check to see which set it's in.
317                          if ($targetGenomes{$genomeID}) {                          if ($targetGenomes{$genomeID}) {
# Line 272  Line 348 
348                      # If the two differentials are greater than one, we keep this feature.                      # If the two differentials are greater than one, we keep this feature.
349                      $score = $inD + $outD;                      $score = $inD + $outD;
350                      $okFlag = ($score > 1);                      $okFlag = ($score > 1);
351                                # Subtract 1 from the score so it looks like the commonality score.
352                                $score -= 1.0;
353                  } else {                  } else {
354                      # Check to see if we're common in set 1 and not in set 2.                      # Check to see if we're common in set 1 and not in set 2.
355                      my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);                      my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
356                      my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);                      my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
357                      if ($score1 && ! $score2) {                      if ($score1 && ! $score2) {
358                          # We satisfy the criterion, so we put this feature to the output. The                          # We satisfy the criterion, so we put this feature to the output. The
359                          # score is normalize to a range similar to the scores in the statistical                                  # score is essentially $score1, since $score2 is zero.
360                          # method.                                  $score = $score1;
                         $score = $score1 + (1 - $score2);  
361                          $okFlag = 1;                          $okFlag = 1;
362                      }                      }
363                  }                  }
364                  if ($okFlag) {                  if ($okFlag) {
365                      # Put this feature to the output. We have one or two extra columns.                      # Put this feature to the output. We have one or two extra columns.
366                      # First we store the score.                      # First we store the score.
367                      $fquery->AddExtraColumns(score => sprintf("%.3f",$score));                              $rhelp->PutExtraColumns(score => sprintf("%0.3f",$score));
368                      # Next we add the list of matching genes, but only if "showMatch" is specified.                      # Next we add the list of matching genes, but only if "showMatch" is specified.
369                      if ($showMatch) {                      if ($showMatch) {
370                          # The matching genes are in the hash "genesMatching".                          # The matching genes are in the hash "genesMatching".
# Line 295  Line 372 
372                          # We need to linkify them.                          # We need to linkify them.
373                          my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);                          my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
374                          # Now add them as an extra column.                          # Now add them as an extra column.
375                          $fquery->AddExtraColumns(matches => $genesHTML);                                  $rhelp->PutExtraColumns(matches => $genesHTML);
376                      }                      }
377                                # Compute a sort key from the feature data and the score.
378                                my $sort = $rhelp->SortKey($record, sprintf("%0.3f", 1 - $score));
379                                # Output the feature.
380                      $saveTime = time();                      $saveTime = time();
381                      $self->PutFeature($fquery);                              $rhelp->PutData($sort, $fid, $record);
382                      $putTimer += time() - $saveTime;                      $putTimer += time() - $saveTime;
383                      # Increase the result count.                      # Increase the result count.
384                      $retVal++;                      $retVal++;
# Line 315  Line 395 
395          $self->CloseSession();          $self->CloseSession();
396          $putTimer += time() - $saveTime;          $putTimer += time() - $saveTime;
397      }      }
398            }
399        }
400      # Trace the timers.      # Trace the timers.
401      Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);      Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
402      # Return the result count.      # Return the result count.
# Line 323  Line 405 
405    
406  =head3 Description  =head3 Description
407    
408  C<< my $htmlText = $shelp->Description(); >>      my $htmlText = $shelp->Description();
409    
410  Return a description of this search. The description is used for the table of contents  Return a description of this search. The description is used for the table of contents
411  on the main search tools page. It may contain HTML, but it should be character-level,  on the main search tools page. It may contain HTML, but it should be character-level,
# Line 338  Line 420 
420      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
421  }  }
422    
423    =head3 SearchTitle
424    
425        my $titleHtml = $shelp->SearchTitle();
426    
427    Return the display title for this search. The display title appears above the search results.
428    If no result is returned, no title will be displayed. The result should be an html string
429    that can be legally put inside a block tag such as C<h3> or C<p>.
430    
431    =cut
432    
433    sub SearchTitle {
434        # Get the parameters.
435        my ($self) = @_;
436        # Compute the title. We extract the relevant clues from the query parameters.
437        my $cgi = $self->Q();
438        my $type = ($cgi->param('useSims') ? "Similarities" : "Bidirectional Best Hits");
439        my $style = ($cgi->param('exclusion') ? "Discriminating" : "Common");
440        my $retVal = "$style Genes using $type";
441        # Return it.
442        return $retVal;
443    }
444    
445  =head2 Internal Utilities  =head2 Internal Utilities
446    
447  =head3 IsCommon  =head3 IsCommon
448    
449  C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>      my $score = SHSigGenes::IsCommon($count, $size, $commonality);
450    
451  Return the match score if a specified count indicates a gene is common in a specified set  Return the match score if a specified count indicates a gene is common in a specified set
452  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and

Legend:
Removed from v.1.11  
changed lines
  Added in v.1.21

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3