[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.5, Sat Oct 7 13:20:35 2006 UTC revision 1.15, Thu May 17 23:44:21 2007 UTC
# Line 8  Line 8 
8      use CGI;      use CGI;
9      use HTML;      use HTML;
10      use Sprout;      use Sprout;
11        use Time::HiRes;
12        use FIGRules;
13    
14      our @ISA = qw(SearchHelper);      our @ISA = qw(SearchHelper);
15    
# Line 55  Line 57 
57    
58  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59    
60    =item showMatch
61    
62    If TRUE, then all the genes in the target set that match the ones in the reference genome
63    will be shown in an extra column.
64    
65  =back  =back
66    
67  =head2 Virtual Methods  =head2 Virtual Methods
68    
69  =head3 Form  =head3 Form
70    
71  C<< my $html = $shelp->Include(); >>  C<< my $html = $shelp->Form(); >>
72    
73  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
74    
# Line 86  Line 93 
93      # Get the default values to use for the commonality and cutoff controls.      # Get the default values to use for the commonality and cutoff controls.
94      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
95      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
96      my $statistical = $cgi->param('statistical') || 0;      my $statistical = $cgi->param('statistical') || 1;
97      # Now we build the table rows. The top contains the two numeric parameters and      my $showMatch = $cgi->param('showMatch') || 0;
98      # the submit button.      my $useSims = $cgi->param('useSims') || 0;
99        my $pegsOnly = $cgi->param('pegsOnly') || 1;
100        # Now we build the table rows.
101      my @rows = ();      my @rows = ();
102        # First we have the given genome.
103        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
104                             $cgi->td({colspan => 2}, $givenMenu));
105        # Now show the target and exclusion menus.
106        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
107                             $cgi->td({colspan => 2}, $targetMenu));
108        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
109                             $cgi->td({colspan => 2}, $excludeMenu));
110        # Next, the tuning parameters.
111      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
112                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
113                                                    -value => $commonality,                                                    -value => $commonality,
114                                                    -size => 5) . "&nbsp;" .                                                    -size => 5))),
115                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
116                                    $cgi->checkbox(-name => 'statistical',                                    $cgi->checkbox(-name => 'statistical',
117                                                   -checked => $statistical,                                                   -checked => $statistical,
118                                                   -value => 1,                                                   -value => 1,
119                                                   -label => 'Use Statistical Algorithm')));                                                   -label => 'Use Statistical Algorithm'),
120      push @rows, $cgi->Tr($cgi->td("Cutoff"),                                    $cgi->checkbox(-name => 'useSims',
121                                                     -checked => $useSims,
122                                                     -value => 1,
123                                                     -label => 'Use Similarities')))),
124                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
125                                      $cgi->checkbox(-name => 'showMatch',
126                                                     -checked => $showMatch,
127                                                     -value => 1,
128                                                     -label => 'Show Matching Genes'),
129    #                                  $cgi->checkbox(-name => 'pegsOnly',
130    #                                                 -checked => $pegsOnly,
131    #                                                 -value => 1,
132    #                                                 -label => 'PEGs Only')
133                                      ))),
134                    $cgi->Tr($cgi->td("Cutoff"),
135                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
136                                                    -value => $cutoff,                                                    -value => $cutoff,
137                                                    -size => 5)));                                                    -size => 5)));
138      push @rows, $self->SubmitRow();      # Next, the feature filter rows.
     # The next rows have the given genome and a feature filter.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
139      push @rows, $self->FeatureFilterRows();      push @rows, $self->FeatureFilterRows();
140      # Now show the target and exclusion menus.      # Finally, the submit button.
141      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),      push @rows, $self->SubmitRow();
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
142      # Create the table.      # Create the table.
143      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
144      # Close the form.      # Close the form.
# Line 140  Line 167 
167      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
168      # assume there was an error.      # assume there was an error.
169      my $retVal;      my $retVal;
170        # Denote the extra columns go at the end.
171        $self->SetExtraPos('score');
172        # Create the timers.
173        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
174      # Validate the numeric parameters.      # Validate the numeric parameters.
175      my $commonality = $cgi->param('commonality');      my $commonality = $cgi->param('commonality');
176      my $cutoff = $cgi->param('cutoff');      my $cutoff = $cgi->param('cutoff');
177        my $pegsOnly = $cgi->param('pegsOnly') || 1;
178      if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {      if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
179          $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");          $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
180      } elsif ($commonality <= 0 || $commonality > 1) {      } elsif ($commonality <= 0 || $commonality > 1) {
# Line 153  Line 185 
185          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
186      } else {      } else {
187          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
188            $self->PrintLine("Gathering the target genomes.  ");
189          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
190          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
191            $self->PrintLine("Gathering the exclusion genomes.  ");
192          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
193            $self->PrintLine("Validating the genome sets.<br />");
194          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
195          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
196              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
# Line 164  Line 199 
199              $targetGenomes{$givenGenomeID} = 1              $targetGenomes{$givenGenomeID} = 1
200          }          }
201          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
202          my $statistical = $cgi->param('statistical') || 0;          my $statistical = $cgi->param('statistical') || 1;
203            # Find out if we need to show matching genes.
204            my $showMatch = $cgi->param('showMatch') || 0;
205          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
206          $retVal = 0;          $retVal = 0;
207            # Compute the list of genomes of interest.
208            my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
209            # Set the parameter that indicates whether or not we're in PEGs-only mode.
210            my $pegMode = ($pegsOnly ? 'peg' : undef);
211            # Get the peg matrix.
212            Trace("Requesting matrix.") if T(3);
213            $saveTime = time();
214            my %bbhMatrix;
215            if (! $cgi->param('useSims')) {
216                # Here we are using BBHs, which are fast enough to do in one gulp.
217                $self->PrintLine("Requesting bidirectional best hits.  ");
218                %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
219            } else {
220                # Here we are using similarities, which is much more complicated.
221                $self->PrintLine("Requesting similarities.<br />");
222                # Create a filtering matrix for the results. We only want to keep PEGs in the
223                # specified target and exclusion genomes.
224                my %keepGenomes = map { $_ => 1 } @allGenomes;
225                # Loop through the given genome's features.
226                my @features = $sprout->FeaturesOf($givenGenomeID, $pegMode);
227                for my $fid (@features) {
228                    $self->PrintLine("Retrieving similarities for $fid.  ");
229                    # Get this feature's similarities.
230                    my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
231                    my $simCount = scalar @{$simList};
232                    $self->PrintLine("Raw similarity count: $simCount.  ");
233                    # Create the matrix hash for this feature.
234                    $bbhMatrix{$fid} = {};
235                    # Now we need to filter out the similarities that don't land on the target genome.
236                    $simCount = 0;
237                    for my $sim (@{$simList}) {
238                        # Insure this similarity lands on a target genome.
239                        my ($genomeID2) = FIGRules::ParseFeatureID($sim->id2);
240                        if ($keepGenomes{$genomeID2}) {
241                            # Here we're keeping the similarity, so we put it in this feature's hash.
242                            $bbhMatrix{$fid}->{$sim->id2} = $sim->psc;
243                            $simCount++;
244                        }
245                    }
246                    $self->PrintLine("Similarities retained: $simCount.<br />");
247                }
248            }
249            $bbhTimer += time() - $saveTime;
250            $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
251            Trace("Matrix built.") if T(3);
252          # Create a feature query object to loop through the chosen features of the given          # Create a feature query object to loop through the chosen features of the given
253          # genome.          # genome.
254            Trace("Creating feature query.") if T(3);
255            $saveTime = time();
256          my $fquery = FeatureQuery->new($self, $givenGenomeID);          my $fquery = FeatureQuery->new($self, $givenGenomeID);
257            $queryTimer += time() - $saveTime;
258          # Get the sizes of the two sets. This information is useful in computing commonality.          # Get the sizes of the two sets. This information is useful in computing commonality.
259          my $targetSetSize = scalar keys %targetGenomes;          my $targetSetSize = scalar keys %targetGenomes;
260          my $exclusionSetSize = scalar keys %exclusionGenomes;          my $exclusionSetSize = scalar keys %exclusionGenomes;
261          # Loop through the features.          # Loop through the features.
262          while (my $record = $fquery->Fetch()) {          my $done = 0;
263            while (! $done) {
264                # Get the next feature.
265                $saveTime = time();
266                my $record = $fquery->Fetch();
267                $queryTimer += time() - $saveTime;
268                if (! $record) {
269                    $done = 1;
270                } else {
271              # Get the feature's ID.              # Get the feature's ID.
272              my $fid = $fquery->FID();              my $fid = $fquery->FID();
273              # Request its list of BBHs. The list is actually a hash mapping each BBH to its                  # Insure we want to look at this feature.
274                    if ($fid =~ /\.peg\./ || ! $pegsOnly) {
275                        Trace("Checking feature $fid.") if T(4);
276                        $self->PrintLine("Checking feature $fid.<br />");
277                        # Get its list of BBHs. The list is actually a hash mapping each BBH to its
278              # score. All we care about, however, are the BBHs themselves.              # score. All we care about, however, are the BBHs themselves.
279              my %bbhList = $sprout->LowBBHs($fid, $cutoff);                      my $bbhList = $bbhMatrix{$fid};
280              # We next wish to loop through the BBH IDs, counting how many are in each of the              # We next wish to loop through the BBH IDs, counting how many are in each of the
281              # sets. If a genome occurs twice, we only want to count the first occurrence, so              # sets. If a genome occurs twice, we only want to count the first occurrence, so
282              # we have a hash of genomes we've already seen.                      # we have a hash of genomes we've already seen. The hash will map each gene ID
283                        # to 0, 1, or 2, depending on whether it was found in the reference genome,
284                        # a target genome, or an exclusion genome.
285              my %alreadySeen = ();              my %alreadySeen = ();
286              # Clear the counts.                      # Save the matching genes in here.
287              my ($targetCount, $exclusionCount) = (0, 0);                      my %genesMatching = ();
288              # Loop through the BBHs.                      # Clear the exclusion count.
289              for my $bbhPeg (keys %bbhList) {                      my $exclusionCount = 0;
290                        # Denote that we're in our own genome.
291                        $alreadySeen{$givenGenomeID} = 0;
292                        my $targetCount = 1;
293                        # Loop through the BBHs/Sims.
294                        for my $bbhPeg (keys %{$bbhList}) {
295                  # Get the genome ID. We want to find out if this genome is new.                  # Get the genome ID. We want to find out if this genome is new.
296                  my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                  my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
297                  if (! $alreadySeen{$genomeID}) {                          if (! exists $alreadySeen{$genomeID}) {
298                      # It's new, so we check to see which set it's in.                      # It's new, so we check to see which set it's in.
299                      if ($targetGenomes{$genomeID}) {                      if ($targetGenomes{$genomeID}) {
300                                    # It's in the target set.
301                          $targetCount++;                          $targetCount++;
302                                    $alreadySeen{$genomeID} = 1;
303                      } elsif ($exclusionGenomes{$genomeID}) {                      } elsif ($exclusionGenomes{$genomeID}) {
304                                    # It's in the exclusion set.
305                          $exclusionCount++;                          $exclusionCount++;
306                                    $alreadySeen{$genomeID} = 2;
307                                }
308                                # Note that $alreadySeen{$genomeID} exists in the hash by this
309                                # point. If it's 1, we need to save the current PEG.
310                                if ($alreadySeen{$genomeID} == 1) {
311                                    $genesMatching{$bbhPeg} = 1;
312                      }                      }
                     # Make sure we don't look at it again.  
                     $alreadySeen{$genomeID} = 1;  
313                  }                  }
314              }              }
315              # Create a variable to indicate whether or not we want to keep this feature.                      # Create a variable to indicate whether or not we want to keep this feature and
316              my $okFlag;                      # another for the score.
317                        my ($okFlag, $score);
318              # We need to see if we're using statistics or not. This only matters              # We need to see if we're using statistics or not. This only matters
319              # for a two-set situation.              # for a two-set situation.
320              if ($statistical && $exclusionSetSize > 0) {              if ($statistical && $exclusionSetSize > 0) {
321                  # This looks like it has something to do with variance computations,                          # This is the magic formula for choosing the differentiating genes. It looks like
322                  # but I'm not sure.                          # it has something to do with variance computations, but I'm not sure.
323                  my $targetNotCount = $targetSetSize - $targetCount;                  my $targetNotCount = $targetSetSize - $targetCount;
324                  my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;                  my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
325                  my $exclusionNotCount = $exclusionSetSize - $exclusionCount;                  my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
# Line 216  Line 328 
328                  my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));                  my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
329                  my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));                  my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
330                  # If the two differentials are greater than one, we keep this feature.                  # If the two differentials are greater than one, we keep this feature.
331                  $okFlag = ($inD + $outD > 1);                          $score = $inD + $outD;
332                            $okFlag = ($score > 1);
333                            # Subtract 1 from the score so it looks like the commonality score.
334                            $score -= 1.0;
335              } else {              } else {
336                  # Check to see if we're common in set 1 and not in set 2.                  # Check to see if we're common in set 1 and not in set 2.
337                  if (IsCommon($targetCount, $targetSetSize, $commonality) &&                          my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
338                      ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {                          my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
339                      # We satisfy the criterion, so we put this feature to the output.                          if ($score1 && ! $score2) {
340                                # We satisfy the criterion, so we put this feature to the output. The
341                                # score is essentially $score1, since $score2 is zero.
342                                $score = $score1;
343                      $okFlag = 1;                      $okFlag = 1;
344                  }                  }
345              }              }
346              if ($okFlag) {              if ($okFlag) {
347                  # Put this feature to the output.                          # Put this feature to the output. We have one or two extra columns.
348                            # First we store the score.
349                            $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
350                            # Next we add the list of matching genes, but only if "showMatch" is specified.
351                            if ($showMatch) {
352                                # The matching genes are in the hash "genesMatching".
353                                my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
354                                # We need to linkify them.
355                                my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
356                                # Now add them as an extra column.
357                                $fquery->AddExtraColumns(matches => $genesHTML);
358                            }
359                            $saveTime = time();
360                  $self->PutFeature($fquery);                  $self->PutFeature($fquery);
361                            $putTimer += time() - $saveTime;
362                  # Increase the result count.                  # Increase the result count.
363                  $retVal++;                  $retVal++;
364              }              }
365                        # Check for a timer trace. We trace every 500 features.
366                        $loopCounter++;
367                        if (T(3) && $loopCounter % 500 == 0) {
368                            Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
369                        }
370                    }
371                }
372          }          }
373          # Close the session file.          # Close the session file.
374            $saveTime = time();
375          $self->CloseSession();          $self->CloseSession();
376            $putTimer += time() - $saveTime;
377      }      }
378        # Trace the timers.
379        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
380      # Return the result count.      # Return the result count.
381      return $retVal;      return $retVal;
382  }  }
# Line 253  Line 395 
395      # Get the parameters.      # Get the parameters.
396      my ($self) = @_;      my ($self) = @_;
397      # Return the result.      # Return the result.
398      return "Search for features that are common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
399    }
400    
401    =head3 SortKey
402    
403    C<< my $key = $shelp->SortKey($fdata); >>
404    
405    Return the sort key for the specified feature data. The default is to sort by feature name,
406    floating NMPDR organisms to the top. If a full-text search is used, then the default
407    sort is by relevance followed by feature name. This sort may be overridden by the
408    search class to provide fancier functionality. This method is called by
409    B<PutFeature>, so it is only used for feature searches. A non-feature search
410    would presumably have its own sort logic.
411    
412    =over 4
413    
414    =item record
415    
416    The C<FeatureData> containing the current feature.
417    
418    =item RETURN
419    
420    Returns a key field that can be used to sort this row in among the results.
421    
422    =back
423    
424    =cut
425    
426    sub SortKey {
427        # Get the parameters.
428        my ($self, $fdata) = @_;
429        # Get the score.
430        my $retVal = $fdata->GetExtraColumn('score');
431        # Invert it to create a sort with the high score first.
432        $retVal = sprintf("%0.3f", 1 - $retVal);
433        Trace("Sort key for " . $fdata->FID() . " is $retVal.") if T(4);
434        # Return the result.
435        return $retVal;
436  }  }
437    
438    
439    
440  =head2 Internal Utilities  =head2 Internal Utilities
441    
442  =head3 IsCommon  =head3 IsCommon
443    
444  C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>  C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
445    
446  Return TRUE if a specified count indicates a gene is common in a specified set.  Return the match score if a specified count indicates a gene is common in a specified set
447  Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
448  comparing the result to the minimum commonality ratio. The one exception is  comparing the result to the minimum commonality ratio. The one exception is
449  if the set size is 0. In that case, this method always returns FALSE.  if the set size is 0. In that case, this method always returns 0.
450    
451  =over 4  =over 4
452    
# Line 296  Line 477 
477      my $retVal = 0;      my $retVal = 0;
478      # Only procced if the size is positive.      # Only procced if the size is positive.
479      if ($size > 0) {      if ($size > 0) {
480          $retVal = ($count/$size >= $commonality);          # Compute the commonality.
481            $retVal = $count/$size;
482            # If it's too small, clear it.
483            if ($retVal < $commonality) {
484                $retVal = 0;
485            }
486      }      }
487      # Return the result.      # Return the result.
488      return $retVal;      return $retVal;

Legend:
Removed from v.1.5  
changed lines
  Added in v.1.15

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3