--- SHSigGenes.pm 2006/12/06 03:36:04 1.10
+++ SHSigGenes.pm 2007/02/21 13:20:37 1.11
@@ -9,6 +9,7 @@
use HTML;
use Sprout;
use Time::HiRes;
+ use FIGRules;
our @ISA = qw(SearchHelper);
@@ -56,6 +57,11 @@
Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
+=item showMatch
+
+If TRUE, then all the genes in the target set that match the ones in the reference genome
+will be shown in an extra column.
+
=back
=head2 Virtual Methods
@@ -88,6 +94,7 @@
my $commonality = $cgi->param('commonality') || "0.8";
my $cutoff = $cgi->param('cutoff') || "1e-10";
my $statistical = $cgi->param('statistical') || 1;
+ my $showMatch = $cgi->param('showMatch') || 0;
# Now we build the table rows.
my @rows = ();
# First we have the given genome.
@@ -103,11 +110,15 @@
$cgi->td($cgi->textfield(-name => 'commonality',
-value => $commonality,
-size => 5))),
- $cgi->Tr($cgi->td(), $cgi->td(
+ $cgi->Tr($cgi->td(), $cgi->td(join(" ",
$cgi->checkbox(-name => 'statistical',
-checked => $statistical,
-value => 1,
- -label => 'Use Statistical Algorithm')));
+ -label => 'Use Statistical Algorithm')),
+ $cgi->checkbox(-name => 'showMatch',
+ -checked => $showMatch,
+ -value => 1,
+ -label => 'Show Matching Genes')));
push @rows, $cgi->Tr($cgi->td("Cutoff"),
$cgi->td($cgi->textfield(-name => 'cutoff',
-value => $cutoff,
@@ -144,6 +155,8 @@
# Declare the return variable. If it remains undefined, the caller will
# assume there was an error.
my $retVal;
+ # Denote the extra columns go at the end.
+ $self->SetExtraPos(1);
# Create the timers.
my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
# Validate the numeric parameters.
@@ -170,7 +183,9 @@
$targetGenomes{$givenGenomeID} = 1
}
# Find out if we want to use a statistical analysis.
- my $statistical = $cgi->param('statistical') || 0;
+ my $statistical = $cgi->param('statistical') || 1;
+ # Find out if we need to show matching genes.
+ my $showMatch = $cgi->param('showMatch') || 0;
# Denote we have not yet found any genomes.
$retVal = 0;
# Compute the list of genomes of interest.
@@ -206,35 +221,47 @@
my $bbhList = $bbhMatrix{$fid};
# We next wish to loop through the BBH IDs, counting how many are in each of the
# sets. If a genome occurs twice, we only want to count the first occurrence, so
- # we have a hash of genomes we've already seen.
+ # we have a hash of genomes we've already seen. The hash will map each gene ID
+ # to 0, 1, or 2, depending on whether it was found in the reference genome,
+ # a target genome, or an exclusion genome.
my %alreadySeen = ();
+ # Save the matching genes in here.
+ my %genesMatching = ();
# Clear the exclusion count.
my $exclusionCount = 0;
# Denote that we're in our own genome.
- $alreadySeen{$givenGenomeID} = 1;
+ $alreadySeen{$givenGenomeID} = 0;
my $targetCount = 1;
# Loop through the BBHs.
for my $bbhPeg (keys %{$bbhList}) {
# Get the genome ID. We want to find out if this genome is new.
my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
- if (! $alreadySeen{$genomeID}) {
+ if (! exists $alreadySeen{$genomeID}) {
# It's new, so we check to see which set it's in.
if ($targetGenomes{$genomeID}) {
+ # It's in the target set.
$targetCount++;
+ $alreadySeen{$genomeID} = 1;
} elsif ($exclusionGenomes{$genomeID}) {
+ # It's in the exclusion set.
$exclusionCount++;
+ $alreadySeen{$genomeID} = 2;
+ }
+ # Note that $alreadySeen{$genomeID} exists in the hash by this
+ # point. If it's 1, we need to save the current PEG.
+ if ($alreadySeen{$genomeID} == 1) {
+ $genesMatching{$bbhPeg} = 1;
}
- # Make sure we don't look at it again.
- $alreadySeen{$genomeID} = 1;
}
}
- # Create a variable to indicate whether or not we want to keep this feature.
- my $okFlag;
+ # Create a variable to indicate whether or not we want to keep this feature and
+ # another for the score.
+ my ($okFlag, $score);
# We need to see if we're using statistics or not. This only matters
# for a two-set situation.
if ($statistical && $exclusionSetSize > 0) {
- # This looks like it has something to do with variance computations,
- # but I'm not sure.
+ # This is the magic formula for choosing the differentiating genes. It looks like
+ # it has something to do with variance computations, but I'm not sure.
my $targetNotCount = $targetSetSize - $targetCount;
my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
@@ -243,17 +270,33 @@
my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
# If the two differentials are greater than one, we keep this feature.
- $okFlag = ($inD + $outD > 1);
+ $score = $inD + $outD;
+ $okFlag = ($score > 1);
} else {
# Check to see if we're common in set 1 and not in set 2.
- if (IsCommon($targetCount, $targetSetSize, $commonality) &&
- ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {
- # We satisfy the criterion, so we put this feature to the output.
+ my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
+ my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
+ if ($score1 && ! $score2) {
+ # We satisfy the criterion, so we put this feature to the output. The
+ # score is normalize to a range similar to the scores in the statistical
+ # method.
+ $score = $score1 + (1 - $score2);
$okFlag = 1;
}
}
if ($okFlag) {
- # Put this feature to the output.
+ # Put this feature to the output. We have one or two extra columns.
+ # First we store the score.
+ $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
+ # Next we add the list of matching genes, but only if "showMatch" is specified.
+ if ($showMatch) {
+ # The matching genes are in the hash "genesMatching".
+ my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
+ # We need to linkify them.
+ my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
+ # Now add them as an extra column.
+ $fquery->AddExtraColumns(matches => $genesHTML);
+ }
$saveTime = time();
$self->PutFeature($fquery);
$putTimer += time() - $saveTime;
@@ -299,12 +342,12 @@
=head3 IsCommon
-C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>
+C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
-Return TRUE if a specified count indicates a gene is common in a specified set.
-Commonality is computed by dividing the count by the size of the set and
+Return the match score if a specified count indicates a gene is common in a specified set
+and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
comparing the result to the minimum commonality ratio. The one exception is
-if the set size is 0. In that case, this method always returns FALSE.
+if the set size is 0. In that case, this method always returns 0.
=over 4
@@ -335,7 +378,12 @@
my $retVal = 0;
# Only procced if the size is positive.
if ($size > 0) {
- $retVal = ($count/$size >= $commonality);
+ # Compute the commonality.
+ $retVal = $count/$size;
+ # If it's too small, clear it.
+ if ($retVal < $commonality) {
+ $retVal = 0;
+ }
}
# Return the result.
return $retVal;