[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Wed Dec 6 03:36:04 2006 UTC revision 1.11, Wed Feb 21 13:20:37 2007 UTC
# Line 9  Line 9 
9      use HTML;      use HTML;
10      use Sprout;      use Sprout;
11      use Time::HiRes;      use Time::HiRes;
12        use FIGRules;
13    
14      our @ISA = qw(SearchHelper);      our @ISA = qw(SearchHelper);
15    
# Line 56  Line 57 
57    
58  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59    
60    =item showMatch
61    
62    If TRUE, then all the genes in the target set that match the ones in the reference genome
63    will be shown in an extra column.
64    
65  =back  =back
66    
67  =head2 Virtual Methods  =head2 Virtual Methods
# Line 88  Line 94 
94      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
95      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
96      my $statistical = $cgi->param('statistical') || 1;      my $statistical = $cgi->param('statistical') || 1;
97        my $showMatch = $cgi->param('showMatch') || 0;
98      # Now we build the table rows.      # Now we build the table rows.
99      my @rows = ();      my @rows = ();
100      # First we have the given genome.      # First we have the given genome.
# Line 103  Line 110 
110                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
111                                                    -value => $commonality,                                                    -value => $commonality,
112                                                    -size => 5))),                                                    -size => 5))),
113                  $cgi->Tr($cgi->td(), $cgi->td(                  $cgi->Tr($cgi->td(), $cgi->td(join(" ",
114                                    $cgi->checkbox(-name => 'statistical',                                    $cgi->checkbox(-name => 'statistical',
115                                                   -checked => $statistical,                                                   -checked => $statistical,
116                                                   -value => 1,                                                   -value => 1,
117                                                   -label => 'Use Statistical Algorithm')));                                                   -label => 'Use Statistical Algorithm')),
118                                      $cgi->checkbox(-name => 'showMatch',
119                                                     -checked => $showMatch,
120                                                     -value => 1,
121                                                     -label => 'Show Matching Genes')));
122      push @rows, $cgi->Tr($cgi->td("Cutoff"),      push @rows, $cgi->Tr($cgi->td("Cutoff"),
123                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
124                                                    -value => $cutoff,                                                    -value => $cutoff,
# Line 144  Line 155 
155      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
156      # assume there was an error.      # assume there was an error.
157      my $retVal;      my $retVal;
158        # Denote the extra columns go at the end.
159        $self->SetExtraPos(1);
160      # Create the timers.      # Create the timers.
161      my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);      my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
162      # Validate the numeric parameters.      # Validate the numeric parameters.
# Line 170  Line 183 
183              $targetGenomes{$givenGenomeID} = 1              $targetGenomes{$givenGenomeID} = 1
184          }          }
185          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
186          my $statistical = $cgi->param('statistical') || 0;          my $statistical = $cgi->param('statistical') || 1;
187            # Find out if we need to show matching genes.
188            my $showMatch = $cgi->param('showMatch') || 0;
189          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
190          $retVal = 0;          $retVal = 0;
191          # Compute the list of genomes of interest.          # Compute the list of genomes of interest.
# Line 206  Line 221 
221                  my $bbhList = $bbhMatrix{$fid};                  my $bbhList = $bbhMatrix{$fid};
222                  # We next wish to loop through the BBH IDs, counting how many are in each of the                  # We next wish to loop through the BBH IDs, counting how many are in each of the
223                  # sets. If a genome occurs twice, we only want to count the first occurrence, so                  # sets. If a genome occurs twice, we only want to count the first occurrence, so
224                  # we have a hash of genomes we've already seen.                  # we have a hash of genomes we've already seen. The hash will map each gene ID
225                    # to 0, 1, or 2, depending on whether it was found in the reference genome,
226                    # a target genome, or an exclusion genome.
227                  my %alreadySeen = ();                  my %alreadySeen = ();
228                    # Save the matching genes in here.
229                    my %genesMatching = ();
230                  # Clear the exclusion count.                  # Clear the exclusion count.
231                  my $exclusionCount = 0;                  my $exclusionCount = 0;
232                  # Denote that we're in our own genome.                  # Denote that we're in our own genome.
233                  $alreadySeen{$givenGenomeID} = 1;                  $alreadySeen{$givenGenomeID} = 0;
234                  my $targetCount = 1;                  my $targetCount = 1;
235                  # Loop through the BBHs.                  # Loop through the BBHs.
236                  for my $bbhPeg (keys %{$bbhList}) {                  for my $bbhPeg (keys %{$bbhList}) {
237                      # Get the genome ID. We want to find out if this genome is new.                      # Get the genome ID. We want to find out if this genome is new.
238                      my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                      my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
239                      if (! $alreadySeen{$genomeID}) {                      if (! exists $alreadySeen{$genomeID}) {
240                          # It's new, so we check to see which set it's in.                          # It's new, so we check to see which set it's in.
241                          if ($targetGenomes{$genomeID}) {                          if ($targetGenomes{$genomeID}) {
242                                # It's in the target set.
243                              $targetCount++;                              $targetCount++;
244                                $alreadySeen{$genomeID} = 1;
245                          } elsif ($exclusionGenomes{$genomeID}) {                          } elsif ($exclusionGenomes{$genomeID}) {
246                                # It's in the exclusion set.
247                              $exclusionCount++;                              $exclusionCount++;
248                                $alreadySeen{$genomeID} = 2;
249                            }
250                            # Note that $alreadySeen{$genomeID} exists in the hash by this
251                            # point. If it's 1, we need to save the current PEG.
252                            if ($alreadySeen{$genomeID} == 1) {
253                                $genesMatching{$bbhPeg} = 1;
254                          }                          }
                         # Make sure we don't look at it again.  
                         $alreadySeen{$genomeID} = 1;  
255                      }                      }
256                  }                  }
257                  # Create a variable to indicate whether or not we want to keep this feature.                  # Create a variable to indicate whether or not we want to keep this feature and
258                  my $okFlag;                  # another for the score.
259                    my ($okFlag, $score);
260                  # We need to see if we're using statistics or not. This only matters                  # We need to see if we're using statistics or not. This only matters
261                  # for a two-set situation.                  # for a two-set situation.
262                  if ($statistical && $exclusionSetSize > 0) {                  if ($statistical && $exclusionSetSize > 0) {
263                      # This looks like it has something to do with variance computations,                      # This is the magic formula for choosing the differentiating genes. It looks like
264                      # but I'm not sure.                      # it has something to do with variance computations, but I'm not sure.
265                      my $targetNotCount = $targetSetSize - $targetCount;                      my $targetNotCount = $targetSetSize - $targetCount;
266                      my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;                      my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
267                      my $exclusionNotCount = $exclusionSetSize - $exclusionCount;                      my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
# Line 243  Line 270 
270                      my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));                      my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
271                      my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));                      my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
272                      # If the two differentials are greater than one, we keep this feature.                      # If the two differentials are greater than one, we keep this feature.
273                      $okFlag = ($inD + $outD > 1);                      $score = $inD + $outD;
274                        $okFlag = ($score > 1);
275                  } else {                  } else {
276                      # Check to see if we're common in set 1 and not in set 2.                      # Check to see if we're common in set 1 and not in set 2.
277                      if (IsCommon($targetCount, $targetSetSize, $commonality) &&                      my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
278                          ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {                      my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
279                          # We satisfy the criterion, so we put this feature to the output.                      if ($score1 && ! $score2) {
280                            # We satisfy the criterion, so we put this feature to the output. The
281                            # score is normalize to a range similar to the scores in the statistical
282                            # method.
283                            $score = $score1 + (1 - $score2);
284                          $okFlag = 1;                          $okFlag = 1;
285                      }                      }
286                  }                  }
287                  if ($okFlag) {                  if ($okFlag) {
288                      # Put this feature to the output.                      # Put this feature to the output. We have one or two extra columns.
289                        # First we store the score.
290                        $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
291                        # Next we add the list of matching genes, but only if "showMatch" is specified.
292                        if ($showMatch) {
293                            # The matching genes are in the hash "genesMatching".
294                            my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
295                            # We need to linkify them.
296                            my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
297                            # Now add them as an extra column.
298                            $fquery->AddExtraColumns(matches => $genesHTML);
299                        }
300                      $saveTime = time();                      $saveTime = time();
301                      $self->PutFeature($fquery);                      $self->PutFeature($fquery);
302                      $putTimer += time() - $saveTime;                      $putTimer += time() - $saveTime;
# Line 299  Line 342 
342    
343  =head3 IsCommon  =head3 IsCommon
344    
345  C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>  C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
346    
347  Return TRUE if a specified count indicates a gene is common in a specified set.  Return the match score if a specified count indicates a gene is common in a specified set
348  Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
349  comparing the result to the minimum commonality ratio. The one exception is  comparing the result to the minimum commonality ratio. The one exception is
350  if the set size is 0. In that case, this method always returns FALSE.  if the set size is 0. In that case, this method always returns 0.
351    
352  =over 4  =over 4
353    
# Line 335  Line 378 
378      my $retVal = 0;      my $retVal = 0;
379      # Only procced if the size is positive.      # Only procced if the size is positive.
380      if ($size > 0) {      if ($size > 0) {
381          $retVal = ($count/$size >= $commonality);          # Compute the commonality.
382            $retVal = $count/$size;
383            # If it's too small, clear it.
384            if ($retVal < $commonality) {
385                $retVal = 0;
386            }
387      }      }
388      # Return the result.      # Return the result.
389      return $retVal;      return $retVal;

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3