[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.8, Thu Nov 16 22:58:28 2006 UTC revision 1.11, Wed Feb 21 13:20:37 2007 UTC
# Line 8  Line 8 
8      use CGI;      use CGI;
9      use HTML;      use HTML;
10      use Sprout;      use Sprout;
11        use Time::HiRes;
12        use FIGRules;
13    
14      our @ISA = qw(SearchHelper);      our @ISA = qw(SearchHelper);
15    
# Line 55  Line 57 
57    
58  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59    
60    =item showMatch
61    
62    If TRUE, then all the genes in the target set that match the ones in the reference genome
63    will be shown in an extra column.
64    
65  =back  =back
66    
67  =head2 Virtual Methods  =head2 Virtual Methods
# Line 87  Line 94 
94      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
95      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
96      my $statistical = $cgi->param('statistical') || 1;      my $statistical = $cgi->param('statistical') || 1;
97      # Now we build the table rows. The top contains the two numeric parameters and      my $showMatch = $cgi->param('showMatch') || 0;
98      # the submit button.      # Now we build the table rows.
99      my @rows = ();      my @rows = ();
100        # First we have the given genome.
101        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
102                             $cgi->td({colspan => 2}, $givenMenu));
103        # Now show the target and exclusion menus.
104        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
105                             $cgi->td({colspan => 2}, $targetMenu));
106        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
107                             $cgi->td({colspan => 2}, $excludeMenu));
108        # Next, the numeric parameters.
109      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
110                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
111                                                    -value => $commonality,                                                    -value => $commonality,
112                                                    -size => 5) . "&nbsp;" .                                                    -size => 5))),
113                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
114                                    $cgi->checkbox(-name => 'statistical',                                    $cgi->checkbox(-name => 'statistical',
115                                                   -checked => $statistical,                                                   -checked => $statistical,
116                                                   -value => 1,                                                   -value => 1,
117                                                   -label => 'Use Statistical Algorithm')));                                                   -label => 'Use Statistical Algorithm')),
118                                      $cgi->checkbox(-name => 'showMatch',
119                                                     -checked => $showMatch,
120                                                     -value => 1,
121                                                     -label => 'Show Matching Genes')));
122      push @rows, $cgi->Tr($cgi->td("Cutoff"),      push @rows, $cgi->Tr($cgi->td("Cutoff"),
123                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
124                                                    -value => $cutoff,                                                    -value => $cutoff,
125                                                    -size => 5)));                                                    -size => 5)));
126      push @rows, $self->SubmitRow();      # Next, the feature filter rows.
     # The next rows have the given genome and a feature filter.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
127      push @rows, $self->FeatureFilterRows();      push @rows, $self->FeatureFilterRows();
128      # Now show the target and exclusion menus.      # Finally, the submit button.
129      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),      push @rows, $self->SubmitRow();
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
130      # Create the table.      # Create the table.
131      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
132      # Close the form.      # Close the form.
# Line 140  Line 155 
155      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
156      # assume there was an error.      # assume there was an error.
157      my $retVal;      my $retVal;
158        # Denote the extra columns go at the end.
159        $self->SetExtraPos(1);
160        # Create the timers.
161        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
162      # Validate the numeric parameters.      # Validate the numeric parameters.
163      my $commonality = $cgi->param('commonality');      my $commonality = $cgi->param('commonality');
164      my $cutoff = $cgi->param('cutoff');      my $cutoff = $cgi->param('cutoff');
# Line 164  Line 183 
183              $targetGenomes{$givenGenomeID} = 1              $targetGenomes{$givenGenomeID} = 1
184          }          }
185          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
186          my $statistical = $cgi->param('statistical') || 0;          my $statistical = $cgi->param('statistical') || 1;
187            # Find out if we need to show matching genes.
188            my $showMatch = $cgi->param('showMatch') || 0;
189          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
190          $retVal = 0;          $retVal = 0;
191            # Compute the list of genomes of interest.
192            my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
193            # Get the BBH matrix.
194            $saveTime = time();
195            my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
196            $bbhTimer += time() - $saveTime;
197          # Create a feature query object to loop through the chosen features of the given          # Create a feature query object to loop through the chosen features of the given
198          # genome.          # genome.
199            Trace("Creating feature query.") if T(3);
200            $saveTime = time();
201          my $fquery = FeatureQuery->new($self, $givenGenomeID);          my $fquery = FeatureQuery->new($self, $givenGenomeID);
202            $queryTimer += time() - $saveTime;
203          # Get the sizes of the two sets. This information is useful in computing commonality.          # Get the sizes of the two sets. This information is useful in computing commonality.
204          my $targetSetSize = scalar keys %targetGenomes;          my $targetSetSize = scalar keys %targetGenomes;
205          my $exclusionSetSize = scalar keys %exclusionGenomes;          my $exclusionSetSize = scalar keys %exclusionGenomes;
206          # Loop through the features.          # Loop through the features.
207          while (my $record = $fquery->Fetch()) {          my $done = 0;
208            while (! $done) {
209                # Get the next feature.
210                $saveTime = time();
211                my $record = $fquery->Fetch();
212                $queryTimer += time() - $saveTime;
213                if (! $record) {
214                    $done = 1;
215                } else {
216              # Get the feature's ID.              # Get the feature's ID.
217              my $fid = $fquery->FID();              my $fid = $fquery->FID();
218              # Request its list of BBHs. The list is actually a hash mapping each BBH to its                  Trace("Processing feature $fid.") if T(4);
219                    # Get its list of BBHs. The list is actually a hash mapping each BBH to its
220              # score. All we care about, however, are the BBHs themselves.              # score. All we care about, however, are the BBHs themselves.
221              my %bbhList = $sprout->LowBBHs($fid, $cutoff);                  my $bbhList = $bbhMatrix{$fid};
222              # We next wish to loop through the BBH IDs, counting how many are in each of the              # We next wish to loop through the BBH IDs, counting how many are in each of the
223              # sets. If a genome occurs twice, we only want to count the first occurrence, so              # sets. If a genome occurs twice, we only want to count the first occurrence, so
224              # we have a hash of genomes we've already seen.                  # we have a hash of genomes we've already seen. The hash will map each gene ID
225                    # to 0, 1, or 2, depending on whether it was found in the reference genome,
226                    # a target genome, or an exclusion genome.
227              my %alreadySeen = ();              my %alreadySeen = ();
228              # Clear the counts.                  # Save the matching genes in here.
229              my ($targetCount, $exclusionCount) = (0, 0);                  my %genesMatching = ();
230                    # Clear the exclusion count.
231                    my $exclusionCount = 0;
232                    # Denote that we're in our own genome.
233                    $alreadySeen{$givenGenomeID} = 0;
234                    my $targetCount = 1;
235              # Loop through the BBHs.              # Loop through the BBHs.
236              for my $bbhPeg (keys %bbhList) {                  for my $bbhPeg (keys %{$bbhList}) {
237                  # Get the genome ID. We want to find out if this genome is new.                  # Get the genome ID. We want to find out if this genome is new.
238                  my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                  my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
239                  if (! $alreadySeen{$genomeID}) {                      if (! exists $alreadySeen{$genomeID}) {
240                      # It's new, so we check to see which set it's in.                      # It's new, so we check to see which set it's in.
241                      if ($targetGenomes{$genomeID}) {                      if ($targetGenomes{$genomeID}) {
242                                # It's in the target set.
243                          $targetCount++;                          $targetCount++;
244                                $alreadySeen{$genomeID} = 1;
245                      } elsif ($exclusionGenomes{$genomeID}) {                      } elsif ($exclusionGenomes{$genomeID}) {
246                                # It's in the exclusion set.
247                          $exclusionCount++;                          $exclusionCount++;
248                                $alreadySeen{$genomeID} = 2;
249                      }                      }
250                      # Make sure we don't look at it again.                          # Note that $alreadySeen{$genomeID} exists in the hash by this
251                      $alreadySeen{$genomeID} = 1;                          # point. If it's 1, we need to save the current PEG.
252                            if ($alreadySeen{$genomeID} == 1) {
253                                $genesMatching{$bbhPeg} = 1;
254                  }                  }
255              }              }
256              # Create a variable to indicate whether or not we want to keep this feature.                  }
257              my $okFlag;                  # Create a variable to indicate whether or not we want to keep this feature and
258                    # another for the score.
259                    my ($okFlag, $score);
260              # We need to see if we're using statistics or not. This only matters              # We need to see if we're using statistics or not. This only matters
261              # for a two-set situation.              # for a two-set situation.
262              if ($statistical && $exclusionSetSize > 0) {              if ($statistical && $exclusionSetSize > 0) {
263                  # This looks like it has something to do with variance computations,                      # This is the magic formula for choosing the differentiating genes. It looks like
264                  # but I'm not sure.                      # it has something to do with variance computations, but I'm not sure.
265                  my $targetNotCount = $targetSetSize - $targetCount;                  my $targetNotCount = $targetSetSize - $targetCount;
266                  my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;                  my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
267                  my $exclusionNotCount = $exclusionSetSize - $exclusionCount;                  my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
# Line 216  Line 270 
270                  my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));                  my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
271                  my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));                  my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
272                  # If the two differentials are greater than one, we keep this feature.                  # If the two differentials are greater than one, we keep this feature.
273                  $okFlag = ($inD + $outD > 1);                      $score = $inD + $outD;
274                        $okFlag = ($score > 1);
275              } else {              } else {
276                  # Check to see if we're common in set 1 and not in set 2.                  # Check to see if we're common in set 1 and not in set 2.
277                  if (IsCommon($targetCount, $targetSetSize, $commonality) &&                      my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
278                      ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {                      my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
279                      # We satisfy the criterion, so we put this feature to the output.                      if ($score1 && ! $score2) {
280                            # We satisfy the criterion, so we put this feature to the output. The
281                            # score is normalize to a range similar to the scores in the statistical
282                            # method.
283                            $score = $score1 + (1 - $score2);
284                      $okFlag = 1;                      $okFlag = 1;
285                  }                  }
286              }              }
287              if ($okFlag) {              if ($okFlag) {
288                  # Put this feature to the output.                      # Put this feature to the output. We have one or two extra columns.
289                        # First we store the score.
290                        $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
291                        # Next we add the list of matching genes, but only if "showMatch" is specified.
292                        if ($showMatch) {
293                            # The matching genes are in the hash "genesMatching".
294                            my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
295                            # We need to linkify them.
296                            my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
297                            # Now add them as an extra column.
298                            $fquery->AddExtraColumns(matches => $genesHTML);
299                        }
300                        $saveTime = time();
301                  $self->PutFeature($fquery);                  $self->PutFeature($fquery);
302                        $putTimer += time() - $saveTime;
303                  # Increase the result count.                  # Increase the result count.
304                  $retVal++;                  $retVal++;
305              }              }
306                    # Check for a timer trace. We trace every 500 features.
307                    $loopCounter++;
308                    if (T(3) && $loopCounter % 500 == 0) {
309                        Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
310                    }
311                }
312          }          }
313          # Close the session file.          # Close the session file.
314            $saveTime = time();
315          $self->CloseSession();          $self->CloseSession();
316            $putTimer += time() - $saveTime;
317      }      }
318        # Trace the timers.
319        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
320      # Return the result count.      # Return the result count.
321      return $retVal;      return $retVal;
322  }  }
# Line 260  Line 342 
342    
343  =head3 IsCommon  =head3 IsCommon
344    
345  C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>  C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
346    
347  Return TRUE if a specified count indicates a gene is common in a specified set.  Return the match score if a specified count indicates a gene is common in a specified set
348  Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
349  comparing the result to the minimum commonality ratio. The one exception is  comparing the result to the minimum commonality ratio. The one exception is
350  if the set size is 0. In that case, this method always returns FALSE.  if the set size is 0. In that case, this method always returns 0.
351    
352  =over 4  =over 4
353    
# Line 296  Line 378 
378      my $retVal = 0;      my $retVal = 0;
379      # Only procced if the size is positive.      # Only procced if the size is positive.
380      if ($size > 0) {      if ($size > 0) {
381          $retVal = ($count/$size >= $commonality);          # Compute the commonality.
382            $retVal = $count/$size;
383            # If it's too small, clear it.
384            if ($retVal < $commonality) {
385                $retVal = 0;
386            }
387      }      }
388      # Return the result.      # Return the result.
389      return $retVal;      return $retVal;

Legend:
Removed from v.1.8  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3