[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.10, Wed Dec 6 03:36:04 2006 UTC revision 1.12, Tue Apr 10 06:10:02 2007 UTC
# Line 9  Line 9 
9      use HTML;      use HTML;
10      use Sprout;      use Sprout;
11      use Time::HiRes;      use Time::HiRes;
12        use FIGRules;
13    
14      our @ISA = qw(SearchHelper);      our @ISA = qw(SearchHelper);
15    
# Line 56  Line 57 
57    
58  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59    
60    =item showMatch
61    
62    If TRUE, then all the genes in the target set that match the ones in the reference genome
63    will be shown in an extra column.
64    
65  =back  =back
66    
67  =head2 Virtual Methods  =head2 Virtual Methods
# Line 88  Line 94 
94      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
95      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
96      my $statistical = $cgi->param('statistical') || 1;      my $statistical = $cgi->param('statistical') || 1;
97        my $showMatch = $cgi->param('showMatch') || 0;
98        my $useSims = $cgi->param('useSims') || 0;
99        my $pegsOnly = $cgi->param('pegsOnly') || 0;
100      # Now we build the table rows.      # Now we build the table rows.
101      my @rows = ();      my @rows = ();
102      # First we have the given genome.      # First we have the given genome.
# Line 98  Line 107 
107                           $cgi->td({colspan => 2}, $targetMenu));                           $cgi->td({colspan => 2}, $targetMenu));
108      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),      push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
109                           $cgi->td({colspan => 2}, $excludeMenu));                           $cgi->td({colspan => 2}, $excludeMenu));
110      # Next, the numeric parameters.      # Next, the tuning parameters.
111      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
112                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
113                                                    -value => $commonality,                                                    -value => $commonality,
114                                                    -size => 5))),                                                    -size => 5))),
115                  $cgi->Tr($cgi->td(), $cgi->td(                  $cgi->Tr($cgi->td(), $cgi->td(join(" ",
116                                    $cgi->checkbox(-name => 'statistical',                                    $cgi->checkbox(-name => 'statistical',
117                                                   -checked => $statistical,                                                   -checked => $statistical,
118                                                   -value => 1,                                                   -value => 1,
119                                                   -label => 'Use Statistical Algorithm')));                                                   -label => 'Use Statistical Algorithm'),
120      push @rows, $cgi->Tr($cgi->td("Cutoff"),                                    $cgi->checkbox(-name => 'useSims',
121                                                     -checked => $useSims,
122                                                     -value => 1,
123                                                     -label => 'Use Similarities')))),
124                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
125                                      $cgi->checkbox(-name => 'showMatch',
126                                                     -checked => $showMatch,
127                                                     -value => 1,
128                                                     -label => 'Show Matching Genes'),
129                                      $cgi->checkbox(-name => 'pegsOnly',
130                                                     -checked => $pegsOnly,
131                                                     -value => 1,
132                                                     -label => 'PEGs Only')))),
133                    $cgi->Tr($cgi->td("Cutoff"),
134                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
135                                                    -value => $cutoff,                                                    -value => $cutoff,
136                                                    -size => 5)));                                                    -size => 5)));
# Line 144  Line 166 
166      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
167      # assume there was an error.      # assume there was an error.
168      my $retVal;      my $retVal;
169        # Denote the extra columns go at the end.
170        $self->SetExtraPos(1);
171      # Create the timers.      # Create the timers.
172      my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);      my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
173      # Validate the numeric parameters.      # Validate the numeric parameters.
174      my $commonality = $cgi->param('commonality');      my $commonality = $cgi->param('commonality');
175      my $cutoff = $cgi->param('cutoff');      my $cutoff = $cgi->param('cutoff');
176        my $pegsOnly = $cgi->param('pegsOnly');
177      if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {      if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
178          $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");          $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
179      } elsif ($commonality <= 0 || $commonality > 1) {      } elsif ($commonality <= 0 || $commonality > 1) {
# Line 159  Line 184 
184          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
185      } else {      } else {
186          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
187            $self->PrintLine("Gathering the target genomes.  ");
188          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
189          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
190            $self->PrintLine("Gathering the exclusion genomes.  ");
191          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
192            $self->PrintLine("Validating the genome sets.<br />");
193          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
194          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
195              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
# Line 170  Line 198 
198              $targetGenomes{$givenGenomeID} = 1              $targetGenomes{$givenGenomeID} = 1
199          }          }
200          # Find out if we want to use a statistical analysis.          # Find out if we want to use a statistical analysis.
201          my $statistical = $cgi->param('statistical') || 0;          my $statistical = $cgi->param('statistical') || 1;
202            # Find out if we need to show matching genes.
203            my $showMatch = $cgi->param('showMatch') || 0;
204          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
205          $retVal = 0;          $retVal = 0;
206          # Compute the list of genomes of interest.          # Compute the list of genomes of interest.
207          my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);          my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
208          # Get the BBH matrix.          # Set the parameter that indicates whether or not we're in PEGs-only mode.
209            my $pegMode = ($pegsOnly ? 'peg' : undef);
210            # Get the peg matrix.
211            Trace("Requesting matrix.") if T(3);
212          $saveTime = time();          $saveTime = time();
213          my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);          my %bbhMatrix;
214            if (! $cgi->param('useSims')) {
215                # Here we are using BBHs, which are fast enough to do in one gulp.
216                $self->PrintLine("Requesting bidirectional best hits.  ");
217                %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
218            } else {
219                # Here we are using similarities, which is much more complicated.
220                $self->PrintLine("Requesting similarities.<br />");
221                # Create a filtering matrix for the results. We only want to keep PEGs in the
222                # specified target and exclusion genomes.
223                my %keepGenomes = map { $_ => 1 } @allGenomes;
224                # Loop through the given genome's features.
225                my @features = $sprout->FeaturesOf($givenGenomeID, $pegMode);
226                for my $fid (@features) {
227                    $self->PrintLine("Retrieving similarities for $fid.  ");
228                    # Get this feature's similarities.
229                    my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
230                    my $simCount = scalar @{$simList};
231                    $self->PrintLine("Raw similarity count: $simCount.  ");
232                    # Create the matrix hash for this feature.
233                    $bbhMatrix{$fid} = {};
234                    # Now we need to filter out the similarities that don't land on the target genome.
235                    $simCount = 0;
236                    for my $sim (@{$simList}) {
237                        # Insure this similarity lands on a target genome.
238                        my ($genomeID2) = FIGRules::ParseFeatureID($sim->id2);
239                        if ($keepGenomes{$genomeID2}) {
240                            # Here we're keeping the similarity, so we put it in this feature's hash.
241                            $bbhMatrix{$fid}->{$sim->id2} = $sim->psc;
242                            $simCount++;
243                        }
244                    }
245                    $self->PrintLine("Similarities retained: $simCount.<br />");
246                }
247            }
248          $bbhTimer += time() - $saveTime;          $bbhTimer += time() - $saveTime;
249            $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
250            Trace("Matrix built.") if T(3);
251          # Create a feature query object to loop through the chosen features of the given          # Create a feature query object to loop through the chosen features of the given
252          # genome.          # genome.
253          Trace("Creating feature query.") if T(3);          Trace("Creating feature query.") if T(3);
# Line 200  Line 269 
269              } else {              } else {
270                  # Get the feature's ID.                  # Get the feature's ID.
271                  my $fid = $fquery->FID();                  my $fid = $fquery->FID();
272                  Trace("Processing feature $fid.") if T(4);                  # Insure we want to look at this feature.
273                    if ($fid =~ /\.peg\./ || ! $pegsOnly) {
274                        Trace("Checking feature $fid.") if T(4);
275                        $self->PrintLine("Checking feature $fid.<br />");
276                  # Get its list of BBHs. The list is actually a hash mapping each BBH to its                  # Get its list of BBHs. The list is actually a hash mapping each BBH to its
277                  # score. All we care about, however, are the BBHs themselves.                  # score. All we care about, however, are the BBHs themselves.
278                  my $bbhList = $bbhMatrix{$fid};                  my $bbhList = $bbhMatrix{$fid};
279                  # We next wish to loop through the BBH IDs, counting how many are in each of the                  # We next wish to loop through the BBH IDs, counting how many are in each of the
280                  # sets. If a genome occurs twice, we only want to count the first occurrence, so                  # sets. If a genome occurs twice, we only want to count the first occurrence, so
281                  # we have a hash of genomes we've already seen.                      # we have a hash of genomes we've already seen. The hash will map each gene ID
282                        # to 0, 1, or 2, depending on whether it was found in the reference genome,
283                        # a target genome, or an exclusion genome.
284                  my %alreadySeen = ();                  my %alreadySeen = ();
285                        # Save the matching genes in here.
286                        my %genesMatching = ();
287                  # Clear the exclusion count.                  # Clear the exclusion count.
288                  my $exclusionCount = 0;                  my $exclusionCount = 0;
289                  # Denote that we're in our own genome.                  # Denote that we're in our own genome.
290                  $alreadySeen{$givenGenomeID} = 1;                      $alreadySeen{$givenGenomeID} = 0;
291                  my $targetCount = 1;                  my $targetCount = 1;
292                  # Loop through the BBHs.                      # Loop through the BBHs/Sims.
293                  for my $bbhPeg (keys %{$bbhList}) {                  for my $bbhPeg (keys %{$bbhList}) {
294                      # Get the genome ID. We want to find out if this genome is new.                      # Get the genome ID. We want to find out if this genome is new.
295                      my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);                      my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
296                      if (! $alreadySeen{$genomeID}) {                          if (! exists $alreadySeen{$genomeID}) {
297                          # It's new, so we check to see which set it's in.                          # It's new, so we check to see which set it's in.
298                          if ($targetGenomes{$genomeID}) {                          if ($targetGenomes{$genomeID}) {
299                                    # It's in the target set.
300                              $targetCount++;                              $targetCount++;
301                                    $alreadySeen{$genomeID} = 1;
302                          } elsif ($exclusionGenomes{$genomeID}) {                          } elsif ($exclusionGenomes{$genomeID}) {
303                                    # It's in the exclusion set.
304                              $exclusionCount++;                              $exclusionCount++;
305                                    $alreadySeen{$genomeID} = 2;
306                                }
307                                # Note that $alreadySeen{$genomeID} exists in the hash by this
308                                # point. If it's 1, we need to save the current PEG.
309                                if ($alreadySeen{$genomeID} == 1) {
310                                    $genesMatching{$bbhPeg} = 1;
311                          }                          }
                         # Make sure we don't look at it again.  
                         $alreadySeen{$genomeID} = 1;  
312                      }                      }
313                  }                  }
314                  # Create a variable to indicate whether or not we want to keep this feature.                      # Create a variable to indicate whether or not we want to keep this feature and
315                  my $okFlag;                      # another for the score.
316                        my ($okFlag, $score);
317                  # We need to see if we're using statistics or not. This only matters                  # We need to see if we're using statistics or not. This only matters
318                  # for a two-set situation.                  # for a two-set situation.
319                  if ($statistical && $exclusionSetSize > 0) {                  if ($statistical && $exclusionSetSize > 0) {
320                      # This looks like it has something to do with variance computations,                          # This is the magic formula for choosing the differentiating genes. It looks like
321                      # but I'm not sure.                          # it has something to do with variance computations, but I'm not sure.
322                      my $targetNotCount = $targetSetSize - $targetCount;                      my $targetNotCount = $targetSetSize - $targetCount;
323                      my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;                      my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
324                      my $exclusionNotCount = $exclusionSetSize - $exclusionCount;                      my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
# Line 243  Line 327 
327                      my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));                      my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
328                      my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));                      my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
329                      # If the two differentials are greater than one, we keep this feature.                      # If the two differentials are greater than one, we keep this feature.
330                      $okFlag = ($inD + $outD > 1);                          $score = $inD + $outD;
331                            $okFlag = ($score > 1);
332                            # Subtract 1 from the score so it looks like the commonality score.
333                            $score -= 1.0;
334                  } else {                  } else {
335                      # Check to see if we're common in set 1 and not in set 2.                      # Check to see if we're common in set 1 and not in set 2.
336                      if (IsCommon($targetCount, $targetSetSize, $commonality) &&                          my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
337                          ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {                          my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
338                          # We satisfy the criterion, so we put this feature to the output.                          if ($score1 && ! $score2) {
339                                # We satisfy the criterion, so we put this feature to the output. The
340                                # score is essentially $score1, since $score2 is zero.
341                                $score = $score1;
342                          $okFlag = 1;                          $okFlag = 1;
343                      }                      }
344                  }                  }
345                  if ($okFlag) {                  if ($okFlag) {
346                      # Put this feature to the output.                          # Put this feature to the output. We have one or two extra columns.
347                            # First we store the score.
348                            $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
349                            # Next we add the list of matching genes, but only if "showMatch" is specified.
350                            if ($showMatch) {
351                                # The matching genes are in the hash "genesMatching".
352                                my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
353                                # We need to linkify them.
354                                my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
355                                # Now add them as an extra column.
356                                $fquery->AddExtraColumns(matches => $genesHTML);
357                            }
358                      $saveTime = time();                      $saveTime = time();
359                      $self->PutFeature($fquery);                      $self->PutFeature($fquery);
360                      $putTimer += time() - $saveTime;                      $putTimer += time() - $saveTime;
# Line 267  Line 368 
368                  }                  }
369              }              }
370          }          }
371            }
372          # Close the session file.          # Close the session file.
373          $saveTime = time();          $saveTime = time();
374          $self->CloseSession();          $self->CloseSession();
# Line 299  Line 401 
401    
402  =head3 IsCommon  =head3 IsCommon
403    
404  C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>  C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
405    
406  Return TRUE if a specified count indicates a gene is common in a specified set.  Return the match score if a specified count indicates a gene is common in a specified set
407  Commonality is computed by dividing the count by the size of the set and  and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
408  comparing the result to the minimum commonality ratio. The one exception is  comparing the result to the minimum commonality ratio. The one exception is
409  if the set size is 0. In that case, this method always returns FALSE.  if the set size is 0. In that case, this method always returns 0.
410    
411  =over 4  =over 4
412    
# Line 335  Line 437 
437      my $retVal = 0;      my $retVal = 0;
438      # Only procced if the size is positive.      # Only procced if the size is positive.
439      if ($size > 0) {      if ($size > 0) {
440          $retVal = ($count/$size >= $commonality);          # Compute the commonality.
441            $retVal = $count/$size;
442            # If it's too small, clear it.
443            if ($retVal < $commonality) {
444                $retVal = 0;
445            }
446      }      }
447      # Return the result.      # Return the result.
448      return $retVal;      return $retVal;

Legend:
Removed from v.1.10  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3