[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.3, Fri Sep 29 15:10:05 2006 UTC revision 1.16, Mon Jul 16 20:04:51 2007 UTC
# Line 4  Line 4 
4    
5      use strict;      use strict;
6      use Tracer;      use Tracer;
     use SearchHelper;  
7      use CGI;      use CGI;
8      use HTML;      use HTML;
9      use Sprout;      use Sprout;
10        use Time::HiRes;
11      our @ISA = qw(SearchHelper);      use FIGRules;
12        use RHFeatures;
13        use base 'SearchHelper';
14    
15  =head1 Gene Discrimination Feature Search Helper  =head1 Gene Discrimination Feature Search Helper
16    
# Line 55  Line 56 
56    
57  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
58    
59    =item showMatch
60    
61    If TRUE, then all the genes in the target set that match the ones in the reference genome
62    will be shown in an extra column.
63    
64  =back  =back
65    
66  =head2 Virtual Methods  =head2 Virtual Methods
67    
68  =head3 Form  =head3 Form
69    
70  C<< my $html = $shelp->Include(); >>  C<< my $html = $shelp->Form(); >>
71    
72  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
73    
# Line 80  Line 86 
86      # there is the selector for the given genome, the commonality and cutoff values,      # there is the selector for the given genome, the commonality and cutoff values,
87      # and the submit button. Our first task, then, is to get the genome selection      # and the submit button. Our first task, then, is to get the genome selection
88      # menus.      # menus.
89      my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('genome')]);      my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
90      my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')]);      my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
91      my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')]);      my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
92      # Get the default values to use for the commonality and cutoff controls.      # Get the default values to use for the commonality and cutoff controls.
93      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
94      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
95      # Now we build the table rows. The top contains the two numeric parameters and      my $statistical = $cgi->param('statistical') || 1;
96      # the submit button.      my $showMatch = $cgi->param('showMatch') || 0;
97        my $useSims = $cgi->param('useSims') || 0;
98        my $pegsOnly = $cgi->param('pegsOnly') || 1;
99        # Now we build the table rows.
100      my @rows = ();      my @rows = ();
101        # First we have the given genome.
102        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
103                             $cgi->td({colspan => 2}, $givenMenu));
104        # Now show the target and exclusion menus.
105        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
106                             $cgi->td({colspan => 2}, $targetMenu));
107        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
108                             $cgi->td({colspan => 2}, $excludeMenu));
109        # Next, the tuning parameters.
110      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
111                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
112                                                    -value => $commonality,                                                    -value => $commonality,
113                                                    -size => 5)));                                                    -size => 5))),
114      push @rows, $cgi->Tr($cgi->td("Cutoff"),                  $cgi->Tr($cgi->td(), $cgi->td(join(" ",
115                                      $cgi->checkbox(-name => 'statistical',
116                                                     -checked => $statistical,
117                                                     -value => 1,
118                                                     -label => 'Use Statistical Algorithm') .
119                                      SearchHelper::Hint("When two sets of genomees are specified, check this " .
120                                                         "box to use a statistical algorithm designed " .
121                                                         "specifically to choose differentiating genes. " .
122                                                         "This box has no effect when looking for genes " .
123                                                         "in common."),
124                                      $cgi->checkbox(-name => 'useSims',
125                                                     -checked => $useSims,
126                                                     -value => 1,
127                                                     -label => 'Use Similarities') .
128                                      SearchHelper::Hint("Normally, Bidirectional Best Hits are used to " .
129                                                         "find matching genes. Check this box to use " .
130                                                         "similarities instead.")))),
131                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
132                                      $cgi->checkbox(-name => 'showMatch',
133                                                     -checked => $showMatch,
134                                                     -value => 1,
135                                                     -label => 'Show Matching Genes') .
136                                      SearchHelper::Hint("Check this button to display the genes matching " .
137                                                         "each gene displayed in the results.")))),
138                    $cgi->Tr($cgi->td("Cutoff"),
139                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
140                                                    -value => $cutoff,                                                    -value => $cutoff,
141                                                    -size => 5)));                                                    -size => 5)));
142        # Next, the feature filter rows.
143        push @rows, RHFeatures::WordSearchRow($self);
144        push @rows, RHFeatures::FeatureFilterFormRows($self);
145        # Finally, the submit button.
146      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
     # The next rows have the given genome and a feature filter.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
     push @rows, $self->FeatureFilterRows();  
     # Now show the target and exclusion menus.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),  
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
147      # Create the table.      # Create the table.
148      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
149      # Close the form.      # Close the form.
# Line 135  Line 172 
172      # Declare the return variable. If it remains undefined, the caller will      # Declare the return variable. If it remains undefined, the caller will
173      # assume there was an error.      # assume there was an error.
174      my $retVal;      my $retVal;
175        # Create the timers.
176        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
177      # Validate the numeric parameters.      # Validate the numeric parameters.
178      my $commonality = $cgi->param('commonality');      my $commonality = $cgi->param('commonality');
179      my $cutoff = $cgi->param('cutoff');      my $cutoff = $cgi->param('cutoff');
# Line 147  Line 186 
186      } elsif ($cutoff > 1) {      } elsif ($cutoff > 1) {
187          $self->SetMessage("Cutoff cannot be greater than 1.");          $self->SetMessage("Cutoff cannot be greater than 1.");
188      } else {      } else {
189            # Get the result helper.
190            my $rhelp = RHFeatures->new($self);
191            # Set up the default columns.
192            $self->DefaultColumns($rhelp);
193            # Add the score at the end.
194            $rhelp->AddExtraColumn(score => undef, title => 'Score', style => 'rightAlign', download => 'num');
195            # Find out if we need to show matching genes.
196            my $showMatch = $cgi->param('showMatch') || 0;
197            # If we do, add a column for them at the front.
198            if ($showMatch) {
199                $rhelp->AddExtraColumn(matches => 0, title => 'Matches', style => 'leftAlign', download => 'list');
200            }
201            # Only proceed if the filtering parameters are valid.
202            if ($rhelp->Valid()) {
203                # Start the output session.
204                $self->OpenSession($rhelp);
205          # Now we need to gather and validate the genome sets.          # Now we need to gather and validate the genome sets.
206                $self->PrintLine("Gathering the target genomes.  ");
207          my ($givenGenomeID) = $self->GetGenomes('given');          my ($givenGenomeID) = $self->GetGenomes('given');
208          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');          my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
209                $self->PrintLine("Gathering the exclusion genomes.  ");
210          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');          my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
211                $self->PrintLine("Validating the genome sets.<br />");
212          # Insure the given genome is not in the exclusion set.          # Insure the given genome is not in the exclusion set.
213          if ($exclusionGenomes{$givenGenomeID}) {          if ($exclusionGenomes{$givenGenomeID}) {
214              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");              $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
215          } else {          } else {
216              # Insure the given genome is in the target set.              # Insure the given genome is in the target set.
217              $targetGenomes{$givenGenomeID} = 1                  $targetGenomes{$givenGenomeID} = 1;
218          }          }
219                # Find out if we want to use a statistical analysis.
220                my $statistical = $cgi->param('statistical') || 1;
221          # Denote we have not yet found any genomes.          # Denote we have not yet found any genomes.
222          $retVal = 0;          $retVal = 0;
223          #TODO: find stuff              # Compute the list of genomes of interest.
224                my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
225                # Get the peg matrix.
226                Trace("Requesting matrix.") if T(3);
227                $saveTime = time();
228                my %bbhMatrix;
229                if (! $cgi->param('useSims')) {
230                    # Here we are using BBHs, which are fast enough to do in one gulp.
231                    $self->PrintLine("Requesting bidirectional best hits.  ");
232                    %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
233                } else {
234                    # Here we are using similarities, which are much more complicated.
235                    $self->PrintLine("Requesting similarities.<br />");
236                    # Create a filtering matrix for the results. We only want to keep PEGs in the
237                    # specified target and exclusion genomes.
238                    my %keepGenomes = map { $_ => 1 } @allGenomes;
239                    # Loop through the given genome's features.
240                    my @features = $sprout->FeaturesOf($givenGenomeID);
241                    for my $fid (@features) {
242                        $self->PrintLine("Retrieving similarities for $fid.  ");
243                        # Get this feature's similarities.
244                        my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
245                        my $simCount = scalar @{$simList};
246                        $self->PrintLine("Raw similarity count: $simCount.  ");
247                        # Create the matrix hash for this feature.
248                        $bbhMatrix{$fid} = {};
249                        # Now we need to filter out the similarities that don't land on the target genome.
250                        $simCount = 0;
251                        for my $sim (@{$simList}) {
252                            # Insure this similarity lands on a target genome.
253                            my $genomeID2 = $sprout->GenomeOf($sim->id2);
254                            if ($keepGenomes{$genomeID2}) {
255                                # Here we're keeping the similarity, so we put it in this feature's hash.
256                                $bbhMatrix{$fid}->{$sim->id2} = $sim->psc;
257                                $simCount++;
258                            }
259                        }
260                        $self->PrintLine("Similarities retained: $simCount.<br />");
261      }      }
262                }
263                $bbhTimer += time() - $saveTime;
264                $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
265                Trace("Matrix built.") if T(3);
266                # Create a feature query object to loop through the chosen features of the given
267                # genome.
268                Trace("Creating feature query.") if T(3);
269                $saveTime = time();
270                my $fquery = $rhelp->GetQuery($givenGenomeID);
271                $queryTimer += time() - $saveTime;
272                # Get the sizes of the two sets. This information is useful in computing commonality.
273                my $targetSetSize = scalar keys %targetGenomes;
274                my $exclusionSetSize = scalar keys %exclusionGenomes;
275                # Loop through the features.
276                my $done = 0;
277                while (! $done) {
278                    # Get the next feature.
279                    $saveTime = time();
280                    my $record = $rhelp->Fetch($fquery);
281                    $queryTimer += time() - $saveTime;
282                    if (! $record) {
283                        $done = 1;
284                    } else {
285                        # Get the feature's ID.
286                        my $fid = $record->PrimaryValue('Feature(id)');
287                        Trace("Checking feature $fid.") if T(4);
288                        $self->PrintLine("Checking feature $fid.<br />");
289                        # Get its list of matching genes. The list is actually a hash mapping each matched gene to its
290                        # score. All we care about, however, are the matches themselves.
291                        my $bbhList = $bbhMatrix{$fid};
292                        # We next wish to loop through the BBH IDs, counting how many are in each of the
293                        # sets. If a genome occurs twice, we only want to count the first occurrence, so
294                        # we have a hash of genomes we've already seen. The hash will map each gene ID
295                        # to 0, 1, or 2, depending on whether it was found in the reference genome,
296                        # a target genome, or an exclusion genome.
297                        my %alreadySeen = ();
298                        # Save the matching genes in here.
299                        my %genesMatching = ();
300                        # Clear the exclusion count.
301                        my $exclusionCount = 0;
302                        # Denote that we're in our own genome.
303                        $alreadySeen{$givenGenomeID} = 0;
304                        my $targetCount = 1;
305                        # Loop through the BBHs/Sims.
306                        for my $bbhPeg (keys %{$bbhList}) {
307                            # Get the genome ID. We want to find out if this genome is new.
308                            my $genomeID = $sprout->GenomeOf($bbhPeg);
309                            if (! exists $alreadySeen{$genomeID}) {
310                                # It's new, so we check to see which set it's in.
311                                if ($targetGenomes{$genomeID}) {
312                                    # It's in the target set.
313                                    $targetCount++;
314                                    $alreadySeen{$genomeID} = 1;
315                                } elsif ($exclusionGenomes{$genomeID}) {
316                                    # It's in the exclusion set.
317                                    $exclusionCount++;
318                                    $alreadySeen{$genomeID} = 2;
319                                }
320                                # Note that $alreadySeen{$genomeID} exists in the hash by this
321                                # point. If it's 1, we need to save the current PEG.
322                                if ($alreadySeen{$genomeID} == 1) {
323                                    $genesMatching{$bbhPeg} = 1;
324                                }
325                            }
326                        }
327                        # Create a variable to indicate whether or not we want to keep this feature and
328                        # another for the score.
329                        my ($okFlag, $score);
330                        # We need to see if we're using statistics or not. This only matters
331                        # for a two-set situation.
332                        if ($statistical && $exclusionSetSize > 0) {
333                            # This is the magic formula for choosing the differentiating genes. It looks like
334                            # it has something to do with variance computations, but I'm not sure.
335                            my $targetNotCount = $targetSetSize - $targetCount;
336                            my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
337                            my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
338                            my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
339                            my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
340                            my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
341                            my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
342                            # If the two differentials are greater than one, we keep this feature.
343                            $score = $inD + $outD;
344                            $okFlag = ($score > 1);
345                            # Subtract 1 from the score so it looks like the commonality score.
346                            $score -= 1.0;
347                        } else {
348                            # Check to see if we're common in set 1 and not in set 2.
349                            my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
350                            my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
351                            if ($score1 && ! $score2) {
352                                # We satisfy the criterion, so we put this feature to the output. The
353                                # score is essentially $score1, since $score2 is zero.
354                                $score = $score1;
355                                $okFlag = 1;
356                            }
357                        }
358                        if ($okFlag) {
359                            # Put this feature to the output. We have one or two extra columns.
360                            # First we store the score.
361                            $rhelp->PutExtraColumns(score => sprintf("%0.3f",$score));
362                            # Next we add the list of matching genes, but only if "showMatch" is specified.
363                            if ($showMatch) {
364                                # The matching genes are in the hash "genesMatching".
365                                my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
366                                # We need to linkify them.
367                                my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
368                                # Now add them as an extra column.
369                                $rhelp->PutExtraColumns(matches => $genesHTML);
370                            }
371                            # Compute a sort key from the feature data and the score.
372                            my $sort = $rhelp->SortKey($record, sprintf("%0.3f", 1 - $score));
373                            # Output the feature.
374                            $saveTime = time();
375                            $rhelp->PutData($sort, $fid, $record);
376                            $putTimer += time() - $saveTime;
377                            # Increase the result count.
378                            $retVal++;
379                        }
380                        # Check for a timer trace. We trace every 500 features.
381                        $loopCounter++;
382                        if (T(3) && $loopCounter % 500 == 0) {
383                            Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
384                        }
385                    }
386                }
387                # Close the session file.
388                $saveTime = time();
389                $self->CloseSession();
390                $putTimer += time() - $saveTime;
391            }
392        }
393        # Trace the timers.
394        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
395      # Return the result count.      # Return the result count.
396      return $retVal;      return $retVal;
397  }  }
# Line 180  Line 410 
410      # Get the parameters.      # Get the parameters.
411      my ($self) = @_;      my ($self) = @_;
412      # Return the result.      # Return the result.
413      return "Search for features that are common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
414    }
415    
416    =head3 SearchTitle
417    
418    C<< my $titleHtml = $shelp->SearchTitle(); >>
419    
420    Return the display title for this search. The display title appears above the search results.
421    If no result is returned, no title will be displayed. The result should be an html string
422    that can be legally put inside a block tag such as C<h3> or C<p>.
423    
424    =cut
425    
426    sub SearchTitle {
427        # Get the parameters.
428        my ($self) = @_;
429        # Compute the title. We extract the relevant clues from the query parameters.
430        my $cgi = $self->Q();
431        my $type = ($cgi->param('useSims') ? "Similarities" : "Bidirectional Best Hits");
432        my $style = ($cgi->param('exclusion') ? "Discriminating" : "Common");
433        my $retVal = "$style Genes using $type";
434        # Return it.
435        return $retVal;
436    }
437    
438    =head2 Internal Utilities
439    
440    =head3 IsCommon
441    
442    C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
443    
444    Return the match score if a specified count indicates a gene is common in a specified set
445    and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
446    comparing the result to the minimum commonality ratio. The one exception is
447    if the set size is 0. In that case, this method always returns 0.
448    
449    =over 4
450    
451    =item count
452    
453    Number of elements of the set that have the relevant characteristic.
454    
455    =item size
456    
457    Total number of elements in the set.
458    
459    =item commonality
460    
461    Minimum count/size ratio for the characteristic to be considered common.
462    
463    =item RETURN
464    
465    Returns TRUE if the characteristic is common, else FALSE.
466    
467    =back
468    
469    =cut
470    
471    sub IsCommon {
472        # Get the parameters.
473        my ($count, $size, $commonality) = @_;
474        # Declare the return variable.
475        my $retVal = 0;
476        # Only procced if the size is positive.
477        if ($size > 0) {
478            # Compute the commonality.
479            $retVal = $count/$size;
480            # If it's too small, clear it.
481            if ($retVal < $commonality) {
482                $retVal = 0;
483            }
484        }
485        # Return the result.
486        return $retVal;
487  }  }
488    
489  1;  1;

Legend:
Removed from v.1.3  
changed lines
  Added in v.1.16

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3