[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Sep 26 13:46:03 2006 UTC revision 1.11, Wed Feb 21 13:20:37 2007 UTC
# Line 8  Line 8 
8      use CGI;      use CGI;
9      use HTML;      use HTML;
10      use Sprout;      use Sprout;
11        use Time::HiRes;
12        use FIGRules;
13    
14      our @ISA = qw(SearchHelper);      our @ISA = qw(SearchHelper);
15    
# Line 55  Line 57 
57    
58  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59    
60    =item showMatch
61    
62    If TRUE, then all the genes in the target set that match the ones in the reference genome
63    will be shown in an extra column.
64    
65  =back  =back
66    
67  =head2 Virtual Methods  =head2 Virtual Methods
68    
69  =head3 Form  =head3 Form
70    
71  C<< my $html = $shelp->Include(); >>  C<< my $html = $shelp->Form(); >>
72    
73  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
74    
# Line 80  Line 87 
87      # there is the selector for the given genome, the commonality and cutoff values,      # there is the selector for the given genome, the commonality and cutoff values,
88      # and the submit button. Our first task, then, is to get the genome selection      # and the submit button. Our first task, then, is to get the genome selection
89      # menus.      # menus.
90      my $givenMenu   = $self->NmpdrGenomeMenu('given', { size => 1 }, [$cgi->param('genome')]);      my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
91      my $targetMenu  = $self->NmpdrGenomeMenu('target', { size => 10, multiple => 'multiple' },      my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
92                                               [$cgi->param('target')]);      my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
     my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', { size => 10, multiple => 'multiple' },  
                                              [$cgi->param('exclusion')]);  
93      # Get the default values to use for the commonality and cutoff controls.      # Get the default values to use for the commonality and cutoff controls.
94      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
95      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
96      # Now we build the table rows. The top contains the two numeric parameters and      my $statistical = $cgi->param('statistical') || 1;
97      # the submit button.      my $showMatch = $cgi->param('showMatch') || 0;
98        # Now we build the table rows.
99      my @rows = ();      my @rows = ();
100        # First we have the given genome.
101        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
102                             $cgi->td({colspan => 2}, $givenMenu));
103        # Now show the target and exclusion menus.
104        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
105                             $cgi->td({colspan => 2}, $targetMenu));
106        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
107                             $cgi->td({colspan => 2}, $excludeMenu));
108        # Next, the numeric parameters.
109      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
110                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
111                                                    -value => $commonality,                                                    -value => $commonality,
112                                                    -size => 5)));                                                    -size => 5))),
113                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
114                                      $cgi->checkbox(-name => 'statistical',
115                                                     -checked => $statistical,
116                                                     -value => 1,
117                                                     -label => 'Use Statistical Algorithm')),
118                                      $cgi->checkbox(-name => 'showMatch',
119                                                     -checked => $showMatch,
120                                                     -value => 1,
121                                                     -label => 'Show Matching Genes')));
122      push @rows, $cgi->Tr($cgi->td("Cutoff"),      push @rows, $cgi->Tr($cgi->td("Cutoff"),
123                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
124                                                    -value => $cutoff,                                                    -value => $cutoff,
125                                                    -size => 5)));                                                    -size => 5)));
126        # Next, the feature filter rows.
127        push @rows, $self->FeatureFilterRows();
128        # Finally, the submit button.
129      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
     # The next rows have the three genome menus.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),  
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
130      # Create the table.      # Create the table.
131      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
132      # Close the form.      # Close the form.
# Line 132  Line 152 
152      # Get the sprout and CGI query objects.      # Get the sprout and CGI query objects.
153      my $cgi = $self->Q();      my $cgi = $self->Q();
154      my $sprout = $self->DB();      my $sprout = $self->DB();
155      # Declare the return variable.      # Declare the return variable. If it remains undefined, the caller will
156        # assume there was an error.
157      my $retVal;      my $retVal;
158        # Denote the extra columns go at the end.
159      # TODO: find stuff      $self->SetExtraPos(1);
160        # Create the timers.
161        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
162        # Validate the numeric parameters.
163        my $commonality = $cgi->param('commonality');
164        my $cutoff = $cgi->param('cutoff');
165        if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
166            $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
167        } elsif ($commonality <= 0 || $commonality > 1) {
168            $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1.");
169        } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) {
170            $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\".");
171        } elsif ($cutoff > 1) {
172            $self->SetMessage("Cutoff cannot be greater than 1.");
173        } else {
174            # Now we need to gather and validate the genome sets.
175            my ($givenGenomeID) = $self->GetGenomes('given');
176            my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
177            my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
178            # Insure the given genome is not in the exclusion set.
179            if ($exclusionGenomes{$givenGenomeID}) {
180                $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
181            } else {
182                # Insure the given genome is in the target set.
183                $targetGenomes{$givenGenomeID} = 1
184            }
185            # Find out if we want to use a statistical analysis.
186            my $statistical = $cgi->param('statistical') || 1;
187            # Find out if we need to show matching genes.
188            my $showMatch = $cgi->param('showMatch') || 0;
189            # Denote we have not yet found any genomes.
190            $retVal = 0;
191            # Compute the list of genomes of interest.
192            my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
193            # Get the BBH matrix.
194            $saveTime = time();
195            my %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
196            $bbhTimer += time() - $saveTime;
197            # Create a feature query object to loop through the chosen features of the given
198            # genome.
199            Trace("Creating feature query.") if T(3);
200            $saveTime = time();
201            my $fquery = FeatureQuery->new($self, $givenGenomeID);
202            $queryTimer += time() - $saveTime;
203            # Get the sizes of the two sets. This information is useful in computing commonality.
204            my $targetSetSize = scalar keys %targetGenomes;
205            my $exclusionSetSize = scalar keys %exclusionGenomes;
206            # Loop through the features.
207            my $done = 0;
208            while (! $done) {
209                # Get the next feature.
210                $saveTime = time();
211                my $record = $fquery->Fetch();
212                $queryTimer += time() - $saveTime;
213                if (! $record) {
214                    $done = 1;
215                } else {
216                    # Get the feature's ID.
217                    my $fid = $fquery->FID();
218                    Trace("Processing feature $fid.") if T(4);
219                    # Get its list of BBHs. The list is actually a hash mapping each BBH to its
220                    # score. All we care about, however, are the BBHs themselves.
221                    my $bbhList = $bbhMatrix{$fid};
222                    # We next wish to loop through the BBH IDs, counting how many are in each of the
223                    # sets. If a genome occurs twice, we only want to count the first occurrence, so
224                    # we have a hash of genomes we've already seen. The hash will map each gene ID
225                    # to 0, 1, or 2, depending on whether it was found in the reference genome,
226                    # a target genome, or an exclusion genome.
227                    my %alreadySeen = ();
228                    # Save the matching genes in here.
229                    my %genesMatching = ();
230                    # Clear the exclusion count.
231                    my $exclusionCount = 0;
232                    # Denote that we're in our own genome.
233                    $alreadySeen{$givenGenomeID} = 0;
234                    my $targetCount = 1;
235                    # Loop through the BBHs.
236                    for my $bbhPeg (keys %{$bbhList}) {
237                        # Get the genome ID. We want to find out if this genome is new.
238                        my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
239                        if (! exists $alreadySeen{$genomeID}) {
240                            # It's new, so we check to see which set it's in.
241                            if ($targetGenomes{$genomeID}) {
242                                # It's in the target set.
243                                $targetCount++;
244                                $alreadySeen{$genomeID} = 1;
245                            } elsif ($exclusionGenomes{$genomeID}) {
246                                # It's in the exclusion set.
247                                $exclusionCount++;
248                                $alreadySeen{$genomeID} = 2;
249                            }
250                            # Note that $alreadySeen{$genomeID} exists in the hash by this
251                            # point. If it's 1, we need to save the current PEG.
252                            if ($alreadySeen{$genomeID} == 1) {
253                                $genesMatching{$bbhPeg} = 1;
254                            }
255                        }
256                    }
257                    # Create a variable to indicate whether or not we want to keep this feature and
258                    # another for the score.
259                    my ($okFlag, $score);
260                    # We need to see if we're using statistics or not. This only matters
261                    # for a two-set situation.
262                    if ($statistical && $exclusionSetSize > 0) {
263                        # This is the magic formula for choosing the differentiating genes. It looks like
264                        # it has something to do with variance computations, but I'm not sure.
265                        my $targetNotCount = $targetSetSize - $targetCount;
266                        my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
267                        my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
268                        my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
269                        my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
270                        my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
271                        my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
272                        # If the two differentials are greater than one, we keep this feature.
273                        $score = $inD + $outD;
274                        $okFlag = ($score > 1);
275                    } else {
276                        # Check to see if we're common in set 1 and not in set 2.
277                        my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
278                        my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
279                        if ($score1 && ! $score2) {
280                            # We satisfy the criterion, so we put this feature to the output. The
281                            # score is normalize to a range similar to the scores in the statistical
282                            # method.
283                            $score = $score1 + (1 - $score2);
284                            $okFlag = 1;
285                        }
286                    }
287                    if ($okFlag) {
288                        # Put this feature to the output. We have one or two extra columns.
289                        # First we store the score.
290                        $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
291                        # Next we add the list of matching genes, but only if "showMatch" is specified.
292                        if ($showMatch) {
293                            # The matching genes are in the hash "genesMatching".
294                            my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
295                            # We need to linkify them.
296                            my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
297                            # Now add them as an extra column.
298                            $fquery->AddExtraColumns(matches => $genesHTML);
299                        }
300                        $saveTime = time();
301                        $self->PutFeature($fquery);
302                        $putTimer += time() - $saveTime;
303                        # Increase the result count.
304                        $retVal++;
305                    }
306                    # Check for a timer trace. We trace every 500 features.
307                    $loopCounter++;
308                    if (T(3) && $loopCounter % 500 == 0) {
309                        Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
310                    }
311                }
312            }
313            # Close the session file.
314            $saveTime = time();
315            $self->CloseSession();
316            $putTimer += time() - $saveTime;
317        }
318        # Trace the timers.
319        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
320      # Return the result count.      # Return the result count.
321      return $retVal;      return $retVal;
322  }  }
# Line 155  Line 335 
335      # Get the parameters.      # Get the parameters.
336      my ($self) = @_;      my ($self) = @_;
337      # Return the result.      # Return the result.
338      return "Search for features that common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
339    }
340    
341    =head2 Internal Utilities
342    
343    =head3 IsCommon
344    
345    C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
346    
347    Return the match score if a specified count indicates a gene is common in a specified set
348    and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
349    comparing the result to the minimum commonality ratio. The one exception is
350    if the set size is 0. In that case, this method always returns 0.
351    
352    =over 4
353    
354    =item count
355    
356    Number of elements of the set that have the relevant characteristic.
357    
358    =item size
359    
360    Total number of elements in the set.
361    
362    =item commonality
363    
364    Minimum count/size ratio for the characteristic to be considered common.
365    
366    =item RETURN
367    
368    Returns TRUE if the characteristic is common, else FALSE.
369    
370    =back
371    
372    =cut
373    
374    sub IsCommon {
375        # Get the parameters.
376        my ($count, $size, $commonality) = @_;
377        # Declare the return variable.
378        my $retVal = 0;
379        # Only procced if the size is positive.
380        if ($size > 0) {
381            # Compute the commonality.
382            $retVal = $count/$size;
383            # If it's too small, clear it.
384            if ($retVal < $commonality) {
385                $retVal = 0;
386            }
387        }
388        # Return the result.
389        return $retVal;
390  }  }
391    
392  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.11

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3