[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Diff of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.1, Tue Sep 26 13:46:03 2006 UTC revision 1.12, Tue Apr 10 06:10:02 2007 UTC
# Line 8  Line 8 
8      use CGI;      use CGI;
9      use HTML;      use HTML;
10      use Sprout;      use Sprout;
11        use Time::HiRes;
12        use FIGRules;
13    
14      our @ISA = qw(SearchHelper);      our @ISA = qw(SearchHelper);
15    
# Line 55  Line 57 
57    
58  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.  Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
59    
60    =item showMatch
61    
62    If TRUE, then all the genes in the target set that match the ones in the reference genome
63    will be shown in an extra column.
64    
65  =back  =back
66    
67  =head2 Virtual Methods  =head2 Virtual Methods
68    
69  =head3 Form  =head3 Form
70    
71  C<< my $html = $shelp->Include(); >>  C<< my $html = $shelp->Form(); >>
72    
73  Generate the HTML for a form to request a new search.  Generate the HTML for a form to request a new search.
74    
# Line 80  Line 87 
87      # there is the selector for the given genome, the commonality and cutoff values,      # there is the selector for the given genome, the commonality and cutoff values,
88      # and the submit button. Our first task, then, is to get the genome selection      # and the submit button. Our first task, then, is to get the genome selection
89      # menus.      # menus.
90      my $givenMenu   = $self->NmpdrGenomeMenu('given', { size => 1 }, [$cgi->param('genome')]);      my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
91      my $targetMenu  = $self->NmpdrGenomeMenu('target', { size => 10, multiple => 'multiple' },      my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
92                                               [$cgi->param('target')]);      my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
     my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', { size => 10, multiple => 'multiple' },  
                                              [$cgi->param('exclusion')]);  
93      # Get the default values to use for the commonality and cutoff controls.      # Get the default values to use for the commonality and cutoff controls.
94      my $commonality = $cgi->param('commonality') || "0.8";      my $commonality = $cgi->param('commonality') || "0.8";
95      my $cutoff = $cgi->param('cutoff') || "1e-10";      my $cutoff = $cgi->param('cutoff') || "1e-10";
96      # Now we build the table rows. The top contains the two numeric parameters and      my $statistical = $cgi->param('statistical') || 1;
97      # the submit button.      my $showMatch = $cgi->param('showMatch') || 0;
98        my $useSims = $cgi->param('useSims') || 0;
99        my $pegsOnly = $cgi->param('pegsOnly') || 0;
100        # Now we build the table rows.
101      my @rows = ();      my @rows = ();
102        # First we have the given genome.
103        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
104                             $cgi->td({colspan => 2}, $givenMenu));
105        # Now show the target and exclusion menus.
106        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
107                             $cgi->td({colspan => 2}, $targetMenu));
108        push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
109                             $cgi->td({colspan => 2}, $excludeMenu));
110        # Next, the tuning parameters.
111      push @rows, $cgi->Tr($cgi->td("Commonality"),      push @rows, $cgi->Tr($cgi->td("Commonality"),
112                           $cgi->td($cgi->textfield(-name => 'commonality',                           $cgi->td($cgi->textfield(-name => 'commonality',
113                                                    -value => $commonality,                                                    -value => $commonality,
114                                                    -size => 5)));                                                    -size => 5))),
115      push @rows, $cgi->Tr($cgi->td("Cutoff"),                  $cgi->Tr($cgi->td(), $cgi->td(join(" ",
116                                      $cgi->checkbox(-name => 'statistical',
117                                                     -checked => $statistical,
118                                                     -value => 1,
119                                                     -label => 'Use Statistical Algorithm'),
120                                      $cgi->checkbox(-name => 'useSims',
121                                                     -checked => $useSims,
122                                                     -value => 1,
123                                                     -label => 'Use Similarities')))),
124                    $cgi->Tr($cgi->td(), $cgi->td(join(" ",
125                                      $cgi->checkbox(-name => 'showMatch',
126                                                     -checked => $showMatch,
127                                                     -value => 1,
128                                                     -label => 'Show Matching Genes'),
129                                      $cgi->checkbox(-name => 'pegsOnly',
130                                                     -checked => $pegsOnly,
131                                                     -value => 1,
132                                                     -label => 'PEGs Only')))),
133                    $cgi->Tr($cgi->td("Cutoff"),
134                           $cgi->td($cgi->textfield(-name => 'cutoff',                           $cgi->td($cgi->textfield(-name => 'cutoff',
135                                                    -value => $cutoff,                                                    -value => $cutoff,
136                                                    -size => 5)));                                                    -size => 5)));
137        # Next, the feature filter rows.
138        push @rows, $self->FeatureFilterRows();
139        # Finally, the submit button.
140      push @rows, $self->SubmitRow();      push @rows, $self->SubmitRow();
     # The next rows have the three genome menus.  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),  
                          $cgi->td({colspan => 2}, $givenMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),  
                          $cgi->td({colspan => 2}, $targetMenu));  
     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),  
                          $cgi->td({colspan => 2}, $excludeMenu));  
141      # Create the table.      # Create the table.
142      $retVal .= $self->MakeTable(\@rows);      $retVal .= $self->MakeTable(\@rows);
143      # Close the form.      # Close the form.
# Line 132  Line 163 
163      # Get the sprout and CGI query objects.      # Get the sprout and CGI query objects.
164      my $cgi = $self->Q();      my $cgi = $self->Q();
165      my $sprout = $self->DB();      my $sprout = $self->DB();
166      # Declare the return variable.      # Declare the return variable. If it remains undefined, the caller will
167        # assume there was an error.
168      my $retVal;      my $retVal;
169        # Denote the extra columns go at the end.
170      # TODO: find stuff      $self->SetExtraPos(1);
171        # Create the timers.
172        my ($saveTime, $loopCounter, $bbhTimer, $putTimer, $queryTimer) = (0, 0, 0, 0, 0);
173        # Validate the numeric parameters.
174        my $commonality = $cgi->param('commonality');
175        my $cutoff = $cgi->param('cutoff');
176        my $pegsOnly = $cgi->param('pegsOnly');
177        if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
178            $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
179        } elsif ($commonality <= 0 || $commonality > 1) {
180            $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1.");
181        } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) {
182            $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\".");
183        } elsif ($cutoff > 1) {
184            $self->SetMessage("Cutoff cannot be greater than 1.");
185        } else {
186            # Now we need to gather and validate the genome sets.
187            $self->PrintLine("Gathering the target genomes.  ");
188            my ($givenGenomeID) = $self->GetGenomes('given');
189            my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
190            $self->PrintLine("Gathering the exclusion genomes.  ");
191            my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
192            $self->PrintLine("Validating the genome sets.<br />");
193            # Insure the given genome is not in the exclusion set.
194            if ($exclusionGenomes{$givenGenomeID}) {
195                $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
196            } else {
197                # Insure the given genome is in the target set.
198                $targetGenomes{$givenGenomeID} = 1
199            }
200            # Find out if we want to use a statistical analysis.
201            my $statistical = $cgi->param('statistical') || 1;
202            # Find out if we need to show matching genes.
203            my $showMatch = $cgi->param('showMatch') || 0;
204            # Denote we have not yet found any genomes.
205            $retVal = 0;
206            # Compute the list of genomes of interest.
207            my @allGenomes = (keys %exclusionGenomes, keys %targetGenomes);
208            # Set the parameter that indicates whether or not we're in PEGs-only mode.
209            my $pegMode = ($pegsOnly ? 'peg' : undef);
210            # Get the peg matrix.
211            Trace("Requesting matrix.") if T(3);
212            $saveTime = time();
213            my %bbhMatrix;
214            if (! $cgi->param('useSims')) {
215                # Here we are using BBHs, which are fast enough to do in one gulp.
216                $self->PrintLine("Requesting bidirectional best hits.  ");
217                %bbhMatrix = $sprout->BBHMatrix($givenGenomeID, $cutoff, @allGenomes);
218            } else {
219                # Here we are using similarities, which is much more complicated.
220                $self->PrintLine("Requesting similarities.<br />");
221                # Create a filtering matrix for the results. We only want to keep PEGs in the
222                # specified target and exclusion genomes.
223                my %keepGenomes = map { $_ => 1 } @allGenomes;
224                # Loop through the given genome's features.
225                my @features = $sprout->FeaturesOf($givenGenomeID, $pegMode);
226                for my $fid (@features) {
227                    $self->PrintLine("Retrieving similarities for $fid.  ");
228                    # Get this feature's similarities.
229                    my $simList = $sprout->Sims($fid, 1000, $cutoff, 'fig');
230                    my $simCount = scalar @{$simList};
231                    $self->PrintLine("Raw similarity count: $simCount.  ");
232                    # Create the matrix hash for this feature.
233                    $bbhMatrix{$fid} = {};
234                    # Now we need to filter out the similarities that don't land on the target genome.
235                    $simCount = 0;
236                    for my $sim (@{$simList}) {
237                        # Insure this similarity lands on a target genome.
238                        my ($genomeID2) = FIGRules::ParseFeatureID($sim->id2);
239                        if ($keepGenomes{$genomeID2}) {
240                            # Here we're keeping the similarity, so we put it in this feature's hash.
241                            $bbhMatrix{$fid}->{$sim->id2} = $sim->psc;
242                            $simCount++;
243                        }
244                    }
245                    $self->PrintLine("Similarities retained: $simCount.<br />");
246                }
247            }
248            $bbhTimer += time() - $saveTime;
249            $self->PrintLine("Time to build matrix: $bbhTimer seconds.<br />");
250            Trace("Matrix built.") if T(3);
251            # Create a feature query object to loop through the chosen features of the given
252            # genome.
253            Trace("Creating feature query.") if T(3);
254            $saveTime = time();
255            my $fquery = FeatureQuery->new($self, $givenGenomeID);
256            $queryTimer += time() - $saveTime;
257            # Get the sizes of the two sets. This information is useful in computing commonality.
258            my $targetSetSize = scalar keys %targetGenomes;
259            my $exclusionSetSize = scalar keys %exclusionGenomes;
260            # Loop through the features.
261            my $done = 0;
262            while (! $done) {
263                # Get the next feature.
264                $saveTime = time();
265                my $record = $fquery->Fetch();
266                $queryTimer += time() - $saveTime;
267                if (! $record) {
268                    $done = 1;
269                } else {
270                    # Get the feature's ID.
271                    my $fid = $fquery->FID();
272                    # Insure we want to look at this feature.
273                    if ($fid =~ /\.peg\./ || ! $pegsOnly) {
274                        Trace("Checking feature $fid.") if T(4);
275                        $self->PrintLine("Checking feature $fid.<br />");
276                        # Get its list of BBHs. The list is actually a hash mapping each BBH to its
277                        # score. All we care about, however, are the BBHs themselves.
278                        my $bbhList = $bbhMatrix{$fid};
279                        # We next wish to loop through the BBH IDs, counting how many are in each of the
280                        # sets. If a genome occurs twice, we only want to count the first occurrence, so
281                        # we have a hash of genomes we've already seen. The hash will map each gene ID
282                        # to 0, 1, or 2, depending on whether it was found in the reference genome,
283                        # a target genome, or an exclusion genome.
284                        my %alreadySeen = ();
285                        # Save the matching genes in here.
286                        my %genesMatching = ();
287                        # Clear the exclusion count.
288                        my $exclusionCount = 0;
289                        # Denote that we're in our own genome.
290                        $alreadySeen{$givenGenomeID} = 0;
291                        my $targetCount = 1;
292                        # Loop through the BBHs/Sims.
293                        for my $bbhPeg (keys %{$bbhList}) {
294                            # Get the genome ID. We want to find out if this genome is new.
295                            my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
296                            if (! exists $alreadySeen{$genomeID}) {
297                                # It's new, so we check to see which set it's in.
298                                if ($targetGenomes{$genomeID}) {
299                                    # It's in the target set.
300                                    $targetCount++;
301                                    $alreadySeen{$genomeID} = 1;
302                                } elsif ($exclusionGenomes{$genomeID}) {
303                                    # It's in the exclusion set.
304                                    $exclusionCount++;
305                                    $alreadySeen{$genomeID} = 2;
306                                }
307                                # Note that $alreadySeen{$genomeID} exists in the hash by this
308                                # point. If it's 1, we need to save the current PEG.
309                                if ($alreadySeen{$genomeID} == 1) {
310                                    $genesMatching{$bbhPeg} = 1;
311                                }
312                            }
313                        }
314                        # Create a variable to indicate whether or not we want to keep this feature and
315                        # another for the score.
316                        my ($okFlag, $score);
317                        # We need to see if we're using statistics or not. This only matters
318                        # for a two-set situation.
319                        if ($statistical && $exclusionSetSize > 0) {
320                            # This is the magic formula for choosing the differentiating genes. It looks like
321                            # it has something to do with variance computations, but I'm not sure.
322                            my $targetNotCount = $targetSetSize - $targetCount;
323                            my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
324                            my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
325                            my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
326                            my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
327                            my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
328                            my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
329                            # If the two differentials are greater than one, we keep this feature.
330                            $score = $inD + $outD;
331                            $okFlag = ($score > 1);
332                            # Subtract 1 from the score so it looks like the commonality score.
333                            $score -= 1.0;
334                        } else {
335                            # Check to see if we're common in set 1 and not in set 2.
336                            my $score1 = IsCommon($targetCount, $targetSetSize, $commonality);
337                            my $score2 = IsCommon($exclusionCount, $exclusionSetSize, $commonality);
338                            if ($score1 && ! $score2) {
339                                # We satisfy the criterion, so we put this feature to the output. The
340                                # score is essentially $score1, since $score2 is zero.
341                                $score = $score1;
342                                $okFlag = 1;
343                            }
344                        }
345                        if ($okFlag) {
346                            # Put this feature to the output. We have one or two extra columns.
347                            # First we store the score.
348                            $fquery->AddExtraColumns(score => sprintf("%.3f",$score));
349                            # Next we add the list of matching genes, but only if "showMatch" is specified.
350                            if ($showMatch) {
351                                # The matching genes are in the hash "genesMatching".
352                                my @genes = sort { FIGRules::FIGCompare($a,$b) } keys %genesMatching;
353                                # We need to linkify them.
354                                my $genesHTML = join(", ", map { HTML::fid_link($cgi, $_) } @genes);
355                                # Now add them as an extra column.
356                                $fquery->AddExtraColumns(matches => $genesHTML);
357                            }
358                            $saveTime = time();
359                            $self->PutFeature($fquery);
360                            $putTimer += time() - $saveTime;
361                            # Increase the result count.
362                            $retVal++;
363                        }
364                        # Check for a timer trace. We trace every 500 features.
365                        $loopCounter++;
366                        if (T(3) && $loopCounter % 500 == 0) {
367                            Trace("Time spent for $loopCounter features: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.");
368                        }
369                    }
370                }
371            }
372            # Close the session file.
373            $saveTime = time();
374            $self->CloseSession();
375            $putTimer += time() - $saveTime;
376        }
377        # Trace the timers.
378        Trace("Time spent: Put = $putTimer, Query = $queryTimer, BBH = $bbhTimer.") if T(3);
379      # Return the result count.      # Return the result count.
380      return $retVal;      return $retVal;
381  }  }
# Line 155  Line 394 
394      # Get the parameters.      # Get the parameters.
395      my ($self) = @_;      my ($self) = @_;
396      # Return the result.      # Return the result.
397      return "Search for features that common to a group of organisms or that discriminate between two groups of organisms.";      return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
398    }
399    
400    =head2 Internal Utilities
401    
402    =head3 IsCommon
403    
404    C<< my $score = SHSigGenes::IsCommon($count, $size, $commonality); >>
405    
406    Return the match score if a specified count indicates a gene is common in a specified set
407    and 0 otherwise. Commonality is computed by dividing the count by the size of the set and
408    comparing the result to the minimum commonality ratio. The one exception is
409    if the set size is 0. In that case, this method always returns 0.
410    
411    =over 4
412    
413    =item count
414    
415    Number of elements of the set that have the relevant characteristic.
416    
417    =item size
418    
419    Total number of elements in the set.
420    
421    =item commonality
422    
423    Minimum count/size ratio for the characteristic to be considered common.
424    
425    =item RETURN
426    
427    Returns TRUE if the characteristic is common, else FALSE.
428    
429    =back
430    
431    =cut
432    
433    sub IsCommon {
434        # Get the parameters.
435        my ($count, $size, $commonality) = @_;
436        # Declare the return variable.
437        my $retVal = 0;
438        # Only procced if the size is positive.
439        if ($size > 0) {
440            # Compute the commonality.
441            $retVal = $count/$size;
442            # If it's too small, clear it.
443            if ($retVal < $commonality) {
444                $retVal = 0;
445            }
446        }
447        # Return the result.
448        return $retVal;
449  }  }
450    
451  1;  1;

Legend:
Removed from v.1.1  
changed lines
  Added in v.1.12

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3