[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

View of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Sun Oct 8 01:50:06 2006 UTC (12 years, 7 months ago) by parrello
Branch: MAIN
Changes since 1.5: +1 -1 lines
Changed the default for using statistical analysis.

#!/usr/bin/perl -w

package SHSigGenes;

    use strict;
    use Tracer;
    use SearchHelper;
    use CGI;
    use HTML;
    use Sprout;

    our @ISA = qw(SearchHelper);

=head1 Gene Discrimination Feature Search Helper

=head2 Introduction

This search performs a signature genes comparison. The user selects two genome sets,
and the search returns genes from a given genome which are common only in the first set
and not in the second. If the second set is empty, the search will return genes from
the given genome that are common in the first set.

Gene identity will be computed in this case using bidirectional best hits. If gene X
from the given genome has a BBH in a specified genome Y, then it is said to occur
in whatever set includes genome Y. A gene is considered I<common> if it occurs in a
certain percentage of the genomes of the set.

This search has the following extra parameters.

=over 4

=item given

The ID of the given genome.

=item target[]

The IDs of the genomes in the first (target) set. The given genome is
automatically considered a part of this set, so it can never be empty.

=item exclusion[]

The IDs of the genomes in the second (exclusion) set. If this set is empty, then
no genes will be considered common in set 2, causing all genes common in set 1
to be selected.

=item commonality

Minimum score for a gene to be considered common. The score is equal to the number
of genomes containing a bidirectional best hit of the gene divided by the total
number of genomes. The default is C<0.8>. A value of C<1> means a gene must have
BBHs in all of the genomes to be considered common; a value of C<0> is invalid.

=item cutoff

Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.

=back

=head2 Virtual Methods

=head3 Form

C<< my $html = $shelp->Include(); >>

Generate the HTML for a form to request a new search.

=cut

sub Form {
    # Get the parameters.
    my ($self) = @_;
    # Get the CGI and sprout objects.
    my $cgi = $self->Q();
    my $sprout = $self->DB();
    # Start the form.
    my $retVal = $self->FormStart("Signature Genes");
    # The bulk of this form will be two genome selection menus, one for the first
    # (target) set and one for the second (exclusion) set. Above these two controls
    # there is the selector for the given genome, the commonality and cutoff values,
    # and the submit button. Our first task, then, is to get the genome selection
    # menus.
    my $givenMenu   = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
    my $targetMenu  = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
    my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
    # Get the default values to use for the commonality and cutoff controls.
    my $commonality = $cgi->param('commonality') || "0.8";
    my $cutoff = $cgi->param('cutoff') || "1e-10";
    my $statistical = $cgi->param('statistical') || 1;
    # Now we build the table rows. The top contains the two numeric parameters and
    # the submit button.
    my @rows = ();
    push @rows, $cgi->Tr($cgi->td("Commonality"),
                         $cgi->td($cgi->textfield(-name => 'commonality',
                                                  -value => $commonality,
                                                  -size => 5) . "&nbsp;" .
                                  $cgi->checkbox(-name => 'statistical',
                                                 -checked => $statistical,
                                                 -value => 1,
                                                 -label => 'Use Statistical Algorithm')));
    push @rows, $cgi->Tr($cgi->td("Cutoff"),
                         $cgi->td($cgi->textfield(-name => 'cutoff',
                                                  -value => $cutoff,
                                                  -size => 5)));
    push @rows, $self->SubmitRow();
    # The next rows have the given genome and a feature filter.
    push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Given Genome"),
                         $cgi->td({colspan => 2}, $givenMenu));
    push @rows, $self->FeatureFilterRows();
    # Now show the target and exclusion menus.
    push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Target Genomes (Set 1)"),
                         $cgi->td({colspan => 2}, $targetMenu));
    push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
                         $cgi->td({colspan => 2}, $excludeMenu));
    # Create the table.
    $retVal .= $self->MakeTable(\@rows);
    # Close the form.
    $retVal .= $self->FormEnd();
    # Return the result.
    return $retVal;
}

=head3 Find

C<< my $resultCount = $shelp->Find(); >>

Conduct a search based on the current CGI query parameters. The search results will
be written to the session cache file and the number of results will be
returned. If the search parameters are invalid, a result count of C<undef> will be
returned and a result message will be stored in this object describing the problem.

=cut

sub Find {
    # Get the parameters.
    my ($self) = @_;
    # Get the sprout and CGI query objects.
    my $cgi = $self->Q();
    my $sprout = $self->DB();
    # Declare the return variable. If it remains undefined, the caller will
    # assume there was an error.
    my $retVal;
    # Validate the numeric parameters.
    my $commonality = $cgi->param('commonality');
    my $cutoff = $cgi->param('cutoff');
    if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
        $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
    } elsif ($commonality <= 0 || $commonality > 1) {
        $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1.");
    } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) {
        $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\".");
    } elsif ($cutoff > 1) {
        $self->SetMessage("Cutoff cannot be greater than 1.");
    } else {
        # Now we need to gather and validate the genome sets.
        my ($givenGenomeID) = $self->GetGenomes('given');
        my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
        my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
        # Insure the given genome is not in the exclusion set.
        if ($exclusionGenomes{$givenGenomeID}) {
            $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
        } else {
            # Insure the given genome is in the target set.
            $targetGenomes{$givenGenomeID} = 1
        }
        # Find out if we want to use a statistical analysis.
        my $statistical = $cgi->param('statistical') || 0;
        # Denote we have not yet found any genomes.
        $retVal = 0;
        # Create a feature query object to loop through the chosen features of the given
        # genome.
        my $fquery = FeatureQuery->new($self, $givenGenomeID);
        # Get the sizes of the two sets. This information is useful in computing commonality.
        my $targetSetSize = scalar keys %targetGenomes;
        my $exclusionSetSize = scalar keys %exclusionGenomes;
        # Loop through the features.
        while (my $record = $fquery->Fetch()) {
            # Get the feature's ID.
            my $fid = $fquery->FID();
            # Request its list of BBHs. The list is actually a hash mapping each BBH to its
            # score. All we care about, however, are the BBHs themselves.
            my %bbhList = $sprout->LowBBHs($fid, $cutoff);
            # We next wish to loop through the BBH IDs, counting how many are in each of the
            # sets. If a genome occurs twice, we only want to count the first occurrence, so
            # we have a hash of genomes we've already seen.
            my %alreadySeen = ();
            # Clear the counts.
            my ($targetCount, $exclusionCount) = (0, 0);
            # Loop through the BBHs.
            for my $bbhPeg (keys %bbhList) {
                # Get the genome ID. We want to find out if this genome is new.
                my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
                if (! $alreadySeen{$genomeID}) {
                    # It's new, so we check to see which set it's in.
                    if ($targetGenomes{$genomeID}) {
                        $targetCount++;
                    } elsif ($exclusionGenomes{$genomeID}) {
                        $exclusionCount++;
                    }
                    # Make sure we don't look at it again.
                    $alreadySeen{$genomeID} = 1;
                }
            }
            # Create a variable to indicate whether or not we want to keep this feature.
            my $okFlag;
            # We need to see if we're using statistics or not. This only matters
            # for a two-set situation.
            if ($statistical && $exclusionSetSize > 0) {
                # This looks like it has something to do with variance computations,
                # but I'm not sure.
                my $targetNotCount = $targetSetSize - $targetCount;
                my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
                my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
                my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
                my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
                my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
                my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
                # If the two differentials are greater than one, we keep this feature.
                $okFlag = ($inD + $outD > 1);
            } else {
                # Check to see if we're common in set 1 and not in set 2.
                if (IsCommon($targetCount, $targetSetSize, $commonality) &&
                    ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {
                    # We satisfy the criterion, so we put this feature to the output.
                    $okFlag = 1;
                }
            }
            if ($okFlag) {
                # Put this feature to the output.
                $self->PutFeature($fquery);
                # Increase the result count.
                $retVal++;
            }
        }
        # Close the session file.
        $self->CloseSession();
    }
    # Return the result count.
    return $retVal;
}

=head3 Description

C<< my $htmlText = $shelp->Description(); >>

Return a description of this search. The description is used for the table of contents
on the main search tools page. It may contain HTML, but it should be character-level,
not block-level, since the description is going to appear in a list.

=cut

sub Description {
    # Get the parameters.
    my ($self) = @_;
    # Return the result.
    return "Search for features that are common to a group of organisms or that discriminate between two groups of organisms.";
}

=head2 Internal Utilities

=head3 IsCommon

C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>

Return TRUE if a specified count indicates a gene is common in a specified set.
Commonality is computed by dividing the count by the size of the set and
comparing the result to the minimum commonality ratio. The one exception is
if the set size is 0. In that case, this method always returns FALSE.

=over 4

=item count

Number of elements of the set that have the relevant characteristic.

=item size

Total number of elements in the set.

=item commonality

Minimum count/size ratio for the characteristic to be considered common.

=item RETURN

Returns TRUE if the characteristic is common, else FALSE.

=back

=cut

sub IsCommon {
    # Get the parameters.
    my ($count, $size, $commonality) = @_;
    # Declare the return variable.
    my $retVal = 0;
    # Only procced if the size is positive.
    if ($size > 0) {
        $retVal = ($count/$size >= $commonality);
    }
    # Return the result.
    return $retVal;
}

1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3