[Bio] / Sprout / SHSigGenes.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHSigGenes.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHSigGenes;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use SearchHelper;
8 :     use CGI;
9 :     use HTML;
10 :     use Sprout;
11 :    
12 :     our @ISA = qw(SearchHelper);
13 :    
14 :     =head1 Gene Discrimination Feature Search Helper
15 :    
16 :     =head2 Introduction
17 :    
18 :     This search performs a signature genes comparison. The user selects two genome sets,
19 :     and the search returns genes from a given genome which are common only in the first set
20 :     and not in the second. If the second set is empty, the search will return genes from
21 :     the given genome that are common in the first set.
22 :    
23 :     Gene identity will be computed in this case using bidirectional best hits. If gene X
24 :     from the given genome has a BBH in a specified genome Y, then it is said to occur
25 :     in whatever set includes genome Y. A gene is considered I<common> if it occurs in a
26 :     certain percentage of the genomes of the set.
27 :    
28 :     This search has the following extra parameters.
29 :    
30 :     =over 4
31 :    
32 :     =item given
33 :    
34 :     The ID of the given genome.
35 :    
36 :     =item target[]
37 :    
38 :     The IDs of the genomes in the first (target) set. The given genome is
39 :     automatically considered a part of this set, so it can never be empty.
40 :    
41 :     =item exclusion[]
42 :    
43 :     The IDs of the genomes in the second (exclusion) set. If this set is empty, then
44 :     no genes will be considered common in set 2, causing all genes common in set 1
45 :     to be selected.
46 :    
47 :     =item commonality
48 :    
49 :     Minimum score for a gene to be considered common. The score is equal to the number
50 :     of genomes containing a bidirectional best hit of the gene divided by the total
51 :     number of genomes. The default is C<0.8>. A value of C<1> means a gene must have
52 :     BBHs in all of the genomes to be considered common; a value of C<0> is invalid.
53 :    
54 :     =item cutoff
55 :    
56 :     Maximum match difference for a BBH hit to be considered valid. The default is C<1e-10>.
57 :    
58 :     =back
59 :    
60 :     =head2 Virtual Methods
61 :    
62 :     =head3 Form
63 :    
64 : parrello 1.7 C<< my $html = $shelp->Form(); >>
65 : parrello 1.1
66 :     Generate the HTML for a form to request a new search.
67 :    
68 :     =cut
69 :    
70 :     sub Form {
71 :     # Get the parameters.
72 :     my ($self) = @_;
73 :     # Get the CGI and sprout objects.
74 :     my $cgi = $self->Q();
75 :     my $sprout = $self->DB();
76 :     # Start the form.
77 :     my $retVal = $self->FormStart("Signature Genes");
78 :     # The bulk of this form will be two genome selection menus, one for the first
79 :     # (target) set and one for the second (exclusion) set. Above these two controls
80 :     # there is the selector for the given genome, the commonality and cutoff values,
81 :     # and the submit button. Our first task, then, is to get the genome selection
82 :     # menus.
83 : parrello 1.5 my $givenMenu = $self->NmpdrGenomeMenu('given', 0, [$cgi->param('given')]);
84 : parrello 1.4 my $targetMenu = $self->NmpdrGenomeMenu('target', 'multiple', [$cgi->param('target')], 10, 'exclusion');
85 :     my $excludeMenu = $self->NmpdrGenomeMenu('exclusion', 'multiple', [$cgi->param('exclusion')], 10, 'target');
86 : parrello 1.1 # Get the default values to use for the commonality and cutoff controls.
87 :     my $commonality = $cgi->param('commonality') || "0.8";
88 :     my $cutoff = $cgi->param('cutoff') || "1e-10";
89 : parrello 1.6 my $statistical = $cgi->param('statistical') || 1;
90 : parrello 1.9 # Now we build the table rows.
91 : parrello 1.1 my @rows = ();
92 : parrello 1.9 # First we have the given genome.
93 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Reference Genome"),
94 :     $cgi->td({colspan => 2}, $givenMenu));
95 :     # Now show the target and exclusion menus.
96 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Inclusion Genomes (Set 1)"),
97 :     $cgi->td({colspan => 2}, $targetMenu));
98 :     push @rows, $cgi->Tr($cgi->td({valign => "top"}, "Exclusion Genomes (Set 2)"),
99 :     $cgi->td({colspan => 2}, $excludeMenu));
100 :     # Next, the numeric parameters.
101 : parrello 1.1 push @rows, $cgi->Tr($cgi->td("Commonality"),
102 :     $cgi->td($cgi->textfield(-name => 'commonality',
103 :     -value => $commonality,
104 : parrello 1.9 -size => 5))),
105 :     $cgi->Tr($cgi->td(), $cgi->td(
106 : parrello 1.5 $cgi->checkbox(-name => 'statistical',
107 :     -checked => $statistical,
108 :     -value => 1,
109 :     -label => 'Use Statistical Algorithm')));
110 : parrello 1.1 push @rows, $cgi->Tr($cgi->td("Cutoff"),
111 :     $cgi->td($cgi->textfield(-name => 'cutoff',
112 :     -value => $cutoff,
113 :     -size => 5)));
114 : parrello 1.9 # Next, the feature filter rows.
115 :     push @rows, $self->FeatureFilterRows();
116 :     # Finally, the submit button.
117 : parrello 1.1 push @rows, $self->SubmitRow();
118 :     # Create the table.
119 :     $retVal .= $self->MakeTable(\@rows);
120 :     # Close the form.
121 :     $retVal .= $self->FormEnd();
122 :     # Return the result.
123 :     return $retVal;
124 :     }
125 :    
126 :     =head3 Find
127 :    
128 :     C<< my $resultCount = $shelp->Find(); >>
129 :    
130 :     Conduct a search based on the current CGI query parameters. The search results will
131 :     be written to the session cache file and the number of results will be
132 :     returned. If the search parameters are invalid, a result count of C<undef> will be
133 :     returned and a result message will be stored in this object describing the problem.
134 :    
135 :     =cut
136 :    
137 :     sub Find {
138 :     # Get the parameters.
139 :     my ($self) = @_;
140 :     # Get the sprout and CGI query objects.
141 :     my $cgi = $self->Q();
142 :     my $sprout = $self->DB();
143 : parrello 1.2 # Declare the return variable. If it remains undefined, the caller will
144 :     # assume there was an error.
145 : parrello 1.1 my $retVal;
146 : parrello 1.2 # Validate the numeric parameters.
147 :     my $commonality = $cgi->param('commonality');
148 :     my $cutoff = $cgi->param('cutoff');
149 :     if ($commonality !~ /^\s*\d(\.\d+)?\s*$/) {
150 :     $self->SetMessage("Commonality value appears invalid, too big, negative, or not a number.");
151 :     } elsif ($commonality <= 0 || $commonality > 1) {
152 :     $self->SetMessage("Commonality cannot be 0 and cannot be greater than 1.");
153 :     } elsif ($cutoff !~ /^\s*\d(.\d+)?(e\-\d+)?\s*$/) {
154 :     $self->SetMessage("Cutoff must be an exponential number (e.g. \"1e-20\" or \"2.5e-11\".");
155 :     } elsif ($cutoff > 1) {
156 :     $self->SetMessage("Cutoff cannot be greater than 1.");
157 :     } else {
158 :     # Now we need to gather and validate the genome sets.
159 : parrello 1.3 my ($givenGenomeID) = $self->GetGenomes('given');
160 :     my %targetGenomes = map { $_ => 1 } $self->GetGenomes('target');
161 :     my %exclusionGenomes = map { $_ => 1 } $self->GetGenomes('exclusion');
162 : parrello 1.2 # Insure the given genome is not in the exclusion set.
163 :     if ($exclusionGenomes{$givenGenomeID}) {
164 :     $self->SetMessage("The given genome ($givenGenomeID) cannot be in the exclusion set.");
165 :     } else {
166 :     # Insure the given genome is in the target set.
167 :     $targetGenomes{$givenGenomeID} = 1
168 :     }
169 : parrello 1.5 # Find out if we want to use a statistical analysis.
170 :     my $statistical = $cgi->param('statistical') || 0;
171 : parrello 1.2 # Denote we have not yet found any genomes.
172 :     $retVal = 0;
173 : parrello 1.4 # Create a feature query object to loop through the chosen features of the given
174 :     # genome.
175 :     my $fquery = FeatureQuery->new($self, $givenGenomeID);
176 :     # Get the sizes of the two sets. This information is useful in computing commonality.
177 :     my $targetSetSize = scalar keys %targetGenomes;
178 :     my $exclusionSetSize = scalar keys %exclusionGenomes;
179 :     # Loop through the features.
180 :     while (my $record = $fquery->Fetch()) {
181 :     # Get the feature's ID.
182 :     my $fid = $fquery->FID();
183 :     # Request its list of BBHs. The list is actually a hash mapping each BBH to its
184 :     # score. All we care about, however, are the BBHs themselves.
185 :     my %bbhList = $sprout->LowBBHs($fid, $cutoff);
186 :     # We next wish to loop through the BBH IDs, counting how many are in each of the
187 :     # sets. If a genome occurs twice, we only want to count the first occurrence, so
188 :     # we have a hash of genomes we've already seen.
189 :     my %alreadySeen = ();
190 :     # Clear the counts.
191 :     my ($targetCount, $exclusionCount) = (0, 0);
192 :     # Loop through the BBHs.
193 :     for my $bbhPeg (keys %bbhList) {
194 :     # Get the genome ID. We want to find out if this genome is new.
195 :     my ($genomeID) = FIGRules::ParseFeatureID($bbhPeg);
196 :     if (! $alreadySeen{$genomeID}) {
197 :     # It's new, so we check to see which set it's in.
198 :     if ($targetGenomes{$genomeID}) {
199 :     $targetCount++;
200 :     } elsif ($exclusionGenomes{$genomeID}) {
201 :     $exclusionCount++;
202 :     }
203 :     # Make sure we don't look at it again.
204 :     $alreadySeen{$genomeID} = 1;
205 :     }
206 :     }
207 : parrello 1.5 # Create a variable to indicate whether or not we want to keep this feature.
208 :     my $okFlag;
209 :     # We need to see if we're using statistics or not. This only matters
210 :     # for a two-set situation.
211 :     if ($statistical && $exclusionSetSize > 0) {
212 :     # This looks like it has something to do with variance computations,
213 :     # but I'm not sure.
214 :     my $targetNotCount = $targetSetSize - $targetCount;
215 :     my $targetSquare = $targetCount * $targetCount + $targetNotCount * $targetNotCount;
216 :     my $exclusionNotCount = $exclusionSetSize - $exclusionCount;
217 :     my $exclusionSquare = $exclusionCount * $exclusionCount + $exclusionNotCount * $exclusionNotCount;
218 :     my $mixed = $targetCount * $exclusionCount + $targetNotCount * $exclusionNotCount;
219 :     my $inD = 1 - (($exclusionSetSize * $mixed) / ($targetSetSize * $exclusionSquare));
220 :     my $outD = 1 - (($targetSetSize * $mixed) / ($exclusionSetSize * $targetSquare));
221 :     # If the two differentials are greater than one, we keep this feature.
222 :     $okFlag = ($inD + $outD > 1);
223 :     } else {
224 :     # Check to see if we're common in set 1 and not in set 2.
225 :     if (IsCommon($targetCount, $targetSetSize, $commonality) &&
226 :     ! IsCommon($exclusionCount, $exclusionSetSize, $commonality)) {
227 :     # We satisfy the criterion, so we put this feature to the output.
228 :     $okFlag = 1;
229 :     }
230 :     }
231 :     if ($okFlag) {
232 :     # Put this feature to the output.
233 : parrello 1.4 $self->PutFeature($fquery);
234 :     # Increase the result count.
235 :     $retVal++;
236 :     }
237 :     }
238 : parrello 1.5 # Close the session file.
239 :     $self->CloseSession();
240 : parrello 1.2 }
241 : parrello 1.1 # Return the result count.
242 :     return $retVal;
243 :     }
244 :    
245 :     =head3 Description
246 :    
247 :     C<< my $htmlText = $shelp->Description(); >>
248 :    
249 :     Return a description of this search. The description is used for the table of contents
250 :     on the main search tools page. It may contain HTML, but it should be character-level,
251 :     not block-level, since the description is going to appear in a list.
252 :    
253 :     =cut
254 :    
255 :     sub Description {
256 :     # Get the parameters.
257 :     my ($self) = @_;
258 :     # Return the result.
259 : parrello 1.8 return "Search for genes that are common to a group of organisms or that discriminate between two groups of organisms.";
260 : parrello 1.1 }
261 :    
262 : parrello 1.4 =head2 Internal Utilities
263 :    
264 :     =head3 IsCommon
265 :    
266 :     C<< my $flag = SHSigGenes::IsCommon($count, $size, $commonality); >>
267 :    
268 :     Return TRUE if a specified count indicates a gene is common in a specified set.
269 :     Commonality is computed by dividing the count by the size of the set and
270 :     comparing the result to the minimum commonality ratio. The one exception is
271 :     if the set size is 0. In that case, this method always returns FALSE.
272 :    
273 :     =over 4
274 :    
275 :     =item count
276 :    
277 :     Number of elements of the set that have the relevant characteristic.
278 :    
279 :     =item size
280 :    
281 :     Total number of elements in the set.
282 :    
283 :     =item commonality
284 :    
285 :     Minimum count/size ratio for the characteristic to be considered common.
286 :    
287 :     =item RETURN
288 :    
289 :     Returns TRUE if the characteristic is common, else FALSE.
290 :    
291 :     =back
292 :    
293 :     =cut
294 :    
295 :     sub IsCommon {
296 :     # Get the parameters.
297 :     my ($count, $size, $commonality) = @_;
298 :     # Declare the return variable.
299 :     my $retVal = 0;
300 :     # Only procced if the size is positive.
301 :     if ($size > 0) {
302 :     $retVal = ($count/$size >= $commonality);
303 :     }
304 :     # Return the result.
305 :     return $retVal;
306 :     }
307 :    
308 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3