Parent Directory
|
Revision Log
Revision 1.4 - (view) (download) (as text)
1 : | parrello | 1.1 | #!/usr/bin/perl -w |
2 : | |||
3 : | package SHOpSearch; | ||
4 : | |||
5 : | use strict; | ||
6 : | use Tracer; | ||
7 : | use CGI; | ||
8 : | use HTML; | ||
9 : | use Sprout; | ||
10 : | parrello | 1.2 | use RHFeatures; |
11 : | parrello | 1.1 | use BasicLocation; |
12 : | parrello | 1.2 | use base 'SearchHelper'; |
13 : | parrello | 1.1 | |
14 : | =head1 Operon Analysis Feature Search Helper | ||
15 : | |||
16 : | =head2 Introduction | ||
17 : | |||
18 : | This search method takes a genome ID and produces a list of the operons. An operon is defined | ||
19 : | as a set of genes in the same direction that are near each other with no intervening genes. | ||
20 : | The concept of I<near> is defined by a parameter. In addition to the standard data | ||
21 : | produced by a feature search, this method also shows the upstream DNA for each feature | ||
22 : | found. The size of the upstream region is also defined by a parameter. | ||
23 : | |||
24 : | parrello | 1.2 | This search has the following extra parameters. |
25 : | parrello | 1.1 | |
26 : | =over 4 | ||
27 : | |||
28 : | =item genome | ||
29 : | |||
30 : | ID of the genome to process for operons. | ||
31 : | |||
32 : | =item nearDistance | ||
33 : | |||
34 : | Maximum distance in base pairs of successive genes in an operon. The default value is C<200>, | ||
35 : | which means that if the distance between the end of one gene and the beginning of the next is more | ||
36 : | than 200 base pairs, the genes are not considered to be part of the same operon. | ||
37 : | |||
38 : | =item upstream | ||
39 : | |||
40 : | Number of base pairs in front of the gene to include in the display of the upstream region. The default | ||
41 : | value is C<250>. | ||
42 : | |||
43 : | =item instream | ||
44 : | |||
45 : | Number of base pairs inside the gene to include in the display of the upstream region. The default | ||
46 : | value is C<50>. | ||
47 : | |||
48 : | =item lintSize | ||
49 : | |||
50 : | The maximum size of a gene that can be interpreted as lint. Genes with this number of base pairs | ||
51 : | or fewer are ignored during the operon analysis. The default is C<100>. | ||
52 : | |||
53 : | =back | ||
54 : | |||
55 : | =cut | ||
56 : | |||
57 : | my %TuningParms = (nearDistance => 200, upstream => 250, instream => 50, lintSize => 100); | ||
58 : | |||
59 : | =head2 Virtual Methods | ||
60 : | |||
61 : | =head3 Form | ||
62 : | |||
63 : | parrello | 1.4 | my $html = $shelp->Form(); |
64 : | parrello | 1.1 | |
65 : | Generate the HTML for a form to request a new search. | ||
66 : | |||
67 : | =cut | ||
68 : | |||
69 : | sub Form { | ||
70 : | # Get the parameters. | ||
71 : | my ($self) = @_; | ||
72 : | # Get the CGI and sprout objects. | ||
73 : | my $cgi = $self->Q(); | ||
74 : | my $sprout = $self->DB(); | ||
75 : | # Start the form. | ||
76 : | my $retVal = $self->FormStart("Operon Anlaysis"); | ||
77 : | # Get the selected gene. | ||
78 : | my $genomeID = $cgi->param('genome'); | ||
79 : | my $genomeList = ($genomeID ? [$genomeID] : []); | ||
80 : | # We'll accumulate the table rows in this variable. | ||
81 : | my @rows = (); | ||
82 : | # Start with the genome menu. We use a tall list but without multiple selection. | ||
83 : | my $menu = $self->NmpdrGenomeMenu('genome', 0, $genomeList, 10); | ||
84 : | push @rows, $cgi->Tr($cgi->td("Select a genome"), $cgi->td({colspan => 2}, $menu)); | ||
85 : | # Next we have the tuning parameters. | ||
86 : | my $options = $self->TuningParameters(%TuningParms); | ||
87 : | push @rows, $cgi->Tr($cgi->td({rowspan => 4}, "Tuning Parameters"), | ||
88 : | $cgi->td("Maximum distance between operon genes"), | ||
89 : | $cgi->td($cgi->textfield(-name => 'nearDistance', | ||
90 : | -value => $options->{nearDistance}, | ||
91 : | parrello | 1.2 | -size => 5) . |
92 : | parrello | 1.3 | SearchHelper::Hint("OpSearch", |
93 : | "If the distance between two genes " . | ||
94 : | parrello | 1.2 | "is greater than this number of base " . |
95 : | "pairs, they will be treated as " . | ||
96 : | "belonging to different operons."))), | ||
97 : | parrello | 1.1 | $cgi->Tr($cgi->td("Upstream base pairs to display"), |
98 : | $cgi->td($cgi->textfield(-name => 'upstream', | ||
99 : | -value => $options->{upstream}, | ||
100 : | parrello | 1.2 | -size => 5) . |
101 : | parrello | 1.3 | SearchHelper::Hint("OpSearch", |
102 : | "When displaying the upstream DNA, this is " . | ||
103 : | parrello | 1.2 | "the number of base pairs preceding the gene " . |
104 : | "that will be shown."))), | ||
105 : | parrello | 1.1 | $cgi->Tr($cgi->td("Instream base pairs to display"), |
106 : | $cgi->td($cgi->textfield(-name => 'instream', | ||
107 : | -value => $options->{instream}, | ||
108 : | parrello | 1.2 | -size => 5) . |
109 : | parrello | 1.3 | SearchHelper::Hint("OpSearch", |
110 : | "When displaying the upstream DNA, this is " . | ||
111 : | parrello | 1.2 | "the number of base pairs from inside the " . |
112 : | "gene that will be shown."))), | ||
113 : | parrello | 1.1 | $cgi->Tr($cgi->td("Maximum lint size"), |
114 : | $cgi->td($cgi->textfield(-name => 'lintSize', | ||
115 : | -value => $options->{lintSize}, | ||
116 : | parrello | 1.2 | -size => 5) . |
117 : | parrello | 1.3 | SearchHelper::Hint("OpSearch", |
118 : | "Genes whose size in base pairs are equal to " . | ||
119 : | parrello | 1.2 | "or less than this amount will; be ignored."))); |
120 : | parrello | 1.1 | # Add the special feature options. |
121 : | parrello | 1.2 | push @rows, RHFeatures::FeatureFilterFormRows($self, 'options'); |
122 : | parrello | 1.1 | # Add the submit button. |
123 : | push @rows, $self->SubmitRow(); | ||
124 : | # Make the rows into a table. | ||
125 : | $retVal .= $self->MakeTable(\@rows); | ||
126 : | # Close the form. | ||
127 : | $retVal .= $self->FormEnd(); | ||
128 : | # Return the result. | ||
129 : | return $retVal; | ||
130 : | } | ||
131 : | |||
132 : | =head3 Find | ||
133 : | |||
134 : | parrello | 1.4 | my $resultCount = $shelp->Find(); |
135 : | parrello | 1.1 | |
136 : | Conduct a search based on the current CGI query parameters. The search results will | ||
137 : | be written to the session cache file and the number of results will be | ||
138 : | returned. If the search parameters are invalid, a result count of C<undef> will be | ||
139 : | returned and a result message will be stored in this object describing the problem. | ||
140 : | |||
141 : | =cut | ||
142 : | |||
143 : | sub Find { | ||
144 : | my ($self) = @_; | ||
145 : | # Get the CGI and Sprout objects. | ||
146 : | my $cgi = $self->Q(); | ||
147 : | my $sprout = $self->DB(); | ||
148 : | # Declare the return variable. If it remains undefined, the caller will | ||
149 : | # know that an error occurred. | ||
150 : | my $retVal; | ||
151 : | # Get the genome ID. | ||
152 : | my $genomeID = $cgi->param('genome'); | ||
153 : | # Get the tuning parameters. | ||
154 : | my $options = $self->TuningParameters(%TuningParms); | ||
155 : | # Validate the tuning parameters. | ||
156 : | my @errorList = (); | ||
157 : | for my $parm (keys %{$options}) { | ||
158 : | if ($options->{$parm} !~ /^\d+$/) { | ||
159 : | push @errorList, "Invalid $parm value \"$options->{$parm}\"."; | ||
160 : | } | ||
161 : | } | ||
162 : | if (@errorList > 0) { | ||
163 : | $self->SetMessage(join(" ", @errorList)); | ||
164 : | } else { | ||
165 : | $self->PrintLine("Retrieving features for genome $genomeID."); | ||
166 : | # Here we have good tuning parameters. The next step is to loop through | ||
167 : | # all the features for the incoming genome. We want to get them in | ||
168 : | # location order within contig. | ||
169 : | my $query = $sprout->Get(['HasFeature', 'Feature', 'IsLocatedIn', 'Contig'], | ||
170 : | "HasFeature(from-link) = ? ORDER BY IsLocatedIn(to-link), IsLocatedIn(beg)", | ||
171 : | [$genomeID]); | ||
172 : | parrello | 1.2 | # Create a feature result helper to help us process the features. |
173 : | my $rhelp = RHFeatures->new($self); | ||
174 : | # Set the columns. | ||
175 : | $self->DefaultColumns($rhelp); | ||
176 : | # Define the extra columns. | ||
177 : | $rhelp->AddExtraColumn(operonID => 0, download => 'text', title => 'Operon ID', style => 'leftAlign'); | ||
178 : | $rhelp->AddExtraColumn(location => 1, download => 'text', title => 'Location', style => 'leftAlign'); | ||
179 : | $rhelp->AddExtraColumn(upstream => undef, download => 'align', title => 'Upstream DNA', style => 'leftAlign'); | ||
180 : | parrello | 1.1 | # Start the session. |
181 : | parrello | 1.2 | $self->OpenSession($rhelp); |
182 : | parrello | 1.1 | # The trickiest part of this whole process is computing the operon information. |
183 : | # Each feature has an operon ID and an operon sequence number. The operon ID | ||
184 : | # is displayed as an extra column. The sequence number is combined with the | ||
185 : | # operon ID to create the sort key. We arbitrarily assume an upper limit | ||
186 : | # of a million operons each with no more than a million features. If we're | ||
187 : | # wrong, the sort won't work but the data will still be okay. To start | ||
188 : | # the whole process along, we prime the operon data with dummy values so | ||
189 : | # that the first feature is considered to be part of a new operon. Note that | ||
190 : | # the operon ID and sequence number are saved in the object so that the | ||
191 : | # sort key method can find them. | ||
192 : | $self->{operonID} = 0; | ||
193 : | $self->{operonFeatureSeq} = 0; | ||
194 : | # This variable contains the last feature's location. We put it on a bogus contig | ||
195 : | # so that the first feature we encounted will be considered the start of a new | ||
196 : | # operon. | ||
197 : | my $lastLocation = BasicLocation->new(" ", 0, '+', 0); | ||
198 : | parrello | 1.2 | # This variable contains the last feature. We may receive multiple results for |
199 : | parrello | 1.1 | # a single feature. Only the last result is output. |
200 : | parrello | 1.2 | my $lastFeature; |
201 : | parrello | 1.1 | my $lastFid = ""; |
202 : | # Finally, we need to save the current contig ID and length. | ||
203 : | $self->{contigID} = ""; | ||
204 : | $self->{contigLen} = 0; | ||
205 : | # Loop until we run out of features. | ||
206 : | while (my $feature = $query->Fetch()) { | ||
207 : | # Get this feature's location. | ||
208 : | my $thisLocation = BasicLocation->new($feature->Values(['IsLocatedIn(to-link)', 'IsLocatedIn(beg)', | ||
209 : | 'IsLocatedIn(dir)', 'IsLocatedIn(len)'])); | ||
210 : | # Get this feature's ID. | ||
211 : | parrello | 1.2 | my $thisFid = $feature->PrimaryValue('IsLocatedIn(from-link)'); |
212 : | parrello | 1.1 | # Only proceed if this feature is not lint. |
213 : | if ($thisLocation->Length >= $options->{lintSize}) { | ||
214 : | # Determine whether or not this is a new feature. | ||
215 : | if ($thisFid eq $lastFid) { | ||
216 : | # This is a second location for the same feature. Combine its location with | ||
217 : | # the last location. | ||
218 : | $lastLocation->Combine($thisLocation); | ||
219 : | Trace("Sublocation found. New location is " . $lastLocation->String()) if T(4); | ||
220 : | } else { | ||
221 : | # We have a new feature. Write out the previous feature's data (if any). | ||
222 : | if ($lastFid) { | ||
223 : | Trace("Writing feature $lastFid.") if T(4); | ||
224 : | parrello | 1.2 | $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options); |
225 : | parrello | 1.1 | $retVal++; |
226 : | # Reveal our status every 100 features. | ||
227 : | if ($retVal % 100 == 0) { | ||
228 : | $self->PrintLine("$retVal features processed. $self->{operonID} operons found."); | ||
229 : | } | ||
230 : | } | ||
231 : | parrello | 1.2 | # Remember the new feature and its ID. |
232 : | parrello | 1.1 | $lastFid = $thisFid; |
233 : | parrello | 1.2 | $lastFeature = $feature; |
234 : | parrello | 1.1 | # Check the operon status. |
235 : | if ($lastLocation->Contig eq $thisLocation->Contig && | ||
236 : | $lastLocation->Dir eq $thisLocation->Dir && | ||
237 : | $thisLocation->Left - $lastLocation->Right < $options->{nearDistance}) { | ||
238 : | # Here we're part of the same operon. Increment the feature sequence number. | ||
239 : | # For forward operons we add 1. For backward operons we subtract 1. | ||
240 : | $self->{operonFeatureSeq} += $thisLocation->NumDirection(); | ||
241 : | Trace("New operon feature sequence number for $thisFid is $self->{operonFeatureSeq}.") if T(4); | ||
242 : | } else { | ||
243 : | # Here we belong to a new operon. | ||
244 : | $self->{operonID}++; | ||
245 : | # The sequence number will be incremented for forward operons and decremented | ||
246 : | # for backward operons. The sequence number is used only for sorting: it insures | ||
247 : | # that the operons are presented in their natural order. By starting at 5,000,000 | ||
248 : | # and relying on our assumption that therte are fewer than 1,000,000 features per | ||
249 : | # operon, we insure that the sequence numbers are always the same number of | ||
250 : | # digits, whether we decrement or increment. | ||
251 : | $self->{operonFeatureSeq} = 5000000; | ||
252 : | Trace("New operon ID for $thisFid is $self->{operonID}.") if T(4); | ||
253 : | } | ||
254 : | # Save the feature location. | ||
255 : | $lastLocation = $thisLocation; | ||
256 : | } | ||
257 : | } | ||
258 : | } | ||
259 : | # Output the last feature (if any). | ||
260 : | if ($lastFid) { | ||
261 : | parrello | 1.2 | $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options); |
262 : | parrello | 1.1 | $retVal++; |
263 : | } | ||
264 : | # Close the session file. | ||
265 : | $self->CloseSession(); | ||
266 : | } | ||
267 : | # Return the result count. | ||
268 : | return $retVal; | ||
269 : | } | ||
270 : | |||
271 : | =head3 Description | ||
272 : | |||
273 : | parrello | 1.4 | my $htmlText = $shelp->Description(); |
274 : | parrello | 1.1 | |
275 : | Return a description of this search. The description is used for the table of contents | ||
276 : | on the main search tools page. It may contain HTML, but it should be character-level, | ||
277 : | not block-level, since the description is going to appear in a list. | ||
278 : | |||
279 : | =cut | ||
280 : | |||
281 : | sub Description { | ||
282 : | # Get the parameters. | ||
283 : | my ($self) = @_; | ||
284 : | # Return the result. | ||
285 : | return "Search for operons in a single genome."; | ||
286 : | } | ||
287 : | |||
288 : | =head3 SortKey | ||
289 : | |||
290 : | parrello | 1.4 | my $key = $shelp->SortKey($rhelp, $record); |
291 : | parrello | 1.1 | |
292 : | parrello | 1.2 | Return the sort key for the current feature. The features are |
293 : | sorted by sequence within operon, which is determined entirely | ||
294 : | by data cached in this object. The sort order may, however, | ||
295 : | be modified by options | ||
296 : | parrello | 1.1 | |
297 : | =over 4 | ||
298 : | |||
299 : | parrello | 1.2 | =item rhelp |
300 : | |||
301 : | Current result helper object. | ||
302 : | |||
303 : | parrello | 1.1 | =item record |
304 : | |||
305 : | parrello | 1.2 | The C<ERDBObject> containing the current feature. |
306 : | parrello | 1.1 | |
307 : | =item RETURN | ||
308 : | |||
309 : | Returns a key field that can be used to sort this row in among the results. | ||
310 : | |||
311 : | =back | ||
312 : | |||
313 : | =cut | ||
314 : | |||
315 : | sub SortKey { | ||
316 : | # Get the parameters. | ||
317 : | parrello | 1.2 | my ($self, $rhelp, $record) = @_; |
318 : | parrello | 1.1 | # Get the current operon ID and sequence. |
319 : | my $operonID = $self->{operonID}; | ||
320 : | my $operonSeqNumber = $self->{operonFeatureSeq}; | ||
321 : | # The operon Sequence number is already at a fixed width. We need to pad to a | ||
322 : | # fixed width for the operon ID. | ||
323 : | parrello | 1.2 | my $operonKey = "$operonID.$operonSeqNumber"; |
324 : | while (length $operonKey < 20) { | ||
325 : | $operonKey = "0$operonKey"; | ||
326 : | parrello | 1.1 | } |
327 : | parrello | 1.2 | # Create a feature sort key for this feature with the operon data mixed in. |
328 : | my $retVal = $rhelp->SortKey($record, $operonKey); | ||
329 : | parrello | 1.1 | # Return the result. |
330 : | return $retVal; | ||
331 : | } | ||
332 : | |||
333 : | =head3 OutputFeature | ||
334 : | |||
335 : | parrello | 1.4 | $shelp->OutputFeature($rhelp, $feature, $location, $options); |
336 : | parrello | 1.1 | |
337 : | Output the current feature. We use the location to compute an upstream location for the feature, | ||
338 : | and this is added to the feature data object as an extra column named C<Upstream DNA>. The operon ID | ||
339 : | is added to the feature data object as an extra column named C<Operon ID>. | ||
340 : | |||
341 : | =over 4 | ||
342 : | |||
343 : | parrello | 1.2 | =item rhelp |
344 : | |||
345 : | Feature result helper. | ||
346 : | parrello | 1.1 | |
347 : | parrello | 1.2 | =item feature |
348 : | parrello | 1.1 | |
349 : | =item location | ||
350 : | |||
351 : | A BasicLocation object describing the combined feature locations for the current feature. | ||
352 : | |||
353 : | =item options | ||
354 : | |||
355 : | Reference to a hash of options for this search. The options of interest to us are C<instream> and | ||
356 : | parrello | 1.4 | C<upstream> |
357 : | parrello | 1.1 | |
358 : | =back | ||
359 : | |||
360 : | =cut | ||
361 : | |||
362 : | sub OutputFeature { | ||
363 : | # Get the parameters. | ||
364 : | parrello | 1.2 | my ($self, $rhelp, $feature, $location, $options) = @_; |
365 : | parrello | 1.1 | # Get access to Sprout. |
366 : | my $sprout = $self->DB(); | ||
367 : | # Get the contig length. | ||
368 : | if ($self->{contigID} ne $location->Contig) { | ||
369 : | $self->{contigID} = $location->Contig; | ||
370 : | $self->{contigLen} = $sprout->ContigLength($location->Contig); | ||
371 : | } | ||
372 : | my $contigLen = $self->{contigLen}; | ||
373 : | # Get the upstream location. | ||
374 : | my $upstreamLocation = $location->Upstream($options->{upstream}, $contigLen); | ||
375 : | # Compute its DNA and convert it to upper case. | ||
376 : | my $upstreamDNA = uc $sprout->DNASeq([$upstreamLocation->String]); | ||
377 : | # Compute the instream DNA. We do this by truncating the feature location. | ||
378 : | my $instreamLocation = BasicLocation->new($location); | ||
379 : | $instreamLocation->Truncate($options->{instream}); | ||
380 : | my $instreamDNA = lc $sprout->DNASeq([$instreamLocation->String]); | ||
381 : | # Combine the DNA sequences. | ||
382 : | $upstreamDNA .= ":$instreamDNA"; | ||
383 : | if (T(4)) { | ||
384 : | my $uString = $upstreamLocation->String; | ||
385 : | my $iString = $instreamLocation->String; | ||
386 : | my $oString = $location->String; | ||
387 : | Trace("Upstream = $uString, Instream = $iString, Original = $oString."); | ||
388 : | } | ||
389 : | # Chop the genome ID off the location string so it looks better. Also, we'll use | ||
390 : | # the SEED format so the user can eyeball the distance between genes. | ||
391 : | my $locationString = $location->SeedString; | ||
392 : | $locationString =~ s/^[^:]+://; | ||
393 : | parrello | 1.2 | # Store the upstream DNA in the result helper along with the operon ID. We add |
394 : | parrello | 1.1 | # the direction to the operon ID so the user knows more easily which way we're pointing. |
395 : | parrello | 1.2 | $rhelp->PutExtraColumns(operonID => $self->{operonID} . $location->Dir, |
396 : | upstream => $upstreamDNA, | ||
397 : | location => $locationString); | ||
398 : | # Compute the sort key and the feature ID. | ||
399 : | my $sortKey = $self->SortKey($rhelp, $feature); | ||
400 : | my $fid = $feature->PrimaryValue('Feature(id)'); | ||
401 : | parrello | 1.1 | # Put the feature to the output. |
402 : | parrello | 1.2 | $rhelp->PutData($sortKey, $fid, $feature); |
403 : | } | ||
404 : | |||
405 : | =head3 SearchTitle | ||
406 : | |||
407 : | parrello | 1.4 | my $titleHtml = $shelp->SearchTitle(); |
408 : | parrello | 1.2 | |
409 : | Return the display title for this search. The display title appears above the search results. | ||
410 : | If no result is returned, no title will be displayed. The result should be an html string | ||
411 : | that can be legally put inside a block tag such as C<h3> or C<p>. | ||
412 : | |||
413 : | =cut | ||
414 : | |||
415 : | sub SearchTitle { | ||
416 : | # Get the parameters. | ||
417 : | my ($self) = @_; | ||
418 : | # Compute the title. We extract the genome ID from the query parameters. | ||
419 : | my $cgi = $self->Q(); | ||
420 : | my $pdbID = $cgi->param('genome'); | ||
421 : | my $retVal = "Docking Results for $pdbID"; | ||
422 : | # Return it. | ||
423 : | return $retVal; | ||
424 : | parrello | 1.1 | } |
425 : | |||
426 : | parrello | 1.2 | |
427 : | parrello | 1.4 | 1; |
MCS Webmaster | ViewVC Help |
Powered by ViewVC 1.0.3 |