[Bio] / Sprout / SHOpSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHOpSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHOpSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use CGI;
8 :     use HTML;
9 :     use Sprout;
10 : parrello 1.2 use RHFeatures;
11 : parrello 1.1 use BasicLocation;
12 : parrello 1.2 use base 'SearchHelper';
13 : parrello 1.1
14 :     =head1 Operon Analysis Feature Search Helper
15 :    
16 :     =head2 Introduction
17 :    
18 :     This search method takes a genome ID and produces a list of the operons. An operon is defined
19 :     as a set of genes in the same direction that are near each other with no intervening genes.
20 :     The concept of I<near> is defined by a parameter. In addition to the standard data
21 :     produced by a feature search, this method also shows the upstream DNA for each feature
22 :     found. The size of the upstream region is also defined by a parameter.
23 :    
24 : parrello 1.2 This search has the following extra parameters.
25 : parrello 1.1
26 :     =over 4
27 :    
28 :     =item genome
29 :    
30 :     ID of the genome to process for operons.
31 :    
32 :     =item nearDistance
33 :    
34 :     Maximum distance in base pairs of successive genes in an operon. The default value is C<200>,
35 :     which means that if the distance between the end of one gene and the beginning of the next is more
36 :     than 200 base pairs, the genes are not considered to be part of the same operon.
37 :    
38 :     =item upstream
39 :    
40 :     Number of base pairs in front of the gene to include in the display of the upstream region. The default
41 :     value is C<250>.
42 :    
43 :     =item instream
44 :    
45 :     Number of base pairs inside the gene to include in the display of the upstream region. The default
46 :     value is C<50>.
47 :    
48 :     =item lintSize
49 :    
50 :     The maximum size of a gene that can be interpreted as lint. Genes with this number of base pairs
51 :     or fewer are ignored during the operon analysis. The default is C<100>.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     my %TuningParms = (nearDistance => 200, upstream => 250, instream => 50, lintSize => 100);
58 :    
59 :     =head2 Virtual Methods
60 :    
61 :     =head3 Form
62 :    
63 :     C<< my $html = $shelp->Form(); >>
64 :    
65 :     Generate the HTML for a form to request a new search.
66 :    
67 :     =cut
68 :    
69 :     sub Form {
70 :     # Get the parameters.
71 :     my ($self) = @_;
72 :     # Get the CGI and sprout objects.
73 :     my $cgi = $self->Q();
74 :     my $sprout = $self->DB();
75 :     # Start the form.
76 :     my $retVal = $self->FormStart("Operon Anlaysis");
77 :     # Get the selected gene.
78 :     my $genomeID = $cgi->param('genome');
79 :     my $genomeList = ($genomeID ? [$genomeID] : []);
80 :     # We'll accumulate the table rows in this variable.
81 :     my @rows = ();
82 :     # Start with the genome menu. We use a tall list but without multiple selection.
83 :     my $menu = $self->NmpdrGenomeMenu('genome', 0, $genomeList, 10);
84 :     push @rows, $cgi->Tr($cgi->td("Select a genome"), $cgi->td({colspan => 2}, $menu));
85 :     # Next we have the tuning parameters.
86 :     my $options = $self->TuningParameters(%TuningParms);
87 :     push @rows, $cgi->Tr($cgi->td({rowspan => 4}, "Tuning Parameters"),
88 :     $cgi->td("Maximum distance between operon genes"),
89 :     $cgi->td($cgi->textfield(-name => 'nearDistance',
90 :     -value => $options->{nearDistance},
91 : parrello 1.2 -size => 5) .
92 : parrello 1.3 SearchHelper::Hint("OpSearch",
93 :     "If the distance between two genes " .
94 : parrello 1.2 "is greater than this number of base " .
95 :     "pairs, they will be treated as " .
96 :     "belonging to different operons."))),
97 : parrello 1.1 $cgi->Tr($cgi->td("Upstream base pairs to display"),
98 :     $cgi->td($cgi->textfield(-name => 'upstream',
99 :     -value => $options->{upstream},
100 : parrello 1.2 -size => 5) .
101 : parrello 1.3 SearchHelper::Hint("OpSearch",
102 :     "When displaying the upstream DNA, this is " .
103 : parrello 1.2 "the number of base pairs preceding the gene " .
104 :     "that will be shown."))),
105 : parrello 1.1 $cgi->Tr($cgi->td("Instream base pairs to display"),
106 :     $cgi->td($cgi->textfield(-name => 'instream',
107 :     -value => $options->{instream},
108 : parrello 1.2 -size => 5) .
109 : parrello 1.3 SearchHelper::Hint("OpSearch",
110 :     "When displaying the upstream DNA, this is " .
111 : parrello 1.2 "the number of base pairs from inside the " .
112 :     "gene that will be shown."))),
113 : parrello 1.1 $cgi->Tr($cgi->td("Maximum lint size"),
114 :     $cgi->td($cgi->textfield(-name => 'lintSize',
115 :     -value => $options->{lintSize},
116 : parrello 1.2 -size => 5) .
117 : parrello 1.3 SearchHelper::Hint("OpSearch",
118 :     "Genes whose size in base pairs are equal to " .
119 : parrello 1.2 "or less than this amount will; be ignored.")));
120 : parrello 1.1 # Add the special feature options.
121 : parrello 1.2 push @rows, RHFeatures::FeatureFilterFormRows($self, 'options');
122 : parrello 1.1 # Add the submit button.
123 :     push @rows, $self->SubmitRow();
124 :     # Make the rows into a table.
125 :     $retVal .= $self->MakeTable(\@rows);
126 :     # Close the form.
127 :     $retVal .= $self->FormEnd();
128 :     # Return the result.
129 :     return $retVal;
130 :     }
131 :    
132 :     =head3 Find
133 :    
134 :     C<< my $resultCount = $shelp->Find(); >>
135 :    
136 :     Conduct a search based on the current CGI query parameters. The search results will
137 :     be written to the session cache file and the number of results will be
138 :     returned. If the search parameters are invalid, a result count of C<undef> will be
139 :     returned and a result message will be stored in this object describing the problem.
140 :    
141 :     =cut
142 :    
143 :     sub Find {
144 :     my ($self) = @_;
145 :     # Get the CGI and Sprout objects.
146 :     my $cgi = $self->Q();
147 :     my $sprout = $self->DB();
148 :     # Declare the return variable. If it remains undefined, the caller will
149 :     # know that an error occurred.
150 :     my $retVal;
151 :     # Get the genome ID.
152 :     my $genomeID = $cgi->param('genome');
153 :     # Get the tuning parameters.
154 :     my $options = $self->TuningParameters(%TuningParms);
155 :     # Validate the tuning parameters.
156 :     my @errorList = ();
157 :     for my $parm (keys %{$options}) {
158 :     if ($options->{$parm} !~ /^\d+$/) {
159 :     push @errorList, "Invalid $parm value \"$options->{$parm}\".";
160 :     }
161 :     }
162 :     if (@errorList > 0) {
163 :     $self->SetMessage(join(" ", @errorList));
164 :     } else {
165 :     $self->PrintLine("Retrieving features for genome $genomeID.");
166 :     # Here we have good tuning parameters. The next step is to loop through
167 :     # all the features for the incoming genome. We want to get them in
168 :     # location order within contig.
169 :     my $query = $sprout->Get(['HasFeature', 'Feature', 'IsLocatedIn', 'Contig'],
170 :     "HasFeature(from-link) = ? ORDER BY IsLocatedIn(to-link), IsLocatedIn(beg)",
171 :     [$genomeID]);
172 : parrello 1.2 # Create a feature result helper to help us process the features.
173 :     my $rhelp = RHFeatures->new($self);
174 :     # Set the columns.
175 :     $self->DefaultColumns($rhelp);
176 :     # Define the extra columns.
177 :     $rhelp->AddExtraColumn(operonID => 0, download => 'text', title => 'Operon ID', style => 'leftAlign');
178 :     $rhelp->AddExtraColumn(location => 1, download => 'text', title => 'Location', style => 'leftAlign');
179 :     $rhelp->AddExtraColumn(upstream => undef, download => 'align', title => 'Upstream DNA', style => 'leftAlign');
180 : parrello 1.1 # Start the session.
181 : parrello 1.2 $self->OpenSession($rhelp);
182 : parrello 1.1 # The trickiest part of this whole process is computing the operon information.
183 :     # Each feature has an operon ID and an operon sequence number. The operon ID
184 :     # is displayed as an extra column. The sequence number is combined with the
185 :     # operon ID to create the sort key. We arbitrarily assume an upper limit
186 :     # of a million operons each with no more than a million features. If we're
187 :     # wrong, the sort won't work but the data will still be okay. To start
188 :     # the whole process along, we prime the operon data with dummy values so
189 :     # that the first feature is considered to be part of a new operon. Note that
190 :     # the operon ID and sequence number are saved in the object so that the
191 :     # sort key method can find them.
192 :     $self->{operonID} = 0;
193 :     $self->{operonFeatureSeq} = 0;
194 :     # This variable contains the last feature's location. We put it on a bogus contig
195 :     # so that the first feature we encounted will be considered the start of a new
196 :     # operon.
197 :     my $lastLocation = BasicLocation->new(" ", 0, '+', 0);
198 : parrello 1.2 # This variable contains the last feature. We may receive multiple results for
199 : parrello 1.1 # a single feature. Only the last result is output.
200 : parrello 1.2 my $lastFeature;
201 : parrello 1.1 my $lastFid = "";
202 :     # Finally, we need to save the current contig ID and length.
203 :     $self->{contigID} = "";
204 :     $self->{contigLen} = 0;
205 :     # Loop until we run out of features.
206 :     while (my $feature = $query->Fetch()) {
207 :     # Get this feature's location.
208 :     my $thisLocation = BasicLocation->new($feature->Values(['IsLocatedIn(to-link)', 'IsLocatedIn(beg)',
209 :     'IsLocatedIn(dir)', 'IsLocatedIn(len)']));
210 :     # Get this feature's ID.
211 : parrello 1.2 my $thisFid = $feature->PrimaryValue('IsLocatedIn(from-link)');
212 : parrello 1.1 # Only proceed if this feature is not lint.
213 :     if ($thisLocation->Length >= $options->{lintSize}) {
214 :     # Determine whether or not this is a new feature.
215 :     if ($thisFid eq $lastFid) {
216 :     # This is a second location for the same feature. Combine its location with
217 :     # the last location.
218 :     $lastLocation->Combine($thisLocation);
219 :     Trace("Sublocation found. New location is " . $lastLocation->String()) if T(4);
220 :     } else {
221 :     # We have a new feature. Write out the previous feature's data (if any).
222 :     if ($lastFid) {
223 :     Trace("Writing feature $lastFid.") if T(4);
224 : parrello 1.2 $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options);
225 : parrello 1.1 $retVal++;
226 :     # Reveal our status every 100 features.
227 :     if ($retVal % 100 == 0) {
228 :     $self->PrintLine("$retVal features processed. $self->{operonID} operons found.");
229 :     }
230 :     }
231 : parrello 1.2 # Remember the new feature and its ID.
232 : parrello 1.1 $lastFid = $thisFid;
233 : parrello 1.2 $lastFeature = $feature;
234 : parrello 1.1 # Check the operon status.
235 :     if ($lastLocation->Contig eq $thisLocation->Contig &&
236 :     $lastLocation->Dir eq $thisLocation->Dir &&
237 :     $thisLocation->Left - $lastLocation->Right < $options->{nearDistance}) {
238 :     # Here we're part of the same operon. Increment the feature sequence number.
239 :     # For forward operons we add 1. For backward operons we subtract 1.
240 :     $self->{operonFeatureSeq} += $thisLocation->NumDirection();
241 :     Trace("New operon feature sequence number for $thisFid is $self->{operonFeatureSeq}.") if T(4);
242 :     } else {
243 :     # Here we belong to a new operon.
244 :     $self->{operonID}++;
245 :     # The sequence number will be incremented for forward operons and decremented
246 :     # for backward operons. The sequence number is used only for sorting: it insures
247 :     # that the operons are presented in their natural order. By starting at 5,000,000
248 :     # and relying on our assumption that therte are fewer than 1,000,000 features per
249 :     # operon, we insure that the sequence numbers are always the same number of
250 :     # digits, whether we decrement or increment.
251 :     $self->{operonFeatureSeq} = 5000000;
252 :     Trace("New operon ID for $thisFid is $self->{operonID}.") if T(4);
253 :     }
254 :     # Save the feature location.
255 :     $lastLocation = $thisLocation;
256 :     }
257 :     }
258 :     }
259 :     # Output the last feature (if any).
260 :     if ($lastFid) {
261 : parrello 1.2 $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options);
262 : parrello 1.1 $retVal++;
263 :     }
264 :     # Close the session file.
265 :     $self->CloseSession();
266 :     }
267 :     # Return the result count.
268 :     return $retVal;
269 :     }
270 :    
271 :     =head3 Description
272 :    
273 :     C<< my $htmlText = $shelp->Description(); >>
274 :    
275 :     Return a description of this search. The description is used for the table of contents
276 :     on the main search tools page. It may contain HTML, but it should be character-level,
277 :     not block-level, since the description is going to appear in a list.
278 :    
279 :     =cut
280 :    
281 :     sub Description {
282 :     # Get the parameters.
283 :     my ($self) = @_;
284 :     # Return the result.
285 :     return "Search for operons in a single genome.";
286 :     }
287 :    
288 :     =head3 SortKey
289 :    
290 : parrello 1.2 C<< my $key = $shelp->SortKey($rhelp, $record); >>
291 : parrello 1.1
292 : parrello 1.2 Return the sort key for the current feature. The features are
293 :     sorted by sequence within operon, which is determined entirely
294 :     by data cached in this object. The sort order may, however,
295 :     be modified by options
296 : parrello 1.1
297 :     =over 4
298 :    
299 : parrello 1.2 =item rhelp
300 :    
301 :     Current result helper object.
302 :    
303 : parrello 1.1 =item record
304 :    
305 : parrello 1.2 The C<ERDBObject> containing the current feature.
306 : parrello 1.1
307 :     =item RETURN
308 :    
309 :     Returns a key field that can be used to sort this row in among the results.
310 :    
311 :     =back
312 :    
313 :     =cut
314 :    
315 :     sub SortKey {
316 :     # Get the parameters.
317 : parrello 1.2 my ($self, $rhelp, $record) = @_;
318 : parrello 1.1 # Get the current operon ID and sequence.
319 :     my $operonID = $self->{operonID};
320 :     my $operonSeqNumber = $self->{operonFeatureSeq};
321 :     # The operon Sequence number is already at a fixed width. We need to pad to a
322 :     # fixed width for the operon ID.
323 : parrello 1.2 my $operonKey = "$operonID.$operonSeqNumber";
324 :     while (length $operonKey < 20) {
325 :     $operonKey = "0$operonKey";
326 : parrello 1.1 }
327 : parrello 1.2 # Create a feature sort key for this feature with the operon data mixed in.
328 :     my $retVal = $rhelp->SortKey($record, $operonKey);
329 : parrello 1.1 # Return the result.
330 :     return $retVal;
331 :     }
332 :    
333 :     =head3 OutputFeature
334 :    
335 : parrello 1.2 C<< $shelp->OutputFeature($rhelp, $feature, $location, $options); >>
336 : parrello 1.1
337 :     Output the current feature. We use the location to compute an upstream location for the feature,
338 :     and this is added to the feature data object as an extra column named C<Upstream DNA>. The operon ID
339 :     is added to the feature data object as an extra column named C<Operon ID>.
340 :    
341 :     =over 4
342 :    
343 : parrello 1.2 =item rhelp
344 :    
345 :     Feature result helper.
346 : parrello 1.1
347 : parrello 1.2 =item feature
348 : parrello 1.1
349 :     =item location
350 :    
351 :     A BasicLocation object describing the combined feature locations for the current feature.
352 :    
353 :     =item options
354 :    
355 :     Reference to a hash of options for this search. The options of interest to us are C<instream> and
356 :     C<upstream>
357 :    
358 :     =back
359 :    
360 :     =cut
361 :    
362 :     sub OutputFeature {
363 :     # Get the parameters.
364 : parrello 1.2 my ($self, $rhelp, $feature, $location, $options) = @_;
365 : parrello 1.1 # Get access to Sprout.
366 :     my $sprout = $self->DB();
367 :     # Get the contig length.
368 :     if ($self->{contigID} ne $location->Contig) {
369 :     $self->{contigID} = $location->Contig;
370 :     $self->{contigLen} = $sprout->ContigLength($location->Contig);
371 :     }
372 :     my $contigLen = $self->{contigLen};
373 :     # Get the upstream location.
374 :     my $upstreamLocation = $location->Upstream($options->{upstream}, $contigLen);
375 :     # Compute its DNA and convert it to upper case.
376 :     my $upstreamDNA = uc $sprout->DNASeq([$upstreamLocation->String]);
377 :     # Compute the instream DNA. We do this by truncating the feature location.
378 :     my $instreamLocation = BasicLocation->new($location);
379 :     $instreamLocation->Truncate($options->{instream});
380 :     my $instreamDNA = lc $sprout->DNASeq([$instreamLocation->String]);
381 :     # Combine the DNA sequences.
382 :     $upstreamDNA .= ":$instreamDNA";
383 :     if (T(4)) {
384 :     my $uString = $upstreamLocation->String;
385 :     my $iString = $instreamLocation->String;
386 :     my $oString = $location->String;
387 :     Trace("Upstream = $uString, Instream = $iString, Original = $oString.");
388 :     }
389 :     # Chop the genome ID off the location string so it looks better. Also, we'll use
390 :     # the SEED format so the user can eyeball the distance between genes.
391 :     my $locationString = $location->SeedString;
392 :     $locationString =~ s/^[^:]+://;
393 : parrello 1.2 # Store the upstream DNA in the result helper along with the operon ID. We add
394 : parrello 1.1 # the direction to the operon ID so the user knows more easily which way we're pointing.
395 : parrello 1.2 $rhelp->PutExtraColumns(operonID => $self->{operonID} . $location->Dir,
396 :     upstream => $upstreamDNA,
397 :     location => $locationString);
398 :     # Compute the sort key and the feature ID.
399 :     my $sortKey = $self->SortKey($rhelp, $feature);
400 :     my $fid = $feature->PrimaryValue('Feature(id)');
401 : parrello 1.1 # Put the feature to the output.
402 : parrello 1.2 $rhelp->PutData($sortKey, $fid, $feature);
403 :     }
404 :    
405 :     =head3 SearchTitle
406 :    
407 :     C<< my $titleHtml = $shelp->SearchTitle(); >>
408 :    
409 :     Return the display title for this search. The display title appears above the search results.
410 :     If no result is returned, no title will be displayed. The result should be an html string
411 :     that can be legally put inside a block tag such as C<h3> or C<p>.
412 :    
413 :     =cut
414 :    
415 :     sub SearchTitle {
416 :     # Get the parameters.
417 :     my ($self) = @_;
418 :     # Compute the title. We extract the genome ID from the query parameters.
419 :     my $cgi = $self->Q();
420 :     my $pdbID = $cgi->param('genome');
421 :     my $retVal = "Docking Results for $pdbID";
422 :     # Return it.
423 :     return $retVal;
424 : parrello 1.1 }
425 :    
426 : parrello 1.2
427 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3