[Bio] / Sprout / SHOpSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHOpSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHOpSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 : parrello 1.5 use CGI qw(-nosticky);
8 : parrello 1.1 use HTML;
9 :     use Sprout;
10 : parrello 1.2 use RHFeatures;
11 : parrello 1.1 use BasicLocation;
12 : parrello 1.2 use base 'SearchHelper';
13 : parrello 1.1
14 :     =head1 Operon Analysis Feature Search Helper
15 :    
16 :     =head2 Introduction
17 :    
18 :     This search method takes a genome ID and produces a list of the operons. An operon is defined
19 :     as a set of genes in the same direction that are near each other with no intervening genes.
20 :     The concept of I<near> is defined by a parameter. In addition to the standard data
21 :     produced by a feature search, this method also shows the upstream DNA for each feature
22 :     found. The size of the upstream region is also defined by a parameter.
23 :    
24 : parrello 1.2 This search has the following extra parameters.
25 : parrello 1.1
26 :     =over 4
27 :    
28 :     =item genome
29 :    
30 :     ID of the genome to process for operons.
31 :    
32 :     =item nearDistance
33 :    
34 :     Maximum distance in base pairs of successive genes in an operon. The default value is C<200>,
35 :     which means that if the distance between the end of one gene and the beginning of the next is more
36 :     than 200 base pairs, the genes are not considered to be part of the same operon.
37 :    
38 :     =item upstream
39 :    
40 :     Number of base pairs in front of the gene to include in the display of the upstream region. The default
41 :     value is C<250>.
42 :    
43 :     =item instream
44 :    
45 :     Number of base pairs inside the gene to include in the display of the upstream region. The default
46 :     value is C<50>.
47 :    
48 :     =item lintSize
49 :    
50 :     The maximum size of a gene that can be interpreted as lint. Genes with this number of base pairs
51 :     or fewer are ignored during the operon analysis. The default is C<100>.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     my %TuningParms = (nearDistance => 200, upstream => 250, instream => 50, lintSize => 100);
58 :    
59 :     =head2 Virtual Methods
60 :    
61 :     =head3 Form
62 :    
63 : parrello 1.4 my $html = $shelp->Form();
64 : parrello 1.1
65 :     Generate the HTML for a form to request a new search.
66 :    
67 :     =cut
68 :    
69 :     sub Form {
70 :     # Get the parameters.
71 :     my ($self) = @_;
72 :     # Get the CGI and sprout objects.
73 :     my $cgi = $self->Q();
74 :     my $sprout = $self->DB();
75 :     # Start the form.
76 : parrello 1.5 my $retVal = $self->FormStart("Operon Analysis");
77 : parrello 1.1 # Get the selected gene.
78 :     my $genomeID = $cgi->param('genome');
79 :     my $genomeList = ($genomeID ? [$genomeID] : []);
80 :     # We'll accumulate the table rows in this variable.
81 :     my @rows = ();
82 :     # Start with the genome menu. We use a tall list but without multiple selection.
83 :     my $menu = $self->NmpdrGenomeMenu('genome', 0, $genomeList, 10);
84 : parrello 1.5 push @rows, CGI::Tr(CGI::td("Select a genome"), CGI::td({colspan => 2}, $menu));
85 : parrello 1.1 # Next we have the tuning parameters.
86 :     my $options = $self->TuningParameters(%TuningParms);
87 : parrello 1.5 push @rows, CGI::Tr(CGI::td({rowspan => 4}, "Tuning Parameters"),
88 :     CGI::td("Maximum distance between operon genes"),
89 :     CGI::td(CGI::textfield(-name => 'nearDistance',
90 : parrello 1.1 -value => $options->{nearDistance},
91 : parrello 1.2 -size => 5) .
92 : parrello 1.6 SearchHelper::Hint("OpSearch", 20))),
93 : parrello 1.5 CGI::Tr(CGI::td("Upstream base pairs to display"),
94 :     CGI::td(CGI::textfield(-name => 'upstream',
95 : parrello 1.1 -value => $options->{upstream},
96 : parrello 1.2 -size => 5) .
97 : parrello 1.6 SearchHelper::Hint("OpSearch", 21))),
98 : parrello 1.5 CGI::Tr(CGI::td("Instream base pairs to display"),
99 :     CGI::td(CGI::textfield(-name => 'instream',
100 : parrello 1.1 -value => $options->{instream},
101 : parrello 1.2 -size => 5) .
102 : parrello 1.6 SearchHelper::Hint("OpSearch", 22))),
103 : parrello 1.5 CGI::Tr(CGI::td("Maximum lint size"),
104 :     CGI::td(CGI::textfield(-name => 'lintSize',
105 : parrello 1.1 -value => $options->{lintSize},
106 : parrello 1.2 -size => 5) .
107 : parrello 1.6 SearchHelper::Hint("OpSearch", 23)));
108 : parrello 1.1 # Add the special feature options.
109 : parrello 1.2 push @rows, RHFeatures::FeatureFilterFormRows($self, 'options');
110 : parrello 1.1 # Add the submit button.
111 :     push @rows, $self->SubmitRow();
112 :     # Make the rows into a table.
113 :     $retVal .= $self->MakeTable(\@rows);
114 :     # Close the form.
115 :     $retVal .= $self->FormEnd();
116 :     # Return the result.
117 :     return $retVal;
118 :     }
119 :    
120 :     =head3 Find
121 :    
122 : parrello 1.4 my $resultCount = $shelp->Find();
123 : parrello 1.1
124 :     Conduct a search based on the current CGI query parameters. The search results will
125 :     be written to the session cache file and the number of results will be
126 :     returned. If the search parameters are invalid, a result count of C<undef> will be
127 :     returned and a result message will be stored in this object describing the problem.
128 :    
129 :     =cut
130 :    
131 :     sub Find {
132 :     my ($self) = @_;
133 :     # Get the CGI and Sprout objects.
134 :     my $cgi = $self->Q();
135 :     my $sprout = $self->DB();
136 :     # Declare the return variable. If it remains undefined, the caller will
137 :     # know that an error occurred.
138 :     my $retVal;
139 :     # Get the genome ID.
140 :     my $genomeID = $cgi->param('genome');
141 :     # Get the tuning parameters.
142 :     my $options = $self->TuningParameters(%TuningParms);
143 :     # Validate the tuning parameters.
144 :     my @errorList = ();
145 :     for my $parm (keys %{$options}) {
146 :     if ($options->{$parm} !~ /^\d+$/) {
147 :     push @errorList, "Invalid $parm value \"$options->{$parm}\".";
148 :     }
149 :     }
150 :     if (@errorList > 0) {
151 :     $self->SetMessage(join(" ", @errorList));
152 :     } else {
153 :     $self->PrintLine("Retrieving features for genome $genomeID.");
154 :     # Here we have good tuning parameters. The next step is to loop through
155 :     # all the features for the incoming genome. We want to get them in
156 :     # location order within contig.
157 :     my $query = $sprout->Get(['HasFeature', 'Feature', 'IsLocatedIn', 'Contig'],
158 :     "HasFeature(from-link) = ? ORDER BY IsLocatedIn(to-link), IsLocatedIn(beg)",
159 :     [$genomeID]);
160 : parrello 1.2 # Create a feature result helper to help us process the features.
161 :     my $rhelp = RHFeatures->new($self);
162 :     # Set the columns.
163 :     $self->DefaultColumns($rhelp);
164 :     # Define the extra columns.
165 :     $rhelp->AddExtraColumn(operonID => 0, download => 'text', title => 'Operon ID', style => 'leftAlign');
166 :     $rhelp->AddExtraColumn(location => 1, download => 'text', title => 'Location', style => 'leftAlign');
167 :     $rhelp->AddExtraColumn(upstream => undef, download => 'align', title => 'Upstream DNA', style => 'leftAlign');
168 : parrello 1.1 # Start the session.
169 : parrello 1.2 $self->OpenSession($rhelp);
170 : parrello 1.1 # The trickiest part of this whole process is computing the operon information.
171 :     # Each feature has an operon ID and an operon sequence number. The operon ID
172 :     # is displayed as an extra column. The sequence number is combined with the
173 :     # operon ID to create the sort key. We arbitrarily assume an upper limit
174 :     # of a million operons each with no more than a million features. If we're
175 :     # wrong, the sort won't work but the data will still be okay. To start
176 :     # the whole process along, we prime the operon data with dummy values so
177 :     # that the first feature is considered to be part of a new operon. Note that
178 :     # the operon ID and sequence number are saved in the object so that the
179 :     # sort key method can find them.
180 :     $self->{operonID} = 0;
181 :     $self->{operonFeatureSeq} = 0;
182 :     # This variable contains the last feature's location. We put it on a bogus contig
183 :     # so that the first feature we encounted will be considered the start of a new
184 :     # operon.
185 :     my $lastLocation = BasicLocation->new(" ", 0, '+', 0);
186 : parrello 1.2 # This variable contains the last feature. We may receive multiple results for
187 : parrello 1.1 # a single feature. Only the last result is output.
188 : parrello 1.2 my $lastFeature;
189 : parrello 1.1 my $lastFid = "";
190 :     # Finally, we need to save the current contig ID and length.
191 :     $self->{contigID} = "";
192 :     $self->{contigLen} = 0;
193 :     # Loop until we run out of features.
194 :     while (my $feature = $query->Fetch()) {
195 :     # Get this feature's location.
196 :     my $thisLocation = BasicLocation->new($feature->Values(['IsLocatedIn(to-link)', 'IsLocatedIn(beg)',
197 :     'IsLocatedIn(dir)', 'IsLocatedIn(len)']));
198 :     # Get this feature's ID.
199 : parrello 1.2 my $thisFid = $feature->PrimaryValue('IsLocatedIn(from-link)');
200 : parrello 1.1 # Only proceed if this feature is not lint.
201 :     if ($thisLocation->Length >= $options->{lintSize}) {
202 :     # Determine whether or not this is a new feature.
203 :     if ($thisFid eq $lastFid) {
204 :     # This is a second location for the same feature. Combine its location with
205 :     # the last location.
206 :     $lastLocation->Combine($thisLocation);
207 :     Trace("Sublocation found. New location is " . $lastLocation->String()) if T(4);
208 :     } else {
209 :     # We have a new feature. Write out the previous feature's data (if any).
210 :     if ($lastFid) {
211 :     Trace("Writing feature $lastFid.") if T(4);
212 : parrello 1.2 $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options);
213 : parrello 1.1 $retVal++;
214 :     # Reveal our status every 100 features.
215 :     if ($retVal % 100 == 0) {
216 :     $self->PrintLine("$retVal features processed. $self->{operonID} operons found.");
217 :     }
218 :     }
219 : parrello 1.2 # Remember the new feature and its ID.
220 : parrello 1.1 $lastFid = $thisFid;
221 : parrello 1.2 $lastFeature = $feature;
222 : parrello 1.1 # Check the operon status.
223 :     if ($lastLocation->Contig eq $thisLocation->Contig &&
224 :     $lastLocation->Dir eq $thisLocation->Dir &&
225 :     $thisLocation->Left - $lastLocation->Right < $options->{nearDistance}) {
226 :     # Here we're part of the same operon. Increment the feature sequence number.
227 :     # For forward operons we add 1. For backward operons we subtract 1.
228 :     $self->{operonFeatureSeq} += $thisLocation->NumDirection();
229 :     Trace("New operon feature sequence number for $thisFid is $self->{operonFeatureSeq}.") if T(4);
230 :     } else {
231 :     # Here we belong to a new operon.
232 :     $self->{operonID}++;
233 :     # The sequence number will be incremented for forward operons and decremented
234 :     # for backward operons. The sequence number is used only for sorting: it insures
235 :     # that the operons are presented in their natural order. By starting at 5,000,000
236 :     # and relying on our assumption that therte are fewer than 1,000,000 features per
237 :     # operon, we insure that the sequence numbers are always the same number of
238 :     # digits, whether we decrement or increment.
239 :     $self->{operonFeatureSeq} = 5000000;
240 :     Trace("New operon ID for $thisFid is $self->{operonID}.") if T(4);
241 :     }
242 :     # Save the feature location.
243 :     $lastLocation = $thisLocation;
244 :     }
245 :     }
246 :     }
247 :     # Output the last feature (if any).
248 :     if ($lastFid) {
249 : parrello 1.2 $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options);
250 : parrello 1.1 $retVal++;
251 :     }
252 :     # Close the session file.
253 :     $self->CloseSession();
254 :     }
255 :     # Return the result count.
256 :     return $retVal;
257 :     }
258 :    
259 :     =head3 Description
260 :    
261 : parrello 1.4 my $htmlText = $shelp->Description();
262 : parrello 1.1
263 :     Return a description of this search. The description is used for the table of contents
264 :     on the main search tools page. It may contain HTML, but it should be character-level,
265 :     not block-level, since the description is going to appear in a list.
266 :    
267 :     =cut
268 :    
269 :     sub Description {
270 :     # Get the parameters.
271 :     my ($self) = @_;
272 :     # Return the result.
273 : parrello 1.5 return "Search for %FIG{operons}% in a single %FIG{genome}%.";
274 : parrello 1.1 }
275 :    
276 :     =head3 SortKey
277 :    
278 : parrello 1.4 my $key = $shelp->SortKey($rhelp, $record);
279 : parrello 1.1
280 : parrello 1.2 Return the sort key for the current feature. The features are
281 :     sorted by sequence within operon, which is determined entirely
282 :     by data cached in this object. The sort order may, however,
283 :     be modified by options
284 : parrello 1.1
285 :     =over 4
286 :    
287 : parrello 1.2 =item rhelp
288 :    
289 :     Current result helper object.
290 :    
291 : parrello 1.1 =item record
292 :    
293 : parrello 1.2 The C<ERDBObject> containing the current feature.
294 : parrello 1.1
295 :     =item RETURN
296 :    
297 :     Returns a key field that can be used to sort this row in among the results.
298 :    
299 :     =back
300 :    
301 :     =cut
302 :    
303 :     sub SortKey {
304 :     # Get the parameters.
305 : parrello 1.2 my ($self, $rhelp, $record) = @_;
306 : parrello 1.1 # Get the current operon ID and sequence.
307 :     my $operonID = $self->{operonID};
308 :     my $operonSeqNumber = $self->{operonFeatureSeq};
309 :     # The operon Sequence number is already at a fixed width. We need to pad to a
310 :     # fixed width for the operon ID.
311 : parrello 1.2 my $operonKey = "$operonID.$operonSeqNumber";
312 :     while (length $operonKey < 20) {
313 :     $operonKey = "0$operonKey";
314 : parrello 1.1 }
315 : parrello 1.2 # Create a feature sort key for this feature with the operon data mixed in.
316 :     my $retVal = $rhelp->SortKey($record, $operonKey);
317 : parrello 1.1 # Return the result.
318 :     return $retVal;
319 :     }
320 :    
321 :     =head3 OutputFeature
322 :    
323 : parrello 1.4 $shelp->OutputFeature($rhelp, $feature, $location, $options);
324 : parrello 1.1
325 :     Output the current feature. We use the location to compute an upstream location for the feature,
326 :     and this is added to the feature data object as an extra column named C<Upstream DNA>. The operon ID
327 :     is added to the feature data object as an extra column named C<Operon ID>.
328 :    
329 :     =over 4
330 :    
331 : parrello 1.2 =item rhelp
332 :    
333 :     Feature result helper.
334 : parrello 1.1
335 : parrello 1.2 =item feature
336 : parrello 1.1
337 :     =item location
338 :    
339 :     A BasicLocation object describing the combined feature locations for the current feature.
340 :    
341 :     =item options
342 :    
343 :     Reference to a hash of options for this search. The options of interest to us are C<instream> and
344 : parrello 1.4 C<upstream>
345 : parrello 1.1
346 :     =back
347 :    
348 :     =cut
349 :    
350 :     sub OutputFeature {
351 :     # Get the parameters.
352 : parrello 1.2 my ($self, $rhelp, $feature, $location, $options) = @_;
353 : parrello 1.1 # Get access to Sprout.
354 :     my $sprout = $self->DB();
355 :     # Get the contig length.
356 :     if ($self->{contigID} ne $location->Contig) {
357 :     $self->{contigID} = $location->Contig;
358 :     $self->{contigLen} = $sprout->ContigLength($location->Contig);
359 :     }
360 :     my $contigLen = $self->{contigLen};
361 :     # Get the upstream location.
362 :     my $upstreamLocation = $location->Upstream($options->{upstream}, $contigLen);
363 :     # Compute its DNA and convert it to upper case.
364 :     my $upstreamDNA = uc $sprout->DNASeq([$upstreamLocation->String]);
365 :     # Compute the instream DNA. We do this by truncating the feature location.
366 :     my $instreamLocation = BasicLocation->new($location);
367 :     $instreamLocation->Truncate($options->{instream});
368 :     my $instreamDNA = lc $sprout->DNASeq([$instreamLocation->String]);
369 :     # Combine the DNA sequences.
370 :     $upstreamDNA .= ":$instreamDNA";
371 :     if (T(4)) {
372 :     my $uString = $upstreamLocation->String;
373 :     my $iString = $instreamLocation->String;
374 :     my $oString = $location->String;
375 :     Trace("Upstream = $uString, Instream = $iString, Original = $oString.");
376 :     }
377 :     # Chop the genome ID off the location string so it looks better. Also, we'll use
378 :     # the SEED format so the user can eyeball the distance between genes.
379 :     my $locationString = $location->SeedString;
380 :     $locationString =~ s/^[^:]+://;
381 : parrello 1.2 # Store the upstream DNA in the result helper along with the operon ID. We add
382 : parrello 1.1 # the direction to the operon ID so the user knows more easily which way we're pointing.
383 : parrello 1.2 $rhelp->PutExtraColumns(operonID => $self->{operonID} . $location->Dir,
384 :     upstream => $upstreamDNA,
385 :     location => $locationString);
386 :     # Compute the sort key and the feature ID.
387 :     my $sortKey = $self->SortKey($rhelp, $feature);
388 :     my $fid = $feature->PrimaryValue('Feature(id)');
389 : parrello 1.1 # Put the feature to the output.
390 : parrello 1.2 $rhelp->PutData($sortKey, $fid, $feature);
391 :     }
392 :    
393 :     =head3 SearchTitle
394 :    
395 : parrello 1.4 my $titleHtml = $shelp->SearchTitle();
396 : parrello 1.2
397 :     Return the display title for this search. The display title appears above the search results.
398 :     If no result is returned, no title will be displayed. The result should be an html string
399 :     that can be legally put inside a block tag such as C<h3> or C<p>.
400 :    
401 :     =cut
402 :    
403 :     sub SearchTitle {
404 :     # Get the parameters.
405 :     my ($self) = @_;
406 :     # Compute the title. We extract the genome ID from the query parameters.
407 :     my $cgi = $self->Q();
408 : parrello 1.5 my $genomeID = $cgi->param('genome');
409 :     my $retVal = "Operon search for $genomeID";
410 : parrello 1.2 # Return it.
411 :     return $retVal;
412 : parrello 1.1 }
413 :    
414 : parrello 1.2
415 : parrello 1.4 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3