[Bio] / Sprout / SHOpSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHOpSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHOpSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use CGI;
8 :     use HTML;
9 :     use Sprout;
10 : parrello 1.2 use RHFeatures;
11 : parrello 1.1 use BasicLocation;
12 : parrello 1.2 use base 'SearchHelper';
13 : parrello 1.1
14 :     =head1 Operon Analysis Feature Search Helper
15 :    
16 :     =head2 Introduction
17 :    
18 :     This search method takes a genome ID and produces a list of the operons. An operon is defined
19 :     as a set of genes in the same direction that are near each other with no intervening genes.
20 :     The concept of I<near> is defined by a parameter. In addition to the standard data
21 :     produced by a feature search, this method also shows the upstream DNA for each feature
22 :     found. The size of the upstream region is also defined by a parameter.
23 :    
24 : parrello 1.2 This search has the following extra parameters.
25 : parrello 1.1
26 :     =over 4
27 :    
28 :     =item genome
29 :    
30 :     ID of the genome to process for operons.
31 :    
32 :     =item nearDistance
33 :    
34 :     Maximum distance in base pairs of successive genes in an operon. The default value is C<200>,
35 :     which means that if the distance between the end of one gene and the beginning of the next is more
36 :     than 200 base pairs, the genes are not considered to be part of the same operon.
37 :    
38 :     =item upstream
39 :    
40 :     Number of base pairs in front of the gene to include in the display of the upstream region. The default
41 :     value is C<250>.
42 :    
43 :     =item instream
44 :    
45 :     Number of base pairs inside the gene to include in the display of the upstream region. The default
46 :     value is C<50>.
47 :    
48 :     =item lintSize
49 :    
50 :     The maximum size of a gene that can be interpreted as lint. Genes with this number of base pairs
51 :     or fewer are ignored during the operon analysis. The default is C<100>.
52 :    
53 :     =back
54 :    
55 :     =cut
56 :    
57 :     my %TuningParms = (nearDistance => 200, upstream => 250, instream => 50, lintSize => 100);
58 :    
59 :     =head2 Virtual Methods
60 :    
61 :     =head3 Form
62 :    
63 :     C<< my $html = $shelp->Form(); >>
64 :    
65 :     Generate the HTML for a form to request a new search.
66 :    
67 :     =cut
68 :    
69 :     sub Form {
70 :     # Get the parameters.
71 :     my ($self) = @_;
72 :     # Get the CGI and sprout objects.
73 :     my $cgi = $self->Q();
74 :     my $sprout = $self->DB();
75 :     # Start the form.
76 :     my $retVal = $self->FormStart("Operon Anlaysis");
77 :     # Get the selected gene.
78 :     my $genomeID = $cgi->param('genome');
79 :     my $genomeList = ($genomeID ? [$genomeID] : []);
80 :     # We'll accumulate the table rows in this variable.
81 :     my @rows = ();
82 :     # Start with the genome menu. We use a tall list but without multiple selection.
83 :     my $menu = $self->NmpdrGenomeMenu('genome', 0, $genomeList, 10);
84 :     push @rows, $cgi->Tr($cgi->td("Select a genome"), $cgi->td({colspan => 2}, $menu));
85 :     # Next we have the tuning parameters.
86 :     my $options = $self->TuningParameters(%TuningParms);
87 :     push @rows, $cgi->Tr($cgi->td({rowspan => 4}, "Tuning Parameters"),
88 :     $cgi->td("Maximum distance between operon genes"),
89 :     $cgi->td($cgi->textfield(-name => 'nearDistance',
90 :     -value => $options->{nearDistance},
91 : parrello 1.2 -size => 5) .
92 :     SearchHelper::Hint("If the distance between two genes " .
93 :     "is greater than this number of base " .
94 :     "pairs, they will be treated as " .
95 :     "belonging to different operons."))),
96 : parrello 1.1 $cgi->Tr($cgi->td("Upstream base pairs to display"),
97 :     $cgi->td($cgi->textfield(-name => 'upstream',
98 :     -value => $options->{upstream},
99 : parrello 1.2 -size => 5) .
100 :     SearchHelper::Hint("When displaying the upstream DNA, this is " .
101 :     "the number of base pairs preceding the gene " .
102 :     "that will be shown."))),
103 : parrello 1.1 $cgi->Tr($cgi->td("Instream base pairs to display"),
104 :     $cgi->td($cgi->textfield(-name => 'instream',
105 :     -value => $options->{instream},
106 : parrello 1.2 -size => 5) .
107 :     SearchHelper::Hint("When displaying the upstream DNA, this is " .
108 :     "the number of base pairs from inside the " .
109 :     "gene that will be shown."))),
110 : parrello 1.1 $cgi->Tr($cgi->td("Maximum lint size"),
111 :     $cgi->td($cgi->textfield(-name => 'lintSize',
112 :     -value => $options->{lintSize},
113 : parrello 1.2 -size => 5) .
114 :     SearchHelper::Hint("Genes whose size in base pairs are equal to " .
115 :     "or less than this amount will; be ignored.")));
116 : parrello 1.1 # Add the special feature options.
117 : parrello 1.2 push @rows, RHFeatures::FeatureFilterFormRows($self, 'options');
118 : parrello 1.1 # Add the submit button.
119 :     push @rows, $self->SubmitRow();
120 :     # Make the rows into a table.
121 :     $retVal .= $self->MakeTable(\@rows);
122 :     # Close the form.
123 :     $retVal .= $self->FormEnd();
124 :     # Return the result.
125 :     return $retVal;
126 :     }
127 :    
128 :     =head3 Find
129 :    
130 :     C<< my $resultCount = $shelp->Find(); >>
131 :    
132 :     Conduct a search based on the current CGI query parameters. The search results will
133 :     be written to the session cache file and the number of results will be
134 :     returned. If the search parameters are invalid, a result count of C<undef> will be
135 :     returned and a result message will be stored in this object describing the problem.
136 :    
137 :     =cut
138 :    
139 :     sub Find {
140 :     my ($self) = @_;
141 :     # Get the CGI and Sprout objects.
142 :     my $cgi = $self->Q();
143 :     my $sprout = $self->DB();
144 :     # Declare the return variable. If it remains undefined, the caller will
145 :     # know that an error occurred.
146 :     my $retVal;
147 :     # Get the genome ID.
148 :     my $genomeID = $cgi->param('genome');
149 :     # Get the tuning parameters.
150 :     my $options = $self->TuningParameters(%TuningParms);
151 :     # Validate the tuning parameters.
152 :     my @errorList = ();
153 :     for my $parm (keys %{$options}) {
154 :     if ($options->{$parm} !~ /^\d+$/) {
155 :     push @errorList, "Invalid $parm value \"$options->{$parm}\".";
156 :     }
157 :     }
158 :     if (@errorList > 0) {
159 :     $self->SetMessage(join(" ", @errorList));
160 :     } else {
161 :     $self->PrintLine("Retrieving features for genome $genomeID.");
162 :     # Here we have good tuning parameters. The next step is to loop through
163 :     # all the features for the incoming genome. We want to get them in
164 :     # location order within contig.
165 :     my $query = $sprout->Get(['HasFeature', 'Feature', 'IsLocatedIn', 'Contig'],
166 :     "HasFeature(from-link) = ? ORDER BY IsLocatedIn(to-link), IsLocatedIn(beg)",
167 :     [$genomeID]);
168 : parrello 1.2 # Create a feature result helper to help us process the features.
169 :     my $rhelp = RHFeatures->new($self);
170 :     # Set the columns.
171 :     $self->DefaultColumns($rhelp);
172 :     # Define the extra columns.
173 :     $rhelp->AddExtraColumn(operonID => 0, download => 'text', title => 'Operon ID', style => 'leftAlign');
174 :     $rhelp->AddExtraColumn(location => 1, download => 'text', title => 'Location', style => 'leftAlign');
175 :     $rhelp->AddExtraColumn(upstream => undef, download => 'align', title => 'Upstream DNA', style => 'leftAlign');
176 : parrello 1.1 # Start the session.
177 : parrello 1.2 $self->OpenSession($rhelp);
178 : parrello 1.1 # The trickiest part of this whole process is computing the operon information.
179 :     # Each feature has an operon ID and an operon sequence number. The operon ID
180 :     # is displayed as an extra column. The sequence number is combined with the
181 :     # operon ID to create the sort key. We arbitrarily assume an upper limit
182 :     # of a million operons each with no more than a million features. If we're
183 :     # wrong, the sort won't work but the data will still be okay. To start
184 :     # the whole process along, we prime the operon data with dummy values so
185 :     # that the first feature is considered to be part of a new operon. Note that
186 :     # the operon ID and sequence number are saved in the object so that the
187 :     # sort key method can find them.
188 :     $self->{operonID} = 0;
189 :     $self->{operonFeatureSeq} = 0;
190 :     # This variable contains the last feature's location. We put it on a bogus contig
191 :     # so that the first feature we encounted will be considered the start of a new
192 :     # operon.
193 :     my $lastLocation = BasicLocation->new(" ", 0, '+', 0);
194 : parrello 1.2 # This variable contains the last feature. We may receive multiple results for
195 : parrello 1.1 # a single feature. Only the last result is output.
196 : parrello 1.2 my $lastFeature;
197 : parrello 1.1 my $lastFid = "";
198 :     # Finally, we need to save the current contig ID and length.
199 :     $self->{contigID} = "";
200 :     $self->{contigLen} = 0;
201 :     # Loop until we run out of features.
202 :     while (my $feature = $query->Fetch()) {
203 :     # Get this feature's location.
204 :     my $thisLocation = BasicLocation->new($feature->Values(['IsLocatedIn(to-link)', 'IsLocatedIn(beg)',
205 :     'IsLocatedIn(dir)', 'IsLocatedIn(len)']));
206 :     # Get this feature's ID.
207 : parrello 1.2 my $thisFid = $feature->PrimaryValue('IsLocatedIn(from-link)');
208 : parrello 1.1 # Only proceed if this feature is not lint.
209 :     if ($thisLocation->Length >= $options->{lintSize}) {
210 :     # Determine whether or not this is a new feature.
211 :     if ($thisFid eq $lastFid) {
212 :     # This is a second location for the same feature. Combine its location with
213 :     # the last location.
214 :     $lastLocation->Combine($thisLocation);
215 :     Trace("Sublocation found. New location is " . $lastLocation->String()) if T(4);
216 :     } else {
217 :     # We have a new feature. Write out the previous feature's data (if any).
218 :     if ($lastFid) {
219 :     Trace("Writing feature $lastFid.") if T(4);
220 : parrello 1.2 $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options);
221 : parrello 1.1 $retVal++;
222 :     # Reveal our status every 100 features.
223 :     if ($retVal % 100 == 0) {
224 :     $self->PrintLine("$retVal features processed. $self->{operonID} operons found.");
225 :     }
226 :     }
227 : parrello 1.2 # Remember the new feature and its ID.
228 : parrello 1.1 $lastFid = $thisFid;
229 : parrello 1.2 $lastFeature = $feature;
230 : parrello 1.1 # Check the operon status.
231 :     if ($lastLocation->Contig eq $thisLocation->Contig &&
232 :     $lastLocation->Dir eq $thisLocation->Dir &&
233 :     $thisLocation->Left - $lastLocation->Right < $options->{nearDistance}) {
234 :     # Here we're part of the same operon. Increment the feature sequence number.
235 :     # For forward operons we add 1. For backward operons we subtract 1.
236 :     $self->{operonFeatureSeq} += $thisLocation->NumDirection();
237 :     Trace("New operon feature sequence number for $thisFid is $self->{operonFeatureSeq}.") if T(4);
238 :     } else {
239 :     # Here we belong to a new operon.
240 :     $self->{operonID}++;
241 :     # The sequence number will be incremented for forward operons and decremented
242 :     # for backward operons. The sequence number is used only for sorting: it insures
243 :     # that the operons are presented in their natural order. By starting at 5,000,000
244 :     # and relying on our assumption that therte are fewer than 1,000,000 features per
245 :     # operon, we insure that the sequence numbers are always the same number of
246 :     # digits, whether we decrement or increment.
247 :     $self->{operonFeatureSeq} = 5000000;
248 :     Trace("New operon ID for $thisFid is $self->{operonID}.") if T(4);
249 :     }
250 :     # Save the feature location.
251 :     $lastLocation = $thisLocation;
252 :     }
253 :     }
254 :     }
255 :     # Output the last feature (if any).
256 :     if ($lastFid) {
257 : parrello 1.2 $self->OutputFeature($rhelp, $lastFeature, $lastLocation, $options);
258 : parrello 1.1 $retVal++;
259 :     }
260 :     # Close the session file.
261 :     $self->CloseSession();
262 :     }
263 :     # Return the result count.
264 :     return $retVal;
265 :     }
266 :    
267 :     =head3 Description
268 :    
269 :     C<< my $htmlText = $shelp->Description(); >>
270 :    
271 :     Return a description of this search. The description is used for the table of contents
272 :     on the main search tools page. It may contain HTML, but it should be character-level,
273 :     not block-level, since the description is going to appear in a list.
274 :    
275 :     =cut
276 :    
277 :     sub Description {
278 :     # Get the parameters.
279 :     my ($self) = @_;
280 :     # Return the result.
281 :     return "Search for operons in a single genome.";
282 :     }
283 :    
284 :     =head3 SortKey
285 :    
286 : parrello 1.2 C<< my $key = $shelp->SortKey($rhelp, $record); >>
287 : parrello 1.1
288 : parrello 1.2 Return the sort key for the current feature. The features are
289 :     sorted by sequence within operon, which is determined entirely
290 :     by data cached in this object. The sort order may, however,
291 :     be modified by options
292 : parrello 1.1
293 :     =over 4
294 :    
295 : parrello 1.2 =item rhelp
296 :    
297 :     Current result helper object.
298 :    
299 : parrello 1.1 =item record
300 :    
301 : parrello 1.2 The C<ERDBObject> containing the current feature.
302 : parrello 1.1
303 :     =item RETURN
304 :    
305 :     Returns a key field that can be used to sort this row in among the results.
306 :    
307 :     =back
308 :    
309 :     =cut
310 :    
311 :     sub SortKey {
312 :     # Get the parameters.
313 : parrello 1.2 my ($self, $rhelp, $record) = @_;
314 : parrello 1.1 # Get the current operon ID and sequence.
315 :     my $operonID = $self->{operonID};
316 :     my $operonSeqNumber = $self->{operonFeatureSeq};
317 :     # The operon Sequence number is already at a fixed width. We need to pad to a
318 :     # fixed width for the operon ID.
319 : parrello 1.2 my $operonKey = "$operonID.$operonSeqNumber";
320 :     while (length $operonKey < 20) {
321 :     $operonKey = "0$operonKey";
322 : parrello 1.1 }
323 : parrello 1.2 # Create a feature sort key for this feature with the operon data mixed in.
324 :     my $retVal = $rhelp->SortKey($record, $operonKey);
325 : parrello 1.1 # Return the result.
326 :     return $retVal;
327 :     }
328 :    
329 :     =head3 OutputFeature
330 :    
331 : parrello 1.2 C<< $shelp->OutputFeature($rhelp, $feature, $location, $options); >>
332 : parrello 1.1
333 :     Output the current feature. We use the location to compute an upstream location for the feature,
334 :     and this is added to the feature data object as an extra column named C<Upstream DNA>. The operon ID
335 :     is added to the feature data object as an extra column named C<Operon ID>.
336 :    
337 :     =over 4
338 :    
339 : parrello 1.2 =item rhelp
340 :    
341 :     Feature result helper.
342 : parrello 1.1
343 : parrello 1.2 =item feature
344 : parrello 1.1
345 :     =item location
346 :    
347 :     A BasicLocation object describing the combined feature locations for the current feature.
348 :    
349 :     =item options
350 :    
351 :     Reference to a hash of options for this search. The options of interest to us are C<instream> and
352 :     C<upstream>
353 :    
354 :     =back
355 :    
356 :     =cut
357 :    
358 :     sub OutputFeature {
359 :     # Get the parameters.
360 : parrello 1.2 my ($self, $rhelp, $feature, $location, $options) = @_;
361 : parrello 1.1 # Get access to Sprout.
362 :     my $sprout = $self->DB();
363 :     # Get the contig length.
364 :     if ($self->{contigID} ne $location->Contig) {
365 :     $self->{contigID} = $location->Contig;
366 :     $self->{contigLen} = $sprout->ContigLength($location->Contig);
367 :     }
368 :     my $contigLen = $self->{contigLen};
369 :     # Get the upstream location.
370 :     my $upstreamLocation = $location->Upstream($options->{upstream}, $contigLen);
371 :     # Compute its DNA and convert it to upper case.
372 :     my $upstreamDNA = uc $sprout->DNASeq([$upstreamLocation->String]);
373 :     # Compute the instream DNA. We do this by truncating the feature location.
374 :     my $instreamLocation = BasicLocation->new($location);
375 :     $instreamLocation->Truncate($options->{instream});
376 :     my $instreamDNA = lc $sprout->DNASeq([$instreamLocation->String]);
377 :     # Combine the DNA sequences.
378 :     $upstreamDNA .= ":$instreamDNA";
379 :     if (T(4)) {
380 :     my $uString = $upstreamLocation->String;
381 :     my $iString = $instreamLocation->String;
382 :     my $oString = $location->String;
383 :     Trace("Upstream = $uString, Instream = $iString, Original = $oString.");
384 :     }
385 :     # Chop the genome ID off the location string so it looks better. Also, we'll use
386 :     # the SEED format so the user can eyeball the distance between genes.
387 :     my $locationString = $location->SeedString;
388 :     $locationString =~ s/^[^:]+://;
389 : parrello 1.2 # Store the upstream DNA in the result helper along with the operon ID. We add
390 : parrello 1.1 # the direction to the operon ID so the user knows more easily which way we're pointing.
391 : parrello 1.2 $rhelp->PutExtraColumns(operonID => $self->{operonID} . $location->Dir,
392 :     upstream => $upstreamDNA,
393 :     location => $locationString);
394 :     # Compute the sort key and the feature ID.
395 :     my $sortKey = $self->SortKey($rhelp, $feature);
396 :     my $fid = $feature->PrimaryValue('Feature(id)');
397 : parrello 1.1 # Put the feature to the output.
398 : parrello 1.2 $rhelp->PutData($sortKey, $fid, $feature);
399 :     }
400 :    
401 :     =head3 SearchTitle
402 :    
403 :     C<< my $titleHtml = $shelp->SearchTitle(); >>
404 :    
405 :     Return the display title for this search. The display title appears above the search results.
406 :     If no result is returned, no title will be displayed. The result should be an html string
407 :     that can be legally put inside a block tag such as C<h3> or C<p>.
408 :    
409 :     =cut
410 :    
411 :     sub SearchTitle {
412 :     # Get the parameters.
413 :     my ($self) = @_;
414 :     # Compute the title. We extract the genome ID from the query parameters.
415 :     my $cgi = $self->Q();
416 :     my $pdbID = $cgi->param('genome');
417 :     my $retVal = "Docking Results for $pdbID";
418 :     # Return it.
419 :     return $retVal;
420 : parrello 1.1 }
421 :    
422 : parrello 1.2
423 : parrello 1.1 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3