[Bio] / Sprout / SHOpSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHOpSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHOpSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use SearchHelper;
8 :     use CGI;
9 :     use HTML;
10 :     use Sprout;
11 :     use FeatureData;
12 :     use FeatureQuery;
13 :     use BasicLocation;
14 :    
15 :     our @ISA = qw(SearchHelper);
16 :    
17 :     =head1 Operon Analysis Feature Search Helper
18 :    
19 :     =head2 Introduction
20 :    
21 :     This search method takes a genome ID and produces a list of the operons. An operon is defined
22 :     as a set of genes in the same direction that are near each other with no intervening genes.
23 :     The concept of I<near> is defined by a parameter. In addition to the standard data
24 :     produced by a feature search, this method also shows the upstream DNA for each feature
25 :     found. The size of the upstream region is also defined by a parameter.
26 :    
27 :     It has the following extra parameters.
28 :    
29 :     =over 4
30 :    
31 :     =item genome
32 :    
33 :     ID of the genome to process for operons.
34 :    
35 :     =item nearDistance
36 :    
37 :     Maximum distance in base pairs of successive genes in an operon. The default value is C<200>,
38 :     which means that if the distance between the end of one gene and the beginning of the next is more
39 :     than 200 base pairs, the genes are not considered to be part of the same operon.
40 :    
41 :     =item upstream
42 :    
43 :     Number of base pairs in front of the gene to include in the display of the upstream region. The default
44 :     value is C<250>.
45 :    
46 :     =item instream
47 :    
48 :     Number of base pairs inside the gene to include in the display of the upstream region. The default
49 :     value is C<50>.
50 :    
51 :     =item lintSize
52 :    
53 :     The maximum size of a gene that can be interpreted as lint. Genes with this number of base pairs
54 :     or fewer are ignored during the operon analysis. The default is C<100>.
55 :    
56 :     =back
57 :    
58 :     =cut
59 :    
60 :     my %TuningParms = (nearDistance => 200, upstream => 250, instream => 50, lintSize => 100);
61 :    
62 :     =head2 Virtual Methods
63 :    
64 :     =head3 Form
65 :    
66 :     C<< my $html = $shelp->Form(); >>
67 :    
68 :     Generate the HTML for a form to request a new search.
69 :    
70 :     =cut
71 :    
72 :     sub Form {
73 :     # Get the parameters.
74 :     my ($self) = @_;
75 :     # Get the CGI and sprout objects.
76 :     my $cgi = $self->Q();
77 :     my $sprout = $self->DB();
78 :     # Start the form.
79 :     my $retVal = $self->FormStart("Operon Anlaysis");
80 :     # Get the selected gene.
81 :     my $genomeID = $cgi->param('genome');
82 :     my $genomeList = ($genomeID ? [$genomeID] : []);
83 :     # We'll accumulate the table rows in this variable.
84 :     my @rows = ();
85 :     # Start with the genome menu. We use a tall list but without multiple selection.
86 :     my $menu = $self->NmpdrGenomeMenu('genome', 0, $genomeList, 10);
87 :     push @rows, $cgi->Tr($cgi->td("Select a genome"), $cgi->td({colspan => 2}, $menu));
88 :     # Next we have the tuning parameters.
89 :     my $options = $self->TuningParameters(%TuningParms);
90 :     push @rows, $cgi->Tr($cgi->td({rowspan => 4}, "Tuning Parameters"),
91 :     $cgi->td("Maximum distance between operon genes"),
92 :     $cgi->td($cgi->textfield(-name => 'nearDistance',
93 :     -value => $options->{nearDistance},
94 :     -size => 5))),
95 :     $cgi->Tr($cgi->td("Upstream base pairs to display"),
96 :     $cgi->td($cgi->textfield(-name => 'upstream',
97 :     -value => $options->{upstream},
98 :     -size => 5))),
99 :     $cgi->Tr($cgi->td("Instream base pairs to display"),
100 :     $cgi->td($cgi->textfield(-name => 'instream',
101 :     -value => $options->{instream},
102 :     -size => 5))),
103 :     $cgi->Tr($cgi->td("Maximum lint size"),
104 :     $cgi->td($cgi->textfield(-name => 'lintSize',
105 :     -value => $options->{lintSize},
106 :     -size => 5)));
107 :     # Add the special feature options.
108 :     push @rows, FeatureQuery::SpecialOptions($self);
109 :     # Add the submit button.
110 :     push @rows, $self->SubmitRow();
111 :     # Make the rows into a table.
112 :     $retVal .= $self->MakeTable(\@rows);
113 :     # Close the form.
114 :     $retVal .= $self->FormEnd();
115 :     # Return the result.
116 :     return $retVal;
117 :     }
118 :    
119 :     =head3 Find
120 :    
121 :     C<< my $resultCount = $shelp->Find(); >>
122 :    
123 :     Conduct a search based on the current CGI query parameters. The search results will
124 :     be written to the session cache file and the number of results will be
125 :     returned. If the search parameters are invalid, a result count of C<undef> will be
126 :     returned and a result message will be stored in this object describing the problem.
127 :    
128 :     =cut
129 :    
130 :     sub Find {
131 :     my ($self) = @_;
132 :     # Get the CGI and Sprout objects.
133 :     my $cgi = $self->Q();
134 :     my $sprout = $self->DB();
135 :     # Declare the return variable. If it remains undefined, the caller will
136 :     # know that an error occurred.
137 :     my $retVal;
138 :     # Denote that the upstream column gos at the end.
139 :     $self->SetExtraPos('Upstream DNA', 'Location');
140 :     # Get the genome ID.
141 :     my $genomeID = $cgi->param('genome');
142 :     # Get the tuning parameters.
143 :     my $options = $self->TuningParameters(%TuningParms);
144 :     # Validate the tuning parameters.
145 :     my @errorList = ();
146 :     for my $parm (keys %{$options}) {
147 :     if ($options->{$parm} !~ /^\d+$/) {
148 :     push @errorList, "Invalid $parm value \"$options->{$parm}\".";
149 :     }
150 :     }
151 :     if (@errorList > 0) {
152 :     $self->SetMessage(join(" ", @errorList));
153 :     } else {
154 :     $self->PrintLine("Retrieving features for genome $genomeID.");
155 :     # Here we have good tuning parameters. The next step is to loop through
156 :     # all the features for the incoming genome. We want to get them in
157 :     # location order within contig.
158 :     my $query = $sprout->Get(['HasFeature', 'Feature', 'IsLocatedIn', 'Contig'],
159 :     "HasFeature(from-link) = ? ORDER BY IsLocatedIn(to-link), IsLocatedIn(beg)",
160 :     [$genomeID]);
161 :     # Create a feature data object to help us process the features.
162 :     my $fd = FeatureData->new($self);
163 :     # Start the session.
164 :     $self->OpenSession();
165 :     # The trickiest part of this whole process is computing the operon information.
166 :     # Each feature has an operon ID and an operon sequence number. The operon ID
167 :     # is displayed as an extra column. The sequence number is combined with the
168 :     # operon ID to create the sort key. We arbitrarily assume an upper limit
169 :     # of a million operons each with no more than a million features. If we're
170 :     # wrong, the sort won't work but the data will still be okay. To start
171 :     # the whole process along, we prime the operon data with dummy values so
172 :     # that the first feature is considered to be part of a new operon. Note that
173 :     # the operon ID and sequence number are saved in the object so that the
174 :     # sort key method can find them.
175 :     $self->{operonID} = 0;
176 :     $self->{operonFeatureSeq} = 0;
177 :     # This variable contains the last feature's location. We put it on a bogus contig
178 :     # so that the first feature we encounted will be considered the start of a new
179 :     # operon.
180 :     my $lastLocation = BasicLocation->new(" ", 0, '+', 0);
181 :     # This variable contains the last feature's ID. We may receive multiple results for
182 :     # a single feature. Only the last result is output.
183 :     my $lastFid = "";
184 :     # Finally, we need to save the current contig ID and length.
185 :     $self->{contigID} = "";
186 :     $self->{contigLen} = 0;
187 :     # Loop until we run out of features.
188 :     while (my $feature = $query->Fetch()) {
189 :     # Get this feature's location.
190 :     my $thisLocation = BasicLocation->new($feature->Values(['IsLocatedIn(to-link)', 'IsLocatedIn(beg)',
191 :     'IsLocatedIn(dir)', 'IsLocatedIn(len)']));
192 :     # Get this feature's ID.
193 :     my ($thisFid) = $feature->Value('IsLocatedIn(from-link)');
194 :     # Only proceed if this feature is not lint.
195 :     if ($thisLocation->Length >= $options->{lintSize}) {
196 :     # Determine whether or not this is a new feature.
197 :     if ($thisFid eq $lastFid) {
198 :     # This is a second location for the same feature. Combine its location with
199 :     # the last location.
200 :     $lastLocation->Combine($thisLocation);
201 :     Trace("Sublocation found. New location is " . $lastLocation->String()) if T(4);
202 :     } else {
203 :     # We have a new feature. Write out the previous feature's data (if any).
204 :     if ($lastFid) {
205 :     Trace("Writing feature $lastFid.") if T(4);
206 :     $self->OutputFeature($fd, $lastLocation, $options);
207 :     $retVal++;
208 :     # Reveal our status every 100 features.
209 :     if ($retVal % 100 == 0) {
210 :     $self->PrintLine("$retVal features processed. $self->{operonID} operons found.");
211 :     }
212 :     }
213 :     # Remember the new feature ID.
214 :     $lastFid = $thisFid;
215 :     # Store this new feature in the feature data object.
216 :     $fd->Store($feature);
217 :     # Check the operon status.
218 :     if ($lastLocation->Contig eq $thisLocation->Contig &&
219 :     $lastLocation->Dir eq $thisLocation->Dir &&
220 :     $thisLocation->Left - $lastLocation->Right < $options->{nearDistance}) {
221 :     # Here we're part of the same operon. Increment the feature sequence number.
222 :     # For forward operons we add 1. For backward operons we subtract 1.
223 :     $self->{operonFeatureSeq} += $thisLocation->NumDirection();
224 :     Trace("New operon feature sequence number for $thisFid is $self->{operonFeatureSeq}.") if T(4);
225 :     } else {
226 :     # Here we belong to a new operon.
227 :     $self->{operonID}++;
228 :     # The sequence number will be incremented for forward operons and decremented
229 :     # for backward operons. The sequence number is used only for sorting: it insures
230 :     # that the operons are presented in their natural order. By starting at 5,000,000
231 :     # and relying on our assumption that therte are fewer than 1,000,000 features per
232 :     # operon, we insure that the sequence numbers are always the same number of
233 :     # digits, whether we decrement or increment.
234 :     $self->{operonFeatureSeq} = 5000000;
235 :     Trace("New operon ID for $thisFid is $self->{operonID}.") if T(4);
236 :     }
237 :     # Save the feature location.
238 :     $lastLocation = $thisLocation;
239 :     }
240 :     }
241 :     }
242 :     # Output the last feature (if any).
243 :     if ($lastFid) {
244 :     $self->OutputFeature($fd, $lastLocation, $options);
245 :     $retVal++;
246 :     }
247 :     # Close the session file.
248 :     $self->CloseSession();
249 :     }
250 :     # Return the result count.
251 :     return $retVal;
252 :     }
253 :    
254 :     =head3 Description
255 :    
256 :     C<< my $htmlText = $shelp->Description(); >>
257 :    
258 :     Return a description of this search. The description is used for the table of contents
259 :     on the main search tools page. It may contain HTML, but it should be character-level,
260 :     not block-level, since the description is going to appear in a list.
261 :    
262 :     =cut
263 :    
264 :     sub Description {
265 :     # Get the parameters.
266 :     my ($self) = @_;
267 :     # Return the result.
268 :     return "Search for operons in a single genome.";
269 :     }
270 :    
271 :     =head3 SortKey
272 :    
273 :     C<< my $key = $shelp->SortKey($fdata); >>
274 :    
275 :     Return the sort key for the specified feature data. The default is to sort by feature name,
276 :     floating NMPDR organisms to the top. If a full-text search is used, then the default
277 :     sort is by relevance followed by feature name. This sort may be overridden by the
278 :     search class to provide fancier functionality. This method is called by
279 :     B<PutFeature>, so it is only used for feature searches. A non-feature search
280 :     would presumably have its own sort logic.
281 :    
282 :     =over 4
283 :    
284 :     =item record
285 :    
286 :     The C<FeatureData> containing the current feature.
287 :    
288 :     =item RETURN
289 :    
290 :     Returns a key field that can be used to sort this row in among the results.
291 :    
292 :     =back
293 :    
294 :     =cut
295 :    
296 :     sub SortKey {
297 :     # Get the parameters.
298 :     my ($self, $fdata) = @_;
299 :     # Get the current operon ID and sequence.
300 :     my $operonID = $self->{operonID};
301 :     my $operonSeqNumber = $self->{operonFeatureSeq};
302 :     # The operon Sequence number is already at a fixed width. We need to pad to a
303 :     # fixed width for the operon ID.
304 :     my $retVal = "$operonID.$operonSeqNumber";
305 :     while (length $retVal < 20) {
306 :     $retVal = "0$retVal";
307 :     }
308 :     # Return the result.
309 :     return $retVal;
310 :     }
311 :    
312 :     =head3 OutputFeature
313 :    
314 :     C<< $shelp->OutputFeature($fd, $location, $options); >>
315 :    
316 :     Output the current feature. We use the location to compute an upstream location for the feature,
317 :     and this is added to the feature data object as an extra column named C<Upstream DNA>. The operon ID
318 :     is added to the feature data object as an extra column named C<Operon ID>.
319 :    
320 :     =over 4
321 :    
322 :     =item fd
323 :    
324 :     Feature Data object with the current feature stored in it.
325 :    
326 :     =item location
327 :    
328 :     A BasicLocation object describing the combined feature locations for the current feature.
329 :    
330 :     =item options
331 :    
332 :     Reference to a hash of options for this search. The options of interest to us are C<instream> and
333 :     C<upstream>
334 :    
335 :     =back
336 :    
337 :     =cut
338 :    
339 :     sub OutputFeature {
340 :     # Get the parameters.
341 :     my ($self, $fd, $location, $options) = @_;
342 :     # Get access to Sprout.
343 :     my $sprout = $self->DB();
344 :     # Get the contig length.
345 :     if ($self->{contigID} ne $location->Contig) {
346 :     $self->{contigID} = $location->Contig;
347 :     $self->{contigLen} = $sprout->ContigLength($location->Contig);
348 :     }
349 :     my $contigLen = $self->{contigLen};
350 :     # Get the upstream location.
351 :     my $upstreamLocation = $location->Upstream($options->{upstream}, $contigLen);
352 :     # Compute its DNA and convert it to upper case.
353 :     my $upstreamDNA = uc $sprout->DNASeq([$upstreamLocation->String]);
354 :     # Compute the instream DNA. We do this by truncating the feature location.
355 :     my $instreamLocation = BasicLocation->new($location);
356 :     $instreamLocation->Truncate($options->{instream});
357 :     my $instreamDNA = lc $sprout->DNASeq([$instreamLocation->String]);
358 :     # Combine the DNA sequences.
359 :     $upstreamDNA .= ":$instreamDNA";
360 :     if (T(4)) {
361 :     my $uString = $upstreamLocation->String;
362 :     my $iString = $instreamLocation->String;
363 :     my $oString = $location->String;
364 :     Trace("Upstream = $uString, Instream = $iString, Original = $oString.");
365 :     }
366 :     # Chop the genome ID off the location string so it looks better. Also, we'll use
367 :     # the SEED format so the user can eyeball the distance between genes.
368 :     my $locationString = $location->SeedString;
369 :     $locationString =~ s/^[^:]+://;
370 :     # Store the upstream DNA in the feature data object along with the operon ID. We add
371 :     # the direction to the operon ID so the user knows more easily which way we're pointing.
372 :     $fd->AddExtraColumns('Operon ID' => $self->{operonID} . $location->Dir,
373 :     'Upstream DNA' => $upstreamDNA,
374 :     'Location' => $locationString);
375 :     # Put the feature to the output.
376 :     $self->PutFeature($fd);
377 :     }
378 :    
379 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3