[Bio] / Sprout / SHToolSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHToolSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.8 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHToolSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 : parrello 1.6 use CGI qw(-nosticky);
8 : parrello 1.1 use HTML;
9 :     use Sprout;
10 :     use Sim;
11 :     use RHLocations;
12 :     use ERDBObject;
13 :     use XML::Simple;
14 :     use Data::Dumper;
15 :     use base 'SearchHelper';
16 :    
17 :     =head1 Tool Search Feature Search Helper
18 :    
19 :     =head2 Introduction
20 :    
21 :     This object conducts searches using command-line tools that compare a DNA or Protein
22 :     pattern against a text database of genetic information.
23 :    
24 :     The tools are invoked by a method called L</ExecScan>. This processes the input,
25 :     invokes the tool, and returns a list of hash references. The hash references contain
26 :     whatever fields are required in the search output. Two fields are required: C<hitLoc>,
27 :     which specifies the location of the matching data, and C<sortKey>, which is used
28 :     to sort it.
29 :    
30 :     =over 4
31 :    
32 :     =item sequence
33 :    
34 :     DNA or protein sequence. This can be a feature ID, pattern, or a FASTA string. A feature ID
35 :     is automatically converted to a FASTA string. If a pattern is specified, it is passed
36 :     directly to the pattern match tool.
37 :    
38 :     =item tool
39 :    
40 :     Tool to use: currently C<blastp>, C<blastn>, C<blastx>, C<dnaScan>, or C<protScan>.
41 :    
42 :     =item options
43 :    
44 :     BLAST options, encoded as a string
45 :    
46 :     =item genome[]
47 :    
48 :     IDs of the genomes to be included in the search.
49 :    
50 :     =back
51 :    
52 :     =cut
53 :    
54 :     =head2 Data Structures
55 :    
56 :     =head3 ToolTable
57 :    
58 :     This search is a fairly complicated thing that is basically used to search for
59 :     sequence data in one or more genomes using command-line tools. The idea is that
60 :     any command-line tool can be put in here and made available for use by the
61 :     search system. From a coding point of view, it would make sense to have a
62 :     separate search helper for each tool, but combining them in this way has
63 :     a psychological benefit for the user. The one requirement across all the tools
64 :     is that the input is a DNA or protein sequence of some sort. For the BLAST
65 :     tools, the sequence is a standard FASTA thing. For the scan-for-matches
66 :     tools, the sequence is a search pattern.
67 :    
68 :     The tool table is a hash that performs the services normally expected of
69 :     subclasses. The hash maps each tool name to a hash reference. The various
70 :     fields in the hash references are as follows.
71 :    
72 :     =over 4
73 :    
74 :     =item db_type
75 :    
76 :     Type of database against which the tool runs: C<prot> for a protein database and
77 :     C<dna> for a DNA database. This tells us which files to pass into the tool as
78 :     the search database.
79 :    
80 :     =item exec
81 :    
82 :     Execution string for the tool. The variable C<$seqFile> is presumed to be the location
83 :     of the input sequence, C<$db> is the directory, and C<$options> are the user-specified options.
84 :    
85 :     =item output
86 :    
87 :     The method for converting the output from the tool (presented as a set of lines of text) to
88 :     the output format expected by the BLAST search tool (a list of hash references).
89 :    
90 :     =item inputType
91 :    
92 :     The type of input expected. This is either a FASTA for protein or DNA (C<prot> or C<dna>),
93 :     or a scan pattern for protein or DNA (C<protPattern> or C<dnaPattern>). If additional
94 :     formats are needed, they must be programmed into the B<ComputeFASTA> method of C<SearchHelper>.
95 :    
96 :     =item extras
97 :    
98 :     A list of lists describing the extra columns desired in the output. The extra column names
99 :     must exist as keys in the hashes produced by the output method. In addition to the fields named
100 :     in this list, the output hashes must contain a field named C<sortKey> that can be used to sort
101 :     the results. The lists correspond exactly to the parameter list for the B<AddExtraColumn>
102 :     method of the B<ResultHelper> class.
103 :    
104 :     =item buttonName
105 :    
106 :     The name to display on the search button if this tool is selected.
107 :    
108 :     =item targetRelationship
109 :    
110 :     The name of the relationship from the genome to the entity whose IDs will appear in the output regions. This is C<HasContig> for
111 :     a DNA search and C<HasFeature> for a feature search.
112 :    
113 :     =item title
114 :    
115 :     Title to be used for the search results pages.
116 :    
117 :     =back
118 :    
119 :     =cut
120 :    
121 :     my %ToolTable = ( blastp => { db_type => 'prot',
122 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastp $options',
123 :     output => \&blastXML,
124 :     inputType => 'prot',
125 : parrello 1.8 extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
126 :     [evalue => undef, title => "eValue", style => "leftAlign", download => 'num'],
127 : parrello 1.4 [queryLoc => undef, title => "Query Location", style => "leftAlign", download => 'text'],
128 : parrello 1.1 [alignment => undef, title => "Alignment", style => "code", download => 'align']],
129 :     buttonName => 'BLAST',
130 :     targetRelationship => 'HasFeature',
131 :     title => 'Blastp Search',
132 :     },
133 :     blastx => { db_type => 'prot',
134 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastx $options',
135 :     output => \&blastXML,
136 :     inputType => 'prot',
137 : parrello 1.8 extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
138 :     [evalue => undef, title => "eValue", style => "leftAlign", download => 'num'],
139 : parrello 1.4 [queryLoc => undef, title => "Query Location", style => "leftAlign", download => 'text'],
140 : parrello 1.1 [alignment => undef, title => "Alignment", style => "code", download => 'align']],
141 :     buttonName => 'BLAST',
142 :     targetRelationship => 'HasFeature',
143 :     title => 'Blastx Search',
144 :     },
145 :     blastn => { db_type => 'dna',
146 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastn $options',
147 :     output => \&blastXML,
148 :     inputType => 'dna',
149 : parrello 1.8 extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
150 :     [evalue => undef, title => "eValue", style => "leftAlign", download => 'num'],
151 : parrello 1.4 [queryLoc => undef, title => "Query Location", style => "leftAlign", download => 'text'],
152 : parrello 1.1 [alignment => undef, title => "Alignment", style => "code", download => 'align']],
153 :     buttonName => 'BLAST',
154 :     targetRelationship => 'HasContig',
155 :     title => 'Blastn Search',
156 :     },
157 :     dnaScan => { db_type => 'dna',
158 :     exec => 'scan_for_matches -c $seqFile $options <$db',
159 :     output => \&scanLines,
160 :     inputType => 'dnaPattern',
161 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
162 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
163 :     buttonName => 'SCAN',
164 :     targetRelationship => 'HasContig',
165 :     title => 'DNA Scan for Matches',
166 :     },
167 :     protScan => { db_type => 'prot',
168 :     exec => 'scan_for_matches -p $seqFile $options <$db',
169 :     output => \&scanLines,
170 :     inputType => 'protPattern',
171 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
172 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
173 :     buttonName => 'SCAN',
174 :     targetRelationship => 'HasFeature',
175 :     title => 'Protein Scan for Matches',
176 :     }
177 :     );
178 :    
179 :     =head2 Public Methods
180 :    
181 :     =head3 ExecTool
182 :    
183 : parrello 1.5 my @sims = $shelp->ExecTool($seqFile, \@genomes, $tool, $options);
184 : parrello 1.1
185 :     Call a tool to search for DNA sequences or features.
186 :    
187 :     =over 4
188 :    
189 :     =item seqFile
190 :    
191 :     Name of a file containing the input sequence. This will either be a FASTA or a scan pattern.
192 :    
193 :     =item genomes
194 :    
195 :     A list of the IDs for the target genomes of the search.
196 :    
197 :     =item tool
198 :    
199 :     Name of the tool to use.
200 :    
201 :     =item options
202 :    
203 :     Options to pass to the tool, formatted for the command line.
204 :    
205 :     =item RETURN
206 :    
207 :     Returns a list of hashes, each representing a single match point.
208 :    
209 :     =back
210 :    
211 :     =cut
212 :    
213 :     sub ExecTool {
214 :     # Get the parameters.
215 :     my ($self, $seqFile, $genomes, $tool, $options) = @_;
216 :     # Declare the return variable.
217 :     my @retVal = ();
218 :     # Get a Sprout database object.
219 :     my $sprout = $self->DB();
220 :     # Insure the blast tools can find the blast matrix directory.
221 :     if (! $ENV{"BLASTMAT"}) { $ENV{"BLASTMAT"} = $FIG_Config::blastmat; }
222 :     # Get the location of the tools.
223 :     my $toolDir = "$FIG_Config::ext_bin";
224 :     Trace("ExecTool for file $seqFile, tool => $tool, options => $options.") if T(2);
225 :     # Get this tool's parameters.
226 :     my $toolData = $ToolTable{$tool};
227 :     # Determine whether or not this is a multi-genome call.
228 :     my $genomeCount = scalar(@$genomes);
229 :     Trace("$genomeCount genomes in list.") if T(2);
230 :     my $multiGenome = ($genomeCount > 1 ? 1 : 0);
231 :     # Loop through the genome list.
232 :     for my $genome (@$genomes) {
233 :     $self->PrintLine("Performing $tool for $genome.<br />");
234 :     Trace("Matching against $genome.") if T(3);
235 :     # Get the target database location for this genome.
236 :     my $db = ($toolData->{db_type} eq 'prot' ?
237 :     "$FIG_Config::organisms/$genome/Features/peg/fasta" :
238 :     "$FIG_Config::organisms/$genome/contigs");
239 :     # Only proceed if the database has data in it.
240 :     if (-s $db) {
241 :     # Verify the database.
242 :     VerifyDB($db, $toolData->{db_type});
243 :     # Get the command.
244 :     my $string = $toolData->{exec};
245 :     # Make the substitutions.
246 :     my %subs = (db => $db, options => ($options || ''), seqFile => $seqFile);
247 :     for my $key (keys %subs) {
248 :     $string =~ s/\$$key/$subs{$key}/g;
249 :     }
250 :     my $command = "$toolDir/$string";
251 :     # Call the command.
252 :     Trace("Executing: $command") if T(3);
253 :     my @data = TICK($command);
254 :     my $dataLineCount = scalar(@data);
255 :     Trace("$dataLineCount lines returned from $tool.") if T(3);
256 :     $self->PrintLine("$dataLineCount data lines returned from $tool.<br />");
257 :     # Compute the maximum number of hits we want back from this genome. Note
258 :     # that scalar(@data) will always be at lest the total number of hits
259 :     # returned.
260 :     my $maxHits = ($multiGenome ? $FIG_Config::blast_limit : scalar(@data));
261 :     # Process the lines. Note that we pass ourselves as the first parameter,
262 :     # mimicking what would happen if the output routine were called as an
263 :     # instance method.
264 :     my @results = &{$toolData->{output}}($self, \@data, $maxHits, $genome);
265 :     # Get a hash of valid targets.
266 :     my $targetRel = $toolData->{targetRelationship};
267 :     my %targetHash = map { $_ => 1 } $sprout->GetFlat([$targetRel], "$targetRel(from-link) = ?",
268 :     [$genome], "$targetRel(to-link)");
269 :     # Insure the results all have valid targets.
270 :     my $keepCount = 0;
271 :     for my $result (@results) {
272 :     # Get the hit location and convert it to a location object.
273 :     my $targetLoc = BasicLocation->new($result->{hitLoc});
274 :     # Test the contig to see if it's a valid target. It's valid if
275 :     # we found it when we did the query above. Invalid targets come
276 :     # from things added to SEED after the Sprout was built.
277 :     if ($targetHash{$targetLoc->Contig}) {
278 :     push @retVal, $result;
279 :     $keepCount++;
280 :     }
281 :     }
282 :     my $message = scalar(@results) . " hits found. $keepCount kept.";
283 :     Trace($message) if T(3);
284 :     $self->PrintLine("$message<br />");
285 : parrello 1.6 } else {
286 :     Confess("Blast data not found at $db.");
287 : parrello 1.1 }
288 :     }
289 :     # Return the resulting list.
290 :     return @retVal;
291 :     }
292 :    
293 :     =head2 Output Processing Methods
294 :    
295 :     =head3 blastXML
296 :    
297 : parrello 1.5 my @dataRows = $shelp->blastXML(\@data, $maxHits, $genome);
298 : parrello 1.1
299 :     Process XML output from Blast.
300 :    
301 :     The BLAST output is presented as a list of text lines, each of which contains a line-end
302 :     character. We combine all these lines into a single string and read it in using an XML parser.
303 :     The parsed result contains the interesting data 3 levels deep as C<iteration hits>. Each hit
304 :     corresponds to a single target object-- either a feature or a contig-- identified as the C<Hit_def>.
305 :     Inside each hit is a list of C<Hit_hsps>. These are single-level hashes that represent a point of contact
306 :     between the query sequence and the target. The from- and to-locations for the query sequence
307 :     are stored as C<Hsp_query-from> and C<Hsp_query-to>, and for the target object they are
308 :     C<Hsp_hit-from> and C<Hsp_hit-to>. The alignment is stored in C<Hsp_hseq>, C<Hsp_qseq>, and
309 : parrello 1.6 C<Hsp_midline>. The bit score is in C<Hsp_bit-score> and the e-value is in C<Hsp_evalue>.
310 : parrello 1.1
311 :     =over 4
312 :    
313 :     =item data
314 :    
315 :     Reference to a list of XML output lines from BLAST.
316 :    
317 :     =item maxHits
318 :    
319 :     Maximum number of hits to return. This is used to control the output size in multi-genome
320 :     searches.
321 :    
322 :     =item genome
323 :    
324 :     ID of the target genome.
325 :    
326 :     =item RETURN
327 :    
328 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<queryLoc>
329 :     indicates the query location, C<alignment> the three-line alignment between the
330 :     query and hit locations, C<sortKey> a sort key based on the bit score, and C<bsc> the bit
331 :     score itself.
332 :    
333 :     =back
334 :    
335 :     =cut
336 :    
337 :     sub blastXML {
338 :     # Get the parameters.
339 :     my ($self, $data, $maxHits, $genome) = @_;
340 :     # Declare the return variable.
341 :     my @retVal;
342 :     Trace("Processing blastXML output for $genome. Max Hits = $maxHits.") if T(2);
343 :     # Set up a counter so that we stop after the appropriate
344 :     # number of hits.
345 :     my $outputHits = 0;
346 :     # Create the xml string from the data.
347 :     my $xmlString = join("", @{$data});
348 : parrello 1.7 # Only proceed if we got something back.
349 : parrello 1.1 # Parse the XML. The various options help to keep the result more compact and predictable.
350 :     # Note we do some major error-checking here, because XMLin is very delicate.
351 :     my $xmlThing;
352 :     eval {
353 : parrello 1.7 if ($xmlString) {
354 :     $xmlThing = XMLin($xmlString, GroupTags => { Iteration_hits => 'Hit', Hit_hsps => 'Hsp' },
355 :     ForceArray => ['Hit', 'Hsp']);
356 :     }
357 : parrello 1.1 };
358 :     if ($@) {
359 :     Confess("XML parsing error for $genome: $@");
360 :     } elsif (! defined($xmlThing)) {
361 :     Trace("No result from XML parse for $genome.") if T(3);
362 :     } else {
363 :     Trace("XML thing for $genome = \n" . Dumper($xmlThing)) if T(blastXML => 3);
364 :     # Get the name of the query object.
365 :     my $queryName = $xmlThing->{'BlastOutput_query-def'};
366 :     # Strip out the comments (if any).
367 :     if ($queryName =~ /^(\S+)\s+/) {
368 :     $queryName = $1;
369 :     }
370 :     my $iterationData = $xmlThing->{BlastOutput_iterations}->{Iteration}->{Iteration_hits};
371 :     Trace("Iteration data contains " . scalar(@{$iterationData}) . " hits.") if T(3);
372 :     # "$iterationData" is now a list of hits. We process these one at a time in the
373 :     # following loop.
374 :     for my $hit (@{$iterationData}) { last if ($outputHits >= $maxHits);
375 :     # Get the hit target. This is the contig or feature containing the hit locations.
376 :     # We canonicalize it so that it has the genome name somewhere in it.
377 :     my $hitArea = Canonize($hit->{Hit_def}, $genome);
378 :     Trace("Hit on $hitArea. Point count = " . scalar(@{$hit->{Hit_hsps}}) . ".") if T(3);
379 :     # Now we loop through the hit points for this hit.
380 :     for my $point (@{$hit->{Hit_hsps}}) { last if ($outputHits >= $maxHits);
381 :     # We need to create an output tuple for this hit. First, we
382 :     # create the alignment string.
383 : parrello 1.4 my $alignment = join("<br/>", $point->{Hsp_qseq}, $point->{Hsp_midline}, $point->{Hsp_hseq});
384 :     # Convert the spaces to non-breaking so that they aren't mucked up by HTML formatting.
385 :     $alignment =~ s/ /&nbsp;/g;
386 :     # Next, we need to create the locations. The fields Hsp_query-frame and Hsp_hit-frame indicate
387 :     # whether the location is on the plus or minus strand.
388 : parrello 1.1 my $hitLoc = BasicLocation->new($hitArea, $point->{'Hsp_hit-from'}, "_", $point->{'Hsp_hit-to'});
389 : parrello 1.4 $hitLoc->Reverse if $point->{'Hsp_hit-frame'} < 0;
390 : parrello 1.1 my $queryLoc = BasicLocation->new($queryName, $point->{'Hsp_query-from'}, "_", $point->{'Hsp_query-to'});
391 : parrello 1.4 $queryLoc->Reverse if $point->{'Hsp_query-frame'} < 0;
392 : parrello 1.6 # We also need the e-value.
393 :     my $eValue = $point->{'Hsp_evalue'};
394 :     # Finally, we get the bit score, formatted nicely for the sorting.
395 : parrello 1.1 my $bsc = sprintf("%0.3f", $point->{'Hsp_bit-score'});
396 :     # Now we can build our output tuple.
397 :     push @retVal, { queryLoc => $queryLoc->SeedString,
398 :     hitLoc => $hitLoc->SeedString,
399 : parrello 1.6 evalue => $eValue,
400 : parrello 1.1 alignment => $alignment,
401 :     sortKey => $self->ToolSortKey($genome, $bsc, $hitLoc),
402 :     };
403 :     # Update our result counter. Both loops will exit when this equals the maximum.
404 :     $outputHits++;
405 :     }
406 :     }
407 :     }
408 :     # Return the results.
409 :     return @retVal;
410 :     }
411 :    
412 :    
413 :     =head3 scanLines
414 :    
415 : parrello 1.5 my @dataRows = $shelp->scanLines(\@data, $maxHits, $genome);
416 : parrello 1.1
417 :     Process output from the scan-for-matches tool. Each match consists of two
418 :     output lines. The first contains the hit source (feature or contig), the
419 :     begin point, and the end point. The second contains the sequence matched.
420 :     Unlike a BLAST search, there is no concept of a query location: the entire
421 :     query matches.
422 :    
423 :     =over 4
424 :    
425 :     =item data
426 :    
427 :     Reference to a list of the output lines from the scan.
428 :    
429 :     =item maxHits
430 :    
431 :     The maximum number of matches to return. This value is used to control
432 :     the output size in multi-genome searches.
433 :    
434 :     =item genome
435 :    
436 :     The ID of the genome containing the hits.
437 :    
438 :     =item RETURN
439 :    
440 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<sortKey> a
441 :     sort key, and C<alignment> the matching sequence.
442 :    
443 :     =back
444 :    
445 :     =cut
446 :    
447 :     sub scanLines {
448 :     # Get the parameters.
449 :     my ($self, $data, $maxHits, $genome) = @_;
450 :     Trace("Processing scanLines output for $genome.") if T(2);
451 :     # Declare the return variable.
452 :     my @retVal;
453 :     # Insure we don't try to return more than the maximum number
454 :     # of hits. Each hit is two lines, so this involves multiplying
455 :     # by two.
456 :     my $maxLines = $maxHits * 2;
457 :     if ($maxLines > scalar(@{$data})) {
458 :     $maxLines = scalar(@{$data});
459 :     }
460 :     # Loop through the lines containing the hits we want.
461 :     for (my $i = 0; $i < $maxLines; $i += 2) {
462 :     # Parse the first line, containing the hit location.
463 :     $data->[$i] =~ /^>([^:]+):\[(\d+),(\d+)\]/;
464 :     # Convert the result to a location.
465 :     my ($hitObject, $beg, $end) = ($1, $2, $3);
466 :     $hitObject = Canonize($hitObject, $genome);
467 :     my $hitLoc = BasicLocation->new($hitObject, $beg, "_", $end);
468 :     # Get the match string.
469 :     my $matchString = $data->[$i+1];
470 :     chomp $matchString;
471 :     # Output the result.
472 :     push @retVal, { hitLoc => $hitLoc->SeedString,
473 :     alignment => $matchString,
474 :     sortKey => $self->ToolSortKey($genome, 0, $hitLoc),
475 :     };
476 :     }
477 :     # Return the result.
478 :     return @retVal;
479 :     }
480 :    
481 :     =head2 Utility Methods
482 :    
483 :     =head3 Canonize
484 :    
485 : parrello 1.5 my $newName = CallScanner::Canonize($name, $genomeID);
486 : parrello 1.1
487 :     If the specified name is a contig ID, insure it has a genome ID in front of it.
488 :    
489 :     =over 4
490 :    
491 :     =item name
492 :    
493 :     Name to fix up.
494 :    
495 :     =item genomeID
496 :    
497 :     ID of the genome to be added to the contig ID, if necessary.
498 :    
499 :     =item RETURN
500 :    
501 :     Returns a fixed-up name.
502 :    
503 :     =back
504 :    
505 :     =cut
506 :    
507 :     sub Canonize {
508 :     # Get the parameters.
509 :     my ($name, $genomeID) = @_;
510 :     # Declare the return variable.
511 :     my $retVal = $name;
512 :     # Check for a genome ID already in place or a feature ID.
513 :     if ($retVal !~ /:/ && $retVal !~ /^fig/) {
514 :     $retVal = "$genomeID:$name";
515 :     }
516 :     # Return the result.
517 :     return $retVal;
518 :     }
519 :    
520 :     =head3 VerifyDB
521 :    
522 : parrello 1.5 CallScanner::VerifyDB($db, $type);
523 : parrello 1.1
524 :     Verify that the specified FASTA file has BLAST databases. If the databases
525 :     do not exist, they will be created. If they are older than the FASTA file,
526 :     they will be regenerated.
527 :    
528 :     =over 4
529 :    
530 :     =item db
531 :    
532 :     Name of the FASTA file.
533 :    
534 :     =item type
535 :    
536 :     Type of database desired: C<prot> for protein and C<dna> for DNA.
537 :    
538 :     =back
539 :    
540 :     =cut
541 :    
542 :     sub VerifyDB {
543 :     # Get the parameters.
544 :     my ($db, $type) = @_;
545 :     # Process according to the data type.
546 :     if ($type eq 'prot') {
547 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db)) {
548 :     Trace("Building protein FASTA database for $db.") if T(3);
549 :     system "$FIG_Config::ext_bin/formatdb -p T -i $db";
550 :     }
551 :     } else {
552 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db)) {
553 :     Trace("Building DNA FASTA database for $db.") if T(3);
554 :     system "$FIG_Config::ext_bin/formatdb -p F -i $db";
555 :     }
556 :     }
557 :     }
558 :    
559 :     =head3 ToolSortKey
560 :    
561 : parrello 1.5 my $key = $shelp->ToolSortKey($genome, $bsc, $hitLoc);
562 : parrello 1.1
563 :     Return the sort key for a match result against the specified
564 :     genome with the specified bit-score. The results are
565 :     to be sorted by bit-score, with the highest score at the top
566 :     and preferential treatment given to NMPDR core genomes.
567 :     The tie-breaker for entries with the same score is the
568 :     organism name followed by the hit location.
569 :    
570 :     =over 4
571 :    
572 :     =item genome
573 :    
574 :     ID of the genome containing the target area.
575 :    
576 :     =item bsc
577 :    
578 :     Bit-score for the match.
579 :    
580 :     =item hitLoc
581 :    
582 :     The hit location, encoded as a location object.
583 :    
584 :     =item RETURN
585 :    
586 :     Returns a key field that can be used to sort the match in among the
587 :     results in the desired fashion.
588 :    
589 :     =back
590 :    
591 :     =cut
592 :    
593 :     sub ToolSortKey {
594 :     # Get the parameters.
595 :     my ($self, $genome, $bsc, $hitLoc) = @_;
596 :     # Get the group from the genome ID.
597 :     my ($orgName, $group) = $self->OrganismData($genome);
598 :     # Declare the return value. It begins with an "A" if this is an NMPDR
599 :     # feature and a "Z" otherwise.
600 :     my $retVal = ($group ? "A" : "Z");
601 :     # Convert the bit score to an integer.
602 :     my $bitScore = int(10 * $bsc);
603 :     # We want to sort by descending bit score, so subtract the result from a big
604 :     # number.
605 :     my $bitThing = 10000000000 - $bitScore;
606 :     if ($bitThing < 0) {
607 :     $bitThing = 0;
608 :     }
609 :     # Pad it to 10 characters.
610 :     $bitThing = "0$bitThing" while (length $bitThing < 10);
611 :     # Tack it onto the group character.
612 :     $retVal .= $bitThing;
613 :     # Finish up with the organism name and the hit location.
614 :     $retVal .= "$orgName:::" . $hitLoc->SeedString;
615 : parrello 1.2 Trace("Blast sort key is $retVal, based on group \"$group\".") if T(4);
616 : parrello 1.1 # Return the result.
617 :     return $retVal;
618 :     }
619 :    
620 :     =head2 Virtual Methods
621 :    
622 :     =head3 Form
623 :    
624 : parrello 1.5 my $html = $shelp->Form();
625 : parrello 1.1
626 :     Generate the HTML for a form to request a new search.
627 :    
628 :     =cut
629 :    
630 :     sub Form {
631 :     # Get the parameters.
632 :     my ($self) = @_;
633 :     # Get the CGI and sprout objects.
634 :     my $cgi = $self->Q();
635 :     my $sprout = $self->DB();
636 :     # Start the form.
637 : parrello 1.3 my $retVal = $self->FormStart("Sequence Search");
638 : parrello 1.1 # Get the list of selected genomes.
639 :     my @selected = $cgi->param('genome');
640 :     # Get the incoming genome sequence and the options. These are the incoming scalar
641 :     # values; the others apply to menus.
642 :     my $sequence = $cgi->param('sequence') || "";
643 :     my $options = $cgi->param('options') || "";
644 : parrello 1.6 my $neighborhood = $cgi->param('neighborhood') || RHLocations::NEIGHBORHOOD;
645 : parrello 1.1 # Get the selected tool and compute the corresponding button caption.
646 :     my $toolChosen = $cgi->param('tool') || "";
647 :     my $caption = ($toolChosen ? $ToolTable{$toolChosen}->{buttonName} : "");
648 :     # Create the menus. First is the tool menu.
649 :     my @valueList = ("", sort keys %ToolTable);
650 : parrello 1.6 my $toolMenu = CGI::popup_menu(-name => "tool", -values => \@valueList,
651 :     -onChange => 'setSubmit(this.value)',
652 :     -default => $toolChosen);
653 : parrello 1.1 # Create the genome selection menu.
654 :     my $menu = $self->NmpdrGenomeMenu("genome", 'multiple', \@selected);
655 :     # The general structure here will be to have the DNA/protein sequence on the top,
656 :     # then genome selector and finally the small controls.
657 :     my @rows = ();
658 : parrello 1.6 push @rows, CGI::Tr(CGI::td("Tool"), CGI::td($toolMenu));
659 :     push @rows, CGI::Tr(CGI::td("Sequence in Raw or FASTA Format"),
660 :     CGI::td({colspan => 2}, CGI::textarea(-name => "sequence", -rows => 5,
661 : parrello 1.7 -value => $sequence, -cols => 62,
662 :     -style => 'font-family: monospace')));
663 : parrello 1.6 push @rows, CGI::Tr(CGI::td("Select one or more genomes"),
664 :     CGI::td({colspan => 2}, $menu));
665 :     push @rows, CGI::Tr(CGI::td("Blast Options"),
666 :     CGI::td({colspan => 2}, CGI::textfield(-name => 'options', size => 45,
667 :     -value => $options)));
668 :     push @rows, CGI::Tr(CGI::td("Neighborhood Width"),
669 :     CGI::td(CGI::textfield(-name => 'neighborhood', -size => 5,
670 :     -value => $neighborhood)));
671 : parrello 1.1 push @rows, $self->SubmitRow($caption);
672 :     # Create the table.
673 :     $retVal .= $self->MakeTable(\@rows);
674 :     # Close the form.
675 :     $retVal .= $self->FormEnd();
676 :     # Get the form name. We'll need it for the upcoming javascript.
677 :     my $formName = $self->FormName();
678 :     # Add the javaScript to update the button.
679 :     $retVal .= "<script type=\"text/javascript\">\n" .
680 :     " function setSubmit(tool) {\n" .
681 :     " switch (tool) {\n";
682 :     for my $tool (sort keys %ToolTable) {
683 :     $retVal .= " case '$tool' : document.$formName.Search.value = '$ToolTable{$tool}->{buttonName}'; break;\n";
684 :     }
685 :     $retVal .= " }\n" .
686 :     " }\n" .
687 :     "</script>\n";
688 :     # Return the result.
689 :     return $retVal;
690 :     }
691 :    
692 :     =head3 Find
693 :    
694 : parrello 1.5 my $resultCount = $shelp->Find();
695 : parrello 1.1
696 :     Conduct a search based on the current CGI query parameters. The search results will
697 :     be written to the session cache file and the number of results will be
698 :     returned. If the search parameters are invalid, a result count of C<undef> will be
699 :     returned and a result message will be stored in this object describing the problem.
700 :    
701 :     =cut
702 :    
703 :     sub Find {
704 :     # Get the parameters.
705 :     my ($self) = @_;
706 :     # Declare the return variable. If it remains undefined, the caller will
707 :     # know there's been an error.
708 :     my $retVal;
709 :     # Get the sprout and CGI query objects.
710 :     my $cgi = $self->Q();
711 :     my $sprout = $self->DB();
712 :     # Get the genome IDs.
713 :     my @genomes = $self->GetGenomes('genome');
714 :     if (! @genomes) {
715 :     $self->SetMessage("No genomes specified.");
716 :     } else {
717 :     # Get the sequence in its raw form.
718 :     my $sequence = $cgi->param('sequence');
719 :     if (! $sequence) {
720 :     $self->SetMessage("No sequence specified.");
721 :     } else {
722 :     # Get the blast options and the tool type.
723 :     my $toolType = $cgi->param('tool') || "";
724 :     my $options = $cgi->param('options') || "";
725 :     # Insure the tool type is valid.
726 :     if (! $toolType) {
727 :     $self->SetMessage("No tool specified.");
728 :     } else {
729 :     # Create the result helper.
730 :     my $rhelp = RHLocations->new($self);
731 :     # Compute the columns. The default columns are determined mostly by the result type.
732 :     $self->DefaultColumns($rhelp);
733 :     #
734 :     # (NOTE: Optional columns should be added here so they go before the alignment column.)
735 :     #
736 :     # Define the extra columns for this tool.
737 :     for my $extraCol (@{$ToolTable{$toolType}->{extras}}) {
738 :     $rhelp->AddExtraColumn(@{$extraCol});
739 :     }
740 :     # Now, the sequence. We use a utility to convert it to a uniform
741 :     # FASTA format of the correct type. Note that for patterns there
742 :     # will not be a header line.
743 :     my $sequenceThing = $self->ComputeFASTA($ToolTable{$toolType}->{inputType}, $cgi->param('sequence'));
744 :     Trace("Fasta sequence has length " . length($sequenceThing) . ".") if T(3);
745 :     # Only proceed if the FASTA sequence was converted successfully.
746 :     if ($sequenceThing) {
747 :     # Write the FASTA to a temporary file. We use the session name with a suffix of
748 :     # "fasta" for the file name.
749 :     my $tmpFile = $self->GetTempFileName('fasta');
750 :     Tracer::PutFile($tmpFile, $sequenceThing);
751 :     # Call the tool.
752 :     my @results = $self->ExecTool($tmpFile, \@genomes, $toolType, $options);
753 :     # Start the output session.
754 :     $self->OpenSession($rhelp);
755 :     # Compute the number of results.
756 :     $retVal = scalar(@results);
757 :     $self->PrintLine("$retVal hit locations found.<br />");
758 :     # Loop through the results.
759 :     my $resultCounter = 0;
760 :     for my $result (@results) {
761 :     # Store the result fields as extra columns. Only the result fields
762 :     # defined as extra columns in the tool table will be kept by the
763 :     # result helper.
764 :     $rhelp->PutExtraColumns(%{$result});
765 :     # Create an ERDB object for the hit location. We use BasicLocation to
766 :     # parse it into its components.
767 : parrello 1.6 my $data = $rhelp->BuildLocationRecord($result->{hitLoc});
768 : parrello 1.1 # Now put the location data and its sort key into the output stream.
769 :     $rhelp->PutData($result->{sortKey}, $result->{hitLoc}, $data);
770 :     # Tell the user every so often what kind of progress we're making.
771 :     $resultCounter++;
772 :     if ($resultCounter % 100 == 0) {
773 :     $self->PrintLine("$resultCounter of $retVal hit locations processed.<br />");
774 :     }
775 :     }
776 :     # Close the output session.
777 :     $self->CloseSession();
778 :     }
779 :     }
780 :     }
781 :     }
782 :     # Return the result count.
783 :     return $retVal;
784 :     }
785 :    
786 :     =head3 Description
787 :    
788 : parrello 1.5 my $htmlText = $shelp->Description();
789 : parrello 1.1
790 :     Return a description of this search. The description is used for the table of contents
791 :     on the main search tools page. It may contain HTML, but it should be character-level,
792 :     not block-level, since the description is going to appear in a list.
793 :    
794 :     =cut
795 :    
796 :     sub Description {
797 :     # Get the parameters.
798 :     my ($self) = @_;
799 :     # Return the result.
800 :     return "Search for matching DNA or protein regions.";
801 :     }
802 :    
803 :     =head3 SearchTitle
804 :    
805 : parrello 1.5 my $titleHtml = $shelp->SearchTitle();
806 : parrello 1.1
807 :     Return the display title for this search. The display title appears above the search results.
808 :     If no result is returned, no title will be displayed. The result should be an html string
809 :     that can be legally put inside a block tag such as C<h3> or C<p>.
810 :    
811 :     =cut
812 :    
813 :     sub SearchTitle {
814 :     # Get the parameters.
815 :     my ($self) = @_;
816 :     # Compute the title. We extract the tool ID from the query parameters.
817 :     my $cgi = $self->Q();
818 :     my $tool = $cgi->param('tool');
819 :     my $retVal = $ToolTable{$tool}->{title};
820 :     # Return it.
821 :     return $retVal;
822 :     }
823 :    
824 :    
825 : parrello 1.5 1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3