[Bio] / Sprout / SHToolSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHToolSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHToolSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use CGI;
8 :     use HTML;
9 :     use Sprout;
10 :     use Sim;
11 :     use RHLocations;
12 :     use ERDBObject;
13 :     use XML::Simple;
14 :     use Data::Dumper;
15 :     use base 'SearchHelper';
16 :    
17 :     =head1 Tool Search Feature Search Helper
18 :    
19 :     =head2 Introduction
20 :    
21 :     This object conducts searches using command-line tools that compare a DNA or Protein
22 :     pattern against a text database of genetic information.
23 :    
24 :     The tools are invoked by a method called L</ExecScan>. This processes the input,
25 :     invokes the tool, and returns a list of hash references. The hash references contain
26 :     whatever fields are required in the search output. Two fields are required: C<hitLoc>,
27 :     which specifies the location of the matching data, and C<sortKey>, which is used
28 :     to sort it.
29 :    
30 :     =over 4
31 :    
32 :     =item sequence
33 :    
34 :     DNA or protein sequence. This can be a feature ID, pattern, or a FASTA string. A feature ID
35 :     is automatically converted to a FASTA string. If a pattern is specified, it is passed
36 :     directly to the pattern match tool.
37 :    
38 :     =item tool
39 :    
40 :     Tool to use: currently C<blastp>, C<blastn>, C<blastx>, C<dnaScan>, or C<protScan>.
41 :    
42 :     =item options
43 :    
44 :     BLAST options, encoded as a string
45 :    
46 :     =item genome[]
47 :    
48 :     IDs of the genomes to be included in the search.
49 :    
50 :     =back
51 :    
52 :     =cut
53 :    
54 :     =head2 Data Structures
55 :    
56 :     =head3 ToolTable
57 :    
58 :     This search is a fairly complicated thing that is basically used to search for
59 :     sequence data in one or more genomes using command-line tools. The idea is that
60 :     any command-line tool can be put in here and made available for use by the
61 :     search system. From a coding point of view, it would make sense to have a
62 :     separate search helper for each tool, but combining them in this way has
63 :     a psychological benefit for the user. The one requirement across all the tools
64 :     is that the input is a DNA or protein sequence of some sort. For the BLAST
65 :     tools, the sequence is a standard FASTA thing. For the scan-for-matches
66 :     tools, the sequence is a search pattern.
67 :    
68 :     The tool table is a hash that performs the services normally expected of
69 :     subclasses. The hash maps each tool name to a hash reference. The various
70 :     fields in the hash references are as follows.
71 :    
72 :     =over 4
73 :    
74 :     =item db_type
75 :    
76 :     Type of database against which the tool runs: C<prot> for a protein database and
77 :     C<dna> for a DNA database. This tells us which files to pass into the tool as
78 :     the search database.
79 :    
80 :     =item exec
81 :    
82 :     Execution string for the tool. The variable C<$seqFile> is presumed to be the location
83 :     of the input sequence, C<$db> is the directory, and C<$options> are the user-specified options.
84 :    
85 :     =item output
86 :    
87 :     The method for converting the output from the tool (presented as a set of lines of text) to
88 :     the output format expected by the BLAST search tool (a list of hash references).
89 :    
90 :     =item inputType
91 :    
92 :     The type of input expected. This is either a FASTA for protein or DNA (C<prot> or C<dna>),
93 :     or a scan pattern for protein or DNA (C<protPattern> or C<dnaPattern>). If additional
94 :     formats are needed, they must be programmed into the B<ComputeFASTA> method of C<SearchHelper>.
95 :    
96 :     =item extras
97 :    
98 :     A list of lists describing the extra columns desired in the output. The extra column names
99 :     must exist as keys in the hashes produced by the output method. In addition to the fields named
100 :     in this list, the output hashes must contain a field named C<sortKey> that can be used to sort
101 :     the results. The lists correspond exactly to the parameter list for the B<AddExtraColumn>
102 :     method of the B<ResultHelper> class.
103 :    
104 :     =item buttonName
105 :    
106 :     The name to display on the search button if this tool is selected.
107 :    
108 :     =item targetRelationship
109 :    
110 :     The name of the relationship from the genome to the entity whose IDs will appear in the output regions. This is C<HasContig> for
111 :     a DNA search and C<HasFeature> for a feature search.
112 :    
113 :     =item title
114 :    
115 :     Title to be used for the search results pages.
116 :    
117 :     =back
118 :    
119 :     =cut
120 :    
121 :     my %ToolTable = ( blastp => { db_type => 'prot',
122 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastp $options',
123 :     output => \&blastXML,
124 :     inputType => 'prot',
125 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
126 : parrello 1.4 [queryLoc => undef, title => "Query Location", style => "leftAlign", download => 'text'],
127 :     [hitLoc => undef, title => "Hit Location", style => "leftAlign", download => 'text'],
128 : parrello 1.1 [alignment => undef, title => "Alignment", style => "code", download => 'align']],
129 :     buttonName => 'BLAST',
130 :     targetRelationship => 'HasFeature',
131 :     title => 'Blastp Search',
132 :     },
133 :     blastx => { db_type => 'prot',
134 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastx $options',
135 :     output => \&blastXML,
136 :     inputType => 'prot',
137 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
138 : parrello 1.4 [queryLoc => undef, title => "Query Location", style => "leftAlign", download => 'text'],
139 :     [hitLoc => undef, title => "Hit Location", style => "leftAlign", download => 'text'],
140 : parrello 1.1 [alignment => undef, title => "Alignment", style => "code", download => 'align']],
141 :     buttonName => 'BLAST',
142 :     targetRelationship => 'HasFeature',
143 :     title => 'Blastx Search',
144 :     },
145 :     blastn => { db_type => 'dna',
146 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastn $options',
147 :     output => \&blastXML,
148 :     inputType => 'dna',
149 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
150 : parrello 1.4 [queryLoc => undef, title => "Query Location", style => "leftAlign", download => 'text'],
151 :     [hitLoc => undef, title => "Hit Location", style => "leftAlign", download => 'text'],
152 : parrello 1.1 [alignment => undef, title => "Alignment", style => "code", download => 'align']],
153 :     buttonName => 'BLAST',
154 :     targetRelationship => 'HasContig',
155 :     title => 'Blastn Search',
156 :     },
157 :     dnaScan => { db_type => 'dna',
158 :     exec => 'scan_for_matches -c $seqFile $options <$db',
159 :     output => \&scanLines,
160 :     inputType => 'dnaPattern',
161 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
162 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
163 :     buttonName => 'SCAN',
164 :     targetRelationship => 'HasContig',
165 :     title => 'DNA Scan for Matches',
166 :     },
167 :     protScan => { db_type => 'prot',
168 :     exec => 'scan_for_matches -p $seqFile $options <$db',
169 :     output => \&scanLines,
170 :     inputType => 'protPattern',
171 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
172 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
173 :     buttonName => 'SCAN',
174 :     targetRelationship => 'HasFeature',
175 :     title => 'Protein Scan for Matches',
176 :     }
177 :     );
178 :    
179 :     =head2 Public Methods
180 :    
181 :     =head3 ExecTool
182 :    
183 :     C<< my @sims = $shelp->ExecTool($seqFile, \@genomes, $tool, $options); >>
184 :    
185 :     Call a tool to search for DNA sequences or features.
186 :    
187 :     =over 4
188 :    
189 :     =item seqFile
190 :    
191 :     Name of a file containing the input sequence. This will either be a FASTA or a scan pattern.
192 :    
193 :     =item genomes
194 :    
195 :     A list of the IDs for the target genomes of the search.
196 :    
197 :     =item tool
198 :    
199 :     Name of the tool to use.
200 :    
201 :     =item options
202 :    
203 :     Options to pass to the tool, formatted for the command line.
204 :    
205 :     =item RETURN
206 :    
207 :     Returns a list of hashes, each representing a single match point.
208 :    
209 :     =back
210 :    
211 :     =cut
212 :    
213 :     sub ExecTool {
214 :     # Get the parameters.
215 :     my ($self, $seqFile, $genomes, $tool, $options) = @_;
216 :     # Declare the return variable.
217 :     my @retVal = ();
218 :     # Get a Sprout database object.
219 :     my $sprout = $self->DB();
220 :     # Insure the blast tools can find the blast matrix directory.
221 :     if (! $ENV{"BLASTMAT"}) { $ENV{"BLASTMAT"} = $FIG_Config::blastmat; }
222 :     # Get the location of the tools.
223 :     my $toolDir = "$FIG_Config::ext_bin";
224 :     Trace("ExecTool for file $seqFile, tool => $tool, options => $options.") if T(2);
225 :     # Get this tool's parameters.
226 :     my $toolData = $ToolTable{$tool};
227 :     # Determine whether or not this is a multi-genome call.
228 :     my $genomeCount = scalar(@$genomes);
229 :     Trace("$genomeCount genomes in list.") if T(2);
230 :     my $multiGenome = ($genomeCount > 1 ? 1 : 0);
231 :     # Loop through the genome list.
232 :     for my $genome (@$genomes) {
233 :     $self->PrintLine("Performing $tool for $genome.<br />");
234 :     Trace("Matching against $genome.") if T(3);
235 :     # Get the target database location for this genome.
236 :     my $db = ($toolData->{db_type} eq 'prot' ?
237 :     "$FIG_Config::organisms/$genome/Features/peg/fasta" :
238 :     "$FIG_Config::organisms/$genome/contigs");
239 :     # Only proceed if the database has data in it.
240 :     if (-s $db) {
241 :     # Verify the database.
242 :     VerifyDB($db, $toolData->{db_type});
243 :     # Get the command.
244 :     my $string = $toolData->{exec};
245 :     # Make the substitutions.
246 :     my %subs = (db => $db, options => ($options || ''), seqFile => $seqFile);
247 :     for my $key (keys %subs) {
248 :     $string =~ s/\$$key/$subs{$key}/g;
249 :     }
250 :     my $command = "$toolDir/$string";
251 :     # Call the command.
252 :     Trace("Executing: $command") if T(3);
253 :     my @data = TICK($command);
254 :     my $dataLineCount = scalar(@data);
255 :     Trace("$dataLineCount lines returned from $tool.") if T(3);
256 :     $self->PrintLine("$dataLineCount data lines returned from $tool.<br />");
257 :     # Compute the maximum number of hits we want back from this genome. Note
258 :     # that scalar(@data) will always be at lest the total number of hits
259 :     # returned.
260 :     my $maxHits = ($multiGenome ? $FIG_Config::blast_limit : scalar(@data));
261 :     # Process the lines. Note that we pass ourselves as the first parameter,
262 :     # mimicking what would happen if the output routine were called as an
263 :     # instance method.
264 :     my @results = &{$toolData->{output}}($self, \@data, $maxHits, $genome);
265 :     # Get a hash of valid targets.
266 :     my $targetRel = $toolData->{targetRelationship};
267 :     my %targetHash = map { $_ => 1 } $sprout->GetFlat([$targetRel], "$targetRel(from-link) = ?",
268 :     [$genome], "$targetRel(to-link)");
269 :     # Insure the results all have valid targets.
270 :     my $keepCount = 0;
271 :     for my $result (@results) {
272 :     # Get the hit location and convert it to a location object.
273 :     my $targetLoc = BasicLocation->new($result->{hitLoc});
274 :     # Test the contig to see if it's a valid target. It's valid if
275 :     # we found it when we did the query above. Invalid targets come
276 :     # from things added to SEED after the Sprout was built.
277 :     if ($targetHash{$targetLoc->Contig}) {
278 :     push @retVal, $result;
279 :     $keepCount++;
280 :     }
281 :     }
282 :     my $message = scalar(@results) . " hits found. $keepCount kept.";
283 :     Trace($message) if T(3);
284 :     $self->PrintLine("$message<br />");
285 :     }
286 :     }
287 :     # Return the resulting list.
288 :     return @retVal;
289 :     }
290 :    
291 :     =head2 Output Processing Methods
292 :    
293 :     =head3 blastXML
294 :    
295 :     C<< my @dataRows = $shelp->blastXML(\@data, $maxHits, $genome); >>
296 :    
297 :     Process XML output from Blast.
298 :    
299 :     The BLAST output is presented as a list of text lines, each of which contains a line-end
300 :     character. We combine all these lines into a single string and read it in using an XML parser.
301 :     The parsed result contains the interesting data 3 levels deep as C<iteration hits>. Each hit
302 :     corresponds to a single target object-- either a feature or a contig-- identified as the C<Hit_def>.
303 :     Inside each hit is a list of C<Hit_hsps>. These are single-level hashes that represent a point of contact
304 :     between the query sequence and the target. The from- and to-locations for the query sequence
305 :     are stored as C<Hsp_query-from> and C<Hsp_query-to>, and for the target object they are
306 :     C<Hsp_hit-from> and C<Hsp_hit-to>. The alignment is stored in C<Hsp_hseq>, C<Hsp_qseq>, and
307 :     C<Hsp_midline>. The bit score is in C<Hsp_bit-score>.
308 :    
309 :     =over 4
310 :    
311 :     =item data
312 :    
313 :     Reference to a list of XML output lines from BLAST.
314 :    
315 :     =item maxHits
316 :    
317 :     Maximum number of hits to return. This is used to control the output size in multi-genome
318 :     searches.
319 :    
320 :     =item genome
321 :    
322 :     ID of the target genome.
323 :    
324 :     =item RETURN
325 :    
326 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<queryLoc>
327 :     indicates the query location, C<alignment> the three-line alignment between the
328 :     query and hit locations, C<sortKey> a sort key based on the bit score, and C<bsc> the bit
329 :     score itself.
330 :    
331 :     =back
332 :    
333 :     =cut
334 :    
335 :     sub blastXML {
336 :     # Get the parameters.
337 :     my ($self, $data, $maxHits, $genome) = @_;
338 :     # Declare the return variable.
339 :     my @retVal;
340 :     Trace("Processing blastXML output for $genome. Max Hits = $maxHits.") if T(2);
341 :     # Set up a counter so that we stop after the appropriate
342 :     # number of hits.
343 :     my $outputHits = 0;
344 :     # Create the xml string from the data.
345 :     my $xmlString = join("", @{$data});
346 :     # Parse the XML. The various options help to keep the result more compact and predictable.
347 :     # Note we do some major error-checking here, because XMLin is very delicate.
348 :     my $xmlThing;
349 :     eval {
350 :     $xmlThing = XMLin($xmlString, GroupTags => { Iteration_hits => 'Hit', Hit_hsps => 'Hsp' },
351 :     ForceArray => ['Hit', 'Hsp']);
352 :     };
353 :     if ($@) {
354 :     Confess("XML parsing error for $genome: $@");
355 :     } elsif (! defined($xmlThing)) {
356 :     Trace("No result from XML parse for $genome.") if T(3);
357 :     } else {
358 :     Trace("XML thing for $genome = \n" . Dumper($xmlThing)) if T(blastXML => 3);
359 :     # Get the name of the query object.
360 :     my $queryName = $xmlThing->{'BlastOutput_query-def'};
361 :     # Strip out the comments (if any).
362 :     if ($queryName =~ /^(\S+)\s+/) {
363 :     $queryName = $1;
364 :     }
365 :     my $iterationData = $xmlThing->{BlastOutput_iterations}->{Iteration}->{Iteration_hits};
366 :     Trace("Iteration data contains " . scalar(@{$iterationData}) . " hits.") if T(3);
367 :     # "$iterationData" is now a list of hits. We process these one at a time in the
368 :     # following loop.
369 :     for my $hit (@{$iterationData}) { last if ($outputHits >= $maxHits);
370 :     # Get the hit target. This is the contig or feature containing the hit locations.
371 :     # We canonicalize it so that it has the genome name somewhere in it.
372 :     my $hitArea = Canonize($hit->{Hit_def}, $genome);
373 :     Trace("Hit on $hitArea. Point count = " . scalar(@{$hit->{Hit_hsps}}) . ".") if T(3);
374 :     # Now we loop through the hit points for this hit.
375 :     for my $point (@{$hit->{Hit_hsps}}) { last if ($outputHits >= $maxHits);
376 :     # We need to create an output tuple for this hit. First, we
377 :     # create the alignment string.
378 : parrello 1.4 my $alignment = join("<br/>", $point->{Hsp_qseq}, $point->{Hsp_midline}, $point->{Hsp_hseq});
379 :     # Convert the spaces to non-breaking so that they aren't mucked up by HTML formatting.
380 :     $alignment =~ s/ /&nbsp;/g;
381 :     # Next, we need to create the locations. The fields Hsp_query-frame and Hsp_hit-frame indicate
382 :     # whether the location is on the plus or minus strand.
383 : parrello 1.1 my $hitLoc = BasicLocation->new($hitArea, $point->{'Hsp_hit-from'}, "_", $point->{'Hsp_hit-to'});
384 : parrello 1.4 $hitLoc->Reverse if $point->{'Hsp_hit-frame'} < 0;
385 : parrello 1.1 my $queryLoc = BasicLocation->new($queryName, $point->{'Hsp_query-from'}, "_", $point->{'Hsp_query-to'});
386 : parrello 1.4 $queryLoc->Reverse if $point->{'Hsp_query-frame'} < 0;
387 : parrello 1.1 # Finally, we get the bit score, formatted nicely for the display.
388 :     my $bsc = sprintf("%0.3f", $point->{'Hsp_bit-score'});
389 :     # Now we can build our output tuple.
390 :     push @retVal, { queryLoc => $queryLoc->SeedString,
391 :     hitLoc => $hitLoc->SeedString,
392 :     bsc => $bsc,
393 :     alignment => $alignment,
394 :     sortKey => $self->ToolSortKey($genome, $bsc, $hitLoc),
395 :     };
396 :     # Update our result counter. Both loops will exit when this equals the maximum.
397 :     $outputHits++;
398 :     }
399 :     }
400 :     }
401 :     # Return the results.
402 :     return @retVal;
403 :     }
404 :    
405 :    
406 :     =head3 scanLines
407 :    
408 :     C<< my @dataRows = $shelp->scanLines(\@data, $maxHits, $genome); >>
409 :    
410 :     Process output from the scan-for-matches tool. Each match consists of two
411 :     output lines. The first contains the hit source (feature or contig), the
412 :     begin point, and the end point. The second contains the sequence matched.
413 :     Unlike a BLAST search, there is no concept of a query location: the entire
414 :     query matches.
415 :    
416 :     =over 4
417 :    
418 :     =item data
419 :    
420 :     Reference to a list of the output lines from the scan.
421 :    
422 :     =item maxHits
423 :    
424 :     The maximum number of matches to return. This value is used to control
425 :     the output size in multi-genome searches.
426 :    
427 :     =item genome
428 :    
429 :     The ID of the genome containing the hits.
430 :    
431 :     =item RETURN
432 :    
433 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<sortKey> a
434 :     sort key, and C<alignment> the matching sequence.
435 :    
436 :     =back
437 :    
438 :     =cut
439 :    
440 :     sub scanLines {
441 :     # Get the parameters.
442 :     my ($self, $data, $maxHits, $genome) = @_;
443 :     Trace("Processing scanLines output for $genome.") if T(2);
444 :     # Declare the return variable.
445 :     my @retVal;
446 :     # Insure we don't try to return more than the maximum number
447 :     # of hits. Each hit is two lines, so this involves multiplying
448 :     # by two.
449 :     my $maxLines = $maxHits * 2;
450 :     if ($maxLines > scalar(@{$data})) {
451 :     $maxLines = scalar(@{$data});
452 :     }
453 :     # Loop through the lines containing the hits we want.
454 :     for (my $i = 0; $i < $maxLines; $i += 2) {
455 :     # Parse the first line, containing the hit location.
456 :     $data->[$i] =~ /^>([^:]+):\[(\d+),(\d+)\]/;
457 :     # Convert the result to a location.
458 :     my ($hitObject, $beg, $end) = ($1, $2, $3);
459 :     $hitObject = Canonize($hitObject, $genome);
460 :     my $hitLoc = BasicLocation->new($hitObject, $beg, "_", $end);
461 :     # Get the match string.
462 :     my $matchString = $data->[$i+1];
463 :     chomp $matchString;
464 :     # Output the result.
465 :     push @retVal, { hitLoc => $hitLoc->SeedString,
466 :     alignment => $matchString,
467 :     sortKey => $self->ToolSortKey($genome, 0, $hitLoc),
468 :     };
469 :     }
470 :     # Return the result.
471 :     return @retVal;
472 :     }
473 :    
474 :     =head2 Utility Methods
475 :    
476 :     =head3 Canonize
477 :    
478 :     C<< my $newName = CallScanner::Canonize($name, $genomeID); >>
479 :    
480 :     If the specified name is a contig ID, insure it has a genome ID in front of it.
481 :    
482 :     =over 4
483 :    
484 :     =item name
485 :    
486 :     Name to fix up.
487 :    
488 :     =item genomeID
489 :    
490 :     ID of the genome to be added to the contig ID, if necessary.
491 :    
492 :     =item RETURN
493 :    
494 :     Returns a fixed-up name.
495 :    
496 :     =back
497 :    
498 :     =cut
499 :    
500 :     sub Canonize {
501 :     # Get the parameters.
502 :     my ($name, $genomeID) = @_;
503 :     # Declare the return variable.
504 :     my $retVal = $name;
505 :     # Check for a genome ID already in place or a feature ID.
506 :     if ($retVal !~ /:/ && $retVal !~ /^fig/) {
507 :     $retVal = "$genomeID:$name";
508 :     }
509 :     # Return the result.
510 :     return $retVal;
511 :     }
512 :    
513 :     =head3 VerifyDB
514 :    
515 :     C<< CallScanner::VerifyDB($db, $type); >>
516 :    
517 :     Verify that the specified FASTA file has BLAST databases. If the databases
518 :     do not exist, they will be created. If they are older than the FASTA file,
519 :     they will be regenerated.
520 :    
521 :     =over 4
522 :    
523 :     =item db
524 :    
525 :     Name of the FASTA file.
526 :    
527 :     =item type
528 :    
529 :     Type of database desired: C<prot> for protein and C<dna> for DNA.
530 :    
531 :     =back
532 :    
533 :     =cut
534 :    
535 :     sub VerifyDB {
536 :     # Get the parameters.
537 :     my ($db, $type) = @_;
538 :     # Process according to the data type.
539 :     if ($type eq 'prot') {
540 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db)) {
541 :     Trace("Building protein FASTA database for $db.") if T(3);
542 :     system "$FIG_Config::ext_bin/formatdb -p T -i $db";
543 :     }
544 :     } else {
545 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db)) {
546 :     Trace("Building DNA FASTA database for $db.") if T(3);
547 :     system "$FIG_Config::ext_bin/formatdb -p F -i $db";
548 :     }
549 :     }
550 :     }
551 :    
552 :     =head3 ToolSortKey
553 :    
554 :     C<< my $key = $shelp->ToolSortKey($genome, $bsc, $hitLoc); >>
555 :    
556 :     Return the sort key for a match result against the specified
557 :     genome with the specified bit-score. The results are
558 :     to be sorted by bit-score, with the highest score at the top
559 :     and preferential treatment given to NMPDR core genomes.
560 :     The tie-breaker for entries with the same score is the
561 :     organism name followed by the hit location.
562 :    
563 :     =over 4
564 :    
565 :     =item genome
566 :    
567 :     ID of the genome containing the target area.
568 :    
569 :     =item bsc
570 :    
571 :     Bit-score for the match.
572 :    
573 :     =item hitLoc
574 :    
575 :     The hit location, encoded as a location object.
576 :    
577 :     =item RETURN
578 :    
579 :     Returns a key field that can be used to sort the match in among the
580 :     results in the desired fashion.
581 :    
582 :     =back
583 :    
584 :     =cut
585 :    
586 :     sub ToolSortKey {
587 :     # Get the parameters.
588 :     my ($self, $genome, $bsc, $hitLoc) = @_;
589 :     # Get the group from the genome ID.
590 :     my ($orgName, $group) = $self->OrganismData($genome);
591 :     # Declare the return value. It begins with an "A" if this is an NMPDR
592 :     # feature and a "Z" otherwise.
593 :     my $retVal = ($group ? "A" : "Z");
594 :     # Convert the bit score to an integer.
595 :     my $bitScore = int(10 * $bsc);
596 :     # We want to sort by descending bit score, so subtract the result from a big
597 :     # number.
598 :     my $bitThing = 10000000000 - $bitScore;
599 :     if ($bitThing < 0) {
600 :     $bitThing = 0;
601 :     }
602 :     # Pad it to 10 characters.
603 :     $bitThing = "0$bitThing" while (length $bitThing < 10);
604 :     # Tack it onto the group character.
605 :     $retVal .= $bitThing;
606 :     # Finish up with the organism name and the hit location.
607 :     $retVal .= "$orgName:::" . $hitLoc->SeedString;
608 : parrello 1.2 Trace("Blast sort key is $retVal, based on group \"$group\".") if T(4);
609 : parrello 1.1 # Return the result.
610 :     return $retVal;
611 :     }
612 :    
613 :     =head2 Virtual Methods
614 :    
615 :     =head3 Form
616 :    
617 :     C<< my $html = $shelp->Form(); >>
618 :    
619 :     Generate the HTML for a form to request a new search.
620 :    
621 :     =cut
622 :    
623 :     sub Form {
624 :     # Get the parameters.
625 :     my ($self) = @_;
626 :     # Get the CGI and sprout objects.
627 :     my $cgi = $self->Q();
628 :     my $sprout = $self->DB();
629 :     # Start the form.
630 : parrello 1.3 my $retVal = $self->FormStart("Sequence Search");
631 : parrello 1.1 # Get the list of selected genomes.
632 :     my @selected = $cgi->param('genome');
633 :     # Get the incoming genome sequence and the options. These are the incoming scalar
634 :     # values; the others apply to menus.
635 :     my $sequence = $cgi->param('sequence') || "";
636 :     my $options = $cgi->param('options') || "";
637 :     # Get the selected tool and compute the corresponding button caption.
638 :     my $toolChosen = $cgi->param('tool') || "";
639 :     my $caption = ($toolChosen ? $ToolTable{$toolChosen}->{buttonName} : "");
640 :     # Create the menus. First is the tool menu.
641 :     my @valueList = ("", sort keys %ToolTable);
642 :     my $toolMenu = $cgi->popup_menu(-name => "tool", -values => \@valueList,
643 :     -onChange => 'setSubmit(this.value)');
644 :     # Create the genome selection menu.
645 :     my $menu = $self->NmpdrGenomeMenu("genome", 'multiple', \@selected);
646 :     # The general structure here will be to have the DNA/protein sequence on the top,
647 :     # then genome selector and finally the small controls.
648 :     my @rows = ();
649 :     push @rows, $cgi->Tr($cgi->td("Tool"), $cgi->td($toolMenu));
650 :     push @rows, $cgi->Tr($cgi->td("Sequence in Raw or FASTA Format"),
651 :     $cgi->td({colspan => 2}, $cgi->textarea(-name => "sequence", -rows => 5, -cols => 62)));
652 :     push @rows, $cgi->Tr($cgi->td("Select one or more genomes"),
653 :     $cgi->td({colspan => 2}, $menu));
654 :     push @rows, $cgi->Tr($cgi->td("Blast Options"),
655 :     $cgi->td({colspan => 2}, $cgi->textfield(-name => 'options', size => 45)));
656 :     push @rows, $cgi->Tr($cgi->td("Neighborhood Width"),
657 :     $cgi->td($cgi->textfield(-name => 'neighborhood', -size => 5, -value => RHLocations::NEIGHBORHOOD)));
658 :     push @rows, $self->SubmitRow($caption);
659 :     # Create the table.
660 :     $retVal .= $self->MakeTable(\@rows);
661 :     # Close the form.
662 :     $retVal .= $self->FormEnd();
663 :     # Get the form name. We'll need it for the upcoming javascript.
664 :     my $formName = $self->FormName();
665 :     # Add the javaScript to update the button.
666 :     $retVal .= "<script type=\"text/javascript\">\n" .
667 :     " function setSubmit(tool) {\n" .
668 :     " switch (tool) {\n";
669 :     for my $tool (sort keys %ToolTable) {
670 :     $retVal .= " case '$tool' : document.$formName.Search.value = '$ToolTable{$tool}->{buttonName}'; break;\n";
671 :     }
672 :     $retVal .= " }\n" .
673 :     " }\n" .
674 :     "</script>\n";
675 :     # Return the result.
676 :     return $retVal;
677 :     }
678 :    
679 :     =head3 Find
680 :    
681 :     C<< my $resultCount = $shelp->Find(); >>
682 :    
683 :     Conduct a search based on the current CGI query parameters. The search results will
684 :     be written to the session cache file and the number of results will be
685 :     returned. If the search parameters are invalid, a result count of C<undef> will be
686 :     returned and a result message will be stored in this object describing the problem.
687 :    
688 :     =cut
689 :    
690 :     sub Find {
691 :     # Get the parameters.
692 :     my ($self) = @_;
693 :     # Declare the return variable. If it remains undefined, the caller will
694 :     # know there's been an error.
695 :     my $retVal;
696 :     # Get the sprout and CGI query objects.
697 :     my $cgi = $self->Q();
698 :     my $sprout = $self->DB();
699 :     # Get the genome IDs.
700 :     my @genomes = $self->GetGenomes('genome');
701 :     if (! @genomes) {
702 :     $self->SetMessage("No genomes specified.");
703 :     } else {
704 :     # Get the sequence in its raw form.
705 :     my $sequence = $cgi->param('sequence');
706 :     if (! $sequence) {
707 :     $self->SetMessage("No sequence specified.");
708 :     } else {
709 :     # Get the blast options and the tool type.
710 :     my $toolType = $cgi->param('tool') || "";
711 :     my $options = $cgi->param('options') || "";
712 :     # Insure the tool type is valid.
713 :     if (! $toolType) {
714 :     $self->SetMessage("No tool specified.");
715 :     } else {
716 :     # Create the result helper.
717 :     my $rhelp = RHLocations->new($self);
718 :     # Compute the columns. The default columns are determined mostly by the result type.
719 :     $self->DefaultColumns($rhelp);
720 :     #
721 :     # (NOTE: Optional columns should be added here so they go before the alignment column.)
722 :     #
723 :     # Define the extra columns for this tool.
724 :     for my $extraCol (@{$ToolTable{$toolType}->{extras}}) {
725 :     $rhelp->AddExtraColumn(@{$extraCol});
726 :     }
727 :     # Now, the sequence. We use a utility to convert it to a uniform
728 :     # FASTA format of the correct type. Note that for patterns there
729 :     # will not be a header line.
730 :     my $sequenceThing = $self->ComputeFASTA($ToolTable{$toolType}->{inputType}, $cgi->param('sequence'));
731 :     Trace("Fasta sequence has length " . length($sequenceThing) . ".") if T(3);
732 :     # Only proceed if the FASTA sequence was converted successfully.
733 :     if ($sequenceThing) {
734 :     # Write the FASTA to a temporary file. We use the session name with a suffix of
735 :     # "fasta" for the file name.
736 :     my $tmpFile = $self->GetTempFileName('fasta');
737 :     Tracer::PutFile($tmpFile, $sequenceThing);
738 :     # Call the tool.
739 :     my @results = $self->ExecTool($tmpFile, \@genomes, $toolType, $options);
740 :     # Start the output session.
741 :     $self->OpenSession($rhelp);
742 :     # Compute the number of results.
743 :     $retVal = scalar(@results);
744 :     $self->PrintLine("$retVal hit locations found.<br />");
745 :     # Loop through the results.
746 :     my $resultCounter = 0;
747 :     for my $result (@results) {
748 :     # Store the result fields as extra columns. Only the result fields
749 :     # defined as extra columns in the tool table will be kept by the
750 :     # result helper.
751 :     $rhelp->PutExtraColumns(%{$result});
752 :     # Create an ERDB object for the hit location. We use BasicLocation to
753 :     # parse it into its components.
754 :     my $data = RHLocations::BuildLocationRecord($result->{hitLoc});
755 :     # Now put the location data and its sort key into the output stream.
756 :     $rhelp->PutData($result->{sortKey}, $result->{hitLoc}, $data);
757 :     # Tell the user every so often what kind of progress we're making.
758 :     $resultCounter++;
759 :     if ($resultCounter % 100 == 0) {
760 :     $self->PrintLine("$resultCounter of $retVal hit locations processed.<br />");
761 :     }
762 :     }
763 :     # Close the output session.
764 :     $self->CloseSession();
765 :     }
766 :     }
767 :     }
768 :     }
769 :     # Return the result count.
770 :     return $retVal;
771 :     }
772 :    
773 :     =head3 Description
774 :    
775 :     C<< my $htmlText = $shelp->Description(); >>
776 :    
777 :     Return a description of this search. The description is used for the table of contents
778 :     on the main search tools page. It may contain HTML, but it should be character-level,
779 :     not block-level, since the description is going to appear in a list.
780 :    
781 :     =cut
782 :    
783 :     sub Description {
784 :     # Get the parameters.
785 :     my ($self) = @_;
786 :     # Return the result.
787 :     return "Search for matching DNA or protein regions.";
788 :     }
789 :    
790 :     =head3 SearchTitle
791 :    
792 :     C<< my $titleHtml = $shelp->SearchTitle(); >>
793 :    
794 :     Return the display title for this search. The display title appears above the search results.
795 :     If no result is returned, no title will be displayed. The result should be an html string
796 :     that can be legally put inside a block tag such as C<h3> or C<p>.
797 :    
798 :     =cut
799 :    
800 :     sub SearchTitle {
801 :     # Get the parameters.
802 :     my ($self) = @_;
803 :     # Compute the title. We extract the tool ID from the query parameters.
804 :     my $cgi = $self->Q();
805 :     my $tool = $cgi->param('tool');
806 :     my $retVal = $ToolTable{$tool}->{title};
807 :     # Return it.
808 :     return $retVal;
809 :     }
810 :    
811 :    
812 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3