[Bio] / Sprout / SHToolSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHToolSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHToolSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use CGI;
8 :     use HTML;
9 :     use Sprout;
10 :     use Sim;
11 :     use FeatureData;
12 :     use FeatureQuery;
13 :     use RHLocations;
14 :     use ERDBObject;
15 :     use XML::Simple;
16 :     use Data::Dumper;
17 :     use base 'SearchHelper';
18 :    
19 :     =head1 Tool Search Feature Search Helper
20 :    
21 :     =head2 Introduction
22 :    
23 :     This object conducts searches using command-line tools that compare a DNA or Protein
24 :     pattern against a text database of genetic information.
25 :    
26 :     The tools are invoked by a method called L</ExecScan>. This processes the input,
27 :     invokes the tool, and returns a list of hash references. The hash references contain
28 :     whatever fields are required in the search output. Two fields are required: C<hitLoc>,
29 :     which specifies the location of the matching data, and C<sortKey>, which is used
30 :     to sort it.
31 :    
32 :     =over 4
33 :    
34 :     =item sequence
35 :    
36 :     DNA or protein sequence. This can be a feature ID, pattern, or a FASTA string. A feature ID
37 :     is automatically converted to a FASTA string. If a pattern is specified, it is passed
38 :     directly to the pattern match tool.
39 :    
40 :     =item tool
41 :    
42 :     Tool to use: currently C<blastp>, C<blastn>, C<blastx>, C<dnaScan>, or C<protScan>.
43 :    
44 :     =item options
45 :    
46 :     BLAST options, encoded as a string
47 :    
48 :     =item genome[]
49 :    
50 :     IDs of the genomes to be included in the search.
51 :    
52 :     =back
53 :    
54 :     =cut
55 :    
56 :     =head2 Data Structures
57 :    
58 :     =head3 ToolTable
59 :    
60 :     This search is a fairly complicated thing that is basically used to search for
61 :     sequence data in one or more genomes using command-line tools. The idea is that
62 :     any command-line tool can be put in here and made available for use by the
63 :     search system. From a coding point of view, it would make sense to have a
64 :     separate search helper for each tool, but combining them in this way has
65 :     a psychological benefit for the user. The one requirement across all the tools
66 :     is that the input is a DNA or protein sequence of some sort. For the BLAST
67 :     tools, the sequence is a standard FASTA thing. For the scan-for-matches
68 :     tools, the sequence is a search pattern.
69 :    
70 :     The tool table is a hash that performs the services normally expected of
71 :     subclasses. The hash maps each tool name to a hash reference. The various
72 :     fields in the hash references are as follows.
73 :    
74 :     =over 4
75 :    
76 :     =item db_type
77 :    
78 :     Type of database against which the tool runs: C<prot> for a protein database and
79 :     C<dna> for a DNA database. This tells us which files to pass into the tool as
80 :     the search database.
81 :    
82 :     =item exec
83 :    
84 :     Execution string for the tool. The variable C<$seqFile> is presumed to be the location
85 :     of the input sequence, C<$db> is the directory, and C<$options> are the user-specified options.
86 :    
87 :     =item output
88 :    
89 :     The method for converting the output from the tool (presented as a set of lines of text) to
90 :     the output format expected by the BLAST search tool (a list of hash references).
91 :    
92 :     =item inputType
93 :    
94 :     The type of input expected. This is either a FASTA for protein or DNA (C<prot> or C<dna>),
95 :     or a scan pattern for protein or DNA (C<protPattern> or C<dnaPattern>). If additional
96 :     formats are needed, they must be programmed into the B<ComputeFASTA> method of C<SearchHelper>.
97 :    
98 :     =item extras
99 :    
100 :     A list of lists describing the extra columns desired in the output. The extra column names
101 :     must exist as keys in the hashes produced by the output method. In addition to the fields named
102 :     in this list, the output hashes must contain a field named C<sortKey> that can be used to sort
103 :     the results. The lists correspond exactly to the parameter list for the B<AddExtraColumn>
104 :     method of the B<ResultHelper> class.
105 :    
106 :     =item buttonName
107 :    
108 :     The name to display on the search button if this tool is selected.
109 :    
110 :     =item targetRelationship
111 :    
112 :     The name of the relationship from the genome to the entity whose IDs will appear in the output regions. This is C<HasContig> for
113 :     a DNA search and C<HasFeature> for a feature search.
114 :    
115 :     =item title
116 :    
117 :     Title to be used for the search results pages.
118 :    
119 :     =back
120 :    
121 :     =cut
122 :    
123 :     my %ToolTable = ( blastp => { db_type => 'prot',
124 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastp $options',
125 :     output => \&blastXML,
126 :     inputType => 'prot',
127 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
128 :     [queryLoc => 1, title => "Query Location", style => "leftAlign", download => 'text'],
129 :     [hitLoc => 2, title => "Hit Location", style => "leftAlign", download => 'text'],
130 :     [alignment => undef, title => "Alignment", style => "code", download => 'align']],
131 :     buttonName => 'BLAST',
132 :     targetRelationship => 'HasFeature',
133 :     title => 'Blastp Search',
134 :     },
135 :     blastx => { db_type => 'prot',
136 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastx $options',
137 :     output => \&blastXML,
138 :     inputType => 'prot',
139 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
140 :     [queryLoc => 1, title => "Query Location", style => "leftAlign", download => 'text'],
141 :     [hitLoc => 2, title => "Hit Location", style => "leftAlign", download => 'text'],
142 :     [alignment => undef, title => "Alignment", style => "code", download => 'align']],
143 :     buttonName => 'BLAST',
144 :     targetRelationship => 'HasFeature',
145 :     title => 'Blastx Search',
146 :     },
147 :     blastn => { db_type => 'dna',
148 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastn $options',
149 :     output => \&blastXML,
150 :     inputType => 'dna',
151 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
152 :     [queryLoc => 1, title => "Query Location", style => "leftAlign", download => 'text'],
153 :     [hitLoc => 2, title => "Hit Location", style => "leftAlign", download => 'text'],
154 :     [alignment => undef, title => "Alignment", style => "code", download => 'align']],
155 :     buttonName => 'BLAST',
156 :     targetRelationship => 'HasContig',
157 :     title => 'Blastn Search',
158 :     },
159 :     dnaScan => { db_type => 'dna',
160 :     exec => 'scan_for_matches -c $seqFile $options <$db',
161 :     output => \&scanLines,
162 :     inputType => 'dnaPattern',
163 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
164 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
165 :     buttonName => 'SCAN',
166 :     targetRelationship => 'HasContig',
167 :     title => 'DNA Scan for Matches',
168 :     },
169 :     protScan => { db_type => 'prot',
170 :     exec => 'scan_for_matches -p $seqFile $options <$db',
171 :     output => \&scanLines,
172 :     inputType => 'protPattern',
173 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
174 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
175 :     buttonName => 'SCAN',
176 :     targetRelationship => 'HasFeature',
177 :     title => 'Protein Scan for Matches',
178 :     }
179 :     );
180 :    
181 :     =head2 Public Methods
182 :    
183 :     =head3 ExecTool
184 :    
185 :     C<< my @sims = $shelp->ExecTool($seqFile, \@genomes, $tool, $options); >>
186 :    
187 :     Call a tool to search for DNA sequences or features.
188 :    
189 :     =over 4
190 :    
191 :     =item seqFile
192 :    
193 :     Name of a file containing the input sequence. This will either be a FASTA or a scan pattern.
194 :    
195 :     =item genomes
196 :    
197 :     A list of the IDs for the target genomes of the search.
198 :    
199 :     =item tool
200 :    
201 :     Name of the tool to use.
202 :    
203 :     =item options
204 :    
205 :     Options to pass to the tool, formatted for the command line.
206 :    
207 :     =item RETURN
208 :    
209 :     Returns a list of hashes, each representing a single match point.
210 :    
211 :     =back
212 :    
213 :     =cut
214 :    
215 :     sub ExecTool {
216 :     # Get the parameters.
217 :     my ($self, $seqFile, $genomes, $tool, $options) = @_;
218 :     # Declare the return variable.
219 :     my @retVal = ();
220 :     # Get a Sprout database object.
221 :     my $sprout = $self->DB();
222 :     # Insure the blast tools can find the blast matrix directory.
223 :     if (! $ENV{"BLASTMAT"}) { $ENV{"BLASTMAT"} = $FIG_Config::blastmat; }
224 :     # Get the location of the tools.
225 :     my $toolDir = "$FIG_Config::ext_bin";
226 :     Trace("ExecTool for file $seqFile, tool => $tool, options => $options.") if T(2);
227 :     # Get this tool's parameters.
228 :     my $toolData = $ToolTable{$tool};
229 :     # Determine whether or not this is a multi-genome call.
230 :     my $genomeCount = scalar(@$genomes);
231 :     Trace("$genomeCount genomes in list.") if T(2);
232 :     my $multiGenome = ($genomeCount > 1 ? 1 : 0);
233 :     # Loop through the genome list.
234 :     for my $genome (@$genomes) {
235 :     $self->PrintLine("Performing $tool for $genome.<br />");
236 :     Trace("Matching against $genome.") if T(3);
237 :     # Get the target database location for this genome.
238 :     my $db = ($toolData->{db_type} eq 'prot' ?
239 :     "$FIG_Config::organisms/$genome/Features/peg/fasta" :
240 :     "$FIG_Config::organisms/$genome/contigs");
241 :     # Only proceed if the database has data in it.
242 :     if (-s $db) {
243 :     # Verify the database.
244 :     VerifyDB($db, $toolData->{db_type});
245 :     # Get the command.
246 :     my $string = $toolData->{exec};
247 :     # Make the substitutions.
248 :     my %subs = (db => $db, options => ($options || ''), seqFile => $seqFile);
249 :     for my $key (keys %subs) {
250 :     $string =~ s/\$$key/$subs{$key}/g;
251 :     }
252 :     my $command = "$toolDir/$string";
253 :     # Call the command.
254 :     Trace("Executing: $command") if T(3);
255 :     my @data = TICK($command);
256 :     my $dataLineCount = scalar(@data);
257 :     Trace("$dataLineCount lines returned from $tool.") if T(3);
258 :     $self->PrintLine("$dataLineCount data lines returned from $tool.<br />");
259 :     # Compute the maximum number of hits we want back from this genome. Note
260 :     # that scalar(@data) will always be at lest the total number of hits
261 :     # returned.
262 :     my $maxHits = ($multiGenome ? $FIG_Config::blast_limit : scalar(@data));
263 :     # Process the lines. Note that we pass ourselves as the first parameter,
264 :     # mimicking what would happen if the output routine were called as an
265 :     # instance method.
266 :     my @results = &{$toolData->{output}}($self, \@data, $maxHits, $genome);
267 :     # Get a hash of valid targets.
268 :     my $targetRel = $toolData->{targetRelationship};
269 :     my %targetHash = map { $_ => 1 } $sprout->GetFlat([$targetRel], "$targetRel(from-link) = ?",
270 :     [$genome], "$targetRel(to-link)");
271 :     # Insure the results all have valid targets.
272 :     my $keepCount = 0;
273 :     for my $result (@results) {
274 :     # Get the hit location and convert it to a location object.
275 :     my $targetLoc = BasicLocation->new($result->{hitLoc});
276 :     # Test the contig to see if it's a valid target. It's valid if
277 :     # we found it when we did the query above. Invalid targets come
278 :     # from things added to SEED after the Sprout was built.
279 :     if ($targetHash{$targetLoc->Contig}) {
280 :     push @retVal, $result;
281 :     $keepCount++;
282 :     }
283 :     }
284 :     my $message = scalar(@results) . " hits found. $keepCount kept.";
285 :     Trace($message) if T(3);
286 :     $self->PrintLine("$message<br />");
287 :     }
288 :     }
289 :     # Return the resulting list.
290 :     return @retVal;
291 :     }
292 :    
293 :     =head2 Output Processing Methods
294 :    
295 :     =head3 blastXML
296 :    
297 :     C<< my @dataRows = $shelp->blastXML(\@data, $maxHits, $genome); >>
298 :    
299 :     Process XML output from Blast.
300 :    
301 :     The BLAST output is presented as a list of text lines, each of which contains a line-end
302 :     character. We combine all these lines into a single string and read it in using an XML parser.
303 :     The parsed result contains the interesting data 3 levels deep as C<iteration hits>. Each hit
304 :     corresponds to a single target object-- either a feature or a contig-- identified as the C<Hit_def>.
305 :     Inside each hit is a list of C<Hit_hsps>. These are single-level hashes that represent a point of contact
306 :     between the query sequence and the target. The from- and to-locations for the query sequence
307 :     are stored as C<Hsp_query-from> and C<Hsp_query-to>, and for the target object they are
308 :     C<Hsp_hit-from> and C<Hsp_hit-to>. The alignment is stored in C<Hsp_hseq>, C<Hsp_qseq>, and
309 :     C<Hsp_midline>. The bit score is in C<Hsp_bit-score>.
310 :    
311 :     =over 4
312 :    
313 :     =item data
314 :    
315 :     Reference to a list of XML output lines from BLAST.
316 :    
317 :     =item maxHits
318 :    
319 :     Maximum number of hits to return. This is used to control the output size in multi-genome
320 :     searches.
321 :    
322 :     =item genome
323 :    
324 :     ID of the target genome.
325 :    
326 :     =item RETURN
327 :    
328 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<queryLoc>
329 :     indicates the query location, C<alignment> the three-line alignment between the
330 :     query and hit locations, C<sortKey> a sort key based on the bit score, and C<bsc> the bit
331 :     score itself.
332 :    
333 :     =back
334 :    
335 :     =cut
336 :    
337 :     sub blastXML {
338 :     # Get the parameters.
339 :     my ($self, $data, $maxHits, $genome) = @_;
340 :     # Declare the return variable.
341 :     my @retVal;
342 :     Trace("Processing blastXML output for $genome. Max Hits = $maxHits.") if T(2);
343 :     # Set up a counter so that we stop after the appropriate
344 :     # number of hits.
345 :     my $outputHits = 0;
346 :     # Create the xml string from the data.
347 :     my $xmlString = join("", @{$data});
348 :     # Parse the XML. The various options help to keep the result more compact and predictable.
349 :     # Note we do some major error-checking here, because XMLin is very delicate.
350 :     my $xmlThing;
351 :     eval {
352 :     $xmlThing = XMLin($xmlString, GroupTags => { Iteration_hits => 'Hit', Hit_hsps => 'Hsp' },
353 :     ForceArray => ['Hit', 'Hsp']);
354 :     };
355 :     if ($@) {
356 :     Confess("XML parsing error for $genome: $@");
357 :     } elsif (! defined($xmlThing)) {
358 :     Trace("No result from XML parse for $genome.") if T(3);
359 :     } else {
360 :     Trace("XML thing for $genome = \n" . Dumper($xmlThing)) if T(blastXML => 3);
361 :     # Get the name of the query object.
362 :     my $queryName = $xmlThing->{'BlastOutput_query-def'};
363 :     # Strip out the comments (if any).
364 :     if ($queryName =~ /^(\S+)\s+/) {
365 :     $queryName = $1;
366 :     }
367 :     my $iterationData = $xmlThing->{BlastOutput_iterations}->{Iteration}->{Iteration_hits};
368 :     Trace("Iteration data contains " . scalar(@{$iterationData}) . " hits.") if T(3);
369 :     # "$iterationData" is now a list of hits. We process these one at a time in the
370 :     # following loop.
371 :     for my $hit (@{$iterationData}) { last if ($outputHits >= $maxHits);
372 :     # Get the hit target. This is the contig or feature containing the hit locations.
373 :     # We canonicalize it so that it has the genome name somewhere in it.
374 :     my $hitArea = Canonize($hit->{Hit_def}, $genome);
375 :     Trace("Hit on $hitArea. Point count = " . scalar(@{$hit->{Hit_hsps}}) . ".") if T(3);
376 :     # Now we loop through the hit points for this hit.
377 :     for my $point (@{$hit->{Hit_hsps}}) { last if ($outputHits >= $maxHits);
378 :     # We need to create an output tuple for this hit. First, we
379 :     # create the alignment string.
380 :     my $alignment = join("<br />", $point->{Hsp_qseq}, $point->{Hsp_midline}, $point->{Hsp_hseq});
381 :     # Next, we need to create the locations.
382 :     my $hitLoc = BasicLocation->new($hitArea, $point->{'Hsp_hit-from'}, "_", $point->{'Hsp_hit-to'});
383 :     my $queryLoc = BasicLocation->new($queryName, $point->{'Hsp_query-from'}, "_", $point->{'Hsp_query-to'});
384 :     # Finally, we get the bit score, formatted nicely for the display.
385 :     my $bsc = sprintf("%0.3f", $point->{'Hsp_bit-score'});
386 :     # Now we can build our output tuple.
387 :     push @retVal, { queryLoc => $queryLoc->SeedString,
388 :     hitLoc => $hitLoc->SeedString,
389 :     bsc => $bsc,
390 :     alignment => $alignment,
391 :     sortKey => $self->ToolSortKey($genome, $bsc, $hitLoc),
392 :     };
393 :     # Update our result counter. Both loops will exit when this equals the maximum.
394 :     $outputHits++;
395 :     }
396 :     }
397 :     }
398 :     # Return the results.
399 :     return @retVal;
400 :     }
401 :    
402 :    
403 :     =head3 scanLines
404 :    
405 :     C<< my @dataRows = $shelp->scanLines(\@data, $maxHits, $genome); >>
406 :    
407 :     Process output from the scan-for-matches tool. Each match consists of two
408 :     output lines. The first contains the hit source (feature or contig), the
409 :     begin point, and the end point. The second contains the sequence matched.
410 :     Unlike a BLAST search, there is no concept of a query location: the entire
411 :     query matches.
412 :    
413 :     =over 4
414 :    
415 :     =item data
416 :    
417 :     Reference to a list of the output lines from the scan.
418 :    
419 :     =item maxHits
420 :    
421 :     The maximum number of matches to return. This value is used to control
422 :     the output size in multi-genome searches.
423 :    
424 :     =item genome
425 :    
426 :     The ID of the genome containing the hits.
427 :    
428 :     =item RETURN
429 :    
430 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<sortKey> a
431 :     sort key, and C<alignment> the matching sequence.
432 :    
433 :     =back
434 :    
435 :     =cut
436 :    
437 :     sub scanLines {
438 :     # Get the parameters.
439 :     my ($self, $data, $maxHits, $genome) = @_;
440 :     Trace("Processing scanLines output for $genome.") if T(2);
441 :     # Declare the return variable.
442 :     my @retVal;
443 :     # Insure we don't try to return more than the maximum number
444 :     # of hits. Each hit is two lines, so this involves multiplying
445 :     # by two.
446 :     my $maxLines = $maxHits * 2;
447 :     if ($maxLines > scalar(@{$data})) {
448 :     $maxLines = scalar(@{$data});
449 :     }
450 :     # Loop through the lines containing the hits we want.
451 :     for (my $i = 0; $i < $maxLines; $i += 2) {
452 :     # Parse the first line, containing the hit location.
453 :     $data->[$i] =~ /^>([^:]+):\[(\d+),(\d+)\]/;
454 :     # Convert the result to a location.
455 :     my ($hitObject, $beg, $end) = ($1, $2, $3);
456 :     $hitObject = Canonize($hitObject, $genome);
457 :     my $hitLoc = BasicLocation->new($hitObject, $beg, "_", $end);
458 :     # Get the match string.
459 :     my $matchString = $data->[$i+1];
460 :     chomp $matchString;
461 :     # Output the result.
462 :     push @retVal, { hitLoc => $hitLoc->SeedString,
463 :     alignment => $matchString,
464 :     sortKey => $self->ToolSortKey($genome, 0, $hitLoc),
465 :     };
466 :     }
467 :     # Return the result.
468 :     return @retVal;
469 :     }
470 :    
471 :     =head2 Utility Methods
472 :    
473 :     =head3 Canonize
474 :    
475 :     C<< my $newName = CallScanner::Canonize($name, $genomeID); >>
476 :    
477 :     If the specified name is a contig ID, insure it has a genome ID in front of it.
478 :    
479 :     =over 4
480 :    
481 :     =item name
482 :    
483 :     Name to fix up.
484 :    
485 :     =item genomeID
486 :    
487 :     ID of the genome to be added to the contig ID, if necessary.
488 :    
489 :     =item RETURN
490 :    
491 :     Returns a fixed-up name.
492 :    
493 :     =back
494 :    
495 :     =cut
496 :    
497 :     sub Canonize {
498 :     # Get the parameters.
499 :     my ($name, $genomeID) = @_;
500 :     # Declare the return variable.
501 :     my $retVal = $name;
502 :     # Check for a genome ID already in place or a feature ID.
503 :     if ($retVal !~ /:/ && $retVal !~ /^fig/) {
504 :     $retVal = "$genomeID:$name";
505 :     }
506 :     # Return the result.
507 :     return $retVal;
508 :     }
509 :    
510 :     =head3 VerifyDB
511 :    
512 :     C<< CallScanner::VerifyDB($db, $type); >>
513 :    
514 :     Verify that the specified FASTA file has BLAST databases. If the databases
515 :     do not exist, they will be created. If they are older than the FASTA file,
516 :     they will be regenerated.
517 :    
518 :     =over 4
519 :    
520 :     =item db
521 :    
522 :     Name of the FASTA file.
523 :    
524 :     =item type
525 :    
526 :     Type of database desired: C<prot> for protein and C<dna> for DNA.
527 :    
528 :     =back
529 :    
530 :     =cut
531 :    
532 :     sub VerifyDB {
533 :     # Get the parameters.
534 :     my ($db, $type) = @_;
535 :     # Process according to the data type.
536 :     if ($type eq 'prot') {
537 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db)) {
538 :     Trace("Building protein FASTA database for $db.") if T(3);
539 :     system "$FIG_Config::ext_bin/formatdb -p T -i $db";
540 :     }
541 :     } else {
542 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db)) {
543 :     Trace("Building DNA FASTA database for $db.") if T(3);
544 :     system "$FIG_Config::ext_bin/formatdb -p F -i $db";
545 :     }
546 :     }
547 :     }
548 :    
549 :     =head3 ToolSortKey
550 :    
551 :     C<< my $key = $shelp->ToolSortKey($genome, $bsc, $hitLoc); >>
552 :    
553 :     Return the sort key for a match result against the specified
554 :     genome with the specified bit-score. The results are
555 :     to be sorted by bit-score, with the highest score at the top
556 :     and preferential treatment given to NMPDR core genomes.
557 :     The tie-breaker for entries with the same score is the
558 :     organism name followed by the hit location.
559 :    
560 :     =over 4
561 :    
562 :     =item genome
563 :    
564 :     ID of the genome containing the target area.
565 :    
566 :     =item bsc
567 :    
568 :     Bit-score for the match.
569 :    
570 :     =item hitLoc
571 :    
572 :     The hit location, encoded as a location object.
573 :    
574 :     =item RETURN
575 :    
576 :     Returns a key field that can be used to sort the match in among the
577 :     results in the desired fashion.
578 :    
579 :     =back
580 :    
581 :     =cut
582 :    
583 :     sub ToolSortKey {
584 :     # Get the parameters.
585 :     my ($self, $genome, $bsc, $hitLoc) = @_;
586 :     # Get the group from the genome ID.
587 :     my ($orgName, $group) = $self->OrganismData($genome);
588 :     # Declare the return value. It begins with an "A" if this is an NMPDR
589 :     # feature and a "Z" otherwise.
590 :     my $retVal = ($group ? "A" : "Z");
591 :     # Convert the bit score to an integer.
592 :     my $bitScore = int(10 * $bsc);
593 :     # We want to sort by descending bit score, so subtract the result from a big
594 :     # number.
595 :     my $bitThing = 10000000000 - $bitScore;
596 :     if ($bitThing < 0) {
597 :     $bitThing = 0;
598 :     }
599 :     # Pad it to 10 characters.
600 :     $bitThing = "0$bitThing" while (length $bitThing < 10);
601 :     # Tack it onto the group character.
602 :     $retVal .= $bitThing;
603 :     # Finish up with the organism name and the hit location.
604 :     $retVal .= "$orgName:::" . $hitLoc->SeedString;
605 :     Trace("Blast sort key is $retVal.") if T(4);
606 :     # Return the result.
607 :     return $retVal;
608 :     }
609 :    
610 :     =head2 Virtual Methods
611 :    
612 :     =head3 Form
613 :    
614 :     C<< my $html = $shelp->Form(); >>
615 :    
616 :     Generate the HTML for a form to request a new search.
617 :    
618 :     =cut
619 :    
620 :     sub Form {
621 :     # Get the parameters.
622 :     my ($self) = @_;
623 :     # Get the CGI and sprout objects.
624 :     my $cgi = $self->Q();
625 :     my $sprout = $self->DB();
626 :     # Start the form.
627 :     my $retVal = $self->FormStart("Tool Search");
628 :     # Get the list of selected genomes.
629 :     my @selected = $cgi->param('genome');
630 :     # Get the incoming genome sequence and the options. These are the incoming scalar
631 :     # values; the others apply to menus.
632 :     my $sequence = $cgi->param('sequence') || "";
633 :     my $options = $cgi->param('options') || "";
634 :     # Get the selected tool and compute the corresponding button caption.
635 :     my $toolChosen = $cgi->param('tool') || "";
636 :     my $caption = ($toolChosen ? $ToolTable{$toolChosen}->{buttonName} : "");
637 :     # Create the menus. First is the tool menu.
638 :     my @valueList = ("", sort keys %ToolTable);
639 :     my $toolMenu = $cgi->popup_menu(-name => "tool", -values => \@valueList,
640 :     -onChange => 'setSubmit(this.value)');
641 :     # Create the genome selection menu.
642 :     my $menu = $self->NmpdrGenomeMenu("genome", 'multiple', \@selected);
643 :     # The general structure here will be to have the DNA/protein sequence on the top,
644 :     # then genome selector and finally the small controls.
645 :     my @rows = ();
646 :     push @rows, $cgi->Tr($cgi->td("Tool"), $cgi->td($toolMenu));
647 :     push @rows, $cgi->Tr($cgi->td("Sequence in Raw or FASTA Format"),
648 :     $cgi->td({colspan => 2}, $cgi->textarea(-name => "sequence", -rows => 5, -cols => 62)));
649 :     push @rows, $cgi->Tr($cgi->td("Select one or more genomes"),
650 :     $cgi->td({colspan => 2}, $menu));
651 :     push @rows, $cgi->Tr($cgi->td("Blast Options"),
652 :     $cgi->td({colspan => 2}, $cgi->textfield(-name => 'options', size => 45)));
653 :     push @rows, $cgi->Tr($cgi->td("Neighborhood Width"),
654 :     $cgi->td($cgi->textfield(-name => 'neighborhood', -size => 5, -value => RHLocations::NEIGHBORHOOD)));
655 :     push @rows, $self->SubmitRow($caption);
656 :     # Create the table.
657 :     $retVal .= $self->MakeTable(\@rows);
658 :     # Close the form.
659 :     $retVal .= $self->FormEnd();
660 :     # Get the form name. We'll need it for the upcoming javascript.
661 :     my $formName = $self->FormName();
662 :     # Add the javaScript to update the button.
663 :     $retVal .= "<script type=\"text/javascript\">\n" .
664 :     " function setSubmit(tool) {\n" .
665 :     " switch (tool) {\n";
666 :     for my $tool (sort keys %ToolTable) {
667 :     $retVal .= " case '$tool' : document.$formName.Search.value = '$ToolTable{$tool}->{buttonName}'; break;\n";
668 :     }
669 :     $retVal .= " }\n" .
670 :     " }\n" .
671 :     "</script>\n";
672 :     # Return the result.
673 :     return $retVal;
674 :     }
675 :    
676 :     =head3 Find
677 :    
678 :     C<< my $resultCount = $shelp->Find(); >>
679 :    
680 :     Conduct a search based on the current CGI query parameters. The search results will
681 :     be written to the session cache file and the number of results will be
682 :     returned. If the search parameters are invalid, a result count of C<undef> will be
683 :     returned and a result message will be stored in this object describing the problem.
684 :    
685 :     =cut
686 :    
687 :     sub Find {
688 :     # Get the parameters.
689 :     my ($self) = @_;
690 :     # Declare the return variable. If it remains undefined, the caller will
691 :     # know there's been an error.
692 :     my $retVal;
693 :     # Get the sprout and CGI query objects.
694 :     my $cgi = $self->Q();
695 :     my $sprout = $self->DB();
696 :     # Get the genome IDs.
697 :     my @genomes = $self->GetGenomes('genome');
698 :     if (! @genomes) {
699 :     $self->SetMessage("No genomes specified.");
700 :     } else {
701 :     # Get the sequence in its raw form.
702 :     my $sequence = $cgi->param('sequence');
703 :     if (! $sequence) {
704 :     $self->SetMessage("No sequence specified.");
705 :     } else {
706 :     # Get the blast options and the tool type.
707 :     my $toolType = $cgi->param('tool') || "";
708 :     my $options = $cgi->param('options') || "";
709 :     # Insure the tool type is valid.
710 :     if (! $toolType) {
711 :     $self->SetMessage("No tool specified.");
712 :     } else {
713 :     # Create the result helper.
714 :     my $rhelp = RHLocations->new($self);
715 :     # Compute the columns. The default columns are determined mostly by the result type.
716 :     $self->DefaultColumns($rhelp);
717 :     #
718 :     # (NOTE: Optional columns should be added here so they go before the alignment column.)
719 :     #
720 :     # Define the extra columns for this tool.
721 :     for my $extraCol (@{$ToolTable{$toolType}->{extras}}) {
722 :     $rhelp->AddExtraColumn(@{$extraCol});
723 :     }
724 :     # Now, the sequence. We use a utility to convert it to a uniform
725 :     # FASTA format of the correct type. Note that for patterns there
726 :     # will not be a header line.
727 :     my $sequenceThing = $self->ComputeFASTA($ToolTable{$toolType}->{inputType}, $cgi->param('sequence'));
728 :     Trace("Fasta sequence has length " . length($sequenceThing) . ".") if T(3);
729 :     # Only proceed if the FASTA sequence was converted successfully.
730 :     if ($sequenceThing) {
731 :     # Write the FASTA to a temporary file. We use the session name with a suffix of
732 :     # "fasta" for the file name.
733 :     my $tmpFile = $self->GetTempFileName('fasta');
734 :     Tracer::PutFile($tmpFile, $sequenceThing);
735 :     # Call the tool.
736 :     my @results = $self->ExecTool($tmpFile, \@genomes, $toolType, $options);
737 :     # Start the output session.
738 :     $self->OpenSession($rhelp);
739 :     # Compute the number of results.
740 :     $retVal = scalar(@results);
741 :     $self->PrintLine("$retVal hit locations found.<br />");
742 :     # Loop through the results.
743 :     my $resultCounter = 0;
744 :     for my $result (@results) {
745 :     # Store the result fields as extra columns. Only the result fields
746 :     # defined as extra columns in the tool table will be kept by the
747 :     # result helper.
748 :     $rhelp->PutExtraColumns(%{$result});
749 :     # Create an ERDB object for the hit location. We use BasicLocation to
750 :     # parse it into its components.
751 :     my $data = RHLocations::BuildLocationRecord($result->{hitLoc});
752 :     # Now put the location data and its sort key into the output stream.
753 :     $rhelp->PutData($result->{sortKey}, $result->{hitLoc}, $data);
754 :     # Tell the user every so often what kind of progress we're making.
755 :     $resultCounter++;
756 :     if ($resultCounter % 100 == 0) {
757 :     $self->PrintLine("$resultCounter of $retVal hit locations processed.<br />");
758 :     }
759 :     }
760 :     # Close the output session.
761 :     $self->CloseSession();
762 :     }
763 :     }
764 :     }
765 :     }
766 :     # Return the result count.
767 :     return $retVal;
768 :     }
769 :    
770 :     =head3 Description
771 :    
772 :     C<< my $htmlText = $shelp->Description(); >>
773 :    
774 :     Return a description of this search. The description is used for the table of contents
775 :     on the main search tools page. It may contain HTML, but it should be character-level,
776 :     not block-level, since the description is going to appear in a list.
777 :    
778 :     =cut
779 :    
780 :     sub Description {
781 :     # Get the parameters.
782 :     my ($self) = @_;
783 :     # Return the result.
784 :     return "Search for matching DNA or protein regions.";
785 :     }
786 :    
787 :     =head3 SearchTitle
788 :    
789 :     C<< my $titleHtml = $shelp->SearchTitle(); >>
790 :    
791 :     Return the display title for this search. The display title appears above the search results.
792 :     If no result is returned, no title will be displayed. The result should be an html string
793 :     that can be legally put inside a block tag such as C<h3> or C<p>.
794 :    
795 :     =cut
796 :    
797 :     sub SearchTitle {
798 :     # Get the parameters.
799 :     my ($self) = @_;
800 :     # Compute the title. We extract the tool ID from the query parameters.
801 :     my $cgi = $self->Q();
802 :     my $tool = $cgi->param('tool');
803 :     my $retVal = $ToolTable{$tool}->{title};
804 :     # Return it.
805 :     return $retVal;
806 :     }
807 :    
808 :    
809 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3