[Bio] / Sprout / SHToolSearch.pm Repository:
ViewVC logotype

Annotation of /Sprout/SHToolSearch.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/perl -w
2 :    
3 :     package SHToolSearch;
4 :    
5 :     use strict;
6 :     use Tracer;
7 :     use CGI;
8 :     use HTML;
9 :     use Sprout;
10 :     use Sim;
11 :     use RHLocations;
12 :     use ERDBObject;
13 :     use XML::Simple;
14 :     use Data::Dumper;
15 :     use base 'SearchHelper';
16 :    
17 :     =head1 Tool Search Feature Search Helper
18 :    
19 :     =head2 Introduction
20 :    
21 :     This object conducts searches using command-line tools that compare a DNA or Protein
22 :     pattern against a text database of genetic information.
23 :    
24 :     The tools are invoked by a method called L</ExecScan>. This processes the input,
25 :     invokes the tool, and returns a list of hash references. The hash references contain
26 :     whatever fields are required in the search output. Two fields are required: C<hitLoc>,
27 :     which specifies the location of the matching data, and C<sortKey>, which is used
28 :     to sort it.
29 :    
30 :     =over 4
31 :    
32 :     =item sequence
33 :    
34 :     DNA or protein sequence. This can be a feature ID, pattern, or a FASTA string. A feature ID
35 :     is automatically converted to a FASTA string. If a pattern is specified, it is passed
36 :     directly to the pattern match tool.
37 :    
38 :     =item tool
39 :    
40 :     Tool to use: currently C<blastp>, C<blastn>, C<blastx>, C<dnaScan>, or C<protScan>.
41 :    
42 :     =item options
43 :    
44 :     BLAST options, encoded as a string
45 :    
46 :     =item genome[]
47 :    
48 :     IDs of the genomes to be included in the search.
49 :    
50 :     =back
51 :    
52 :     =cut
53 :    
54 :     =head2 Data Structures
55 :    
56 :     =head3 ToolTable
57 :    
58 :     This search is a fairly complicated thing that is basically used to search for
59 :     sequence data in one or more genomes using command-line tools. The idea is that
60 :     any command-line tool can be put in here and made available for use by the
61 :     search system. From a coding point of view, it would make sense to have a
62 :     separate search helper for each tool, but combining them in this way has
63 :     a psychological benefit for the user. The one requirement across all the tools
64 :     is that the input is a DNA or protein sequence of some sort. For the BLAST
65 :     tools, the sequence is a standard FASTA thing. For the scan-for-matches
66 :     tools, the sequence is a search pattern.
67 :    
68 :     The tool table is a hash that performs the services normally expected of
69 :     subclasses. The hash maps each tool name to a hash reference. The various
70 :     fields in the hash references are as follows.
71 :    
72 :     =over 4
73 :    
74 :     =item db_type
75 :    
76 :     Type of database against which the tool runs: C<prot> for a protein database and
77 :     C<dna> for a DNA database. This tells us which files to pass into the tool as
78 :     the search database.
79 :    
80 :     =item exec
81 :    
82 :     Execution string for the tool. The variable C<$seqFile> is presumed to be the location
83 :     of the input sequence, C<$db> is the directory, and C<$options> are the user-specified options.
84 :    
85 :     =item output
86 :    
87 :     The method for converting the output from the tool (presented as a set of lines of text) to
88 :     the output format expected by the BLAST search tool (a list of hash references).
89 :    
90 :     =item inputType
91 :    
92 :     The type of input expected. This is either a FASTA for protein or DNA (C<prot> or C<dna>),
93 :     or a scan pattern for protein or DNA (C<protPattern> or C<dnaPattern>). If additional
94 :     formats are needed, they must be programmed into the B<ComputeFASTA> method of C<SearchHelper>.
95 :    
96 :     =item extras
97 :    
98 :     A list of lists describing the extra columns desired in the output. The extra column names
99 :     must exist as keys in the hashes produced by the output method. In addition to the fields named
100 :     in this list, the output hashes must contain a field named C<sortKey> that can be used to sort
101 :     the results. The lists correspond exactly to the parameter list for the B<AddExtraColumn>
102 :     method of the B<ResultHelper> class.
103 :    
104 :     =item buttonName
105 :    
106 :     The name to display on the search button if this tool is selected.
107 :    
108 :     =item targetRelationship
109 :    
110 :     The name of the relationship from the genome to the entity whose IDs will appear in the output regions. This is C<HasContig> for
111 :     a DNA search and C<HasFeature> for a feature search.
112 :    
113 :     =item title
114 :    
115 :     Title to be used for the search results pages.
116 :    
117 :     =back
118 :    
119 :     =cut
120 :    
121 :     my %ToolTable = ( blastp => { db_type => 'prot',
122 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastp $options',
123 :     output => \&blastXML,
124 :     inputType => 'prot',
125 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
126 :     [queryLoc => 1, title => "Query Location", style => "leftAlign", download => 'text'],
127 :     [hitLoc => 2, title => "Hit Location", style => "leftAlign", download => 'text'],
128 :     [alignment => undef, title => "Alignment", style => "code", download => 'align']],
129 :     buttonName => 'BLAST',
130 :     targetRelationship => 'HasFeature',
131 :     title => 'Blastp Search',
132 :     },
133 :     blastx => { db_type => 'prot',
134 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastx $options',
135 :     output => \&blastXML,
136 :     inputType => 'prot',
137 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
138 :     [queryLoc => 1, title => "Query Location", style => "leftAlign", download => 'text'],
139 :     [hitLoc => 2, title => "Hit Location", style => "leftAlign", download => 'text'],
140 :     [alignment => undef, title => "Alignment", style => "code", download => 'align']],
141 :     buttonName => 'BLAST',
142 :     targetRelationship => 'HasFeature',
143 :     title => 'Blastx Search',
144 :     },
145 :     blastn => { db_type => 'dna',
146 :     exec => 'blastall -i $seqFile -d $db -m 7 -FF -p blastn $options',
147 :     output => \&blastXML,
148 :     inputType => 'dna',
149 :     extras => [[bsc => 0, title => "Bit Score", style => "rightAlign", download => 'num'],
150 :     [queryLoc => 1, title => "Query Location", style => "leftAlign", download => 'text'],
151 :     [hitLoc => 2, title => "Hit Location", style => "leftAlign", download => 'text'],
152 :     [alignment => undef, title => "Alignment", style => "code", download => 'align']],
153 :     buttonName => 'BLAST',
154 :     targetRelationship => 'HasContig',
155 :     title => 'Blastn Search',
156 :     },
157 :     dnaScan => { db_type => 'dna',
158 :     exec => 'scan_for_matches -c $seqFile $options <$db',
159 :     output => \&scanLines,
160 :     inputType => 'dnaPattern',
161 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
162 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
163 :     buttonName => 'SCAN',
164 :     targetRelationship => 'HasContig',
165 :     title => 'DNA Scan for Matches',
166 :     },
167 :     protScan => { db_type => 'prot',
168 :     exec => 'scan_for_matches -p $seqFile $options <$db',
169 :     output => \&scanLines,
170 :     inputType => 'protPattern',
171 :     extras => [[hitLoc => 0, title => "Hit Location", style => "leftAlign", download => 'text'],
172 :     [alignment => undef, title => "Matching Sequence", style => "code", download => 'align']],
173 :     buttonName => 'SCAN',
174 :     targetRelationship => 'HasFeature',
175 :     title => 'Protein Scan for Matches',
176 :     }
177 :     );
178 :    
179 :     =head2 Public Methods
180 :    
181 :     =head3 ExecTool
182 :    
183 :     C<< my @sims = $shelp->ExecTool($seqFile, \@genomes, $tool, $options); >>
184 :    
185 :     Call a tool to search for DNA sequences or features.
186 :    
187 :     =over 4
188 :    
189 :     =item seqFile
190 :    
191 :     Name of a file containing the input sequence. This will either be a FASTA or a scan pattern.
192 :    
193 :     =item genomes
194 :    
195 :     A list of the IDs for the target genomes of the search.
196 :    
197 :     =item tool
198 :    
199 :     Name of the tool to use.
200 :    
201 :     =item options
202 :    
203 :     Options to pass to the tool, formatted for the command line.
204 :    
205 :     =item RETURN
206 :    
207 :     Returns a list of hashes, each representing a single match point.
208 :    
209 :     =back
210 :    
211 :     =cut
212 :    
213 :     sub ExecTool {
214 :     # Get the parameters.
215 :     my ($self, $seqFile, $genomes, $tool, $options) = @_;
216 :     # Declare the return variable.
217 :     my @retVal = ();
218 :     # Get a Sprout database object.
219 :     my $sprout = $self->DB();
220 :     # Insure the blast tools can find the blast matrix directory.
221 :     if (! $ENV{"BLASTMAT"}) { $ENV{"BLASTMAT"} = $FIG_Config::blastmat; }
222 :     # Get the location of the tools.
223 :     my $toolDir = "$FIG_Config::ext_bin";
224 :     Trace("ExecTool for file $seqFile, tool => $tool, options => $options.") if T(2);
225 :     # Get this tool's parameters.
226 :     my $toolData = $ToolTable{$tool};
227 :     # Determine whether or not this is a multi-genome call.
228 :     my $genomeCount = scalar(@$genomes);
229 :     Trace("$genomeCount genomes in list.") if T(2);
230 :     my $multiGenome = ($genomeCount > 1 ? 1 : 0);
231 :     # Loop through the genome list.
232 :     for my $genome (@$genomes) {
233 :     $self->PrintLine("Performing $tool for $genome.<br />");
234 :     Trace("Matching against $genome.") if T(3);
235 :     # Get the target database location for this genome.
236 :     my $db = ($toolData->{db_type} eq 'prot' ?
237 :     "$FIG_Config::organisms/$genome/Features/peg/fasta" :
238 :     "$FIG_Config::organisms/$genome/contigs");
239 :     # Only proceed if the database has data in it.
240 :     if (-s $db) {
241 :     # Verify the database.
242 :     VerifyDB($db, $toolData->{db_type});
243 :     # Get the command.
244 :     my $string = $toolData->{exec};
245 :     # Make the substitutions.
246 :     my %subs = (db => $db, options => ($options || ''), seqFile => $seqFile);
247 :     for my $key (keys %subs) {
248 :     $string =~ s/\$$key/$subs{$key}/g;
249 :     }
250 :     my $command = "$toolDir/$string";
251 :     # Call the command.
252 :     Trace("Executing: $command") if T(3);
253 :     my @data = TICK($command);
254 :     my $dataLineCount = scalar(@data);
255 :     Trace("$dataLineCount lines returned from $tool.") if T(3);
256 :     $self->PrintLine("$dataLineCount data lines returned from $tool.<br />");
257 :     # Compute the maximum number of hits we want back from this genome. Note
258 :     # that scalar(@data) will always be at lest the total number of hits
259 :     # returned.
260 :     my $maxHits = ($multiGenome ? $FIG_Config::blast_limit : scalar(@data));
261 :     # Process the lines. Note that we pass ourselves as the first parameter,
262 :     # mimicking what would happen if the output routine were called as an
263 :     # instance method.
264 :     my @results = &{$toolData->{output}}($self, \@data, $maxHits, $genome);
265 :     # Get a hash of valid targets.
266 :     my $targetRel = $toolData->{targetRelationship};
267 :     my %targetHash = map { $_ => 1 } $sprout->GetFlat([$targetRel], "$targetRel(from-link) = ?",
268 :     [$genome], "$targetRel(to-link)");
269 :     # Insure the results all have valid targets.
270 :     my $keepCount = 0;
271 :     for my $result (@results) {
272 :     # Get the hit location and convert it to a location object.
273 :     my $targetLoc = BasicLocation->new($result->{hitLoc});
274 :     # Test the contig to see if it's a valid target. It's valid if
275 :     # we found it when we did the query above. Invalid targets come
276 :     # from things added to SEED after the Sprout was built.
277 :     if ($targetHash{$targetLoc->Contig}) {
278 :     push @retVal, $result;
279 :     $keepCount++;
280 :     }
281 :     }
282 :     my $message = scalar(@results) . " hits found. $keepCount kept.";
283 :     Trace($message) if T(3);
284 :     $self->PrintLine("$message<br />");
285 :     }
286 :     }
287 :     # Return the resulting list.
288 :     return @retVal;
289 :     }
290 :    
291 :     =head2 Output Processing Methods
292 :    
293 :     =head3 blastXML
294 :    
295 :     C<< my @dataRows = $shelp->blastXML(\@data, $maxHits, $genome); >>
296 :    
297 :     Process XML output from Blast.
298 :    
299 :     The BLAST output is presented as a list of text lines, each of which contains a line-end
300 :     character. We combine all these lines into a single string and read it in using an XML parser.
301 :     The parsed result contains the interesting data 3 levels deep as C<iteration hits>. Each hit
302 :     corresponds to a single target object-- either a feature or a contig-- identified as the C<Hit_def>.
303 :     Inside each hit is a list of C<Hit_hsps>. These are single-level hashes that represent a point of contact
304 :     between the query sequence and the target. The from- and to-locations for the query sequence
305 :     are stored as C<Hsp_query-from> and C<Hsp_query-to>, and for the target object they are
306 :     C<Hsp_hit-from> and C<Hsp_hit-to>. The alignment is stored in C<Hsp_hseq>, C<Hsp_qseq>, and
307 :     C<Hsp_midline>. The bit score is in C<Hsp_bit-score>.
308 :    
309 :     =over 4
310 :    
311 :     =item data
312 :    
313 :     Reference to a list of XML output lines from BLAST.
314 :    
315 :     =item maxHits
316 :    
317 :     Maximum number of hits to return. This is used to control the output size in multi-genome
318 :     searches.
319 :    
320 :     =item genome
321 :    
322 :     ID of the target genome.
323 :    
324 :     =item RETURN
325 :    
326 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<queryLoc>
327 :     indicates the query location, C<alignment> the three-line alignment between the
328 :     query and hit locations, C<sortKey> a sort key based on the bit score, and C<bsc> the bit
329 :     score itself.
330 :    
331 :     =back
332 :    
333 :     =cut
334 :    
335 :     sub blastXML {
336 :     # Get the parameters.
337 :     my ($self, $data, $maxHits, $genome) = @_;
338 :     # Declare the return variable.
339 :     my @retVal;
340 :     Trace("Processing blastXML output for $genome. Max Hits = $maxHits.") if T(2);
341 :     # Set up a counter so that we stop after the appropriate
342 :     # number of hits.
343 :     my $outputHits = 0;
344 :     # Create the xml string from the data.
345 :     my $xmlString = join("", @{$data});
346 :     # Parse the XML. The various options help to keep the result more compact and predictable.
347 :     # Note we do some major error-checking here, because XMLin is very delicate.
348 :     my $xmlThing;
349 :     eval {
350 :     $xmlThing = XMLin($xmlString, GroupTags => { Iteration_hits => 'Hit', Hit_hsps => 'Hsp' },
351 :     ForceArray => ['Hit', 'Hsp']);
352 :     };
353 :     if ($@) {
354 :     Confess("XML parsing error for $genome: $@");
355 :     } elsif (! defined($xmlThing)) {
356 :     Trace("No result from XML parse for $genome.") if T(3);
357 :     } else {
358 :     Trace("XML thing for $genome = \n" . Dumper($xmlThing)) if T(blastXML => 3);
359 :     # Get the name of the query object.
360 :     my $queryName = $xmlThing->{'BlastOutput_query-def'};
361 :     # Strip out the comments (if any).
362 :     if ($queryName =~ /^(\S+)\s+/) {
363 :     $queryName = $1;
364 :     }
365 :     my $iterationData = $xmlThing->{BlastOutput_iterations}->{Iteration}->{Iteration_hits};
366 :     Trace("Iteration data contains " . scalar(@{$iterationData}) . " hits.") if T(3);
367 :     # "$iterationData" is now a list of hits. We process these one at a time in the
368 :     # following loop.
369 :     for my $hit (@{$iterationData}) { last if ($outputHits >= $maxHits);
370 :     # Get the hit target. This is the contig or feature containing the hit locations.
371 :     # We canonicalize it so that it has the genome name somewhere in it.
372 :     my $hitArea = Canonize($hit->{Hit_def}, $genome);
373 :     Trace("Hit on $hitArea. Point count = " . scalar(@{$hit->{Hit_hsps}}) . ".") if T(3);
374 :     # Now we loop through the hit points for this hit.
375 :     for my $point (@{$hit->{Hit_hsps}}) { last if ($outputHits >= $maxHits);
376 :     # We need to create an output tuple for this hit. First, we
377 :     # create the alignment string.
378 :     my $alignment = join("<br />", $point->{Hsp_qseq}, $point->{Hsp_midline}, $point->{Hsp_hseq});
379 :     # Next, we need to create the locations.
380 :     my $hitLoc = BasicLocation->new($hitArea, $point->{'Hsp_hit-from'}, "_", $point->{'Hsp_hit-to'});
381 :     my $queryLoc = BasicLocation->new($queryName, $point->{'Hsp_query-from'}, "_", $point->{'Hsp_query-to'});
382 :     # Finally, we get the bit score, formatted nicely for the display.
383 :     my $bsc = sprintf("%0.3f", $point->{'Hsp_bit-score'});
384 :     # Now we can build our output tuple.
385 :     push @retVal, { queryLoc => $queryLoc->SeedString,
386 :     hitLoc => $hitLoc->SeedString,
387 :     bsc => $bsc,
388 :     alignment => $alignment,
389 :     sortKey => $self->ToolSortKey($genome, $bsc, $hitLoc),
390 :     };
391 :     # Update our result counter. Both loops will exit when this equals the maximum.
392 :     $outputHits++;
393 :     }
394 :     }
395 :     }
396 :     # Return the results.
397 :     return @retVal;
398 :     }
399 :    
400 :    
401 :     =head3 scanLines
402 :    
403 :     C<< my @dataRows = $shelp->scanLines(\@data, $maxHits, $genome); >>
404 :    
405 :     Process output from the scan-for-matches tool. Each match consists of two
406 :     output lines. The first contains the hit source (feature or contig), the
407 :     begin point, and the end point. The second contains the sequence matched.
408 :     Unlike a BLAST search, there is no concept of a query location: the entire
409 :     query matches.
410 :    
411 :     =over 4
412 :    
413 :     =item data
414 :    
415 :     Reference to a list of the output lines from the scan.
416 :    
417 :     =item maxHits
418 :    
419 :     The maximum number of matches to return. This value is used to control
420 :     the output size in multi-genome searches.
421 :    
422 :     =item genome
423 :    
424 :     The ID of the genome containing the hits.
425 :    
426 :     =item RETURN
427 :    
428 :     Returns a list of hash references. In each hash, C<hitLoc> indicates the hit location, C<sortKey> a
429 :     sort key, and C<alignment> the matching sequence.
430 :    
431 :     =back
432 :    
433 :     =cut
434 :    
435 :     sub scanLines {
436 :     # Get the parameters.
437 :     my ($self, $data, $maxHits, $genome) = @_;
438 :     Trace("Processing scanLines output for $genome.") if T(2);
439 :     # Declare the return variable.
440 :     my @retVal;
441 :     # Insure we don't try to return more than the maximum number
442 :     # of hits. Each hit is two lines, so this involves multiplying
443 :     # by two.
444 :     my $maxLines = $maxHits * 2;
445 :     if ($maxLines > scalar(@{$data})) {
446 :     $maxLines = scalar(@{$data});
447 :     }
448 :     # Loop through the lines containing the hits we want.
449 :     for (my $i = 0; $i < $maxLines; $i += 2) {
450 :     # Parse the first line, containing the hit location.
451 :     $data->[$i] =~ /^>([^:]+):\[(\d+),(\d+)\]/;
452 :     # Convert the result to a location.
453 :     my ($hitObject, $beg, $end) = ($1, $2, $3);
454 :     $hitObject = Canonize($hitObject, $genome);
455 :     my $hitLoc = BasicLocation->new($hitObject, $beg, "_", $end);
456 :     # Get the match string.
457 :     my $matchString = $data->[$i+1];
458 :     chomp $matchString;
459 :     # Output the result.
460 :     push @retVal, { hitLoc => $hitLoc->SeedString,
461 :     alignment => $matchString,
462 :     sortKey => $self->ToolSortKey($genome, 0, $hitLoc),
463 :     };
464 :     }
465 :     # Return the result.
466 :     return @retVal;
467 :     }
468 :    
469 :     =head2 Utility Methods
470 :    
471 :     =head3 Canonize
472 :    
473 :     C<< my $newName = CallScanner::Canonize($name, $genomeID); >>
474 :    
475 :     If the specified name is a contig ID, insure it has a genome ID in front of it.
476 :    
477 :     =over 4
478 :    
479 :     =item name
480 :    
481 :     Name to fix up.
482 :    
483 :     =item genomeID
484 :    
485 :     ID of the genome to be added to the contig ID, if necessary.
486 :    
487 :     =item RETURN
488 :    
489 :     Returns a fixed-up name.
490 :    
491 :     =back
492 :    
493 :     =cut
494 :    
495 :     sub Canonize {
496 :     # Get the parameters.
497 :     my ($name, $genomeID) = @_;
498 :     # Declare the return variable.
499 :     my $retVal = $name;
500 :     # Check for a genome ID already in place or a feature ID.
501 :     if ($retVal !~ /:/ && $retVal !~ /^fig/) {
502 :     $retVal = "$genomeID:$name";
503 :     }
504 :     # Return the result.
505 :     return $retVal;
506 :     }
507 :    
508 :     =head3 VerifyDB
509 :    
510 :     C<< CallScanner::VerifyDB($db, $type); >>
511 :    
512 :     Verify that the specified FASTA file has BLAST databases. If the databases
513 :     do not exist, they will be created. If they are older than the FASTA file,
514 :     they will be regenerated.
515 :    
516 :     =over 4
517 :    
518 :     =item db
519 :    
520 :     Name of the FASTA file.
521 :    
522 :     =item type
523 :    
524 :     Type of database desired: C<prot> for protein and C<dna> for DNA.
525 :    
526 :     =back
527 :    
528 :     =cut
529 :    
530 :     sub VerifyDB {
531 :     # Get the parameters.
532 :     my ($db, $type) = @_;
533 :     # Process according to the data type.
534 :     if ($type eq 'prot') {
535 :     if ((! -s "$db.psq") || (-M "$db.psq" > -M $db)) {
536 :     Trace("Building protein FASTA database for $db.") if T(3);
537 :     system "$FIG_Config::ext_bin/formatdb -p T -i $db";
538 :     }
539 :     } else {
540 :     if ((! -s "$db.nsq") || (-M "$db.nsq" > -M $db)) {
541 :     Trace("Building DNA FASTA database for $db.") if T(3);
542 :     system "$FIG_Config::ext_bin/formatdb -p F -i $db";
543 :     }
544 :     }
545 :     }
546 :    
547 :     =head3 ToolSortKey
548 :    
549 :     C<< my $key = $shelp->ToolSortKey($genome, $bsc, $hitLoc); >>
550 :    
551 :     Return the sort key for a match result against the specified
552 :     genome with the specified bit-score. The results are
553 :     to be sorted by bit-score, with the highest score at the top
554 :     and preferential treatment given to NMPDR core genomes.
555 :     The tie-breaker for entries with the same score is the
556 :     organism name followed by the hit location.
557 :    
558 :     =over 4
559 :    
560 :     =item genome
561 :    
562 :     ID of the genome containing the target area.
563 :    
564 :     =item bsc
565 :    
566 :     Bit-score for the match.
567 :    
568 :     =item hitLoc
569 :    
570 :     The hit location, encoded as a location object.
571 :    
572 :     =item RETURN
573 :    
574 :     Returns a key field that can be used to sort the match in among the
575 :     results in the desired fashion.
576 :    
577 :     =back
578 :    
579 :     =cut
580 :    
581 :     sub ToolSortKey {
582 :     # Get the parameters.
583 :     my ($self, $genome, $bsc, $hitLoc) = @_;
584 :     # Get the group from the genome ID.
585 :     my ($orgName, $group) = $self->OrganismData($genome);
586 :     # Declare the return value. It begins with an "A" if this is an NMPDR
587 :     # feature and a "Z" otherwise.
588 :     my $retVal = ($group ? "A" : "Z");
589 :     # Convert the bit score to an integer.
590 :     my $bitScore = int(10 * $bsc);
591 :     # We want to sort by descending bit score, so subtract the result from a big
592 :     # number.
593 :     my $bitThing = 10000000000 - $bitScore;
594 :     if ($bitThing < 0) {
595 :     $bitThing = 0;
596 :     }
597 :     # Pad it to 10 characters.
598 :     $bitThing = "0$bitThing" while (length $bitThing < 10);
599 :     # Tack it onto the group character.
600 :     $retVal .= $bitThing;
601 :     # Finish up with the organism name and the hit location.
602 :     $retVal .= "$orgName:::" . $hitLoc->SeedString;
603 : parrello 1.2 Trace("Blast sort key is $retVal, based on group \"$group\".") if T(4);
604 : parrello 1.1 # Return the result.
605 :     return $retVal;
606 :     }
607 :    
608 :     =head2 Virtual Methods
609 :    
610 :     =head3 Form
611 :    
612 :     C<< my $html = $shelp->Form(); >>
613 :    
614 :     Generate the HTML for a form to request a new search.
615 :    
616 :     =cut
617 :    
618 :     sub Form {
619 :     # Get the parameters.
620 :     my ($self) = @_;
621 :     # Get the CGI and sprout objects.
622 :     my $cgi = $self->Q();
623 :     my $sprout = $self->DB();
624 :     # Start the form.
625 : parrello 1.3 my $retVal = $self->FormStart("Sequence Search");
626 : parrello 1.1 # Get the list of selected genomes.
627 :     my @selected = $cgi->param('genome');
628 :     # Get the incoming genome sequence and the options. These are the incoming scalar
629 :     # values; the others apply to menus.
630 :     my $sequence = $cgi->param('sequence') || "";
631 :     my $options = $cgi->param('options') || "";
632 :     # Get the selected tool and compute the corresponding button caption.
633 :     my $toolChosen = $cgi->param('tool') || "";
634 :     my $caption = ($toolChosen ? $ToolTable{$toolChosen}->{buttonName} : "");
635 :     # Create the menus. First is the tool menu.
636 :     my @valueList = ("", sort keys %ToolTable);
637 :     my $toolMenu = $cgi->popup_menu(-name => "tool", -values => \@valueList,
638 :     -onChange => 'setSubmit(this.value)');
639 :     # Create the genome selection menu.
640 :     my $menu = $self->NmpdrGenomeMenu("genome", 'multiple', \@selected);
641 :     # The general structure here will be to have the DNA/protein sequence on the top,
642 :     # then genome selector and finally the small controls.
643 :     my @rows = ();
644 :     push @rows, $cgi->Tr($cgi->td("Tool"), $cgi->td($toolMenu));
645 :     push @rows, $cgi->Tr($cgi->td("Sequence in Raw or FASTA Format"),
646 :     $cgi->td({colspan => 2}, $cgi->textarea(-name => "sequence", -rows => 5, -cols => 62)));
647 :     push @rows, $cgi->Tr($cgi->td("Select one or more genomes"),
648 :     $cgi->td({colspan => 2}, $menu));
649 :     push @rows, $cgi->Tr($cgi->td("Blast Options"),
650 :     $cgi->td({colspan => 2}, $cgi->textfield(-name => 'options', size => 45)));
651 :     push @rows, $cgi->Tr($cgi->td("Neighborhood Width"),
652 :     $cgi->td($cgi->textfield(-name => 'neighborhood', -size => 5, -value => RHLocations::NEIGHBORHOOD)));
653 :     push @rows, $self->SubmitRow($caption);
654 :     # Create the table.
655 :     $retVal .= $self->MakeTable(\@rows);
656 :     # Close the form.
657 :     $retVal .= $self->FormEnd();
658 :     # Get the form name. We'll need it for the upcoming javascript.
659 :     my $formName = $self->FormName();
660 :     # Add the javaScript to update the button.
661 :     $retVal .= "<script type=\"text/javascript\">\n" .
662 :     " function setSubmit(tool) {\n" .
663 :     " switch (tool) {\n";
664 :     for my $tool (sort keys %ToolTable) {
665 :     $retVal .= " case '$tool' : document.$formName.Search.value = '$ToolTable{$tool}->{buttonName}'; break;\n";
666 :     }
667 :     $retVal .= " }\n" .
668 :     " }\n" .
669 :     "</script>\n";
670 :     # Return the result.
671 :     return $retVal;
672 :     }
673 :    
674 :     =head3 Find
675 :    
676 :     C<< my $resultCount = $shelp->Find(); >>
677 :    
678 :     Conduct a search based on the current CGI query parameters. The search results will
679 :     be written to the session cache file and the number of results will be
680 :     returned. If the search parameters are invalid, a result count of C<undef> will be
681 :     returned and a result message will be stored in this object describing the problem.
682 :    
683 :     =cut
684 :    
685 :     sub Find {
686 :     # Get the parameters.
687 :     my ($self) = @_;
688 :     # Declare the return variable. If it remains undefined, the caller will
689 :     # know there's been an error.
690 :     my $retVal;
691 :     # Get the sprout and CGI query objects.
692 :     my $cgi = $self->Q();
693 :     my $sprout = $self->DB();
694 :     # Get the genome IDs.
695 :     my @genomes = $self->GetGenomes('genome');
696 :     if (! @genomes) {
697 :     $self->SetMessage("No genomes specified.");
698 :     } else {
699 :     # Get the sequence in its raw form.
700 :     my $sequence = $cgi->param('sequence');
701 :     if (! $sequence) {
702 :     $self->SetMessage("No sequence specified.");
703 :     } else {
704 :     # Get the blast options and the tool type.
705 :     my $toolType = $cgi->param('tool') || "";
706 :     my $options = $cgi->param('options') || "";
707 :     # Insure the tool type is valid.
708 :     if (! $toolType) {
709 :     $self->SetMessage("No tool specified.");
710 :     } else {
711 :     # Create the result helper.
712 :     my $rhelp = RHLocations->new($self);
713 :     # Compute the columns. The default columns are determined mostly by the result type.
714 :     $self->DefaultColumns($rhelp);
715 :     #
716 :     # (NOTE: Optional columns should be added here so they go before the alignment column.)
717 :     #
718 :     # Define the extra columns for this tool.
719 :     for my $extraCol (@{$ToolTable{$toolType}->{extras}}) {
720 :     $rhelp->AddExtraColumn(@{$extraCol});
721 :     }
722 :     # Now, the sequence. We use a utility to convert it to a uniform
723 :     # FASTA format of the correct type. Note that for patterns there
724 :     # will not be a header line.
725 :     my $sequenceThing = $self->ComputeFASTA($ToolTable{$toolType}->{inputType}, $cgi->param('sequence'));
726 :     Trace("Fasta sequence has length " . length($sequenceThing) . ".") if T(3);
727 :     # Only proceed if the FASTA sequence was converted successfully.
728 :     if ($sequenceThing) {
729 :     # Write the FASTA to a temporary file. We use the session name with a suffix of
730 :     # "fasta" for the file name.
731 :     my $tmpFile = $self->GetTempFileName('fasta');
732 :     Tracer::PutFile($tmpFile, $sequenceThing);
733 :     # Call the tool.
734 :     my @results = $self->ExecTool($tmpFile, \@genomes, $toolType, $options);
735 :     # Start the output session.
736 :     $self->OpenSession($rhelp);
737 :     # Compute the number of results.
738 :     $retVal = scalar(@results);
739 :     $self->PrintLine("$retVal hit locations found.<br />");
740 :     # Loop through the results.
741 :     my $resultCounter = 0;
742 :     for my $result (@results) {
743 :     # Store the result fields as extra columns. Only the result fields
744 :     # defined as extra columns in the tool table will be kept by the
745 :     # result helper.
746 :     $rhelp->PutExtraColumns(%{$result});
747 :     # Create an ERDB object for the hit location. We use BasicLocation to
748 :     # parse it into its components.
749 :     my $data = RHLocations::BuildLocationRecord($result->{hitLoc});
750 :     # Now put the location data and its sort key into the output stream.
751 :     $rhelp->PutData($result->{sortKey}, $result->{hitLoc}, $data);
752 :     # Tell the user every so often what kind of progress we're making.
753 :     $resultCounter++;
754 :     if ($resultCounter % 100 == 0) {
755 :     $self->PrintLine("$resultCounter of $retVal hit locations processed.<br />");
756 :     }
757 :     }
758 :     # Close the output session.
759 :     $self->CloseSession();
760 :     }
761 :     }
762 :     }
763 :     }
764 :     # Return the result count.
765 :     return $retVal;
766 :     }
767 :    
768 :     =head3 Description
769 :    
770 :     C<< my $htmlText = $shelp->Description(); >>
771 :    
772 :     Return a description of this search. The description is used for the table of contents
773 :     on the main search tools page. It may contain HTML, but it should be character-level,
774 :     not block-level, since the description is going to appear in a list.
775 :    
776 :     =cut
777 :    
778 :     sub Description {
779 :     # Get the parameters.
780 :     my ($self) = @_;
781 :     # Return the result.
782 :     return "Search for matching DNA or protein regions.";
783 :     }
784 :    
785 :     =head3 SearchTitle
786 :    
787 :     C<< my $titleHtml = $shelp->SearchTitle(); >>
788 :    
789 :     Return the display title for this search. The display title appears above the search results.
790 :     If no result is returned, no title will be displayed. The result should be an html string
791 :     that can be legally put inside a block tag such as C<h3> or C<p>.
792 :    
793 :     =cut
794 :    
795 :     sub SearchTitle {
796 :     # Get the parameters.
797 :     my ($self) = @_;
798 :     # Compute the title. We extract the tool ID from the query parameters.
799 :     my $cgi = $self->Q();
800 :     my $tool = $cgi->param('tool');
801 :     my $retVal = $ToolTable{$tool}->{title};
802 :     # Return it.
803 :     return $retVal;
804 :     }
805 :    
806 :    
807 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3