[Bio] / FigKernelScripts / p3-blast.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-blast.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/env perl
2 :     #
3 :     # Copyright (c) 2003-2015 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 :    
20 :     use strict;
21 :     use warnings;
22 : parrello 1.3 use P3Utils;
23 : parrello 1.1 use gjoseqlib;
24 :     use BlastInterface;
25 :     use Data::Dumper;
26 :     use P3DataAPI;
27 : parrello 1.3 use Hsp;
28 : parrello 1.1
29 :     =head1 Blast FASTA Data
30 :    
31 : parrello 1.2 p3-blast.pl [ options ] type blastdb
32 : parrello 1.1
33 :     Blast the input against a specified blast database. The input should be a FASTA file. The blast database
34 :     can also be a FASTA file, the input itself, or it can be a genome ID.
35 :    
36 :     =head2 Parameters
37 :    
38 :     The positional parameters are the name of the blast program (C<blastn>, C<blastp>, C<blastx>, or C<tblastn>)
39 :     followed by the file name of the blast database. If the blast database is not pre-built, it will be built in
40 :     place. If the database is not found, it is presumed to be a genome ID. If the database name is omitted, the
41 :     input will be blasted against itself.
42 :    
43 : parrello 1.3 The options in L<P3Utils/ih_options> can be used to override the standard input.
44 :    
45 : parrello 1.1 The additional command-line options are as follows.
46 :    
47 :     =over 4
48 :    
49 :     =item hsp
50 :    
51 : parrello 1.3 If specified, then the output is in the form of HSP data (see L<Hsp>). This is the default, and is mutually exclusive with C<sim> and C<tbl>.
52 : parrello 1.1
53 :     =item sim
54 :    
55 : parrello 1.3 If specified, then the output is in the form of similarity data (see L<Sim>). This parameter is mutually exclusive with C<hsp> and C<tbl>.
56 :    
57 :     =item tbl
58 :    
59 :     If specified, then the output is in the form of a six-column table: query ID, query description, subject ID, subject description, percent identity, and e-value.
60 :    
61 :     =item best
62 :    
63 :     If specified, then only the best match for each query sequence will be output.
64 : parrello 1.1
65 :     =item BLAST Parameters
66 :    
67 :     The following may be specified as BLAST parameters
68 :    
69 :     =over 8
70 :    
71 :     =item maxE
72 :    
73 :     Maximum E-value (default C<1e-10>).
74 :    
75 :     =item maxHSP
76 :    
77 :     Maximum number of returned results (before filtering). The default is to return all results.
78 :    
79 :     =item minScr
80 :    
81 :     Minimum required bit-score. The default is no minimum.
82 :    
83 :     =item percIdentity
84 :    
85 :     Minimum percent identity. The default is no minimum.
86 :    
87 :     =item minLen
88 :    
89 :     Minimum permissible match length (used to filter the results). The default is no filtering.
90 :    
91 :     =back
92 :    
93 :     =back
94 :    
95 :     =cut
96 :    
97 :     # map each blast tool name to the type of blast database required
98 :     use constant BLAST_TOOL => { blastp => 'prot', blastn => 'dna', blastx => 'prot', tblastn => 'dna' };
99 :    
100 : parrello 1.3 $| = 1;
101 : parrello 1.1 # Get the command-line parameters.
102 : parrello 1.3 my $opt = P3Utils::script_opts('type blastdb',
103 :     P3Utils::ih_options(),
104 :     ['output' => hidden => { one_of => [ [ 'hsp' => 'produce HSP output'], ['sim' => 'produce similarity output'], ['tbl' => 'produce table output']]}],
105 : parrello 1.1 ['maxE|e=f', 'maximum e-value', { default => 1e-10 }],
106 :     ['maxHSP|b', 'if specified, the maximum number of returned results (before filtering)'],
107 :     ['minScr=f', 'if specified, the minimum permissible bit score'],
108 :     ['percIdentity=f', 'if specified, the minimum permissible percent identity'],
109 :     ['minLen|l=i', 'if specified, the minimum permissible match lengt (for filtering)'],
110 : parrello 1.3 ['best', 'only output best match for each query sequence']
111 :     );
112 : parrello 1.1 # Open the input file.
113 : parrello 1.3 my $ih = P3Utils::ih($opt);
114 : parrello 1.1 # Get the positional parameters.
115 :     my ($blastProg, $blastdb) = @ARGV;
116 :     if (! $blastProg) {
117 :     die "You must specify the blast tool.";
118 :     }
119 :     my $blastDbType = BLAST_TOOL->{$blastProg};
120 :     if (! $blastDbType) {
121 :     die "Invalid blast tool specified.";
122 :     }
123 : parrello 1.3 # Determine the output type.
124 :     my $outForm = $opt->output // 'hsp';
125 : parrello 1.1 # This hash contains the BLAST parameters.
126 :     my %blast;
127 : parrello 1.3 $blast{outForm} = ($outForm eq 'sim' ? 'sim' : 'hsp');
128 : parrello 1.1 $blast{maxE} = $opt->maxe;
129 :     $blast{maxHSP} = $opt->maxhsp // 0;
130 :     $blast{minIden} = $opt->percidentity // 0;
131 :     $blast{minLen} = $opt->minlen // 0;
132 : parrello 1.3 # Save the best-only option.
133 :     my $best = $opt->best;
134 :     # Print the output headers.
135 :     if ($outForm eq 'hsp') {
136 :     P3Utils::print_cols([qw(qid qdef qlen sid sdef slen score e-val pN p-val match-len identity positive gaps dir q-start q-end q-sequence s-start s-end s-sequence)]);
137 :     } elsif ($outForm eq 'sim') {
138 :     P3Utils::print_cols([qw(qid sid pct-identity match-len mismatches gaps q-start q-end s-start s-end e-val score q-len s-len tool)]);
139 :     } elsif ($outForm eq 'tbl') {
140 :     P3Utils::print_cols([qw(qid qdef sid sdef pct-identity e-val)]);
141 :     }
142 : parrello 1.1 # Get the input triples. These are the query sequences.
143 :     my @query = gjoseqlib::read_fasta($ih);
144 :     # Now we need to determine the BLAST database.
145 :     my $blastDatabase;
146 :     if (! $blastdb) {
147 :     # Use the query.
148 :     $blastDatabase = \@query;
149 :     } elsif (-s $blastdb) {
150 :     # Here the user specified a file name.
151 :     $blastDatabase = $blastdb;
152 :     } else {
153 :     # Not a file name, so we assume it is a genome.
154 :     p3_genome_fasta($blastdb, $blastDbType);
155 :     $blastDatabase = $blastdb;
156 :     if (! $blastDatabase) {
157 :     die "$blastdb is not a file or a genome ID."
158 :     }
159 :     }
160 :     # Now run the BLAST.
161 :     my $matches = BlastInterface::blast(\@query, $blastDatabase, $blastProg, \%blast);
162 : parrello 1.3 my $lastQuery = '';
163 : parrello 1.1 # Format the output.
164 :     for my $match (@$matches) {
165 : parrello 1.3 my $qid = $match->[0];
166 :     # The best match for a query sequence is always the first encountered.
167 :     if (! $best || $qid ne $lastQuery) {
168 :     if ($outForm eq 'tbl') {
169 :     # Must convert from HSP to table.
170 :     my $hsp = $match;
171 : parrello 1.4 if (ref $hsp eq 'ARRAY') {
172 :     $hsp = Hsp->new($match);
173 :     }
174 : parrello 1.3 $match = [$hsp->qid(), $hsp->qdef(), $hsp->sid(), $hsp->sdef(), $hsp->pct(), $hsp->e_val()];
175 :     }
176 :     P3Utils::print_cols($match);
177 :     $lastQuery = $qid;
178 :     }
179 : parrello 1.1 }
180 :    
181 :    
182 :     sub p3_genome_fasta
183 :     {
184 :     my ($genome_id, $blastDbType) = @_;
185 :     my $d = P3DataAPI->new();
186 :    
187 :     my $gto = $d->gto_of($genome_id);
188 :     if ($blastDbType eq "dna") {
189 :     $gto->write_contigs_to_file("$genome_id");
190 :     } else {
191 :     $gto->write_protein_translations_to_file("$genome_id");
192 :     }
193 :     }
194 :    
195 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3