[Bio] / FigKernelScripts / svr_blast.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_blast.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : overbeek 1.1 use strict;
2 :     use Data::Dumper;
3 :     use Carp;
4 :     use gjoseqlib;
5 :    
6 :     #
7 :     # This is a SAS Component
8 :     #
9 :    
10 :    
11 :     use SeedEnv;
12 :     my $sapO = SAPserver->new();
13 :    
14 :     =head1 svr_blast
15 :    
16 :     Run blast locally
17 :    
18 :     ------
19 :     Example: svr_blast -p pegs 83333.1 [ blast PEGs identified in file against genome 83333.1 ]
20 :     svr_blast -d pegs 83333.1 [ use blastn, not blastp ]
21 :     svr_blast -s pegs 83333.1 [ blast PEGs in fasta file against genome 83333.1 ]
22 :     svr_blast -p pegs [ blast PEGs identified in file against themselves ]
23 :     svr_blast 83333.1 [ sequences of PEGs from the last column of STDIN input against genome]
24 :     svr_blast [ sequences of PEGs from the last column of STDIN input against themselves ]
25 :     svr_blast -c 1 [ sequences of PEGs from the first column of STDIN input against themselves ]
26 : overbeek 1.2 svr_blast -c 1 -parms='-m8' [ sequences of PEGs from the first column of STDIN input against themselves - -m8 format ]
27 : overbeek 1.1
28 :     The output is exactly the unfiltered blast output
29 :    
30 :     ------
31 :    
32 :     This svr command may be thought of as implementing two types of requests:
33 :    
34 :     1. "Blast a set of PEGs against the genes (or protein products) in a set of genomes"
35 :     2. "Blast a set of PEGs against itself".
36 :    
37 :     When we say "set of pegs" or "pegs in genome" we mean either the DNA or the protein sequences
38 :     corresponding to the pegs. Which is determined by the -d flag or its absence (think of
39 :     protein by default, -d for DNA is that is what you want).
40 :    
41 :     A set of PEGs can be read from a file. If the file contains just IDs, use "-p IDfile".
42 :     If the file contains actual sequence in FASTA format use "-s fasta.file".
43 :    
44 :     If you are blasting PEGs against genomes, the genomes are given as one or more
45 :     arguments of the form xxx.yyy (where xxx.yyy is the genome ID; for example, E.coli is 83333.1).
46 :    
47 :     You can read the PEG ids from standard input, much like most of the
48 :     other SVR scripts (this is done only if -s File and -p File were
49 :     omitted). IDs are from the last column in the STDIN file, or from
50 :     another column specified using the -c argument. The standard input
51 :     should be a tab-separated table (i.e., each line is a tab-separated
52 :     set of fields). Normally, the last field in each line would contain
53 :     the PEG for which aliases are being requested. If some other column
54 :     contains the PEGs, use
55 :    
56 :     -c N
57 :    
58 :     where N is the column (from 1) that contains the PEG in each case.
59 :    
60 :     NOTE: the PEG sequences are formed as the union of the sequences derived
61 :     from
62 :    
63 :     1. the IDs from STDIN (only if -p and -s are omitted)
64 :     2. the ids from the -p file
65 :     3. the sequences from the -s file
66 :    
67 :     This is a pipe command. The input is taken from the standard input, and the
68 :     output is to the standard output.
69 :    
70 :     The parameters of the BLAST run are the defaults, unless you use
71 :    
72 :     -parms='parameters passed to blast'
73 :    
74 :     =head2 Command-Line Options
75 :    
76 :     =over 4
77 :    
78 :     =item -c Column
79 :    
80 :     This is used only if the column containing PEGs is not the last.
81 :    
82 :     =back
83 :    
84 :     =head2 Output Format
85 :    
86 :     The output is just the BLAST output.
87 :    
88 :     =cut
89 :    
90 :     my $usage = "usage: svr_blast [-c column] [-s fasta.file] [-p IDfile] [-d] [G1 G2...]";
91 :    
92 :     my $column;
93 :     my $pFile;
94 :     my $sFile;
95 :     my $d = 0;
96 :     my $parms = "";
97 :    
98 :     while ($ARGV[0] && ($ARGV[0] =~ /^-/))
99 :     {
100 :     $_ = shift @ARGV;
101 :     if ($_ =~ s/^-c=*//) { $column = ($_ || shift @ARGV) }
102 :     elsif ($_ =~ s/^-d//) { $d = 1 }
103 :     elsif ($_ =~ s/^-s=*//) { $sFile = ($_ || shift @ARGV) }
104 :     elsif ($_ =~ s/^-parms=*//) { $parms = ($_ || shift @ARGV) }
105 :     elsif ($_ =~ s/^-p=*//) { $pFile = ($_ || shift @ARGV) }
106 :     else { die "Bad Flag: $_" }
107 :     }
108 :     my @genomes = @ARGV;
109 :    
110 :     my @fid_ids = ();
111 :     if ((! $sFile) && (! $pFile))
112 :     {
113 :     ScriptThing::AdjustStdin();
114 :     my @lines = map { chomp; [split(/\t/,$_)] } <STDIN>;
115 :     if (! $column) { $column = @{$lines[0]} }
116 :     @fid_ids = map { $_->[$column-1] } @lines;
117 :     }
118 :    
119 :     my @fids_seq = ();
120 :     my @fids_id = ();
121 :    
122 :     if ($sFile)
123 :     {
124 :     @fids_seq = &gjoseqlib::read_fasta($sFile);
125 :     }
126 :    
127 :     my %seen = map { ($_->[0] => 1 ) } @fids_seq; ### [ID,Comment,Sequence]
128 :    
129 :     if ($pFile)
130 :     {
131 :     open(PF,"<",$pFile) || die "could not open $pFile";
132 :     while (defined($_ = <PF>))
133 :     {
134 :     if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)/)
135 :     {
136 :     push(@fid_ids,$1);
137 :     }
138 :     }
139 :     close(PF);
140 :     }
141 :    
142 :     my %ids_to_get = map { $_ => 1 } grep { ! $seen{$_} } @fid_ids;
143 :     my @extra_ids = keys(%ids_to_get);
144 :     push(@fids_seq,&tuples(\@extra_ids,$d));
145 :    
146 :     my @genome_pegs = ();
147 :     if (@genomes > 0)
148 :     {
149 :     my $genomeH = $sapO->all_features( -ids => \@genomes, -type => ['peg'] );
150 :     foreach my $genome (keys(%$genomeH))
151 :     {
152 :     my $pegs = $genomeH->{$genome};
153 :     push(@genome_pegs,&tuples($pegs,$d));
154 :     }
155 :     }
156 :    
157 :     my $qF = "query.$$.fasta";
158 :     my $dbF = "db.$$.fasta";
159 :    
160 :     &gjoseqlib::print_alignment_as_fasta($qF,\@fids_seq);
161 :     if (@genomes > 0)
162 :     {
163 :     &gjoseqlib::print_alignment_as_fasta($dbF,\@genome_pegs);
164 :     }
165 :     else
166 :     {
167 :     &gjoseqlib::print_alignment_as_fasta($dbF,\@fids_seq);
168 :     }
169 :    
170 :     my $pflag = $d ? 'F' : 'T';
171 :     system "formatdb -i $dbF -p $pflag";
172 :     my $cmd = $d ? 'blastn' : 'blastp';
173 :     open(BLAST,"blastall $parms -p $cmd -i $qF -d $dbF |")
174 :     || die "could not make blast run, sorry";
175 :     while (defined($_ = <BLAST>))
176 :     {
177 :     print $_;
178 :     }
179 :     close(BLAST);
180 :     unlink($dbF,$qF);
181 :    
182 :     sub tuples {
183 :     my($ids,$dna) = @_;
184 :    
185 :     my $idsH = $sapO->ids_to_sequences( -ids => $ids,
186 :     -protein => ($dna ? 0 : 1),
187 :     -fasta => 0 );
188 :    
189 :     return map { [$_,'',$idsH->{$_}] } keys(%$idsH);
190 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3