[Bio] / FigKernelScripts / svr_with_close_blast_hits.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_with_close_blast_hits.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : overbeek 1.1 ########################################################################
2 :     use strict;
3 :     use Data::Dumper;
4 :     use Carp;
5 :    
6 :     #
7 :     # This is a SAS Component
8 :     #
9 :    
10 :    
11 :     =head1 svr_with_close_blast_hits -d DB [-m MaxDist ] [-p MaxPsc] < PEGs > +[PegLocation,HitLocation] 2> no.hits
12 :    
13 :     Determine which of the input PEGs have blastX hits to a given DB "close".
14 :    
15 :     ------
16 :    
17 :     The standard input should be a tab-separated table (i.e., each line
18 :     is a tab-separated set of fields). Normally, the last field in each
19 :     line would contain a PEG, but you can specify what column the
20 :     PEG IDs come from.
21 :    
22 :     If some other column contains the PEGs, use
23 :    
24 :     -c N
25 :    
26 :     where N is the column (from 1) that contains the PEG in each case.
27 :    
28 :     This is a pipe command. The input is taken from the standard input, and the
29 :     output is to the standard output and standard error. Genes in which there is a close
30 :     blastX hit against a given protein DB are written to STDOUT (with three appended columns:
31 :     the Gene location, the hit location, and the other gene that had the best blast score).
32 :     Genes that fail to hit anything are written to STDERR.
33 :    
34 :     =head2 Command-Line Options
35 :    
36 :     =over 4
37 :    
38 :     =item -c Column
39 :    
40 :     This is used only if the column containing PEGs is not the last.
41 :    
42 :     =item -d BlastDB
43 :    
44 :     This is the name of a protein blast DB. It is assumed that formatdb has already
45 :     been run to properly format it.
46 :    
47 :     =item -m MaxDist
48 :    
49 :     This is the distance used to snip out a section of DNA centered on the PEG.
50 :     The snipped DNA will be of length ((2 * MaxDist) + length of PEG). Default is 2000.
51 :    
52 :     =item -p MaxPsc
53 :    
54 :     This is the maximum Psc used to determine whether or not there was a significant similarity
55 :    
56 :     =back
57 :    
58 :     =head2 Output Format
59 :    
60 :     PEGs that can be clustered are written to STDOUT. Three columns are
61 :     added at the end of each line in STDOUT -- the location of the PEG,
62 :     the location of a significant blast hit, and the other PEG that generated the hit.
63 :     The locations will be in the form GID:Contig_Start[+-]Length. For example,
64 :    
65 :     100226.1:NC_003888_3766170+612
66 :    
67 :     would designate a gene in genome 10226.1 on contig NC_003888 that starts
68 :     at position 3766170 (positions are numbered from 1) that is on the
69 :     positive strand and has a length of 612.
70 :    
71 :     When a PEG has no hit, the original line (with no added columns)
72 :     is written to STDERR.
73 :    
74 :     =cut
75 :    
76 :     use SeedEnv;
77 :     use SAPserver;
78 :     my $sapO = SAPserver->new();
79 :     use Getopt::Long;
80 :    
81 :     my $usage = "usage: svr_with_close_blast_hits -d DB [-m MaxDist ] [-p MaxPsc] [-c column] < PEGs > +[PegLocation,HitLocation] 2> no.hits";
82 :    
83 :     my $column;
84 :     my $blastdb;
85 :     my $max_psc = 1.0e-5;
86 :     my $maxD = 2000;
87 :    
88 :     my $rc = GetOptions('c=i' => \$column,
89 :     'd=s' => \$blastdb,
90 :     'p=f' => \$max_psc,
91 :     'm=i' => \$maxD);
92 :    
93 :     if (! $rc) { print STDERR $usage; exit }
94 :     ($blastdb) || die "you need to give the formatted blastdb";
95 :    
96 :     my @lines = map { chomp; [split(/\t/,$_)] } <STDIN>;
97 :     if (! $column) { $column = @{$lines[0]} }
98 :     my @pegs = map { $_->[$column-1] } @lines;
99 :     my $pegH = $sapO->fid_locations( -ids => \@pegs, -boundaries => 1 );
100 :    
101 :     my $wus = {};
102 :     foreach my $line (@lines)
103 :     {
104 :     my($peg,$peg_loc);
105 :     if (($peg = $line->[$column-1]) && ($peg_loc = $pegH->{$peg}))
106 :     {
107 :     if ($peg_loc =~ /^(\d+\.\d+):(\S+)_(\d+)([+-])(\d+)$/)
108 :     {
109 :     my($genome,$contig,$beg,$strand,$ln) = ($1,$2,$3,$4,$5);
110 :     my($min,$max) = ($strand eq "+") ? ($3,$3+$5-1) : ($3+1-$5,$3);
111 :     $min -= $maxD;
112 :     $max += $maxD;
113 :     my $ln = $max +1 - $min;
114 :     $wus->{$peg} = "$genome:$contig" . "_" . "$min" . "+" . "$ln";
115 :     }
116 :     }
117 :     }
118 :    
119 :     my $locH = $sapO->locs_to_dna( -locations => $wus, -fasta => 1 );
120 :     open(TMP,">","tmp.$$.fasta") || die "could not open tmp.$$.fasta";
121 :     foreach my $peg (keys(%$locH))
122 :     {
123 :     my $seq = $locH->{$peg};
124 :     print TMP $seq;
125 :     }
126 :     close(TMP);
127 :    
128 :     my %blast_hits;
129 :     open(BLAST,"blastall -d $blastdb -i tmp.$$.fasta -p blastx -e $max_psc -m 8 -FF |")
130 :     || die "could not run blastall -d $blastdb -i tmp.$$.fasta -p blastx -e $max_psc -m 8 -FF";
131 :     while (defined($_ = <BLAST>))
132 :     {
133 :     chomp;
134 :     my @flds = split(/\s+/,$_);
135 :     my($id1,$id2,$iden,undef,undef,undef,$b1,$e1,$b2,$e2,$psc,$bsc) = @flds;
136 :     if (! $blast_hits{$id1})
137 :     {
138 :     my $wu = $wus->{$id1};
139 :     if ($wu =~ /^(\d+\.\d+):(\S+)_(\d+)\+(\d+)/)
140 :     {
141 :     my($genome,$contig,$beg,$ln) = ($1,$2,$3,$4);
142 :     if (($b1 < ($maxD-10)) || ($e1 < ($maxD-10)) || ($b1 > (($ln+10) - $maxD)) || ($e1 > (($ln+10) - $maxD)))
143 :     {
144 :     my $start = $b1 + ($beg - 1);
145 :     my $end = $e1 + ($beg - 1);
146 :     my $ln1 = abs($end-$start)+1;
147 :     my $region;
148 :     if ($b1 < $e1)
149 :     {
150 :     $region = "$genome:$contig\_$start\+$ln1";
151 :     }
152 :     else
153 :     {
154 :     $region = "$genome:$contig\_$start\-$ln1";
155 :     }
156 :     $blast_hits{$id1} = [$id2,$region];
157 :     }
158 :     }
159 :     }
160 :     }
161 :     close(BLAST);
162 :     unlink("tmp.$$.fasta");
163 :    
164 :     foreach my $line (@lines)
165 :     {
166 :     my $peg = $line->[$column-1];
167 :     my $peg_loc = $pegH->{$peg};
168 :     my $hit_loc = $blast_hits{$peg}->[1];
169 :     my $id2 = $blast_hits{$peg}->[0];
170 :     if ($hit_loc)
171 :     {
172 :     print join("\t",(@$line,$peg_loc,$hit_loc,$id2)),"\n";
173 :     }
174 :     else
175 :     {
176 :     print STDERR join("\t",@$line),"\n";
177 :     }
178 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3