[Bio] / FigKernelScripts / make_PG.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/make_PG.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : overbeek 1.1 use strict;
2 :     use Data::Dumper;
3 :     use SeedEnv;
4 :     use gjoseqlib;
5 :    
6 :     # This program attempts to create an initial pan-genome. Specifically
7 :     #
8 :     # make_PG FIGdiskOfSEED PG < genomes.to.include
9 :     #
10 :     # takes as input a file containing genome IDs in each line (the first
11 :     # thing in each line that looks like /\d+\.\d+/ is taken as a genome ID)
12 :     # and it creates a directory (PG) containing
13 :     #
14 :     # pg.genomes -- list of the genome IDs used
15 :     # repeat.clusters -- clusters of volatile PEGs
16 :     # pg.sets -- protein families for normal PEGs
17 :     # singletons -- PEGs that appear to be singletons (occur just once)
18 :     #
19 :     # The singletons should probably be ignored (for now). They will often be the
20 :     # product of bad gene calls.
21 :     #
22 :     #
23 :     #
24 :    
25 :     my($pgD,$pubseed);
26 :     my $usage = "usage: make_PG PubSEED PG < genomes";
27 :     (
28 :     ($pubseed = shift @ARGV) &&
29 :     ($pgD = shift @ARGV)
30 :     )
31 :     || die $usage;
32 :    
33 :     (-d $pubseed) || die "you need to say where PubSEED resides";
34 :     my $orgdir = "$pubseed/FIG/Data/Organisms";
35 :     mkdir($pgD,0777) || die "$pgD already exists";
36 :     open(PGG,">","$pgD/pg.genomes") || die "could not open $pgD/pg.genomes";
37 :     my @genomes = map { ($_ =~ /(\d+\.\d+)/) ? $1 : () } <STDIN>;
38 :    
39 :     foreach my $g (@genomes)
40 :     {
41 :     print PGG "$g\n";
42 :     }
43 :     close(PGG);
44 :     my %funcs;
45 :     foreach my $g (@genomes)
46 :     {
47 :     if (open(FUNCS,"<","$orgdir/$g/assigned_functions"))
48 :     {
49 :     while (defined($_ = <FUNCS>))
50 :     {
51 :     if ($_ =~ /^(\S+)\t(\S[^\t]*\S)/)
52 :     {
53 :     $funcs{$1} = $2;
54 :     }
55 :     }
56 :     close(FUNCS);
57 :     }
58 :     }
59 :    
60 :     #
61 :     # $all_pegs will contain fasta verions (protein) of all pegs in the genomes
62 :     #
63 :     my $all_pegs = "tmp.$$.allpegs";
64 :     my @reps;
65 :     my %deleted_features;
66 :     foreach my $g (@genomes)
67 :     {
68 :     my @tmp = &repeats($g,$orgdir);
69 :     push(@reps,@tmp);
70 :     if (open(DEL,"<","$orgdir/$g/Features/peg/deleted.features"))
71 :     {
72 :     while (defined($_ = <DEL>) && ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/))
73 :     {
74 :     $deleted_features{$1} = 1;
75 :     }
76 :     close(DEL);
77 :     }
78 :     my @fasta = grep { ! $deleted_features{$_->[0]} } &gjoseqlib::read_fasta("$orgdir/$g/Features/peg/fasta");
79 :     open(ALL,">>",$all_pegs) || die "could not open $all_pegs";
80 :     &gjoseqlib::print_alignment_as_fasta(\*ALL,\@fasta);
81 :     close(ALL);
82 :     }
83 :     my $blast_reps = "tmp.$$.blastdb";
84 :     &gjoseqlib::print_alignment_as_fasta($blast_reps,\@reps);
85 :     &run("formatdb -i $blast_reps -p F");
86 :    
87 :     #
88 :     # Now we blast $all_pegs against the computed repeat regions to get probable
89 :     # "volatile pegs"
90 :     #
91 :     #
92 :     my %hit_reps;
93 : overbeek 1.2 foreach $_ (`blastall -m8 -p tblastn -i $all_pegs -d $blast_reps`)
94 : overbeek 1.1 # system "blastall -m8 -p tblastn -i $all_pegs -d $blast_reps > rep.blast.db";
95 : overbeek 1.2 # foreach $_ (`cat rep.blast.db`)
96 : overbeek 1.1 {
97 :     my($peg,$contig,$iden,undef,undef,undef,$b1,$e1,$b2,$e2) = split(/\s+/,$_);
98 :     if (($iden > 80) && (($e1+1-$b1) >= 40))
99 :     {
100 :     $hit_reps{$peg} = 1;
101 :     }
102 :     }
103 :    
104 :     my @rep_pegs = grep { $hit_reps{$_->[0]} } &gjoseqlib::read_fasta($all_pegs);
105 :     #
106 :     # @rep_pegs is now the probable volatile pegs
107 :     #
108 :    
109 : overbeek 1.2 system "rm $all_pegs $blast_reps*";
110 : overbeek 1.1 my $rep_pegsF = "tmp.$$.rep.pegs.fasta";
111 :     &gjoseqlib::print_alignment_as_fasta($rep_pegsF,\@rep_pegs);
112 :     &run("formatdb -p T -i $rep_pegsF");
113 :     my $repeat_clusters = "$pgD/repeat.clusters";
114 :     open(REPS,"| cluster_objects > $repeat_clusters") || die "could not open $repeat_clusters";
115 :     foreach $_ (`blastall -i $rep_pegsF -d $rep_pegsF -p blastp -m 8`)
116 :     {
117 :     my($peg1,$peg2,$iden,undef,undef,undef,$b1,$e1,$b2,$e2) = split(/\s+/,$_);
118 :     if (($peg1 ne $peg2) && ($iden > 60) && (($e1+1-$b1) > 30))
119 :     {
120 :     print REPS "$peg1\t$peg2\n";
121 :     }
122 :     }
123 :     close(REPS);
124 :     system "rm $rep_pegsF*";
125 :     #############################
126 :     #
127 :     # We now have to handle occasional duplications, removing them from repeat clusters.
128 :     # This involves finding which repeat.clusters tend to have single entries per genome,
129 :     # and then resetting $hit_reps and the repeat.clusters file.
130 :     #
131 :     #
132 :     #############################
133 :     if (-s $repeat_clusters)
134 :     {
135 :     rename($repeat_clusters,"$repeat_clusters~") || die "could not rename $repeat_clusters";
136 :     open(IN,"<","$repeat_clusters~") || die "could not open $repeat_clusters~";
137 :     open(OUT,"| tabs2rel > $repeat_clusters") || die "could not open $repeat_clusters";
138 :     my $set;
139 :     while (defined($set = <IN>))
140 :     {
141 :     chop $set;
142 :     my @pegs = split(/\t/,$set);
143 :     my %byG;
144 :     foreach my $peg (@pegs)
145 :     {
146 :     $byG{&SeedUtils::genome_of($peg)}++;
147 :     }
148 :     my $tot = 0;
149 :     my $mult = 0;
150 :     foreach my $g (keys(%byG))
151 :     {
152 :     if ($byG{$g} > 1)
153 :     {
154 :     $mult++;
155 :     }
156 :     $tot++;
157 :     }
158 :     if ($tot && (($mult/$tot) > 0.5))
159 :     {
160 :     print OUT "$set\n";
161 :     }
162 :     else
163 :     {
164 :     foreach my $peg (@pegs)
165 :     {
166 :     delete $hit_reps{$peg};
167 :     }
168 :     }
169 :     }
170 :     }
171 :     close(IN);
172 :     close(OUT);
173 :    
174 :     #############################
175 :     &add_funcs($repeat_clusters,\%funcs);
176 :     my $NonReps = "tmp.$$.non.reps";
177 :     mkdir($NonReps,0777) || die "could not make $NonReps";
178 :    
179 :     my $genome_specs = "tmp.genome.specs.$$";
180 :     my %non_rep_peg;
181 :     open(SPECS,">",$genome_specs) || die "could not open $genome_specs";
182 :     foreach my $g (@genomes)
183 :     {
184 :     &make_nonrepG($orgdir,$g,$NonReps,\%hit_reps,\%non_rep_peg,\%deleted_features);
185 :     print SPECS join(",",("$NonReps/$g/fasta",
186 :     "$NonReps/$g/tbl",
187 :     "$NonReps/$g/assigned_functions")),"\n";
188 :     }
189 :     close(SPECS);
190 :     &run("svr_make_pan_genome_prot_families -p 8 < $genome_specs > $pgD/pg.sets 2> /dev/null");
191 :     &add_funcs("$pgD/pg.sets",\%funcs);
192 :    
193 :     my %occurred = map { chop; ($_ => 1) } `cut -f2 $pgD/pg.sets`;
194 :     open(SINGLETONS,">","$pgD/singletons") || die "could not open $pgD/singletons";
195 :     foreach my $peg (keys(%non_rep_peg))
196 :     {
197 :     if (! $occurred{$peg})
198 :     {
199 :     print SINGLETONS "$peg\n";
200 :     }
201 :     }
202 :     close(SINGLETONS);
203 :     &add_funcs("$pgD/singletons",\%funcs);
204 : overbeek 1.2 &run("rm -r $NonReps $genome_specs");
205 : overbeek 1.1
206 :     sub add_funcs {
207 :     my($file,$funcs) = @_;
208 :    
209 :     rename($file,"$file~") || die "could not rename $file";
210 :     open(IN,"<$file~") || die "could not open $file~";
211 :     open(OUT,">$file") || die "could not open $file";
212 :     while (defined($_ = <IN>))
213 :     {
214 :     if ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/)
215 :     {
216 :     chop;
217 :     my $f = $funcs->{$1} ? $funcs->{$1} : '';
218 :     print OUT "$_\t$f\n";
219 :     }
220 :     else
221 :     {
222 :     print OUT "$_\t\n";
223 :     }
224 :     }
225 :     close(IN);
226 :     close(OUT);
227 :     }
228 :    
229 :     sub make_nonrepG {
230 :     my($fromD,$g,$toD,$hit_reps,$non_rep_peg,$deleted_features) = @_;
231 :    
232 :     mkdir("$toD/$g",0777) || die "could not make $toD/$g";
233 :     &filter_one("$fromD/$g/assigned_functions","$toD/$g/assigned_functions",$hit_reps,$deleted_features);
234 :     &filter_one("$fromD/$g/Features/peg/tbl","$toD/$g/tbl",$hit_reps,$deleted_features);
235 :     &filter_one_fasta("$fromD/$g/Features/peg/fasta","$toD/$g/fasta",$hit_reps,$deleted_features);
236 :     foreach $_ (`cut -f1 $toD/$g/tbl`)
237 :     {
238 :     chop;
239 :     if ((! $hit_reps->{$_}) && (! $deleted_features->{$_}))
240 :     {
241 :     $non_rep_peg->{$_} = 1;
242 :     }
243 :     }
244 :     }
245 :    
246 :     sub filter_one {
247 :     my($from,$to,$hit_reps,$deleted_features) = @_;
248 :    
249 :     open(IN,"<",$from) || die "could not open $from";
250 :     open(OUT,">",$to) || die "could not open $to";
251 :     while (defined($_ = <IN>))
252 :     {
253 :     if (! (($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) &&
254 : overbeek 1.3 ($hit_reps->{$1} || $deleted_features->{$1})))
255 : overbeek 1.1 {
256 :     print OUT $_;
257 :     }
258 :     }
259 :     close(IN);
260 :     close(OUT);
261 :     }
262 :    
263 :     sub filter_one_fasta {
264 :     my($from,$to,$hit_reps,$deleted_features) = @_;
265 :    
266 :     my @seqs = grep { (! $hit_reps->{$_->[0]}) && (! $deleted_features->{$_->[1]}) }
267 :     &gjoseqlib::read_fasta($from);
268 :     &gjoseqlib::print_alignment_as_fasta($to,\@seqs);
269 :     }
270 :    
271 :     sub repeats {
272 :     my($g,$orgdir) = @_;
273 :    
274 :     my %contigs = map { ($_->[0] => $_->[2]) } &gjoseqlib::read_fasta("$orgdir/$g/contigs");
275 :     open(REPS,"svr_big_repeats -i 90 -l 200 -f $orgdir/$g/contigs |")
276 :     || die "could not run svr_big_repeats";
277 :     my @reps;
278 :     while (defined($_ = <REPS>))
279 :     {
280 :     chop;
281 :     my($ln,$ident,$c1,$b1,$e1,$c2,$b2,$e2) = split(/\t/,$_);
282 :     push(@reps,["$g:$c1",'',&dna_seq($c1,$b1,$e1,\%contigs)]);
283 :     }
284 :     return @reps;
285 :     }
286 :    
287 :     sub dna_seq {
288 :     my($contig,$beg,$end,$contigs) = @_;
289 :    
290 :     if ($beg < $end)
291 :     {
292 :     return substr($contigs->{$contig},$beg-1,$end-($beg-1));
293 :     }
294 :     else
295 :     {
296 :     return &SeedUtils::rev_comp(substr($contigs->{$contig},$end-1,$beg-($end-1)));
297 :     }
298 :     }
299 :    
300 :     sub copy_to {
301 :     my($g,$fromD,$toD,$rename) = @_;
302 :    
303 :     my $from = "$fromD/$g";
304 :     my $to = "$toD/" . ($rename ? $rename : $g);
305 :     mkdir($to,0777) || die "could not make $to";
306 :     &run("cp $from/contigs $to/contigs");
307 :     my %delete;
308 :     if (-s "$from/Features/peg/deleted.features")
309 :     {
310 :     %delete = map { chop; ($_ => 1) } `cat $from/Features/peg/deleted.features`;
311 :     }
312 :     &copy_file("$from/assigned_functions","$to/assigned_functions",\%delete,$g,$rename);
313 :     mkdir("$to/Features",0777) || die "could not make $to/Features";
314 :     mkdir("$to/Features/peg",0777) || die "could not make $to/Features/peg";
315 :     &copy_file("$from/Features/peg/tbl","$to/Features/peg/tbl",\%delete,$g,$rename);
316 :     &copy_file("$from/Features/peg/fasta","$to/Features/peg/fasta",\%delete,$g,$rename);
317 :     }
318 :    
319 :     sub copy_file {
320 :     my($from,$to,$delete,$g,$rename) = @_;
321 :    
322 :     open(FROM,"<",$from) || die "could not open $from";
323 :     open(TO,">",$to) || die "could not open $to";
324 :     while (defined($_ = <FROM>))
325 :     {
326 :     if (($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) && (! $delete->{$1}))
327 :     {
328 :     if ($rename)
329 :     {
330 :     $_ =~ s/(fig\|)\d+\.\d+(\.peg\.\d+)/$1$rename$2/;
331 :     }
332 :     print TO $_;
333 :     }
334 :     else
335 :     {
336 :     print TO $_;
337 :     }
338 :     }
339 :     close(FROM);
340 :     close(TO);
341 :     }
342 :    
343 :     sub run {
344 :     my($cmd) = @_;
345 :    
346 :     # print STDERR "running: $cmd\n";
347 :     my $rc = system($cmd);
348 :     if ($rc)
349 :     {
350 :     die "$rc: $cmd failed";
351 :     }
352 :     }
353 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3