[Bio] / FigKernelScripts / make_PG.pl Repository:
ViewVC logotype

View of /FigKernelScripts/make_PG.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Nov 10 16:14:01 2011 UTC (8 years ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0912, rast_rel_2014_0729, mgrast_release_3_1_2, mgrast_version_3_2, mgrast_dev_12152011, HEAD
Changes since 1.2: +1 -1 lines
clarify condition

use strict;
use Data::Dumper;
use SeedEnv;
use gjoseqlib;

# This program attempts to create an initial pan-genome.  Specifically
# 
# 	make_PG FIGdiskOfSEED PG < genomes.to.include
# 
# takes as input a file containing genome IDs in each line (the first
# thing in each line that looks like /\d+\.\d+/ is taken as a genome ID)
# and it creates a directory (PG) containing
# 
# 	pg.genomes        -- list of the genome IDs used
# 	repeat.clusters   -- clusters of volatile PEGs
# 	pg.sets           -- protein families for normal PEGs
# 	singletons        -- PEGs that appear to be singletons (occur just once)
# 
# The singletons should probably be ignored (for now).  They will often be the
# product of bad gene calls.
# 
# 
# 

my($pgD,$pubseed);
my $usage = "usage: make_PG PubSEED PG < genomes";
(
 ($pubseed = shift @ARGV) &&
 ($pgD     = shift @ARGV)
)
    || die $usage;

(-d $pubseed) || die "you need to say where PubSEED resides";
my $orgdir = "$pubseed/FIG/Data/Organisms";
mkdir($pgD,0777) || die "$pgD already exists";
open(PGG,">","$pgD/pg.genomes") || die "could not open $pgD/pg.genomes";
my @genomes = map { ($_ =~ /(\d+\.\d+)/) ? $1 : () } <STDIN>;

foreach my $g (@genomes)
{
    print PGG "$g\n";
}
close(PGG);
my %funcs;
foreach my $g (@genomes)
{
    if (open(FUNCS,"<","$orgdir/$g/assigned_functions"))
    {
	while (defined($_ = <FUNCS>))
	{
	    if ($_ =~ /^(\S+)\t(\S[^\t]*\S)/)
	    {
		$funcs{$1} = $2;
	    }
	}
	close(FUNCS);
    }
}

#
# $all_pegs will contain fasta verions (protein) of all pegs in the genomes
#
my $all_pegs = "tmp.$$.allpegs";
my @reps;
my %deleted_features;
foreach my $g (@genomes)
{
    my @tmp =  &repeats($g,$orgdir);
    push(@reps,@tmp);
    if (open(DEL,"<","$orgdir/$g/Features/peg/deleted.features"))
    {
	while (defined($_ = <DEL>) && ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/))
	{
	    $deleted_features{$1} = 1;
	}
	close(DEL);
    }
    my @fasta = grep { ! $deleted_features{$_->[0]} } &gjoseqlib::read_fasta("$orgdir/$g/Features/peg/fasta");
    open(ALL,">>",$all_pegs) || die "could not open $all_pegs";
    &gjoseqlib::print_alignment_as_fasta(\*ALL,\@fasta);
    close(ALL);
}
my $blast_reps = "tmp.$$.blastdb";
&gjoseqlib::print_alignment_as_fasta($blast_reps,\@reps);
&run("formatdb -i $blast_reps -p F");

#
# Now we blast $all_pegs against the computed repeat regions to get probable
# "volatile pegs"
#
#
my %hit_reps;
foreach $_ (`blastall -m8 -p tblastn -i $all_pegs -d $blast_reps`)
# system "blastall -m8 -p tblastn -i $all_pegs -d $blast_reps > rep.blast.db";
# foreach $_ (`cat rep.blast.db`)
{
    my($peg,$contig,$iden,undef,undef,undef,$b1,$e1,$b2,$e2) = split(/\s+/,$_);
    if (($iden > 80) && (($e1+1-$b1) >= 40))
    {
	$hit_reps{$peg} = 1;
    }
}

my @rep_pegs = grep { $hit_reps{$_->[0]} } &gjoseqlib::read_fasta($all_pegs);
#
# @rep_pegs is now the probable volatile pegs
#

system "rm $all_pegs $blast_reps*";
my $rep_pegsF = "tmp.$$.rep.pegs.fasta";
&gjoseqlib::print_alignment_as_fasta($rep_pegsF,\@rep_pegs);
&run("formatdb -p T -i $rep_pegsF");
my $repeat_clusters = "$pgD/repeat.clusters";
open(REPS,"| cluster_objects > $repeat_clusters") || die "could not open $repeat_clusters";
foreach $_ (`blastall -i $rep_pegsF -d $rep_pegsF -p blastp -m 8`)
{
    my($peg1,$peg2,$iden,undef,undef,undef,$b1,$e1,$b2,$e2) = split(/\s+/,$_);
    if (($peg1 ne $peg2) && ($iden > 60) && (($e1+1-$b1) > 30))
    {
	print REPS "$peg1\t$peg2\n";
    }
}
close(REPS);
system "rm $rep_pegsF*";
#############################
#
# We now have to handle occasional duplications, removing them from repeat clusters.
# This involves finding which repeat.clusters tend to have single entries per genome,
# and then resetting $hit_reps and the repeat.clusters file.
#
#
#############################
if (-s $repeat_clusters)
{
    rename($repeat_clusters,"$repeat_clusters~") || die "could not rename $repeat_clusters";
    open(IN,"<","$repeat_clusters~") || die "could not open $repeat_clusters~";
    open(OUT,"| tabs2rel > $repeat_clusters") || die "could not open $repeat_clusters";
    my $set;
    while (defined($set = <IN>))
    {
	chop $set;
	my @pegs = split(/\t/,$set);
	my %byG;
	foreach my $peg (@pegs)
	{
	    $byG{&SeedUtils::genome_of($peg)}++;
	}
	my $tot  = 0;
	my $mult = 0;
	foreach my $g (keys(%byG))
	{
	    if ($byG{$g} > 1)
	    {
		$mult++;
	    }
	    $tot++;
	}
	if ($tot && (($mult/$tot) > 0.5))
	{
	    print OUT "$set\n";
	}
	else
	{
	    foreach my $peg (@pegs)
	    {
		delete $hit_reps{$peg};
	    }
	}
    }
}
close(IN);
close(OUT);

#############################
&add_funcs($repeat_clusters,\%funcs);
my $NonReps = "tmp.$$.non.reps";
mkdir($NonReps,0777) || die "could not make $NonReps";

my $genome_specs = "tmp.genome.specs.$$";
my %non_rep_peg;
open(SPECS,">",$genome_specs) || die "could not open $genome_specs";
foreach my $g (@genomes)
{
    &make_nonrepG($orgdir,$g,$NonReps,\%hit_reps,\%non_rep_peg,\%deleted_features);
    print SPECS join(",",("$NonReps/$g/fasta",
			  "$NonReps/$g/tbl",
			  "$NonReps/$g/assigned_functions")),"\n";
}
close(SPECS);
&run("svr_make_pan_genome_prot_families -p 8 < $genome_specs > $pgD/pg.sets 2> /dev/null");
&add_funcs("$pgD/pg.sets",\%funcs);

my %occurred = map { chop; ($_ => 1) } `cut -f2 $pgD/pg.sets`;
open(SINGLETONS,">","$pgD/singletons") || die "could not open $pgD/singletons";
foreach my $peg (keys(%non_rep_peg))
{
    if (! $occurred{$peg})
    {
	print SINGLETONS "$peg\n";
    }
}
close(SINGLETONS);
&add_funcs("$pgD/singletons",\%funcs);
&run("rm -r $NonReps $genome_specs");

sub add_funcs {
    my($file,$funcs) = @_;

    rename($file,"$file~") || die "could not rename $file";
    open(IN,"<$file~") || die "could not open $file~";
    open(OUT,">$file")  || die "could not open $file";
    while (defined($_ = <IN>))
    {
	if ($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/)
	{
	    chop;
	    my $f = $funcs->{$1} ? $funcs->{$1} : '';
	    print OUT "$_\t$f\n";
	}
	else
	{
	    print OUT "$_\t\n";
	}
    }
    close(IN);
    close(OUT);
}

sub make_nonrepG {
    my($fromD,$g,$toD,$hit_reps,$non_rep_peg,$deleted_features) = @_;

    mkdir("$toD/$g",0777) || die "could not make $toD/$g";
    &filter_one("$fromD/$g/assigned_functions","$toD/$g/assigned_functions",$hit_reps,$deleted_features);
    &filter_one("$fromD/$g/Features/peg/tbl","$toD/$g/tbl",$hit_reps,$deleted_features);
    &filter_one_fasta("$fromD/$g/Features/peg/fasta","$toD/$g/fasta",$hit_reps,$deleted_features);
    foreach $_ (`cut -f1 $toD/$g/tbl`)
    {
	chop;
	if ((! $hit_reps->{$_}) && (! $deleted_features->{$_}))
	{
	    $non_rep_peg->{$_} = 1;
	}
    }
}

sub filter_one {
    my($from,$to,$hit_reps,$deleted_features) = @_;

    open(IN,"<",$from) || die "could not open $from";
    open(OUT,">",$to)  || die "could not open $to";
    while (defined($_ = <IN>))
    {
	if (! (($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) && 
	       ($hit_reps->{$1} || $deleted_features->{$1})))
	{
	    print OUT $_;
	}
    }
    close(IN);
    close(OUT);
}

sub filter_one_fasta {
    my($from,$to,$hit_reps,$deleted_features) = @_;

    my @seqs = grep { (! $hit_reps->{$_->[0]}) && (! $deleted_features->{$_->[1]}) } 
               &gjoseqlib::read_fasta($from);
    &gjoseqlib::print_alignment_as_fasta($to,\@seqs);
}

sub repeats {
    my($g,$orgdir) = @_;

    my %contigs = map { ($_->[0] => $_->[2]) } &gjoseqlib::read_fasta("$orgdir/$g/contigs");
    open(REPS,"svr_big_repeats -i 90 -l 200 -f $orgdir/$g/contigs |")
	|| die "could not run svr_big_repeats";
    my @reps;
    while (defined($_ = <REPS>))
    {
	chop;
	my($ln,$ident,$c1,$b1,$e1,$c2,$b2,$e2) = split(/\t/,$_);
	push(@reps,["$g:$c1",'',&dna_seq($c1,$b1,$e1,\%contigs)]);
    }
    return @reps;
}

sub dna_seq {
    my($contig,$beg,$end,$contigs) = @_;

    if ($beg < $end)
    {
	return substr($contigs->{$contig},$beg-1,$end-($beg-1));
    }
    else
    {
	return &SeedUtils::rev_comp(substr($contigs->{$contig},$end-1,$beg-($end-1)));
    }
}

sub copy_to {
    my($g,$fromD,$toD,$rename) = @_;

    my $from = "$fromD/$g";
    my $to = "$toD/" . ($rename ? $rename : $g);
    mkdir($to,0777) || die "could not make $to";
    &run("cp $from/contigs $to/contigs");
    my %delete;
    if (-s "$from/Features/peg/deleted.features")
    {
	%delete = map { chop; ($_ => 1) } `cat $from/Features/peg/deleted.features`;
    }
    &copy_file("$from/assigned_functions","$to/assigned_functions",\%delete,$g,$rename);
    mkdir("$to/Features",0777) || die "could not make $to/Features";
    mkdir("$to/Features/peg",0777) || die "could not make $to/Features/peg";
    &copy_file("$from/Features/peg/tbl","$to/Features/peg/tbl",\%delete,$g,$rename);
    &copy_file("$from/Features/peg/fasta","$to/Features/peg/fasta",\%delete,$g,$rename);
}

sub copy_file {
    my($from,$to,$delete,$g,$rename) = @_;

    open(FROM,"<",$from) || die "could not open $from";
    open(TO,">",$to)     || die "could not open $to";
    while (defined($_ = <FROM>))
    {
	if (($_ =~ /(fig\|\d+\.\d+\.peg\.\d+)/) && (! $delete->{$1}))
	{
	    if ($rename)
	    {
		$_ =~ s/(fig\|)\d+\.\d+(\.peg\.\d+)/$1$rename$2/;
	    }
	    print TO $_;
	}
	else
	{
	    print TO $_;
	}
    }
    close(FROM);
    close(TO);
}

sub run {
    my($cmd) = @_;

#    print STDERR "running: $cmd\n";
    my $rc = system($cmd);
    if ($rc)
    {
	die "$rc: $cmd failed";
    }
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3