[Bio] / FigKernelScripts / extract_genomes_from_samples.pl Repository:
ViewVC logotype

View of /FigKernelScripts/extract_genomes_from_samples.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Dec 21 14:00:12 2012 UTC (6 years, 10 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.1: +97 -8 lines
needs work on getting rRNAs

# 
# This program takes as input a "samples directory":
# 
# SamplesDir
# 	Sample
# 		all.contigs
# 		16S.sequences
# 
# It fills out the directory structure into
#   
# SamplesDir
# 	Sample
# 		all.contigs
# 		16S.sequences
# 		16S.reps [representative SSUs]
# 		refs.for.rep [reference genomes for each SSU-rep]
# 		ref.fasta    [all PEGs from reference genomes]
# 		no.reference.hits.report [contigs with no matches against reference PEGs]
# 		unplaced.with.hits [contigs that did hit references, but we did not resolve
# 		                    which SSU-rep set was the correct one]
# 	        contig.separation.report [summarizes hits against reference pegs for each contig]
# 		Sets 
# 		     ssu-rep-id [ each of these directories should contain a single genome ]
# 				contigs  [ this is the separated genome ]
# 				kmer.report [ crude analysis of what is there]
# 				status [complete/incomplete, multiple genomes]
# 
#
#  That is, it assumes that the 16S.sequences file accurately captures the 16S sequences
#  needed for the analysis
#
use strict;
use SeedUtils;
use Data::Dumper;
use FIG_Config;
use gjoseqlib;
use Sim;
use BlastInterface;
use Proc::ParallelLoop;
use Getopt::Long;

my $processes = 1;
my $sampleDir;
my $rep_thresh = 0.94;
my $usage = "usage: extract_genomes_from_samples -d SamplesDir [-p Processes] [-r representatives-threshhold]";
my $rc    = GetOptions("p=i"            => \$processes,
                       "d=s"            => \$sampleDir,
                       "r=f"            => \$rep_thresh
                      );
if (! $rc) { print STDERR $usage; exit }
(-d $sampleDir) || die "You need to specify an existing Samples directory";


opendir(DIR,"$sampleDir") || die "cannot open $sampleDir";
my @samples = sort grep { $_ !~ /^\./ } readdir(DIR);
closedir(DIR);
my $args = [map { [$rep_thresh,$sampleDir,$_] } @samples];
&pareach($args,\&do_one,{ Max_Workers => $processes });
#foreach my $sample (@samples)

sub do_one {
    my($tuple) = @_;
    my($rep_thresh,$SampleDir,$sample) = @$tuple;
    print STDERR "processing $sample\n";
    my $dir = "$sampleDir/$sample";

    if ((-s "$dir/16S.sequences") && (! -s "$dir/contig.separation.report"))
    {
	print STDERR "building separation report\n";
	&SeedUtils::run("svr_representative_sequences -d $dir/Clustered-$rep_thresh -f $dir/cluster.summary-$rep_thresh -s $rep_thresh < $dir/16S.sequences > $dir/16S.reps");
	my @reps = &gjoseqlib::read_fasta("$dir/16S.reps");
	open(REPS,">$dir/refs.for.rep") || die "could not open $dir/refs.for.rep";
	unlink("$dir/ref.fasta");
	my @blast = sort { $b->bsc <=> $a->bsc } &BlastInterface::blast(\@reps,$FIG_Config::global . "/genome_rRNA.fasta","blastn",{ maxE => 1.0e-30});
	my @close = &close_genomes(\@blast,\@reps);
	foreach my $tuple (@close)
	{
	    my($ssu,$gs) = @$tuple;

	    foreach my $g (@$gs)
	    {
		&SeedUtils::run("cat $FIG_Config::organisms/$g/Features/peg/fasta >> $dir/ref.fasta");
	    }
	    print REPS join("\t",($ssu,@$gs)),"\n";
	}
	close(REPS);

	&SeedUtils::run("formatdb -i $dir/ref.fasta -p T");
	open(REPORT,">$dir/contig.separation.report") || die "could not open $dir/contig.separation.report";
	open(MISSED,">$dir/contig.no.separation.report") || die "could not open $dir/contig.no.separation.report";
	foreach my $contig (&gjoseqlib::read_fasta("$dir/all.contigs"))
	{
	    my @blast = sort { $b->iden <=> $a->iden } grep { (abs($_->e2 - $_->b2) >= 200) } &BlastInterface::blast($contig,"$dir/ref.fasta","blastx",{ maxE => 1.0e-30});
#	    print STDERR &Dumper(\@blast);
	    my @used;
	    my $i;
	    my $hit;
	    while (($i < 10) && ($hit = shift @blast))
	    {
		my $j;
		for ($j=0; ($j < @used) && (! &overlaps($used[$j]->[2],$used[$j]->[3],$hit->b1,$hit->e1)); $j++) {}
		if ($j == @used)
		{
		    print REPORT join("\t",($contig->[0],$hit->id2,$hit->iden,$hit->b1,$hit->e1)),"\n";
		    $i++;
		    push(@used,[$hit->id2,$hit->iden,$hit->b1,$hit->e1]);
		}
	    }
	    print REPORT "====\n";
	    if ($i == 0)
	    {
		print MISSED "*** could not place $contig->[0]\n";
	    }
	}
	close(REPORT);
	close(MISSED);
    }

    if ((-s "$dir/all.contigs") && (-s "$dir/16S.sequences"))
    {
	&split_contigs($sampleDir,$sample);

	my %contigs = map { ($_ =~ /^>(\S+)\s+len_(\d+)_cov_([^_]+)/) ? ($1 => [$2,$3]) : () } `grep "^>" $dir/all.contigs`;
	my @nosep;
	my %nosep;
	if (-s "$dir/contig.no.separation.report")
	{
	    @nosep = map { ($_ =~ /^\*\*\* could not place (\S+)/) ? $1 : () } `sort $dir/contig.no.separation.report`;
	    %nosep = map { ($_ => 1) } @nosep;
	    &make_contig_report("$dir/no.reference.hits.report",\@nosep,\%contigs);
	}
	if ((-s "$dir/contig.separation.report") && (-d "$dir/Sets"))
	{
	    my %placed_contigs = map { ($_ =~ /^>(\S+)/) ? ($1 => 1) : () } `cat $dir/Sets/*/contigs | grep "^>"`;
	    my @unplaced = grep { (! $nosep{$_}) && (! $placed_contigs{$_}) } keys(%contigs);
	    &make_contig_report("$dir/unplaced.with.hits.report",\@unplaced,\%contigs);
	    opendir(SETS,"$dir/Sets") || die "could not open $dir/Sets";
	    my @genomeD = grep { $_ !~ /^\./ } readdir(SETS);
	    close(SETS);
	    foreach my $d (@genomeD)
	    {
		if (-s "$dir/Sets/$d/contigs")
		{
		    &SeedUtils::run("svr_assign_to_dna_using_figfams < $dir/Sets/$d/contigs > $dir/Sets/$d/kmer.report");
		}
	    }
	}
    }

    if (-d "$dir/Sets")
    {
	opendir(DIR,"$dir/Sets") || die "could not open $dir/Sets";
	my @ssu_reps = grep { $_ !~ /^\./ } readdir(DIR);
	closedir(DIR);

	foreach my $ssu_rep (@ssu_reps)
	{
	    if (-s "$dir/Sets/$ssu_rep/kmer.report")
	    {
		my @ribosomal_proteins = map { ($_ =~ /[LS]SU ribosomal protein/) ? $_ : () }
		                         `cat $dir/Sets/$ssu_rep/kmer.report | cut -f 4 | grep ribosomal`;
		my %distinct = map { ($_ => 1) } @ribosomal_proteins;
		my $N = keys(%distinct);
		open(STATUS,">$dir/Sets/$ssu_rep/status") || die "could not open $dir/Sets/$ssu_rep/status";
		print STATUS "$N distinct ribsomal proteins\n";
		my $dups = @ribosomal_proteins - $N;
		print STATUS "$dups duplicates\n";
		if ($N < 40)
		{
		    print STATUS "genome is incomplete\n";
		}
		else
		{
		    print STATUS "genome is complete\n";
		}
		if ($dups > 10)
		{
		    print STATUS "There are multiple genomes in the set of contigs\n";
		}
		close(STATUS);
            }
        }
    }
}

sub make_contig_report {
    my($rep,$contig_ids,$contigH) = @_;

    open(FILE,">",$rep) || die "could not open $rep";

    my $tot = 0;
    my $totN = 0;
    foreach my $contig (@$contig_ids)
    {
	if (my $x = $contigH->{$contig})
	{
	    print FILE join("\t",($contig,@$x)),"\n";
	    $tot += $x->[0];
	    $totN++;
	}
    }
    print FILE "totN=$totN tot=$tot\n";
    close(FILE);
}

sub split_contigs {
    my($dir,$sample) = @_;

    my %contigs = map { ($_->[0] => $_) } &gjoseqlib::read_fasta("$dir/$sample/all.contigs");
    if (! -d "$dir/$sample/Sets") { mkdir("$dir/$sample/Sets",0777) }
    if (! -s "$dir/$sample/refs.for.rep")
    {
	open(SSU,"<","$dir/$sample/16S.sequences")
	    || die "could not open 16S.sequences";
	$_ = <SSU>;
	($_ =~ /^>(\S+)/) || die "what is in 16S.sequences?";
	my $ssu = $1;
	mkdir("$dir/$sample/Sets/$ssu",0777);
	my $contig_set = &gjoseqlib::read_fasta("$dir/$sample/all.contigs");
	&gjoseqlib::write_fasta("$dir/$sample/Sets/$ssu/contigs",$contig_set);
	my $n = @$contig_set;
	my $bp = 0;
	foreach my $contig (@$contig_set)
	{
	    $bp += length($contig->[2]);
	}
	print STDERR join("\t",($ssu,$n,$bp)),"\n";
    }
    else
    {
	my %refH;
	foreach $_ (`cat $dir/$sample/refs.for.rep`)
	{
	    chop;
	    my($ssu_id,@refG) = split(/\t/,$_);
	    foreach my $g (@refG)
	    {
		$refH{$g} = $ssu_id;
	    }
	}
	$/ = "\n====\n";
	my @report = `cat $dir/$sample/contig.separation.report`;
	my %placed_contigs;
	foreach my $chunk (@report)
	{
	    chomp $chunk;
	    my $first;
	    my $count = 0;
	    my @hits = split(/\n/,$chunk);
	    if (@hits > 0)
	    {
		my $contig;
		foreach my $hit (@hits)
		{
		    my @flds = split(/\t/,$hit);
		    $contig = $flds[0];
		    my $g = &SeedUtils::genome_of($flds[1]);
		    my $ssu = $refH{$g};
		    if (! $first)
		    {
			$first = $ssu;
		    }
		    if ($ssu eq $first)
		    {
			$count++;
		    }
		}
		if ($first && ($count >= (@hits / 2)))
		{
		    push(@{$placed_contigs{$first}},$contigs{$contig});
		}
	    }
	}
	$/ = "\n";
	foreach my $ssu (keys(%placed_contigs))
	{
	    mkdir("$dir/$sample/Sets/$ssu",0777);
	    my $contig_set = $placed_contigs{$ssu};
	    &gjoseqlib::write_fasta("$dir/$sample/Sets/$ssu/contigs",$contig_set);
	    my $n = @$contig_set;
	    my $bp = 0;
	    foreach my $contig (@$contig_set)
	    {
		$bp += length($contig->[2]);
	    }
	    print STDERR join("\t",($ssu,$n,$bp)),"\n";
	}
    }
}

## make this keep the close genomes at least 50 pegs different in size
## This must return 2-tuples (ssu-id,[g1,g2,g3])
sub close_genomes {
    my($blast,$reps) = @_;

    my %seenG;
    my %got;
    while (my $x = shift @$blast)
    {
	my $g   = $x->id2;
	my $ssu = $x->id1;
	if (! $got{$ssu}) 
	{ 
	    $got{$ssu}->{$g} = 1 ;
	}
	else 
	{
	    if ((keys(%{$got{$ssu}}) < 3) && (! $seenG{$g}))
	    {
		my @gs = keys(%{$got{$ssu}});
		my $numpegs_in_g = &numpegs($g);
		my $i;
		for ($i=0; ($i < @gs) && (abs($numpegs_in_g - $seenG{$gs[$i]}) < 50); $i++) {}
		if ($i == @gs)
		{
		    $seenG{$g} = $numpegs_in_g;
		    $got{$ssu}->{$g} = 1;
		}
	    }
	}
    }
    my @ret;
    foreach my $ssu (keys(%got))
    {
	my @refs = sort { $a <=> $b } keys(%{$got{$ssu}});
	push(@ret,[$ssu,\@refs]);
    }
    return @ret;
}

sub numpegs {
    my($g) = @_;

    my $n = `cat $FIG_Config::organisms/$g/Features/peg/tbl`;
    return $n;
}

sub overlaps {
    my($b1,$e1,$b2,$e2) = @_;

    if ($b1 > $e1) { ($b1,$e1) = ($e1,$b1) }
    if ($b2 > $e2) { ($b2,$e2) = ($e2,$b2) }
    return ((($b1 <= $b2) && ($b2 <= $e1)) || (($b2 <= $b1) && ($b1 <= $e2)));
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3