[Bio] / Kmers2 / build_data_directory.pl Repository:
ViewVC logotype

View of /Kmers2/build_data_directory.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Mon Jul 22 20:00:15 2013 UTC (6 years, 4 months ago) by overbeek
Branch: MAIN
CVS Tags: HEAD
Changes since 1.4: +4 -3 lines
loosened parameters

use strict;
use Data::Dumper;
use SeedEnv;
use FIG;
my $fig = new FIG;

my $usage = "usage: build_data_directory DataDir K";
my($genus,$dataD,$k);
(
 ($dataD      = shift @ARGV) &&
 ($k          = shift @ARGV) && ($k =~ /^\d{1,2}$/)
)
    || die $usage;
print STDERR "building $dataD for Kmers of size $k\n";
mkdir($dataD,0777);

my %seen;
my $existing = $ENV{'SAS_SERVER'};
$ENV{'SAS_SERVER'} = 'PUBSEED';
open(G,">$dataD/all.genomes") || die "could not open $dataD/all.genomes";
foreach $_ (`svr_all_genomes`)
{
    if (($_ !~ /[pP]hage\b/) && ($_ !~ /[pP]lasmid\b/))
    {
	if (($_ =~ /^(\S+)\s(\S+).*\s(\d+\.\d+)$/) && ($2 ne "sp."))
	{
	    my $genus = $1;
	    my $species = $2;
	    my $genome = $3;
	    if ($species !~ /^sp\.?$/i)
	    {
		my $md5 = $fig->genome_md5sum($genome);
		if ((! $seen{$md5}) && $fig->is_prokaryotic($genome))
		{
		    $seen{$md5} = 1;
		    print G $_;
		}
	    }
	}
    }
}
close(G);
$ENV{'SAS_SERVER'} = $existing;

&build_function_index($dataD);
&build_otu_index($dataD);
&build_reduced_kmers($dataD,$k);
&extend_otu_index($dataD);
&update_kmers_with_extended_OTUs($dataD);

sub build_function_index {
    my($dataD) = @_;

    open(ASSIGNMENTS,"cut -f2 $dataD/all.genomes | svr_all_features peg | svr_function_of |")
	|| die "cannot access assignments";

    my %funcs;
    while (defined($_ = <ASSIGNMENTS>))
    {
	if ($_ =~ /^\S+\t(\S.*\S+)/)
	{
	    my $stripped = &SeedUtils::strip_func($1);   #### CHECK THIS
	    $funcs{$stripped}++;
	}
    }
    close(ASSIGNMENTS);
    open(FI,">$dataD/function.index") || die "could not open $dataD/function.index";

    my $nxt = 0;
    foreach my $f (sort keys(%funcs))
    {
	if (($funcs{$f} > 5)  && ((! &SeedUtils::hypo($f)) || ($f =~ /FIG/)))
	{
	    print FI "$nxt\t$f\n";
	    $nxt++;
	}
    }
    close(FI);
}

sub build_otu_index {
    my($dataD) = @_;

    my %otus;
    foreach $_ (`cat $dataD/all.genomes`)
    {
	if ($_ =~ /(\S+\s\S+)/)
	{
	    $otus{$1}++;
	}
    }

    open(OI,">$dataD/otu.index") || die "could not open $dataD/otu.index";
    my $nxt = 0;
    foreach my $otu (keys(%otus))
    {
	if ($otu !~ / sp\.$/)
	{
	    print OI "$nxt\t$otu\n";
	    $nxt++;
	}
    }
    close(OI);
}

sub build_reduced_kmers {
    my($dataD,$k) = @_;

    my %to_oI;
    foreach $_ (`cat $dataD/otu.index`)
    {
	if ($_ =~ /^(\S+)\t(\S.*\S)/)
	{
	    $to_oI{$2} = $1;
	}
    }

    my %g_to_oI;

    foreach $_ (`cat $dataD/all.genomes`)
    {
	if ($_ =~ /^(\S[^\t]+)\t(\S.*\S)/)
	{
	    my $g = $2;
	    my $gs = $1;
	    if (($gs =~ /^(\S+ \S+)/) && defined($to_oI{$1}))
	    {
		$g_to_oI{$g} = $to_oI{$1};
	    }
	}
    }

    my %to_fI;
    foreach $_ (`cat $dataD/function.index`)
    {
	if ($_ =~ /^(\S+)\t(\S.*\S)/)
	{
	    $to_fI{$2} = $1;
	}
    }
#
#   we take assignments from both the pubSEED and the ASEED.  ASEED functions
#   override PSEED functions
#
    my %peg_to_fI;
    &load_peg_to_fI($dataD,\%to_fI,'PUBSEED',\%peg_to_fI);
    &load_peg_to_fI($dataD,\%to_fI,'SEED',\%peg_to_fI);

    open(RAW,"| sort -T . > $dataD/sorted.kmers") || die "could not open $dataD/sorted.kmers";
    foreach my $g (`cut -f2 $dataD/all.genomes`)
    {
#	next if (! $g_to_oI{$g});
	foreach $_ (`echo '$g' | svr_all_features peg | svr_translations_of`)
	{
	    if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S.*\S)$/)
	    {
		chomp;
		my $seq = $2;
		my $id = $1;
		($id =~ /^fig\|(\d+\.\d+)/) || die "bad peg $_";
		my $g = $1;
		my $oI = $g_to_oI{$g};
		my $fI = $peg_to_fI{$id};
		for (my $i=0; ($i < (length($seq) - $k)); $i++)
		{
		    my $kmer = uc substr($seq,$i,$k);
		    if ($kmer !~ /[^ACDEFGHIKLMNPQRSTVWY]/)
		    {
			print RAW join("\t",($kmer,$fI,$oI,length($seq)-$i)),"\n";
		    }
		}
	    }
	}
    }
    close(RAW);

    open(RAW,"<$dataD/sorted.kmers") || die "could not open sorted.kmers";
    open(REDUCED,">$dataD/reduced.kmers") || die "could not open reduced kmers";
    my $last = <RAW>;
    while ($last && ($last =~ /^(\S+)/))
    {
	my $curr = $1;
	my @set;
	while ($last && ($last =~ /^(\S+)\t(\S*)\t(\S*)\t(\S*)$/) && ($1 eq $curr))
	{
	    push(@set,[$2,$3,$4]);
	    $last = <RAW>;
	}
	&process_set($curr,\@set,\*REDUCED);
    }
    close(REDUCED);
}

sub load_peg_to_fI {
    my($dataD,$to_fI,$which_seed,$peg_to_fI) = @_;
    my $existing = $ENV{'SAS_SERVER'};
    $ENV{'SAS_SERVER'} = $which_seed;

    foreach $_ (`cut -f2 $dataD/all.genomes | svr_all_features peg | svr_function_of`)
    {
	if ($_ =~ /^(\S+)\t(\S.*\S)/)
	{
	    my $peg = $1;
	    my $func = &SeedUtils::strip_func($2);
	    if ($to_fI->{$func})
	    {
		$peg_to_fI->{$peg} = $to_fI->{$func};
	    }
	}
    }
    $ENV{'SAS_SERVER'} = $existing;
}


sub process_set {
    my($kmer,$set,$fh) = @_;

    my %funcs;
    foreach my $tuple (@$set)
    {
	my($fI,$oI,$off) = @$tuple;
	$funcs{$fI}++;
    }
    my @tmp = sort { $funcs{$b} <=> $funcs{$a} } keys(%funcs);
    if ($tmp[0] && ($funcs{$tmp[0]} > (0.5 * @$set)))
    {
	my $best_fI = $tmp[0];
	my %otus;
	my $otu = '';
	foreach my $tuple (@$set)
	{
	    my($fI,$oI,$off) = @$tuple;
	    if (($fI == $best_fI) && $oI)
	    {
		$otus{$oI}++;
	    }
	}
	@tmp = sort { $otus{$b} <=> $otus{$a} } keys(%otus);
	if ($tmp[0] && ($_ = &pickable_otu(\@tmp,\%otus,scalar @$set)))
	{
	    $otu = $_;
	}
	my $sum_off = 0;
	my $n = 0;
	foreach my $tuple (@$set)
	{
	    my($fI,$oI,$off) = @$tuple;
	    if ($fI == $best_fI)
	    {
		$sum_off += $off;
		$n++;
	    }
	}
	print $fh join("\t",($kmer,int($sum_off/$n),$best_fI,$otu)),"\n";
    }
}

sub pickable_otu {
    my($poss_otus,$counts,$in_set) = @_;

    my @called;
    if (@$poss_otus == 1) { return $poss_otus->[0] }
    my $best;
    while (($best = shift @$poss_otus) && ($counts->{$best} >= (0.2 * $in_set)))
    {
	push(@called,$best);
    }
    if (@called > 0)
    {
	return join(",",sort { $a <=> $b } @called);
    }
    return '';
}
    
sub extend_otu_index {
    my($DataD) = @_;

    my @otus = map { chop; [split(/\t/,$_)] } `cat $dataD/otu.index`;
    my $nxt = $otus[-1]->[0] + 1;
    open(EXT,">>$DataD/otu.index") || die "could not extend otu.index";
    foreach $_ (`cut -f4 $DataD/reduced.kmers | grep ',' | sort -u`)
    {
	print EXT join("\t",($nxt++,$_));
    }
    close(EXT);
}

sub update_kmers_with_extended_OTUs {
    my($dataD) = @_;

    my %to_oI;
    foreach $_ (`cat $dataD/otu.index`)
    {
	if ($_ =~ /^(\d+)\t(\S+)/)
	{
	    $to_oI{$2} = $1;
	}
    }
    open(IN,"<$dataD/reduced.kmers") || die "bad";
    open(OUT,">$dataD/final.kmers") || die "could not open final.kmers";
    while (defined($_ = <IN>))
    {
	if ($_ !~ /,/) { print OUT $_ }
	else
	{
	    chop;
	    my($kmer,$off,$fI,$otus) = split(/\t/,$_);
	    my $otu = $to_oI{$otus};
	    print OUT join("\t",($kmer,$off,$fI,$otu)),"\n";
	}
    }
    close(IN);
    close(OUT);
}


	

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3