use strict; use Data::Dumper; use SeedEnv; use FIG; my $fig = new FIG; my $usage = "usage: build_data_directory DataDir K"; my($genus,$dataD,$k); ( ($dataD = shift @ARGV) && ($k = shift @ARGV) && ($k =~ /^\d{1,2}$/) ) || die $usage; print STDERR "building $dataD for Kmers of size $k\n"; mkdir($dataD,0777); my %seen; my $existing = $ENV{'SAS_SERVER'}; $ENV{'SAS_SERVER'} = 'PUBSEED'; open(G,">$dataD/all.genomes") || die "could not open $dataD/all.genomes"; foreach $_ (`svr_all_genomes`) { if (($_ !~ /[pP]hage\b/) && ($_ !~ /[pP]lasmid\b/)) { if (($_ =~ /^(\S+)\s(\S+).*\s(\d+\.\d+)$/) && ($2 ne "sp.")) { my $genus = $1; my $species = $2; my $genome = $3; if ($species !~ /^sp\.?$/i) { my $md5 = $fig->genome_md5sum($genome); if ((! $seen{$md5}) && $fig->is_prokaryotic($genome)) { $seen{$md5} = 1; print G $_; } } } } } close(G); $ENV{'SAS_SERVER'} = $existing; &build_function_index($dataD); &build_otu_index($dataD); &build_reduced_kmers($dataD,$k); &extend_otu_index($dataD); &update_kmers_with_extended_OTUs($dataD); sub build_function_index { my($dataD) = @_; open(ASSIGNMENTS,"cut -f2 $dataD/all.genomes | svr_all_features peg | svr_function_of |") || die "cannot access assignments"; my %funcs; while (defined($_ = )) { if ($_ =~ /^\S+\t(\S.*\S+)/) { my $stripped = &SeedUtils::strip_func($1); #### CHECK THIS $funcs{$stripped}++; } } close(ASSIGNMENTS); open(FI,">$dataD/function.index") || die "could not open $dataD/function.index"; my $nxt = 0; foreach my $f (sort keys(%funcs)) { if (($funcs{$f} > 5) && ((! &SeedUtils::hypo($f)) || ($f =~ /FIG/))) { print FI "$nxt\t$f\n"; $nxt++; } } close(FI); } sub build_otu_index { my($dataD) = @_; my %otus; foreach $_ (`cat $dataD/all.genomes`) { if ($_ =~ /(\S+\s\S+)/) { $otus{$1}++; } } open(OI,">$dataD/otu.index") || die "could not open $dataD/otu.index"; my $nxt = 0; foreach my $otu (keys(%otus)) { if ($otu !~ / sp\.$/) { print OI "$nxt\t$otu\n"; $nxt++; } } close(OI); } sub build_reduced_kmers { my($dataD,$k) = @_; my %to_oI; foreach $_ (`cat $dataD/otu.index`) { if ($_ =~ /^(\S+)\t(\S.*\S)/) { $to_oI{$2} = $1; } } my %g_to_oI; foreach $_ (`cat $dataD/all.genomes`) { if ($_ =~ /^(\S[^\t]+)\t(\S.*\S)/) { my $g = $2; my $gs = $1; if (($gs =~ /^(\S+ \S+)/) && defined($to_oI{$1})) { $g_to_oI{$g} = $to_oI{$1}; } } } my %to_fI; foreach $_ (`cat $dataD/function.index`) { if ($_ =~ /^(\S+)\t(\S.*\S)/) { $to_fI{$2} = $1; } } # # we take assignments from both the pubSEED and the ASEED. ASEED functions # override PSEED functions # my %peg_to_fI; &load_peg_to_fI($dataD,\%to_fI,'PUBSEED',\%peg_to_fI); &load_peg_to_fI($dataD,\%to_fI,'SEED',\%peg_to_fI); open(RAW,"| sort -T . > $dataD/sorted.kmers") || die "could not open $dataD/sorted.kmers"; foreach my $g (`cut -f2 $dataD/all.genomes`) { # next if (! $g_to_oI{$g}); foreach $_ (`echo '$g' | svr_all_features peg | svr_translations_of`) { if ($_ =~ /^(fig\|\d+\.\d+\.peg\.\d+)\t(\S.*\S)$/) { chomp; my $seq = $2; my $id = $1; ($id =~ /^fig\|(\d+\.\d+)/) || die "bad peg $_"; my $g = $1; my $oI = $g_to_oI{$g}; my $fI = $peg_to_fI{$id}; for (my $i=0; ($i < (length($seq) - $k)); $i++) { my $kmer = uc substr($seq,$i,$k); if ($kmer !~ /[^ACDEFGHIKLMNPQRSTVWY]/) { print RAW join("\t",($kmer,$fI,$oI,length($seq)-$i)),"\n"; } } } } } close(RAW); open(RAW,"<$dataD/sorted.kmers") || die "could not open sorted.kmers"; open(REDUCED,">$dataD/reduced.kmers") || die "could not open reduced kmers"; my $last = ; while ($last && ($last =~ /^(\S+)/)) { my $curr = $1; my @set; while ($last && ($last =~ /^(\S+)\t(\S*)\t(\S*)\t(\S*)$/) && ($1 eq $curr)) { push(@set,[$2,$3,$4]); $last = ; } &process_set($curr,\@set,\*REDUCED); } close(REDUCED); } sub load_peg_to_fI { my($dataD,$to_fI,$which_seed,$peg_to_fI) = @_; my $existing = $ENV{'SAS_SERVER'}; $ENV{'SAS_SERVER'} = $which_seed; foreach $_ (`cut -f2 $dataD/all.genomes | svr_all_features peg | svr_function_of`) { if ($_ =~ /^(\S+)\t(\S.*\S)/) { my $peg = $1; my $func = &SeedUtils::strip_func($2); if ($to_fI->{$func}) { $peg_to_fI->{$peg} = $to_fI->{$func}; } } } $ENV{'SAS_SERVER'} = $existing; } sub process_set { my($kmer,$set,$fh) = @_; my %funcs; foreach my $tuple (@$set) { my($fI,$oI,$off) = @$tuple; $funcs{$fI}++; } my @tmp = sort { $funcs{$b} <=> $funcs{$a} } keys(%funcs); if ($tmp[0] && ($funcs{$tmp[0]} > (0.5 * @$set))) { my $best_fI = $tmp[0]; my %otus; my $otu = ''; foreach my $tuple (@$set) { my($fI,$oI,$off) = @$tuple; if (($fI == $best_fI) && $oI) { $otus{$oI}++; } } @tmp = sort { $otus{$b} <=> $otus{$a} } keys(%otus); if ($tmp[0] && ($_ = &pickable_otu(\@tmp,\%otus,scalar @$set))) { $otu = $_; } my $sum_off = 0; my $n = 0; foreach my $tuple (@$set) { my($fI,$oI,$off) = @$tuple; if ($fI == $best_fI) { $sum_off += $off; $n++; } } print $fh join("\t",($kmer,int($sum_off/$n),$best_fI,$otu)),"\n"; } } sub pickable_otu { my($poss_otus,$counts,$in_set) = @_; my @called; if (@$poss_otus == 1) { return $poss_otus->[0] } my $best; while (($best = shift @$poss_otus) && ($counts->{$best} >= (0.2 * $in_set))) { push(@called,$best); } if (@called > 0) { return join(",",sort { $a <=> $b } @called); } return ''; } sub extend_otu_index { my($DataD) = @_; my @otus = map { chop; [split(/\t/,$_)] } `cat $dataD/otu.index`; my $nxt = $otus[-1]->[0] + 1; open(EXT,">>$DataD/otu.index") || die "could not extend otu.index"; foreach $_ (`cut -f4 $DataD/reduced.kmers | grep ',' | sort -u`) { print EXT join("\t",($nxt++,$_)); } close(EXT); } sub update_kmers_with_extended_OTUs { my($dataD) = @_; my %to_oI; foreach $_ (`cat $dataD/otu.index`) { if ($_ =~ /^(\d+)\t(\S+)/) { $to_oI{$2} = $1; } } open(IN,"<$dataD/reduced.kmers") || die "bad"; open(OUT,">$dataD/final.kmers") || die "could not open final.kmers"; while (defined($_ = )) { if ($_ !~ /,/) { print OUT $_ } else { chop; my($kmer,$off,$fI,$otus) = split(/\t/,$_); my $otu = $to_oI{$otus}; print OUT join("\t",($kmer,$off,$fI,$otu)),"\n"; } } close(IN); close(OUT); }