[Bio] / FigKernelScripts / FFB2_xx.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_xx.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Jul 12 19:30:27 2010 UTC (9 years, 4 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Bunch of figfam generation optimizations.

########################################################################

use strict;
use Cache::Memcached::Fast;
use IPC::Open3;
use POSIX ":sys_wait_h";


my($ln, $setI, $N, $offset);
(
 ($ln = shift @ARGV) &&
 ($setI = shift @ARGV) &&
 ($N = shift @ARGV) &&
 (defined($offset = shift @ARGV))
)
    || die "usage: FFB2_get_prot_gs_oligos K setI N Offset [memcache-host memcache-port]";

my $mchost = shift;
my $mcport = shift;
my $mc;

if ($mchost && $mcport)
{
    $mc = new Cache::Memcached::Fast({ servers => ["$mchost:$mcport"] } );
}

use FIG;
my $fig = new FIG;

if ($offset == 0)
{
    open(SETI,">$setI") || die "could not open $setI";
}

my %active_genomes = map { $_ => 1 } $fig->genomes('complete');
open(SETS,"<$FIG_Config::global/genome.sets") || die "could not open genome.sets";

my($x,%seen,%genomes);

while (defined($x = <SETS>))
{
    if (($x =~ /^(\d+)\t(\d+\.\d+)\t(\S.*\S)/) && $active_genomes{$2})
    {
	$genomes{$2} = $1;
        if (! $seen{$1})
        {   
	    $seen{$1} = 1;
	    print SETI "$1\t$3\n" if $offset == 0;
        }
    }
}
close(SETS);
close(SETI) if $offset == 0;

#
# We write $chunksize oligos to a sort and later merge the output files into a pipe to usable_motifs.
#

my $chunksize = 10_000_000;

my $sortargs = "-S 1G";

my $file_idx = 0;
my $n_written = 0;

my $tmp_fmt = "$FIG_Config::temp/gs_oligo_tmp.$offset.$N.$$.%05d";

my $tmp = sprintf $tmp_fmt, $file_idx++;
my @files;
my @pids;

my $out = new FileHandle;
my $err = new FileHandle;
my $pid = open3(\*SORT, $out, $err, "sort $sortargs > $tmp");
$pid or die "cannot open3: $!";
print STDERR "Write $tmp\n";
push(@pids, [$pid, $out, $err, $tmp]);
push(@files, $tmp);

foreach my $genome (sort { $a <=> $b } keys(%genomes))
{
    my($xx) = $genome =~ /(\d)\./;
    next if ($xx % $N != $offset);

    my $gs = $fig->genus_species($genome);
    my $set = $genomes{$genome};
    print STDERR "$genome $gs\n";

    my @pegs = $fig->all_features($genome,'peg');
    my $seqs = $mc->get_multi(map { "s:$_" } @pegs);
			     
    foreach my $peg (@pegs)
    {
	my $seq = $seqs->{"s:$peg"};
	if (!$seq)
	{
	    print STDERR "no seq in bulk for $peg\n";
	    $seq = &get_translation($fig, $peg);
	}
	
        if ($seq)
        {
	    my $i;
	    my $n = length($seq);
            for ($i=0; ($i < ($n - $ln)); $i++)
            {
                my $oligo = uc substr($seq,$i,$ln);
                if ($oligo =~ /^[ACDEFGHIKLMNPQRSTVWY]+$/)
                {   
                    print SORT join("\t",($oligo,$set)),"\n";
		    if ($n_written++ >= $chunksize)
		    {
			$n_written = 0;
			close(SORT);
			do_wait();
			my $tmp = sprintf $tmp_fmt, $file_idx++;
			my $out = new FileHandle;
			my $err = new FileHandle;
			my $pid = open3(\*SORT, $out, $err, "sort $sortargs > $tmp");
			$pid or die "cannot open3: $!";
			print STDERR "Write $tmp\n";
			push(@pids, [$pid, $out, $err, $tmp]);
			push(@files, $tmp);
		    }
                }
            }
        }
    }
}

close(SORT);

for my $ent (@pids)
{
    my($pid, $out, $err, $file) = @$ent;
    print STDERR "Wait on $pid for $file\n";
    my $rc = waitpid($pid, 0);
    print STDERR "Wait returns $rc\n";
    close($out);
    close($err);
}

system("sort", "-m", @files);
unlink(@files);

sub do_wait
{
    my @npids;
    for my $ent (@pids)
    {
	my($pid, $out, $err, $file) = @$ent;
	my $rc = waitpid($pid, WNOHANG);
	if ($rc == $pid)
	{
	    print STDERR "Pid $pid $file done\n";
	    close($out);
	    close($err);
	}
	else
	{
	    push(@npids, $ent);
	}
    }
    @pids = @npids;
}

sub get_translation
{
    my($fig, $peg) = @_;

    my $tr;
    if ($mc)
    {
	$tr = $mc->get("s:$peg");
    }
    if (!$tr)
    {
	$tr = $fig->get_translation($peg);
	if ($tr && $mc)
	{
	    $mc->set("s:$peg", $tr);
	}
    }
    return $tr;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3