[Bio] / FigKernelScripts / FFB2_get_oligos.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_get_oligos.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (download) (as text) (annotate)
Thu Nov 11 17:49:30 2010 UTC (9 years ago) by disz
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.9: +1 -1 lines
FigFam 6 digit problem

# -*- perl -*-

use strict;
use Data::Dumper;
use Carp;
use Cache::Memcached::Fast;
#use IPC::Run qw(start finish reap_nb);
use IPC::Open3;
use POSIX ":sys_wait_h";
use FileHandle;

my $usage = "usage: FFB2_get_oligos families.2c family.functions function.index [memcache-host memcache-port] > oligos";

my $max_sz = 12;
my $min_sz = 7;
my($fam2cF,$famFuncF,$famFuncIdx);

(
 ($fam2cF     = shift @ARGV) && open(FAM2C,"<$fam2cF") &&
 ($famFuncF   = shift @ARGV) &&
 ($famFuncIdx = shift @ARGV)
)
    || die $usage;

open(FFIDX, ">", $famFuncIdx) or die "Cannot write $famFuncIdx:$ !";

my $mchost = shift;
my $mcport = shift;
my $mc;

if ($mchost && $mcport)
{
    $mc = new Cache::Memcached::Fast({ servers => ["$mchost:$mcport"] } );
}

#
# We write $chunksize oligos to a sort and later merge the output files into a pipe to usable_motifs.
#

my $chunksize = 10_000_000;

my $sortargs = "-S 1G";

my $file_idx = 0;
my $n_written = 0;

my $tmp_fmt = "$FIG_Config::temp/ff_oligo_tmp.$$.%05d";

my @handles;
my @files;

# my $sort_fh = FileHandle->new();
# my $tmp = sprintf $tmp_fmt, $file_idx++;
# my $child_cmd = ["sort", "-S", "1G"];
# $child_cmd = ["cat"];
# my $sort_handle = start $child_cmd, '<pipe', $sort_fh, '>', $tmp;
# push(@handles, [$sort_handle, $tmp]);
# push(@files, $tmp);

my @pids;
my $out = new FileHandle;
my $err = new FileHandle;

my $tmp = sprintf $tmp_fmt, $file_idx++;
my $sortcmd = "sort $sortargs";
my $pid = open3(\*SORT, $out, $err, "$sortcmd > $tmp");
$pid or die "cannot open3: $!";
print STDERR "Write $tmp\n";
push(@pids, [$pid, $out, $err, $tmp]);
push(@files, $tmp);


use FIG;
my $fig = new FIG;
my %fam_functions = map { $_ =~ /^(FIG\d+)\t(\S.*\S)/; $1 => $2 } `cat $famFuncF`;
my $nextF = 0;
my $x = <FAM2C>;
my %function_index;
while ($x && ($x =~ /^(\S+)/))
{
    my $fam = $1;
    my $fam_function  = $fam_functions{$fam};
    my $fam_functionI = &fam_function_index(\%function_index,\$nextF,$fam_function);
    while ($x && ($x =~ /^(\S+)\t(\S+)/) && ($1 eq $fam))
    {
	my $peg = $2;
	if (my $seq = &get_translation($fig, $peg))
	{
	    my $i;
	    my $ln = length($seq);
	    for($i=0; ($i <= ($ln - $min_sz)); $i++)
	    {
		my $oligo = uc substr($seq,$i,($i < ($ln-$max_sz)) ? $max_sz : ($ln-$i));
		if ($oligo =~ /^[ACDEFGHIKLMNPQRSTVWY]+$/)
		{
 		    print SORT $oligo . "\t" . $fam_functionI . "\t" . $fam . "\n";
# 		    $sort_handle->pump() if $n_written % 10000 == 0;
# 		    if ($n_written++ >= $chunksize)
# 		    {
# 			$sort_fh->close();
# 			undef $sort_fh;
# 			map { $_->[0]->reap_nb() } @handles;
# 			$sort_fh = FileHandle->new();
# 			my $tmp = sprintf $tmp_fmt, $file_idx++;
# 			$sort_handle = start $child_cmd, '<pipe', $sort_fh, '>', $tmp;
# 			push(@handles, [$sort_handle, $tmp]);
# 			push(@files, $tmp);
# 			$n_written = 0;
# 		    }

		    if ($n_written++ >= $chunksize)
		    {
			$n_written = 0;
			close(SORT);
			do_wait();
			my $tmp = sprintf $tmp_fmt, $file_idx++;
			my $out = new FileHandle;
			my $err = new FileHandle;
			my $pid = open3(\*SORT, $out, $err, "$sortcmd > $tmp");
			$pid or die "cannot open3: $!";
			print STDERR "Write $tmp\n";
			push(@pids, [$pid, $out, $err, $tmp]);
			push(@files, $tmp);
		    }
		}
	    }
	}
	$x = <FAM2C>;
    }
}
close(FAM2C);
close FFIDX;

close SORT;
for my $ent (@pids)
{
    my($pid, $out, $err, $file) = @$ent;
    print STDERR "Wait on $pid for $file\n";
    my $rc = waitpid($pid, 0);
    print STDERR "Wait returns $rc\n";
    close($out);
    close($err);
}


my @sort;
if (-x "/scratch/olson/coreutils/bin/sort")
{
    my $n = @files;
    $n++;
    @sort = ("/scratch/olson/coreutils/bin/sort", "--batch-size=$n", "-m", @files);
}
else
{
    @sort = ("sort", "-S", "2G", "-m", @files);
}
print STDERR "sort cmd is @sort\n";
my $rc = system(@sort);
if ($rc != 0)
{
    die "Sort failed with rc=$rc\n";
}
unlink(@files);

sub do_wait
{
    my @npids;
    for my $ent (@pids)
    {
	my($pid, $out, $err, $file) = @$ent;
	my $rc = waitpid($pid, WNOHANG);
	if ($rc == $pid)
	{
	    print STDERR "Pid $pid $file done\n";
	    close($out);
	    close($err);
	}
	else
	{
	    push(@npids, $ent);
	}
    }
    @pids = @npids;
}

sub get_translation
{
    my($fig, $peg) = @_;

    my $tr;
    if ($mc)
    {
	$tr = $mc->get("s:$peg");
    }
    if (!$tr)
    {
	$tr = $fig->get_translation($peg);
	if ($tr && $mc)
	{
	    $mc->set("s:$peg", $tr);
	}
    }
    return $tr;
}

sub fam_function_index {
    my($fam_functions,$nextFP,$fam_function) = @_;

    defined($fam_function) || confess "bad";
    my $x = $fam_functions->{$fam_function};
    if (! defined($x)) 
    {
	$x = $fam_functions->{$fam_function} = $$nextFP;
	print FFIDX "$x\t$fam_function\n";
	$$nextFP++;
    }
    return $x;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3