[Bio] / FigKernelScripts / compute_figfam_assignments.pl Repository:
ViewVC logotype

View of /FigKernelScripts/compute_figfam_assignments.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Sep 15 21:48:45 2010 UTC (9 years, 2 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.1: +25 -7 lines
Clean up debugging, write concatenated output file.

use strict;
use FFs;
use FIG;
use FIGV;
use Data::Dumper;
use SeedAware;
use FileHandle;

=head1 compute_figfam_assignments

     compute_figfam_assignments ff-dir [genome-dir] < peg-list > assignments

Compute the figfam assignments for the given set of pegs. If a genome
directory is provided, use that directory as the source of function
and sequence data. Otherwise assume (require) that the pegs in the
list be present in the SEED we are operating in.

=cut

my $usage = "Usage: compute_figfam_assignments ff_dir org_dir < input > output";

@ARGV == 1 or @ARGV == 2 or die "$usage\n";

my $ff_dir = shift;
my $genome_dir = shift;

-d $ff_dir or die "Figfam dir $ff_dir does not exist\n";

my $fig;
if (defined($genome_dir))
{
    -d $genome_dir or die "Genome directory $genome_dir does not exist\n";
    $fig = FIGV->new($genome_dir);
}
else
{
    $fig = FIG->new();
}

my $nprocs = 8;
my @work;
push(@work, []) for 1..$nprocs;
my $next_work = 0;

my @ids;

while (<STDIN>)
{
    chomp;
    my($id) = split(/\t/);
    push(@ids, $id);
}

my $fns = $fig->function_of_bulk(\@ids);

for my $id (@ids)
{
    my $prot = $fig->get_translation($id);
    my $w = $work[$next_work];
    $next_work = ($next_work + 1) % $nprocs;
    push(@$w, [$id, $fns->{$id}, $prot]);
}

undef $fig;

my $tmpdir = SeedAware::location_of_tmp();

my @subproc_work;
for my $i (0..$#work)
{
    my $out_file = "$tmpdir/out.$$.$i";
    push(@subproc_work, [$i, $out_file, $work[$i]]);
}

my @procs;
for my $i (0..$#work)
{
    my $pid = fork;
    if ($pid == 0)
    {
	process_work($subproc_work[$i]);
	exit;
    }
    print STDERR "Forked $pid for work $i\n";
    $procs[$i] = $pid;
}
print STDERR "Waiting\n";

for my $pid (@procs)
{
    my $n = waitpid($pid, 0);
    my $rc = $?;
    print STDERR "Process $n finishes with $rc\n";
}
print STDERR "All done\n";

for my $ent (@subproc_work)
{
    my($n, $file, $list) = @$ent;
    if (open(my $fh, "<", $file))
    {
	while (<$fh>)
	{
	    print $_;
	}
	close($fh);
	unlink($file);
    }
    else
    {
	die "Could not open intermediate file $file: $!";
    }
}

sub process_work
{
    my($work_def) = @_;

    my $ffs = FFs->new($ff_dir);

    my($id, $file, $work) = @$work_def;
    print STDERR "Process $id $file in $$\n";

    my $out;
    open($out, ">", $file) or die "Cannot write $out: $!";
    $out->autoflush(1);
    for my $ent (@$work)
    {
	my($fid, $func, $seq) = @$ent;
	my($fam, $sims) = $ffs->place_seq_and_function_in_family($seq, $func);
	print $out "$fid\t$fam\t$func\t$ff_dir\n";
    }
    close($out);
}
    


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3