[Bio] / MGRASTBackend / mg_figfam_sims.pl Repository:
ViewVC logotype

View of /MGRASTBackend/mg_figfam_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Jul 6 19:40:28 2009 UTC (10 years, 4 months ago) by arodri7
Branch: MAIN
CVS Tags: HEAD
first version, figfams pipeline

#
# Chunk inputs and submit a pile of runs to the cluster for the figfam hits.
#

use strict;
use FIG;
use FIG_Config;
use File::Basename;
use GenomeMeta;
use Job48;
use JobStage;
use SGE;
use FortyEightMeta::SimDB;
use FortyEightMeta::SimStatusDB;

my $STAGE = "figfam_sims";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job = $stage->job();

my $meta = $job->meta;

print "Running job! $jobdir\n";

$stage->set_status("in_progress");

my $sge = new SGE;

#
# Find needed executables
#

my $chunk_exe = "$FIG_Config::bin/mg_chunk_figfam_sims";
#my $chunk_exe = "/home/arodri7/FIGdisk/dist/releases/current/MGRASTBackend/scripts/mg_chunk_figfam_sims.pl";
-x $chunk_exe or $stage->fatal("Executable missing: $chunk_exe");

my $compute_exe = "$FIG_Config::bin/mg_compute_figfam_sims";
#my $compute_exe = "/home/arodri7/FIGdisk/dist/releases/current/MGRASTBackend/scripts/mg_compute_figfam_sims.pl";
-x $compute_exe or $stage->fatal("Executable missing: $compute_exe");

my $ff_data = "$FIG_Config::FigfamsData";
#my $ff_data = "/home/arodri7/FIGdisk/FIG/Data/FigfamsData";
-d $ff_data or $stage->fatal("FIGfams data missing: $ff_data");

my $proc = "$jobdir/proc";
chdir($proc) or $stage->fatal("cannot chdir $proc: $!");

#my $ff_out = "$jobdir/proc/mg_figfams.out";
#my $fasta = "$jobdir/proc/assigned_with_figfams.fasta";
my $ff_out = $meta->get_metadata("figfam_server.assigned_file");
my $fasta = $meta->get_metadata("figfam_server.assigned_fasta");
($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");
($ff_out and -f $ff_out) or $stage->fatal("figfam assigned file not found: '$ff_out'");

my @sge_ids;
my @short_ids;

my $cutoff = $stage->get_metadata("options.sim_compute_cutoff");
$cutoff = 0.01 unless defined($cutoff);

#my $simdb = FortyEightMeta::SimDB->new($FIG_Config::mgrast_database_def);
#$simdb or $stage->fatal("Cannot open database description file $FIG_Config::mgrast_database_def");

my $blast_opts = "-m 8";
my @jobs;

#my $status_db = FortyEightMeta::SimStatusDB->new($job_id);
#$status_db or die "Cannot create status_db";

#my @db_list = $simdb->databases();
#
# Remove databases marked with inhibit_sims=1 - these are there
# for analysis of older data that we don't want to compute sims
# for any more.
#
#@db_list = grep { !$_->{inhibit_sims} } @db_list;
#@db_list = ("ff");
#$stage->set_metadata("sims.database_list", \@db_list);
#for my $db (@db_list)
{
    #my $name = $db->{name};
    my $name = "ff";
    #my $version = $db->{version};
    #my $files = $db->{files};
    #for my $file (@$files)
    {
	#my $path = $file->{fasta};
	#
	# Compute size of seqs per batch. uspib is microseconds per
	# input byte, calibrated on the fast intel boxes. We target 15-minute runs
	# since those correspond to roughly 30-minute runs on the PPC nodes.
	#
	
	#my $uspib = $file->{uspib};
	my $chunk_size;
	#if ($uspib)
	#{
	#    $chunk_size = int(15 * 60 / ( 1e-6 * $uspib));
	#}
	#else
	#{
	    $chunk_size = 10000;
	#}
	#$stage->log("Sims db: name=$name version=$version uspib=$uspib path=$path chunksize=$chunk_size");

	my $blastp = 'blastx';
	#my $blastp = $file->{type} eq 'dna' ? 'blastn' : 'blastx';
	#my $dir = $file->{dir};
	my $dir = "sims.ff";
	#my $nr = $file->{fasta};
	
	
	my $path = "$proc/$dir";
	-d $path or mkdir ($path, 0777) or $stage->fatal("Cannot mkdir $path: $!");
	open(F, ">", "$path/FIGFAMS_PATH");
	print F "$ff_data\n";
	close(F);

	my $cmd = "$chunk_exe -f $fasta -p $blastp -o '$blast_opts' -d $ff_data -n $chunk_size -j $path $ff_out";
	open(P, "$cmd |") or $stage->fatal("Failed pipe open: $!: $cmd");
	my($start, $end);
	while (<P>)
	{
	    print;
	    chomp;
	    if (/tasks\s+(\d+)\s+(\d+)/)
	    {
		($start, $end) = ($1, $2);
	    }
	}
	close(P) or $stage->fatal("Error on close: \$!=$! \$?=$?: $cmd");
	
	defined($start) or $stage->fatal("tasks not found");
	
	print "Got tasks from $start to $end\n";

	#
	# And submit.
	#
	
 	my @sge_args;
	my $abbr = "ff";
	my $jobname = "m$abbr"."_$job_id";
	
	push(@sge_args, "-N $jobname");
	push(@sge_args, "-v PATH");
	push(@sge_args, "-e $jobdir/sge_output");
	push(@sge_args, "-o $jobdir/sge_output");
	push(@sge_args, "-t $start-$end");
	push(@sge_args, "-b yes");
	#
	# metagenome 48hr jobs get low priority
	#
	push(@sge_args, "-l low");

	my $sge_args = join(" ", @sge_args);
	
	my $sge_id;
	
	eval {
	    $sge_id = $sge->submit_job($meta, $sge_args, "$compute_exe $jobdir $path");
#	    print "Would submit '$sge_args' '$compute_exe' '$jobdir' '$path'\n";
	};
	
	if ($@)
	{
	    $stage->fatal($meta, "error starting SGE job $compute_exe $jobdir: $@\n");
	}
	
	#
	# Initialize sim status entries.
	#
	my $retries = $FIG_Config::mgrast_blast_retries;
	$retries = 3 unless defined($retries);
	for my $t ($start .. $end)
	{
	    my $rec = {};
	    $rec->{sim_sge_id} = $sge_id;
	    $rec->{abbr} = $abbr;
	    $rec->{work_dir} = $path;
	    $rec->{blast_retries_left} = $retries;
	    $rec->{status} = 'not_started';

	    #$status_db->set_task($dir, $t, $rec);
	}    

	push(@sge_ids, $sge_id);
    }
}
    
$stage->set_qualified_metadata("sge_ids", \@sge_ids);
$stage->set_status("complete");
$stage->set_running("no");




MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3