[Bio] / MGRASTBackend / mg_non_seed_sims.pl Repository:
ViewVC logotype

View of /MGRASTBackend/mg_non_seed_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Jul 24 03:48:56 2009 UTC (10 years, 3 months ago) by arodri7
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +3 -3 lines
update_errors

#
# Chunk inputs and submit a pile of runs to the cluster.
#

use strict;
use FIG;
use FIG_Config;
use File::Basename;
use GenomeMeta;
use Job48;
use JobStage;
use SGE;
use FortyEightMeta::SimDB;
use FortyEightMeta::SimStatusDB;

my $STAGE = "mg_non_seed_sims";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job = $stage->job();

my $meta = $job->meta;

print "Running job! $jobdir\n";

$stage->set_status("in_progress");

my $sge = new SGE;

#
# Find needed executables
#

my $chunk_exe = "$FIG_Config::bin/mg_chunk_sims";
-x $chunk_exe or $stage->fatal("Executable missing: $chunk_exe");

my $compute_exe = "$FIG_Config::bin/mg_compute_sims";
-x $compute_exe or $stage->fatal("Executable missing: $compute_exe");

my $proc = "$jobdir/proc";
chdir($proc) or $stage->fatal("cannot chdir $proc: $!");

my $fasta = $meta->get_metadata("preprocess.fasta_file");
($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

my @sge_ids;
my @short_ids;

my $cutoff = $stage->get_metadata("options.sim_compute_cutoff");
$cutoff = 0.01 unless defined($cutoff);

my $simdb = FortyEightMeta::SimDB->new($FIG_Config::mgrast_database_def);
$simdb or $stage->fatal("Cannot open database description file $FIG_Config::mgrast_database_def");

my $blast_opts = "-m 8 -e $cutoff";
my @jobs;

my $status_db = FortyEightMeta::SimStatusDB->new($job_id);
$status_db or die "Cannot create status_db";

my @db_list = $simdb->databases();
#
# Remove databases marked with inhibit_sims=1 - these are there
# for analysis of older data that we don't want to compute sims
# for any more.
#
@db_list = grep { !$_->{inhibit_sims} } @db_list;
$stage->set_metadata("sims.database_list", \@db_list);
for my $db (@db_list)
{
    my $name = $db->{name};
    next if ($name =~ /SEED/);
    my $version = $db->{version};
    my $files = $db->{files};
    for my $file (@$files)
    {
	my $path = $file->{fasta};
	#
	# Compute size of seqs per batch. uspib is microseconds per
	# input byte, calibrated on the fast intel boxes. We target 15-minute runs
	# since those correspond to roughly 30-minute runs on the PPC nodes.
	#
	
	my $uspib = $file->{uspib};
	my $chunk_size;
	if ($uspib)
	{
	    $chunk_size = int(15 * 60 / ( 1e-6 * $uspib));
	}
	else
	{
	    $chunk_size = 10000;
	}
	$stage->log("Sims db: name=$name version=$version uspib=$uspib path=$path chunksize=$chunk_size");

	my $blastp = $file->{type} eq 'dna' ? 'blastn' : 'blastx';
	my $dir = $file->{dir};
	my $nr = $file->{fasta};
	
	my $path = "$proc/$dir";
	-d $path or mkdir ($path, 0777) or $stage->fatal("Cannot mkdir $path: $!");
	open(F, ">", "$path/FASTA_PATH");
	print F "$nr\n";
	close(F);

	my $cmd = "$chunk_exe -p $blastp -o '$blast_opts' -N $chunk_size -nr $nr -j $path $fasta";
	open(P, "$cmd |") or $stage->fatal("Failed pipe open: $!: $cmd");
	my($start, $end);
	while (<P>)
	{
	    print;
	    chomp;
	    if (/tasks\s+(\d+)\s+(\d+)/)
	    {
		($start, $end) = ($1, $2);
	    }
	}
	close(P) or $stage->fatal("Error on close: \$!=$! \$?=$?: $cmd");
	
	defined($start) or $stage->fatal("tasks not found");
	
	print "Got tasks from $start to $end\n";

	#
	# And submit.
	#
	
 	my @sge_args;
	my $jobname = "mns$file->{abbr}_$job_id";
	
	push(@sge_args, "-N $jobname");
	push(@sge_args, "-v PATH");
	push(@sge_args, "-e $jobdir/sge_output");
	push(@sge_args, "-o $jobdir/sge_output");
	push(@sge_args, "-t $start-$end");
	push(@sge_args, "-b yes");
	#
	# metagenome 48hr jobs get low priority
	#
	push(@sge_args, "-l low");

	my $sge_args = join(" ", @sge_args);
	
	my $sge_id;
	
	eval {
	    $sge_id = $sge->submit_job($meta, $sge_args, "$compute_exe $jobdir $path");
#	    print "Would submit '$sge_args' '$compute_exe' '$jobdir' '$path'\n";
	};
	
	if ($@)
	{
	    $stage->fatal($meta, "error starting SGE job $compute_exe $jobdir: $@\n");
	}
	
	#
	# Initialize sim status entries.
	#
	my $retries = $FIG_Config::mgrast_blast_retries;
	$retries = 3 unless defined($retries);
	for my $t ($start .. $end)
	{
	    my $rec = {};
	    $rec->{sim_sge_id} = $sge_id;
	    $rec->{abbr} = $file->{abbr};
	    $rec->{work_dir} = $path;
	    $rec->{blast_retries_left} = $retries;
	    $rec->{status} = 'not_started';

	    $status_db->set_task($dir, $t, $rec);
	}    

	push(@sge_ids, $sge_id);
    }
}
    
$stage->set_qualified_metadata("sge_ids", \@sge_ids);
$stage->set_status("complete");
$stage->set_running("no");




MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3