[Bio] / FortyEightMeta / recover_broken_job.pl Repository:
ViewVC logotype

View of /FortyEightMeta/recover_broken_job.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Mon Sep 1 19:51:41 2008 UTC (11 years, 4 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_rel_2008_0923, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_rel_2008_0919, mgrast_rel_2008_1110, mgrast_rel_2008_0917
Changes since 1.4: +8 -0 lines
adding a tag to meta xml and allowing the dot directory

#
# Find the sim chunks of the job that are bad; construct a new task.list for them
# and submit an array job to finish.
#

use strict;
use FIG;
use FIG_Config;
use File::Basename;
use GenomeMeta;
use Job48;
use SGE;
use Carp;

my $STAGE = "sims";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

if ($jobdir =~ /^\d+$/)
{
    $jobdir = "/vol/metagenome-48-hour/Jobs.prod/$jobdir";
}
if ($jobdir eq ".")
{
    $jobdir = `pwd`;
    chomp $jobdir;
}

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $job_id = basename($jobdir);
my $job = new Job48($jobdir);

my $meta = $job->meta;

print "Running job! $jobdir\n";


$job->meta->set_metadata("status.$STAGE", "in_progress");
$job->meta->set_metadata("recover_broken_job", ("started at ".localtime));


my $sge = new SGE;

#
# Find needed executables
#

my $compute_exe = "$FIG_Config::bin/mg_compute_sims";
-x $compute_exe or &fatal("Executable missing: $compute_exe");

my $rdb_postproc_exe = "$FIG_Config::bin/mg_postproc_taxa_sims";
-x $rdb_postproc_exe or &fatal("Executable missing: $rdb_postproc_exe");

my $sims_done_exe = "$FIG_Config::bin/mg_sims_done";
-x $sims_done_exe or &fatal("Executable missing: $sims_done_exe");

#
# Set of sims jobs to run.
#

my $opts = "-m 8 -e 0.1";
my @jobs = (["sims.gg",   10000, "/vol/metagenome-48-hour/Data/greengenes.fasta", "blastn", $opts],
	    ["sims.lsu", 450000, "/vol/metagenome-48-hour/Data/lsu.fa", "blastn", $opts],
	    ["sims.ssu",  45000, "/vol/metagenome-48-hour/Data/ssu.fa", "blastn", $opts],
	    ["sims.16s",  10000, "/vol/metagenome-48-hour/Data/16s.fa", "blastn", $opts],
#	    ["sims.seed",   500, "/home/redwards/databases/SEED_2006_07_01", "blastx", $opts],
	    ["sims.seed",   500, "/scratch/metagenome-48-hour/Data/SEED_2006_07_01", "blastx", $opts],
	   );

my $proc = "$jobdir/proc";
chdir($proc) or &fatal("cannot chdir $proc: $!");

my $fasta = $meta->get_metadata("preprocess.fasta_file");
($fasta and -f $fasta) or &fatal("fasta not found: '$fasta'");

my @sge_ids;
my @short_ids;

for my $simj (@jobs)
{
    my($dir, $nseqs, $nr, $blastp, $opts) = @$simj;
    my $path = "$proc/$dir";
    -d $path or &fatal("Sims directory $path does not exist\n");

    my $repair = "$path/sims.repair";
    -d $repair or mkdir($repair) or die "cannot mkdir $repair: $!";

    my $repair_err = "$path/sims.repair/err";
    -d $repair_err or mkdir($repair_err) or die "cannot mkdir $repair_err: $!";

    open(T, "<$path/task.list") or die "Cannot open $path/task.list: $!";
    open(NT, ">$repair/task.list") or die "Cannot open $repair/task.list: $!";

    my $newtask = 0;
    while (my $tline = <T>)
    {
	chomp $tline;
	my($task, $in, $nr, $flags, $out, $err) = split(/\t/, $tline);

	my $fatal;
	my $success;

	if ( -e $err )
	{
	    open(E, "<$err") or die "cannot open err file $err: $!";
	    
	    while (<E>)
	    {
		if (/FATAL/)
		{
		    $fatal++;
		    last;
		}
		elsif (/SUCCESS/)
		{
		    $success++;
		}
	    }
	    close(E);
	}
	else
	{
	    $fatal++;
	}

	if ($fatal or !$success)
	{
	    print NT join("\t", $newtask + 1, $in, $nr, $flags, $out, "$repair_err/err.$task"), "\n";
	    $newtask++;
	}
    }

    close(NT);

    if ($newtask > 0)
    {
	print "Have $newtask new tasks in $repair/task.list\n";
	#
	# And submit.
	#
	
	my @sge_args;

	my $n = $dir;
	$n =~ s/^sims\.//;
	$n = "m${n}$job_id";
	
	push(@sge_args, "-N $n");
	push(@sge_args, "-v PATH");
	push(@sge_args, "-e $jobdir/sge_output");
	push(@sge_args, "-o $jobdir/sge_output");
	push(@sge_args, "-t 1-$newtask");
	push(@sge_args, "-b yes");
	#
	# repair metagenome 48hr jobs get high priority
	#
	push(@sge_args, "-l high");
	
	my $sge_args = join(" ", @sge_args);
	
	my $sge_id;
	
	eval {
	    $sge_id = $sge->submit_job($meta, $sge_args, "$compute_exe $jobdir $repair");
	};
	
	if ($@)
	{
	    &fatal($meta, "error starting SGE job $compute_exe $jobdir: $@\n");
	}
	
	push(@sge_ids, $sge_id);
    }

}

#
# Schedule a postprocessing job that just marks the sims stage complete.
#

my @sge_args;

push(@sge_args, "-N sd$job_id");
push(@sge_args, "-v PATH");
push(@sge_args, "-e $jobdir/sge_output");
push(@sge_args, "-o $jobdir/sge_output");
push(@sge_args, "-b yes");
push(@sge_args, "-l high");
#
# Hold on the sims jobs.
#
push(@sge_args, "-hold_jid " . join(",", @sge_ids));

my $sge_args = join(" ", @sge_args);

my $sge_id;

eval {
    $sge_id = $sge->submit_job($meta, $sge_args, "$sims_done_exe $jobdir");
};

if ($@)
{
    &fatal($meta, "error starting SGE job $sims_done_exe $jobdir: $@\n");
}

push(@sge_ids, $sge_id);

$meta->set_metadata("$STAGE.sge_ids", \@sge_ids);


sub run
{
    my(@cmd) = @_;
    
    print "Run @cmd\n";
    my $rc = system(@cmd);
    if ($rc != 0)
    {
	&fatal("Failed with rc=$rc: @cmd");
    }
}

sub fatal
{
    my($msg) = @_;

    $meta->add_log_entry($0, ['fatal error', $msg]);
    $meta->set_metadata("status.$STAGE", "error");

    croak "$0: $msg";
}
    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3