[Bio] / FortyEightMeta / mg_upgrade_sims.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_upgrade_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Sat Jan 3 23:33:06 2009 UTC (11 years, 2 months ago) by redwards
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
two parts of the code for upgrading from an old version of mg-rast to a newer version - the preprocess and the sims

#
# Chunk inputs and submit a pile of runs to the cluster.
#

use strict;
use FIG;
use FIG_Config;
use File::Basename;
use GenomeMeta;
use Job48;
use JobStage;
use SGE;
use FortyEightMeta::SimDB;
use FortyEightMeta::SimStatusDB;

=pod

=head1 mg_sims_upgrade

Upgrade sims from one version of mg-rast to another. The way that we will do this is to have two inputs, a from setting and a to setting. 


=cut



my $STAGE = "sims";

#@ARGV == 1 or die "Usage: $0 job-dir\n";

@ARGV == 3 or die "Usage: $0 job-dir from_version to_version\n";

my ($jobdir, $from, $to) = @ARGV;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job = $stage->job();

my $meta = $job->meta;

print "Running job! $jobdir\n";

$stage->set_status("in_progress");

my $sge = new SGE;

#
# Find needed executables
#

my $chunk_exe = "$FIG_Config::bin/mg_chunk_sims";
-x $chunk_exe or $stage->fatal("Executable missing: $chunk_exe");

my $compute_exe = "$FIG_Config::bin/mg_compute_sims";
-x $compute_exe or $stage->fatal("Executable missing: $compute_exe");

my $proc = "$jobdir/proc";
chdir($proc) or $stage->fatal("cannot chdir $proc: $!");

my $fasta = $meta->get_metadata("preprocess.fasta_file");
($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

my @sge_ids;
my @short_ids;

my $cutoff = $stage->get_metadata("options.sim_compute_cutoff");
$cutoff = 0.01 unless defined($cutoff);

my $simdb = FortyEightMeta::SimDB->new($FIG_Config::mgrast_database_def);
$simdb or $stage->fatal("Cannot open database description file $FIG_Config::mgrast_database_def");

my $blast_opts = "-m 8 -e $cutoff";
my @jobs;

my $status_db = FortyEightMeta::SimStatusDB->new($job_id);
$status_db or die "Cannot create status_db";

#
# Get a list of databases for the upgrade analysis, and then append the databases that
# are not upgradeable (e.g 16S's)

my @db_list = $simdb->db_files_for_upgrade("SEED_Upgrade", $from, $to);
push @db_list, grep { !$_->{inhibit_sims} } grep { !$_->{upgrade_database} } $simdb->databases();



unless (@db_list) {die "Can't find a database to go from $from to $to\n"}

$stage->set_metadata("sims.database_list", \@db_list);
for my $db (@db_list)
{
    my $name = $db->{name};
    my $version = $db->{version};
    my $files = $db->{files};
    if ( $db->{size_of_new} )
    {
	    my $effective_db_size = $db->{size_of_new};
	    $blast_opts .= " -z $effective_db_size";
    }
    
    for my $file (@$files)
    {
	my $path = $file->{fasta};
	#
	# Compute size of seqs per batch. uspib is microseconds per
	# input byte, calibrated on the fast intel boxes. We target 15-minute runs
	# since those correspond to roughly 30-minute runs on the PPC nodes.
	#
	
	my $uspib = $file->{uspib};
	my $chunk_size;
	if ($uspib)
	{
	    $chunk_size = int(15 * 60 / ( 1e-6 * $uspib));
	}
	else
	{
	    $chunk_size = 10000;
	}
	$stage->log("Sims db: name=$name version=$version uspib=$uspib path=$path chunksize=$chunk_size");

	my $blastp = $file->{type} eq 'dna' ? 'blastn' : 'blastx';
	my $dir = $file->{dir};
	my $nr = $file->{fasta};
	
	my $path = "$proc/$dir";
	-d $path or mkdir ($path, 0777) or $stage->fatal("Cannot mkdir $path: $!");
	open(F, ">", "$path/FASTA_PATH");
	print F "$nr\n";
	close(F);

	my $cmd = "$chunk_exe -p $blastp -o '$blast_opts' -N $chunk_size -nr $nr -j $path $fasta";
	open(P, "$cmd |") or $stage->fatal("Failed pipe open: $!: $cmd");
	my($start, $end);
	while (<P>)
	{
	    print;
	    chomp;
	    if (/tasks\s+(\d+)\s+(\d+)/)
	    {
		($start, $end) = ($1, $2);
	    }
	}
	close(P) or $stage->fatal("Error on close: \$!=$! \$?=$?: $cmd");
	
	defined($start) or $stage->fatal("tasks not found");
	
	print "Got tasks from $start to $end\n";

	#
	# And submit.
	#
	
 	my @sge_args;
	my $jobname = "m$file->{abbr}_$job_id";
	
	push(@sge_args, "-N $jobname");
	push(@sge_args, "-v PATH");
	push(@sge_args, "-e $jobdir/sge_output");
	push(@sge_args, "-o $jobdir/sge_output");
	push(@sge_args, "-t $start-$end");
	push(@sge_args, "-b yes");
	#
	# metagenome 48hr jobs get low priority
	#
	push(@sge_args, "-l low");

	my $sge_args = join(" ", @sge_args);
	
	my $sge_id;
	
	eval {
## DELETE THIS LINE (and uncomment the next one)
	  $sge_id = $sge->submit_job($meta, $sge_args, "$compute_exe $jobdir $path");
	  print "Would submit '$sge_args' '$compute_exe' '$jobdir' '$path'\n";
	};
	
	if ($@)
	{
	    $stage->fatal($meta, "error starting SGE job $compute_exe $jobdir: $@\n");
	}

	#
	# Initialize sim status entries.
	#
	my $retries = $FIG_Config::mgrast_blast_retries;
	$retries = 3 unless defined($retries);
	for my $t ($start .. $end)
	{
	    my $rec = {};
	    $rec->{sim_sge_id} = $sge_id;
	    $rec->{abbr} = $file->{abbr};
	    $rec->{work_dir} = $path;
	    $rec->{blast_retries_left} = $retries;
	    $rec->{status} = 'not_started';

## DELETE THIS LINE (and uncomment the next one)
	    $status_db->set_task($dir, $t, $rec);
	}    

	push(@sge_ids, $sge_id);

	# If this is an upgradeable job we need to copy the files over
	# only upgradeable jobs have old sizes and new sizes, and we need these to recalculate the E values
	if (  $db->{size_of_new} && $db->{size_of_old} ) {&copy_sims_files($db, $fasta, $end, $path, $dir)}
   }
}




##DELETE THIS LINE
#exit(0);
$stage->set_qualified_metadata("sge_ids", \@sge_ids);
$stage->set_status("complete");
$stage->set_running("no");






sub copy_sims_files {
	my ($db, $fasta, $end, $path, $dir)=@_;
	## A block to copy the old sims to the new directory. We are only going to do this for
	# a subset of the sims - we will not bother with small databases like 16S that we
	# can just recompute in a few minutes.
	
	# now we've got this sims queued, we should tack the old ids on the end of the task.list file
	# 
	# The process we're doing is finding the sims files from last time, and copying them
	# but altering the E value based on the ratio of the sizes of the two databases. We are
	# also going to copy the error files, but in this case, we're going to add a new line to the
	# error file that says that we copied it, just in case something goes wrong we might figure
	# out where the error file came from.
	#
	# Note that the mg_upgrade_preprocess stores the upgrade_source and original_preprocess.count_proc.file
	# in the meta.xml so we can get them here.
	#
	
	
	# database size, start of new file, and data file locations 
	
	print STDERR "Copying sims from ", $db->{name}, "\n";
	unless ($db->{size_of_old}) 
	{
		print STDERR "Size of old database is not defined. Skipping\n";
		return;
	}
	
	my $factor = $db->{size_of_new}/$db->{size_of_old};
	my $dest_file_number   = $end+1;
	my $sims_sourcedir  = $FIG_Config::mgrast_jobs."/".$meta->get_metadata("upgrade_source") . 
			"/proc/".$dir."/sims.raw/".$meta->get_metadata("original_preprocess.count_proc.file");
	my $err_sourcedir        = $FIG_Config::mgrast_jobs."/".$meta->get_metadata("upgrade_source") .
			"/proc/".$dir."/sims.err/".$meta->get_metadata("original_preprocess.count_proc.file");

	my $sims_destdir = "$path/sims.raw/".basename($fasta);
	my $err_destdir = "$path/sims.err/".basename($fasta);
	
	unless (-d $sims_sourcedir) {die "Can't find the directory for the sims: $sims_sourcedir"}
	opendir(DIR, "$sims_sourcedir") || die "Can't open $sims_sourcedir";
	foreach my $inf (sort {$a cmp $b} grep {$_ !~ /^\./} readdir(DIR)) 
	{
		my $rewritef = $inf;
		$rewritef =~ s/(\d+)$/$dest_file_number/;
		my $fromerr="err.$1"; # the error file that we need to grab later

		# copy from inf to rewritef
		if (-e "$sims_destdir/$rewritef") {die "We were trying to rewrite to $sims_destdir/$rewritef, but it already exists."}
		else {open(OUT, ">$sims_destdir/$rewritef") || die "We can't open $sims_destdir/$rewritef";}
		open(IN, "$sims_sourcedir/$inf") || die "can't open $sims_sourcedir/$inf";
		while (<IN>)
		{
			my @l=split /\t/;
			unless ($#l>=11 && $#l<=15) # note it should be 11, but we might add protocol and protein length information
			{
				print STDERR "In $sims_sourcedir/$inf we got questionable BLAST output: we only got $#l lines from $_";
				print OUT;
				next;
			}
			$l[10] = sprintf("%.1g",  ($l[10] * $factor)); # adjust the evalue
			print OUT join("\t", @l);
		}
		close IN;
		close OUT;


		# copy the error files across. But we're also going to add a line.
		# Note that we will store the original name here
		open(IN, "$err_sourcedir/$fromerr") || die "Can't open $err_sourcedir/$fromerr";
		open(OUT, ">$err_destdir/err.$dest_file_number") || die "Can't open $err_destdir/err.$dest_file_number";
		my $timestamp = scalar(localtime(time));
		print OUT <<EOF;
This error file was copied during a metagenome upgrade from $from to $to.
The original error file was $err_sourcedir/$fromerr
The original sims output file that it relates to is $sims_sourcedir/$inf
Copying occured on $timestamp
EOF
		print OUT while (<IN>);
		close IN;
		close OUT;
		

		# now we just create a fake status for the database, so that mg-load-sims will start them
		# loading when it runs. 
		my $rec = {};
		$rec->{abbr} = "u";
		$rec->{work_dir} = $path;
		$rec->{blast_retries_left} = 0;
		$rec->{status} = 'not_started';

## DELETE THIS LINE (and uncomment the next one)
	    $status_db->set_task($dir, $dest_file_number, $rec);

		$dest_file_number++;
	}
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3