[Bio] / FortyEightMeta / mg_port_job.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_port_job.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Wed Jun 25 16:57:03 2008 UTC (11 years, 7 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.1: +1 -1 lines
die if job already loaded

#
# Port a mgrast version 1 job to mgrast version 2.
#
# We assume the job directory has been copied verbatim into the v2 job directory.
#
# The following tasks are performed:
#
# Database tables created (as in mg_preprocess) and metadata updated to note their
# names.
#
# Taxonomy database list is scanned. If a sims directory in the proc directory
# exists that matches a taxonomy database, the sims are loaded into the
# database tables. We use the simsdir.raw file that contains all sims.
#
# We special-case sims.seed in order to point the analysis at the SEED
# database version "OLD".
#

use strict;
use Data::Dumper;
use FortyEightMeta::SimDB;
use FortyEightMeta::MGDB;
use JobStage;
use File::Basename;
use DBrtns;
    
my $STAGE = "port_job";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job = $stage->job();
my $meta = $stage->meta();

my $sim_db = FortyEightMeta::SimDB->new();

my $mgdb;
eval {
    $mgdb = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
			  $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
			  $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
			  $FIG_Config::mgrast_dbsock);
};
if ($@)
{
    $stage->fatal("cannot connect to database: $@");
}

$mgdb or $stage->fatal("Cannot open connection to database");

#
# NR file translation table. We've moved data from the original
# location to a new place, but the old jobs have the old
# paths.
#
my %nr_trans = ('/vol/metagenome-48-hour/Data/greengenes.fasta' =>
		'/vol/mg-rast-test/Data/db/greengenes/1/greengenes.fasta',
		'/vol/metagenome-48-hour/Data/lsu.fa' =>
		'/vol/mg-rast-test/Data/db/euro_rrna/1/lsu.fa',
		'/vol/metagenome-48-hour/Data/ssu.fa' =>
		'/vol/mg-rast-test/Data/db/euro_rrna/1/ssu.fa',
		'/scratch/metagenome-48-hour/Data/SEED_2006_07_01' =>
		'/vol/48-hour/Data/nr',
		'/vol/metagenome-48-hour/Data/16s.fa' =>
		'/vol/mg-rast-test/Data/db/rdp/9.27/16s.fa');

#
# See if the job has already been ported, and bail.
#

if ($meta->get_metadata('db.table_name') ne '')
{
    die "Job is already ported\n";
}

#
# Scan fasta to find max id length.
#
my $fasta = $meta->get_metadata("preprocess.fasta_file");

if (! -f $fasta)
{
    if ($fasta =~ m,^(.*)/(proc/[^/]+\.fa)$,)
    {
	my($base, $rel) = ($1, $2);
	if (! -d $base)
	{
	    my $nfasta = "$jobdir/$rel";
	    if (! -f $nfasta)
	    {
		die "Original fasta location not found, relocation to $nfasta not found either\n";
	    }
	    print "Updating fasta location from $fasta to $nfasta\n";
	    $meta->set_metadata("preprocess.fasta_file_org", $fasta);
	    $meta->set_metadata("preprocess.fasta_file", $nfasta);
	    $fasta = $nfasta;
	}
    }
    else
    {
	die "Fasta $fasta not found, cannot compute relocation\n";
    }
}

open(F, "<", $fasta) or die "Cannot open fasta file $fasta: $!";
my $max_id_len = -1;
while (<F>)
{
    if (/^>(\S+)/)
    {
	my $l = length($1);
	$max_id_len = $l if $l > $max_id_len;
    }
}
close(F);
print "got max $max_id_len from fasta $fasta\n";

my ($table_name, $best_iden_name, $best_psc_name) = FortyEightMeta::MGDB::create_sims_db($mgdb, $job_id, $max_id_len);

$stage->set_metadata("db.table_name", $table_name);
$stage->set_metadata("db.best_by_iden_table_name", $best_iden_name);
$stage->set_metadata("db.best_by_psc_table_name", $best_psc_name);

#
# Determine list of computed sims dirs in the job. These are the directories
# under jobdir/proc that have a task.list file.
#

my @dirs;
my %db_spec;
for my $tl (<$jobdir/proc/*/task.list>)
{
    if ($tl =~ m,^(.*/proc/([^/]+))/task\.list$,)
    {
	my($sim_path, $sim_dir) = ($1, $2);
	#
	# Open it up to find the NR used.
	#
	open(TL, "<", $tl) or die "Cannot open $tl: $!";
	my $l = <TL>;
	my ($task, $in, $nr, $flags, $out, $err) = split(/\t/, $l);
	close(TL);

	my $nr_used = $nr;
	my $trf = $nr_trans{$nr};
	
	my($db_name, $db_version, $tax_files) = $sim_db->db_files_for_fasta_file($nr);
	if (!$db_name)
	{
	    ($db_name, $db_version, $tax_files) = $sim_db->db_files_for_fasta_file($trf);
	    $nr_used = $trf;
	}

	if ($db_name)
	{
	    $db_spec{$db_name} = $db_version;
	}

	#
	# Find the sims file.
	#
	my $sims = "$sim_path.raw";
	if (! -f $sims)
	{
	    die "Sims not found at $sims\n";
	}
						   
	push(@dirs, [$sim_path, $sim_dir, $sims, $nr, $trf, $nr_used, $db_name, $db_version, $tax_files]);
    }
}

my @databases = $sim_db->databases(\%db_spec);

$meta->set_metadata('sims.database_list', \@databases);

#
# We now have all the information that we should need to port the job.
#
# We don't actually need the db info we looked up, since mg_load_sims_file
# does the same lookup, but it verifies that we were able to find the
# data.
#

for my $ent (@dirs)
{
    my($sim_path, $sim_dir, $sim_file, $nr, $trans_nr, $nr_used, $db_name, $db_version, $tax_files) = @$ent;

    if (!defined($db_name))
    {
	warn "Skipping $sim_path - no database found\n";
    }
    my @cmd = ("$FIG_Config::bin/mg_load_sims_file",
	       $nr_used, $sim_file,
	       $table_name, $best_iden_name, $best_psc_name);
    print "Run: @cmd\n";
    my $rc = system(@cmd);
    if ($rc == 0)
    {
	print "Success loading $sim_file\n";
    }
    else
    {
	die "Error loading $sim_file, rc=$rc\n";
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3