[Bio] / FortyEightMeta / mg_postproc_taxa_sims.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_postproc_taxa_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.13 - (download) (as text) (annotate)
Fri May 30 23:23:51 2008 UTC (12 years ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.12: +37 -45 lines
Development checkin of new metagenomics RAST pipeline.

#
# Postprocess the metagenome LSU/SSU/Greengene sims runs.
#

use DB_File;
use Data::Dumper;

use JobStage;
use strict;
use FIG;
use FIGV;
use FIG_Config;
use File::Basename;
use File::Copy;
use GenomeMeta;
use Carp 'croak';
use FortyEightMeta::ExpandSims;

my $STAGE = "sims_postprocess";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job = $stage->job();

my $meta = $job->meta;

print "Running job! $jobdir\n";

my $blast2gff_exe = "$FIG_Config::bin/blast2gff";
-x $blast2gff_exe or $stage->fatal("Executable $blast2gff_exe missing");

my $correct_contigs_exe = "$FIG_Config::bin/correct_gff_missing_contigs";
-x $correct_contigs_exe or $stage->fatal("Executable $correct_contigs_exe missing");

my $gff2seed_exe = "$FIG_Config::bin/gff2seed";
-x $gff2seed_exe or $stage->fatal("Executable $gff2seed_exe missing");

my $get_ss_conns_exe = "$FIG_Config::bin/get_ss_connections";
-x $get_ss_conns_exe or $stage->fatal("Executable $get_ss_conns_exe missing");

$stage->set_status("in_progress");

&process_rdp();
&create_seed_gff();
&install_seed_gff();

$meta->set_metadata("status.sims_postprocess", "complete");
$meta->set_metadata("sims_postprocess.running", "no");

exit(0);

#
# Process the seed blastx results into an organism directory.
#
sub create_seed_gff
{

    #
    # consolidate all the blastx generated sims into a single file
    #

    my $sims = "$jobdir/proc/sims.seed.raw";
    #if (! -e $sims){
    #	consolidate_sims("$jobdir/proc/sims.seed", $sims);
    #}
    
    my $gff = "$jobdir/proc/$genome.gff";
    my $out_sims = "$jobdir/proc/$genome.sims";

    #
    # Use the btree-indexed NR from the data directory if possible.
    #

    my $nr = "$FIG_Config::metagenome_data/nr.2007-0208.btree";
    my $nrlen = "$FIG_Config::metagenome_data/nr.2007-0208.len.btree";
    my @nrarg;
    if (-f $nr)
    {
	@nrarg = ('-nr', $nr);
    }
    if (-f $nrlen)
    {
	push(@nrarg, "-nrlen", $nrlen);
    }
    

    #
    # Verify we have the inputs we need for generating the orgdir.
    #

    my $fasta = $meta->get_metadata("preprocess.fasta_file");
    ($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

    my $taxonomy = &FIG::file_head("$jobdir/TAXONOMY", 1);
    my $project = &FIG::file_head("$jobdir/PROJECT", 1);
    chomp $taxonomy;
    chomp $project;

    #
    # Start the process.
    #

    my @args = (-f => $fasta,
		-s => $out_sims,
		-o => $gff,
		-gs => $job->genome_name(),
		-taxonomy => $taxonomy,
		-proj => $project,
		-g => $genome,
		@nrarg,
		$sims);

    my $pid = open(BOUT, "-|");
    if ($pid == 0)
    {
	exec { $blast2gff_exe } $blast2gff_exe, @args;
	die "Failed: $!: $blast2gff_exe @args";
    }

    print "Started pid=$pid $blast2gff_exe @args\n";

    while (<BOUT>)
    {
	print "Blast2gff: $_";
    }

    if (!close(BOUT))
    {
	$stage->fatal("Failed with \$!=$! \$?=$?: $blast2gff_exe @args");
    }
}

#
# Unpack the gff file into the organism directory, do the rest of the post-install cleanup.

#
sub install_seed_gff
{
    my $genome = $job->genome_id();
    my $gff = "$jobdir/proc/$genome.gff";
    my $out_sims = "$jobdir/proc/$genome.sims";

    -f $gff or $stage->fatal("gff file $gff does not exist");
    
    makedir("$jobdir/rp");
    my $orgdir = "$jobdir/rp/$genome";

    my $fasta = $meta->get_metadata("preprocess.fasta_file");
    ($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

    #
    # seed2gff won't run if the orgdir exists
    #

    if (-d $orgdir)
    {
	my $new = "$orgdir." . time;
	print "Moving $orgdir to $new\n";
	rename $orgdir, $new or $stage->fatal("Cannot rename $orgdir to $new: $!");
    }

    #
    # Create seed dir
    #
    my @args = ("-debug", $gff, "$jobdir/rp");
    &run_pipe(\*STDOUT, $gff2seed_exe, @args);

    #
    # Copy in missing contigs
    #

    &run_pipe(\*STDOUT, $correct_contigs_exe, "-orgdir", $orgdir, $fasta, $genome);

    #
    # Index contigs.
    #

    &run_pipe(\*STDOUT, "$FIG_Config::bin/make_fasta_btree",
	      "$orgdir/contigs",
	      "$orgdir/contigs.btree",
	      "$orgdir/contig_len.btree");

    #
    # Copy sims.
    #
    copy($out_sims, "$orgdir/similarities");

    #
    # Expand sims.
    #

    my $peg_syns = "$FIG_Config::fortyeight_data/peg.synonyms";
    my $exp_sims = "$orgdir/expanded_similarities";
    &run_pipe(\*STDOUT, "$FIG_Config::bin/expand_sims",
	      "-orgdir", $orgdir,
	      $peg_syns,
	      "$orgdir/similarities", $exp_sims);

#     my $fig = new FIGV($orgdir);


    
#     my $syns_fh = new FileHandle($peg_syns);
#     $syns_fh or $stage->fatal("cannot open $peg_syns: $!");

#     my $syns = load_peg_syns($syns_fh);

#     eval {
# 	expand_sims($fig, $out_sims, $exp_sims, $syns);
#     };
#     if ($@)
#     {
# 	$stage->fatal("error in expand_sims: $@");
#     }
    

    #
    # Set up to compute subsys mappings.
    #

    makedir("$orgdir/Subsystems");
    my $bindings = "$orgdir/Subsystems/bindings";
    my $subsystems = "$orgdir/Subsystems/subsystems";

    &touch($bindings);
    &touch($subsystems);

    my $fh = new FileHandle(">$orgdir/ss.attributes");
    $fh or $stage->fatal("Cannot write $orgdir/ss.attributes: $!");
    
    &run_pipe($fh,
	      $get_ss_conns_exe,
	      "-figv", $bindings, $subsystems,
	      "-orgdir", $orgdir,
	      "-g", $genome);
    close($fh);

    my $fh = new FileHandle(">$orgdir/taxa_summary_by_blast");
    $fh or $stage->fatal("Cannot write $orgdir/taxa_summary_by_blast: $!");
    
    &run_pipe($fh,
	      $summarize_exe,
	      "-orgdir", $orgdir,
	      "-b", "$orgdir/taxa_summary_by_blast.btree",
	      $genome);
    close($fh);

    #
    # Copy in the taxa hits that we computed earlier.
    #
    system("cp $jobdir/proc/taxa*hits $orgdir/.");
}
    
#
# If we are running here, we should have generated sims in the proc/sims.{16s,gg,lsu,ssu} directories.
# Scan the task.list files in each of them, and concatenate the output into files proc/sims.WHAT.raw.
#
sub process_rdp
{
    my $orgdir = "$jobdir/rp/$genome";
    
    for my $name (qw(16s gg lsu ssu))
    {
	my $dir = "$jobdir/proc/sims.$name";
	my $sims = "$jobdir/proc/sims.$name.raw";
	
	#if (! -e $sims){
	#    consolidate_sims($dir, $sims);
	#}
	# Now we can invoke the appropriate analysis.
	

	my $besthits = "$jobdir/proc/taxa.$name.besthits";
	my $allhits =  "$jobdir/proc/taxa.$name.allhits";
	
	if ($name eq 'gg')
	{
	    my @cmd = ("$FIG_Config::bin/blast2taxa", $sims, $besthits, $allhits);
	    my $rc = system(@cmd);
	    if ($rc != 0)
	    {
		$stage->fatal("error running $FIG_Config::bin/blast2taxa $sims");
	    }
	}
	elsif ($name eq 'lsu' or $name eq 'ssu' or $name eq '16s')
	{
	    my $db = $name;
	    $db = "rdp" if $name eq '16s';
	    
	    my @cmd = ("$FIG_Config::bin/16s_taxa_mysql",
		       '-d', $db,
		       '-b', $sims,
		       '-c', '1e-5',
		       '-l', '50',
		       '-ah', $allhits,
		       '-bh', $besthits);
	    print "Run @cmd\n";
	    my $rc = system(@cmd);
	    if ($rc != 0)
	    {
		$stage->fatal("error rc=$rc running @cmd");
	    }
	}
    }
}

sub consolidate_sims
{
    my($dir, $sims) = @_;
    if (! -d $dir)
    {
	$stage->fatal("Sim directory $dir not found");
    }
    
    open(TL, "<$dir/task.list") or $stage->fatal("Cannot open $dir/task.list: $!");
    
    #
    # Open output file.
    #
    open(O, ">$sims") or $stage->fatal("Cannot create $sims: $!");
	
    while (<TL>)
    {
	chomp;
	my ($id, $in, $nr, $flags, $out, $err) = split(/\t/);
	
	if (! -f $out)
	{
	    $stage->fatal("Sims output file $out missing for sims dir $dir");
	}
	
	copy($out, \*O);
    }
    close(O);
}
    
sub run_pipe
{
    my($outfh, $cmd, @args) = @_;

    my $pid = open(BOUT, "-|");
    if ($pid == 0)
    {
	exec { $cmd } $cmd, @args;
	die "Failed: $!: $cmd @args";
    }

    print "Started pid=$pid $cmd @args\n";

    while (<BOUT>)
    {
	print $outfh $_;
    }

    if (!close(BOUT))
    {
	$stage->fatal("Failed with \$!=$! \$?=$?: $cmd @args");
    }
}

sub makedir
{
    my($dir) = @_;

    if (! -d $dir)
    {
	mkdir($dir, 0777) or $stage->fatal("mkdir $dir failed: $!");
    }
    chmod 0777, $dir;
}

sub touch
{
    my($file) = @_;
    my $fh = new FileHandle(">$file") or $stage->fatal("touch: cannot open $file for writing: $!");
    close($fh);
}

    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3