[Bio] / FortyEightMeta / mg_create_seed_org.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_create_seed_org.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Fri May 30 23:23:51 2008 UTC (11 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0924, mgrast_rel_2008_0625, mgrast_rel_2008_0919, mgrast_rel_2008_0917
Development checkin of new metagenomics RAST pipeline.

#
# Postprocess the metagenome LSU/SSU/Greengene sims runs.
#

use DB_File;
use Data::Dumper;

use JobStage;
use strict;
use FIG;
use FIGV;
use FIG_Config;
use File::Basename;
use File::Copy;
use GenomeMeta;
use Carp 'croak';
use FortyEightMeta::ExpandSims;

my $STAGE = "create_seed_org";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

my $stage = new JobStage('Job48', $STAGE, $jobdir);
$stage or die "Cannot create job for $jobdir\n";

my $job_id = basename($jobdir);
my $job = $stage->job();
my $genome = $job->genome_id();

my $meta = $job->meta;

print "Running job! $jobdir\n";

my $blast2gff_exe = "$FIG_Config::bin/blast2gff";
-x $blast2gff_exe or $stage->fatal("Executable $blast2gff_exe missing");

my $correct_contigs_exe = "$FIG_Config::bin/correct_gff_missing_contigs";
-x $correct_contigs_exe or $stage->fatal("Executable $correct_contigs_exe missing");

my $gff2seed_exe = "$FIG_Config::bin/gff2seed";
-x $gff2seed_exe or $stage->fatal("Executable $gff2seed_exe missing");

my $get_ss_conns_exe = "$FIG_Config::bin/get_ss_connections";
-x $get_ss_conns_exe or $stage->fatal("Executable $get_ss_conns_exe missing");

$stage->set_status("in_progress");
$stage->set_running("yes");

my $db_info = create_seed_gff();

if ($db_info)
{
    install_seed_gff($db_info);
}

$stage->set_status("complete");
$stage->set_running("no");

exit(0);

#
# Process the seed blastx results into an organism directory.
#
sub create_seed_gff
{
    #
    # We need to first find the information about the SEED database
    # we used to generate this data.
    #
    my $db_list = $stage->get_metadata("sims.database_list");

    #
    # Find the one named "SEED".
    #
    my $db;
    if (my @s = grep { $_->{name} eq 'SEED'} @$db_list)
    {
	if (@s > 1)
	{
	    $stage->log("Multiple SEED databases were processed, using version $s[0]->{version}");
	}
	
	$db = $s[0]->{files}->[0];
    }
    else
    {
	$stage->log("No SEED database was processed in this job");
	return 0; 
    }

    my $sims_dir = "$jobdir/proc/$db->{dir}";
    my $sims = "$sims_dir.raw";
    consolidate_sims($sims_dir, $sims);
    
    my $gff = "$jobdir/proc/$genome.gff";
    my $out_sims = "$jobdir/proc/$genome.sims";

    #
    # Use the btree-indexed NR from the data directory if possible.
    #

    my $nr = "$db->{fasta}.btree";
    my $nrlen = "$db->{fasta}-len.btree";
    my @nrarg;
    if (-f $nr)
    {
	@nrarg = ('-nr', $nr);
    }
    if (-f $nrlen)
    {
	push(@nrarg, "-nrlen", $nrlen);
    }
    

    #
    # Verify we have the inputs we need for generating the orgdir.
    #

    my $fasta = $meta->get_metadata("preprocess.fasta_file");
    ($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

    my $taxonomy = &FIG::file_head("$jobdir/TAXONOMY", 1);
    my $project = &FIG::file_head("$jobdir/PROJECT", 1);
    chomp $taxonomy;
    chomp $project;

    #
    # Start the process.
    #

    my @args = (-f => $fasta,
		-s => $out_sims,
		-o => $gff,
		-gs => $job->genome_name(),
		-taxonomy => $taxonomy,
		-proj => $project,
		-g => $genome,
		@nrarg,
		$sims);

    my $pid = open(BOUT, "-|");
    if ($pid == 0)
    {
	exec { $blast2gff_exe } $blast2gff_exe, @args;
	die "Failed: $!: $blast2gff_exe @args";
    }

    print "Started pid=$pid $blast2gff_exe @args\n";

    while (<BOUT>)
    {
	print "Blast2gff: $_";
    }

    if (!close(BOUT))
    {
	$stage->fatal("Failed with \$!=$! \$?=$?: $blast2gff_exe @args");
    }

    return $db;
}

#
# Unpack the gff file into the organism directory, do the rest of the post-install cleanup.

#
sub install_seed_gff
{
    my($db) = @_;
    
    my $genome = $job->genome_id();
    my $gff = "$jobdir/proc/$genome.gff";
    my $out_sims = "$jobdir/proc/$genome.sims";

    -f $gff or $stage->fatal("gff file $gff does not exist");
    
    makedir("$jobdir/rp");
    my $orgdir = "$jobdir/rp/$genome";

    my $fasta = $meta->get_metadata("preprocess.fasta_file");
    ($fasta and -f $fasta) or $stage->fatal("fasta not found: '$fasta'");

    #
    # seed2gff won't run if the orgdir exists
    #

    if (-d $orgdir)
    {
	my $new = "$orgdir." . time;
	print "Moving $orgdir to $new\n";
	rename $orgdir, $new or $stage->fatal("Cannot rename $orgdir to $new: $!");
    }

    #
    # Create seed dir
    #
    my @args = ("-debug", $gff, "$jobdir/rp");
    &run_pipe(\*STDOUT, $gff2seed_exe, @args);

    #
    # Copy in missing contigs
    #

    &run_pipe(\*STDOUT, $correct_contigs_exe, "-orgdir", $orgdir, $fasta, $genome);

    #
    # Index contigs.
    #

    &run_pipe(\*STDOUT, "$FIG_Config::bin/make_fasta_btree",
	      "$orgdir/contigs",
	      "$orgdir/contigs.btree",
	      "$orgdir/contig_len.btree");

    #
    # Copy sims.
    #
    copy($out_sims, "$orgdir/similarities");

    #
    # Expand sims.
    #

    my $peg_syns = "$db->{peg_synonyms}";
    my $exp_sims = "$orgdir/expanded_similarities";
    &run_pipe(\*STDOUT, "$FIG_Config::bin/expand_sims",
	      "-orgdir", $orgdir,
	      $peg_syns,
	      "$orgdir/similarities", $exp_sims);

    #
    # Index unexpanded sims, flip and index them as well.
    #

    &run_pipe(\*STDOUT, "$FIG_Config::bin/flip_sims",
	      "$orgdir/similarities", "$orgdir/similarities.flips");

    index_sims("$orgdir/similarities", "$orgdir/similarities.index");
    index_sims("$orgdir/similarities.flips", "$orgdir/similarities.flips.index");

    &run_pipe(\*STDOUT, "$FIG_Config::bin/index_feature_dir_fasta",
	      "$orgdir/Features/peg");

    

    #
    # Set up to compute subsys mappings.
    #

    makedir("$orgdir/Subsystems");
    my $bindings = "$orgdir/Subsystems/bindings";
    my $subsystems = "$orgdir/Subsystems/subsystems";

    &touch($bindings);
    &touch($subsystems);

    my $fh = new FileHandle(">$orgdir/ss.attributes");
    $fh or $stage->fatal("Cannot write $orgdir/ss.attributes: $!");
    
    &run_pipe($fh,
	      $get_ss_conns_exe,
	      "-figv", $bindings, $subsystems,
	      "-orgdir", $orgdir,
	      "-g", $genome);
    close($fh);

}
    
sub consolidate_sims
{
    my($dir, $sims) = @_;
    if (! -d $dir)
    {
	$stage->fatal("Sim directory $dir not found");
    }
    
    open(TL, "<$dir/task.list") or $stage->fatal("Cannot open $dir/task.list: $!");
    
    #
    # Open output file.
    #
    open(O, ">$sims") or $stage->fatal("Cannot create $sims: $!");
	
    while (<TL>)
    {
	chomp;
	my ($id, $in, $nr, $flags, $out, $err) = split(/\t/);
	
	if (! -f $out)
	{
	    $stage->fatal("Sims output file $out missing for sims dir $dir");
	}
	
	copy($out, \*O);
    }
    close(TL);
    close(O);
}
    
sub run_pipe
{
    my($outfh, $cmd, @args) = @_;

    my $pid = open(BOUT, "-|");
    if ($pid == 0)
    {
	exec { $cmd } $cmd, @args;
	die "Failed: $!: $cmd @args";
    }

    print "Started pid=$pid $cmd @args\n";

    while (<BOUT>)
    {
	print $outfh $_;
    }

    if (!close(BOUT))
    {
	$stage->fatal("Failed with \$!=$! \$?=$?: $cmd @args");
    }
}

sub makedir
{
    my($dir) = @_;

    if (! -d $dir)
    {
	mkdir($dir, 0777) or $stage->fatal("mkdir $dir failed: $!");
    }
    chmod 0777, $dir;
}

sub touch
{
    my($file) = @_;
    my $fh = new FileHandle(">$file") or $stage->fatal("touch: cannot open $file for writing: $!");
    close($fh);
}

    
#
# Use the C index_sims_file app to create a berkeley db index
# of the sims file.
#

sub index_sims
{
    my($sims, $index_file) = @_;

    my $path = &FIG::find_fig_executable("index_sims_file");

    open(IDX, "$path 0 < $sims |") or die "Cannot open index_sims_file pipe: $!\n";

    my %index;
    my $tied = tie %index, 'DB_File', $index_file, O_RDWR | O_CREAT, 0666, $DB_BTREE;

    $tied or $stage->fatal("Creation of hash $index_file failed: $!\n");

    while (<IDX>)
    {
	chomp;
	my($peg, undef, $seek, $len) = split(/\t/);
	
	$index{$peg} = "$seek,$len";
    }
    close(IDX);
    
    $tied->sync();
    untie %index;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3