[Bio] / FortyEightMeta / mg_preprocess.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_preprocess.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.16 - (download) (as text) (annotate)
Thu Jun 12 21:35:53 2008 UTC (11 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0924, mgrast_rel_2008_0625, mgrast_rel_2008_0919, mgrast_rel_2008_0917
Changes since 1.15: +9 -0 lines
Replace the MLDBM/DB_File sim status database with a postgres version.

#
# Perform metagenome server preprocessing.
#

use strict;
use FIG;
use FIG_Config;
use File::Basename;
use File::Copy;
use GenomeMeta;
use JobStage;
use Carp;
use DBrtns;
use FortyEightMeta::MGDB;
use FortyEightMeta::SimStatusDB;

my $STAGE = "preprocess";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('Job48', $STAGE, $jobdir);

$stage or die "Cannot create job for $jobdir\n";

print "Running job! $jobdir\n";

my $mgdb;
eval {
    $mgdb = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
			  $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
			  $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
			  $FIG_Config::mgrast_dbsock);
};
if ($@)
{
    $stage->fatal("cannot connect to database: $@");
}

$mgdb or $stage->fatal("Cannot open connection to database");

$stage->set_status("in_progress");
$stage->set_running("yes");
$stage->set_qualified_metadata("host", $stage->hostname);

#
# Set up sim status database.
#
{
    my $simdb = FortyEightMeta::SimStatusDB->new($stage->job->id, $mgdb);
    $simdb->make_tables();
}

my $fig = new FIG;

#
# Find needed executables
#

my $count_fasta_exe = "$FIG_Config::bin/countfastachars";
-x $count_fasta_exe or $stage->fatal("Executable missing: $count_fasta_exe");

#
# Create directories if needed
#

chdir($jobdir) or $stage->fatal("cannot chdir $jobdir: $!");

-d "raw" or mkdir "raw", 0777 or $stage->fatal("cannot mkdir raw in $jobdir: $!");
-d "proc" or mkdir "proc", 0777 or $stage->fatal("cannot mkdir proc in $jobdir: $!");
-d "sge_output" or mkdir "sge_output", 0777 or $stage->fatal("cannot mkdir sge_output in $jobdir: $!");

#
# Extract data into the raw dir.
#
# With the latest frontend, the tarfile will be extracted into the
# incoming dir. All we need do is copy the files in.
#

chdir "raw" or $stage->fatal("cannot chdir to raw in $jobdir: $!");

my $src_fasta = $stage->get_metadata("source_fasta");
my $src_qual = $stage->get_metadata("source_qual");

my($raw_fasta);

#
# Consult the clearinghouse to create a genome id.
#

my $genome_id;

#
# Eval this; it includes a SOAP call...
eval {

    $genome_id = create_and_register_genome_id();
};
if ($@)
{
    $stage->fatal("Failure to assign genome ID for genome\n$@");
}

my $new_fa = "$genome_id.fa";
my $new_qual = "$genome_id.qual";

#
# Process fasta file.
#

open(FA, "<$src_fasta") or $stage->fatal("Cannot open provided fasta file $src_fasta: $!");
$_ = <FA>;
if (!/^>\S+/)
{
    $stage->fatal("Fasta file provided ($src_fasta) does not appear to be a fasta file (first line does not start with >)");
}
close(FA);

$raw_fasta = "$new_fa.unformatted";

copy($src_fasta, "$jobdir/raw/$raw_fasta") or $stage->fatal("copy of $src_fasta to $jobdir/raw/$raw_fasta failed: $!");

#
# Run the fasta through reformat_contigs to get rid of any line ending problems.
#

my $max_id_len = &reformat_contigs("$jobdir/raw/$raw_fasta", "$jobdir/proc/$new_fa");

#
# Copy the qual file in.
#
if ($src_qual ne '')
{
    copy($src_qual, "$jobdir/proc/$new_qual");
}

chdir "$jobdir/proc" or $stage->fatal("cannot chdir to $jobdir/proc: $!");

#
# Save to the GENOME_ID file.
#
open(G, ">$jobdir/GENOME_ID") or $stage->fatal("Cannot write $jobdir/GENOME_ID: $!");
print G "$genome_id\n";
close(GENOME_ID);

#
# While we are here, set up the taxonomy.
#
open(T, ">$jobdir/TAXONOMY") or $stage->fatal("Cannot write $jobdir/TAXONOMY: $!");
print T "unclassified; environmental samples; " . $stage->job->genome_name() . "\n";
close(T);

-f $new_fa or $stage->fatal("Cannot find new fasta file $new_fa");
-f $new_qual or warn "Warning: cannot find new qual file $new_qual";

$stage->set_qualified_metadata("fasta_file", "$jobdir/proc/$new_fa");
$stage->set_qualified_metadata("qual_file", "$jobdir/proc/$new_qual") if $new_qual;

#
# Now we can count.
#

$stage->run_process_in_shell("count_fasta_raw",
			     "$count_fasta_exe -m $jobdir/meta.xml -k $STAGE.count_raw -s $jobdir/raw/$raw_fasta > count_raw.stdout 2> count_raw.stderr");

$stage->run_process_in_shell("count_fasta_proc",
			     "$count_fasta_exe -m $jobdir/meta.xml -k $STAGE.count_proc -s $jobdir/proc/$new_fa > count_proc.stdout 2> count_proc.stderr");

my ($table_name, $best_iden_name, $best_psc_name) = FortyEightMeta::MGDB::create_sims_db($mgdb, $stage->job->id(), $max_id_len);

$stage->set_metadata("db.table_name", $table_name);
$stage->set_metadata("db.best_by_iden_table_name", $best_iden_name);
$stage->set_metadata("db.best_by_psc_table_name", $best_psc_name);
$stage->set_metadata("preprocess.max_contig_id_len", $max_id_len);

$stage->set_status("complete");
$stage->set_running("no");

sub reformat_contigs
{
    my($in, $out) = @_;
    my $prefix = $stage->job->id() . "_";
    my(@args);

    push(@args, "-v");

    if ($stage->meta->get_metadata("options.remove_duplicates"))
    {
	my $mapfile = "$jobdir/proc/duplicates.map";
	push(@args,
	     "-remove-duplicates",
	     "-duplicates-file=$mapfile",
	    );
    }

    if ($stage->meta->get_metadata("options.renumber_sequences"))
    {
	my $mapfile = "$jobdir/proc/renumber.map";
	push(@args, 
	     "-renumber",
	     "-renumber-prefix=$prefix",
	     "-renumber-digits=6",
	     "-renumber-map=$mapfile");
    }

    my $min_read = $FIG_Config::mgrast_min_read_size;
    if ($min_read !~ /^\d+$/)
    {
	$min_read = 10;
    }
    push(@args, "-min=$min_read");
    
    push(@args, $in, $out);

    print "Run '$in' '$out' with @args\n";
    $stage->run_process("reformat_contigs", "$FIG_Config::bin/reformat_contigs", @args);

    #
    # Read stderr output to find longest contig id length.
    #
    my $max_id;
    my $err_fh = $stage->open_error_file("reformat_contigs", "<");
    while (<$err_fh>)
    {
	if (/max\s+id\s+length\s+(\d+)/)
	{
	    $max_id = $1;
	    last;
	}
    }
    close($err_fh);
    return $max_id;
}


sub create_and_register_genome_id
{
    my $tax = $fig->clearinghouse_register_metagenome_taxon_id($stage->job->user(), $stage->job->genome_name());
    $tax or die "could not register taxon id";

    my $proxy = SOAP::Lite-> uri('http://www.soaplite.com/Scripts')-> proxy($FIG_Config::clearinghouse_url);
    
    my $response = $proxy->register_genome($tax);
    if ($response->fault) {	
      die "Failed to deposit:	 " . $response->faultcode . " " . $response->faultstring;
    }	

    my $vers = $response->result;

    return "$tax.$vers";
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3