[Bio] / FortyEightMeta / mg_preprocess.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_preprocess.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.18 - (download) (as text) (annotate)
Sun Nov 16 16:58:58 2008 UTC (11 years, 4 months ago) by arodri7
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.17: +4 -2 lines
update the metagene process tool

#
# Perform metagenome server preprocessing.
#

use strict;
use FIG;
use FIG_Config;
use File::Basename;
use File::Copy;
use GenomeMeta;
use JobStage;
use Carp;
use DBrtns;
use FortyEightMeta::MGDB;
use FortyEightMeta::SimStatusDB;

my $STAGE = "preprocess";

@ARGV == 1 or die "Usage: $0 job-dir\n";

my $jobdir = shift;

-d $jobdir or die "$0: job dir $jobdir does not exist\n";

my $stage = new JobStage('Job48', $STAGE, $jobdir);

$stage or die "Cannot create job for $jobdir\n";

print "Running job! $jobdir\n";

my $mgdb;
eval {
    $mgdb = new DBrtns($FIG_Config::mgrast_dbms, $FIG_Config::mgrast_db,
			  $FIG_Config::mgrast_dbuser, $FIG_Config::mgrast_dbpass,
			  $FIG_Config::mgrast_dbport, $FIG_Config::mgrast_dbhost,
			  $FIG_Config::mgrast_dbsock);
};
if ($@)
{
    $stage->fatal("cannot connect to database: $@");
}

$mgdb or $stage->fatal("Cannot open connection to database");

$stage->set_status("in_progress");
$stage->set_running("yes");
$stage->set_qualified_metadata("host", $stage->hostname);

#
# Set up sim status database.
#
{
    my $simdb = FortyEightMeta::SimStatusDB->new($stage->job->id, $mgdb);
    $simdb->make_tables();
}

my $fig = new FIG;

#
# Find needed executables
#

my $count_fasta_exe = "$FIG_Config::bin/countfastachars";
-x $count_fasta_exe or $stage->fatal("Executable missing: $count_fasta_exe");

#
# Create directories if needed
#

chdir($jobdir) or $stage->fatal("cannot chdir $jobdir: $!");

-d "raw" or mkdir "raw", 0777 or $stage->fatal("cannot mkdir raw in $jobdir: $!");
-d "proc" or mkdir "proc", 0777 or $stage->fatal("cannot mkdir proc in $jobdir: $!");
-d "sge_output" or mkdir "sge_output", 0777 or $stage->fatal("cannot mkdir sge_output in $jobdir: $!");

#
# Extract data into the raw dir.
#
# With the latest frontend, the tarfile will be extracted into the
# incoming dir. All we need do is copy the files in.
#

chdir "raw" or $stage->fatal("cannot chdir to raw in $jobdir: $!");

my $src_fasta = $stage->get_metadata("source_fasta");
my $src_qual = $stage->get_metadata("source_qual");

my($raw_fasta);

#
# Consult the clearinghouse to create a genome id.
#

my $genome_id;

#
# Eval this; it includes a SOAP call...
eval {

    $genome_id = create_and_register_genome_id();
};
if ($@)
{
    $stage->fatal("Failure to assign genome ID for genome\n$@");
}

my $new_fa = "$genome_id.fa";
my $new_qual = "$genome_id.qual";

#
# Process fasta file.
#

open(FA, "<$src_fasta") or $stage->fatal("Cannot open provided fasta file $src_fasta: $!");
$_ = <FA>;
if (!/^>\S+/)
{
    $stage->fatal("Fasta file provided ($src_fasta) does not appear to be a fasta file (first line does not start with >)");
}
close(FA);

$raw_fasta = "$new_fa.unformatted";

copy($src_fasta, "$jobdir/raw/$raw_fasta") or $stage->fatal("copy of $src_fasta to $jobdir/raw/$raw_fasta failed: $!");

#
# Now we can count the original fasta file.
#

$stage->run_process_in_shell("count_fasta_raw",
			     "$count_fasta_exe -m $jobdir/meta.xml -k $STAGE.count_raw -s $jobdir/raw/$raw_fasta > $jobdir/proc/count_raw.stdout 2> $jobdir/proc/count_raw.stderr");
# get the max length from the file generated
my $max_length=0;
my $fh = $stage->open_file("$jobdir/proc/count_raw.stdout", "r");
while (<$fh>)
{
    if (/max\s+id\s+length\s+(\d+)/)
    {
	$max_length = $1;
	last;
    }
}
close($fh);


#
# Run the fasta through reformat_contigs to get rid of any line ending problems.
#    # added by arodri7
#    Metagene will also be run if the user selected the option and any sequences were longer than 500 bp. 
#    A new set of sequences would be made depending on the metagene output for the contig sequences
#

my $max_id_len = &reformat_contigs("$jobdir/raw/$raw_fasta", "$jobdir/proc/$new_fa", $max_length);

#
# Copy the qual file in.
#
if ($src_qual ne '')
{
    copy($src_qual, "$jobdir/proc/$new_qual");
}

chdir "$jobdir/proc" or $stage->fatal("cannot chdir to $jobdir/proc: $!");

#
# Save to the GENOME_ID file.
#
open(G, ">$jobdir/GENOME_ID") or $stage->fatal("Cannot write $jobdir/GENOME_ID: $!");
print G "$genome_id\n";
close(GENOME_ID);

#
# While we are here, set up the taxonomy.
#
open(T, ">$jobdir/TAXONOMY") or $stage->fatal("Cannot write $jobdir/TAXONOMY: $!");
print T "unclassified; environmental samples; " . $stage->job->genome_name() . "\n";
close(T);

-f $new_fa or $stage->fatal("Cannot find new fasta file $new_fa");
-f $new_qual or warn "Warning: cannot find new qual file $new_qual";

$stage->set_qualified_metadata("fasta_file", "$jobdir/proc/$new_fa");
$stage->set_qualified_metadata("qual_file", "$jobdir/proc/$new_qual") if $new_qual;

#
# Now we can count the new fasta file.
#

#$stage->run_process_in_shell("count_fasta_raw",
#			     "$count_fasta_exe -m $jobdir/meta.xml -k $STAGE.count_raw -s $jobdir/raw/$raw_fasta > count_raw.stdout 2> count_raw.stderr");

$stage->run_process_in_shell("count_fasta_proc",
			     "$count_fasta_exe -m $jobdir/meta.xml -k $STAGE.count_proc -s $jobdir/proc/$new_fa > count_proc.stdout 2> count_proc.stderr");

my ($table_name, $best_iden_name, $best_psc_name) = FortyEightMeta::MGDB::create_sims_db($mgdb, $stage->job->id(), $max_id_len);

$stage->set_metadata("db.table_name", $table_name);
$stage->set_metadata("db.best_by_iden_table_name", $best_iden_name);
$stage->set_metadata("db.best_by_psc_table_name", $best_psc_name);
$stage->set_metadata("preprocess.max_contig_id_len", $max_id_len);

$stage->set_status("complete");
$stage->set_running("no");

sub reformat_contigs
{
    my($in, $out, $max) = @_;
    my $prefix = $stage->job->id() . "_";
    my(@args);

    push(@args, "-v");

    if ($stage->meta->get_metadata("options.remove_duplicates"))
    {
	my $mapfile = "$jobdir/proc/duplicates.map";
	push(@args,
	     "-remove-duplicates",
	     "-duplicates-file=$mapfile",
	     );
    }

    if ( ($stage->meta->get_metadata("options.run_metagene")) && ($max>=500) ){
	# run metagene on the raw input file
	print "Run metagene '$in' \n";
	my $out2 = "$jobdir/proc/metagene_contigs.fa";

	my @metagene_args = ("-output-directory=$jobdir/proc", "$in", "$out2");
	$stage->run_process("metagene_output", "$FIG_Config::bin/extract_metagene_contigs", @metagene_args);
	
	$in = $out2;
    }

    if ($stage->meta->get_metadata("options.renumber_sequences"))
    {
	my $mapfile = "$jobdir/proc/renumber.map";
	push(@args, 
	     "-renumber",
	     "-renumber-prefix=$prefix",
	     "-renumber-digits=6",
	     "-renumber-map=$mapfile");
    }

    my $min_read = $FIG_Config::mgrast_min_read_size;
    if ($min_read !~ /^\d+$/)
    {
	$min_read = 10;
    }
    push(@args, "-min=$min_read");
    
    push(@args, $in, $out);

    print "Run '$in' '$out' with @args\n";
    $stage->run_process("reformat_contigs", "$FIG_Config::bin/reformat_contigs", @args);

    #
    # Read stderr output to find longest contig id length.
    #
    my $max_id;
    my $err_fh = $stage->open_error_file("reformat_contigs", "<");
    while (<$err_fh>)
    {
	if (/max\s+id\s+length\s+(\d+)/)
	{
	    $max_id = $1;
	    last;
	}
    }
    close($err_fh);
    return $max_id;
}


sub create_and_register_genome_id
{
    my $tax = $fig->clearinghouse_register_metagenome_taxon_id($stage->job->user(), $stage->job->genome_name());
    $tax or die "could not register taxon id";

    my $proxy = SOAP::Lite-> uri('http://www.soaplite.com/Scripts')-> proxy($FIG_Config::clearinghouse_url);
    
    my $response = $proxy->register_genome($tax);
    if ($response->fault) {	
      die "Failed to deposit:	 " . $response->faultcode . " " . $response->faultstring;
    }	

    my $vers = $response->result;

    return "$tax.$vers";
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3