[Bio] / FortyEightMeta / mg_chunk_sims.pl Repository:
ViewVC logotype

View of /FortyEightMeta/mg_chunk_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri May 30 23:23:51 2008 UTC (11 years, 8 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, mgrast_rel_2008_0806, mgrast_dev_10262011, mgrast_dev_02212011, mgrast_rel_2008_0923, mgrast_release_3_0, mgrast_dev_03252011, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, mgrast_rel_2008_0919, mgrast_rel_2008_1110, myrast_33, mgrast_rel_2008_0917, mgrast_dev_04052011, mgrast_dev_02222011, HEAD
Changes since 1.1: +14 -4 lines
Development checkin of new metagenomics RAST pipeline.

#
# Chunk a fasta file into pieces suitable for cluster BLAST calculations.
#
# We are provided the NR and peg.synonyms files that should be used for this.
#
# Usage: rp_chunk_sims -p blastprog -o 'blastopts' -n num-seqs -nr nr -j sims-job-dir fasta [fasta ...]
#
#
# We write a file task.list into sims-job-dir that contains the list of work units.
#
# The work units will write raw sims into sims-job-dir/sims.raw
#

use strict;
use File::Basename;
use Cwd 'abs_path';

my $usage = "$0 [-p blastprog] [-o blastopts] [-N num-chars] [-n num-seqs] -nr nr-file -j jobdir fasta [fasta...]";

my $blastprog = "blastp";
my $flags = "-m 8 -e 1.0e-5 -FF";
my $max_seqs;
my $max_chars;
my $nr_file;
my $jobdir;

while (@ARGV)
{
    my $opt = shift;
    if ($opt eq '-p')
    {
	$blastprog = shift;
    }
    elsif ($opt eq '-o')
    {
	$flags = shift;
    }
    elsif ($opt eq '-n')
    {
	$max_seqs = shift;
    }
    elsif ($opt eq '-N')
    {
	$max_chars = shift;
    }
    elsif ($opt eq '-nr')
    {
	$nr_file = shift;
    }
    elsif ($opt eq '-j')
    {
	$jobdir = shift;
    }
    elsif ($opt =~ /^-/)
    {
	die "Invalid option $opt. Usage: $usage\n";
    }
    else
    {
	unshift(@ARGV, $opt);
	last;
    }
}

$flags = "$flags -p $blastprog";

if (@ARGV == 0)
{
    die $usage;
}

my @fasta_files = @ARGV;

for my $f (@fasta_files)
{
    -f $f or die "Fasta file $f does not exist\n";
}

if (!defined($max_seqs) and !defined($max_chars))
{
    die "Must pass either -n or -N option\n";
}

-d $jobdir or mkdir $jobdir or die "Cannot mkdir $jobdir: $!\n";

my $next_task = 1;
my $last_task;

my $task_file = "$jobdir/task.list";
my $input_dir = "$jobdir/sims.in";
my $output_dir = "$jobdir/sims.raw";
my $error_dir = "$jobdir/sims.err";

-d $input_dir or mkdir $input_dir or die "Cannot mkdir $input_dir: $!\n";
-d $output_dir or mkdir $output_dir or die "Cannot mkdir $output_dir: $!\n";
-d $error_dir or mkdir $error_dir or die "Cannot mkdir $error_dir: $!\n";

$input_dir = abs_path($input_dir);
$output_dir = abs_path($output_dir);
$error_dir = abs_path($error_dir);

open(TASK, ">$task_file") or die "Cannot write $task_file: $!";

for my $file (@fasta_files)
{
    my $cur_size = 0;
    my $cur_count = 0;
    my $cur_input = '';

    my $base = basename($file);
    $file = abs_path($file);
	
    open(F, "<$file") or die "Cannot open $file: $!\n";

    print "Chunk file $file\n";
    
    while (<F>)
    {
	if (/^>/)
	{
	    if ((defined($max_seqs) and ($cur_count >= $max_seqs)) or
		(defined($max_chars) and ($cur_size >= $max_chars)))
	    {
		write_task($base, $input_dir, $output_dir, $error_dir, $cur_input);
		$cur_size = 0;
		$cur_count = 0;
		$cur_input = '';
	    }
	    $cur_input .= $_;
	    $cur_count++;
	}
	else
	{
	    $cur_input .= $_;
	    $cur_size += length($_);
	}
    }
    if ($cur_count >= 0)
    {
	write_task($base, $input_dir, $output_dir, $error_dir, $cur_input);
	$cur_size = 0;
	$cur_input = '';
	$cur_count = 0;
    }
    close(F);
}
close(TASK);

print "tasks\t1\t$last_task\n";

#
# Write an input chunk to $dir.
# Write a line on the 
sub write_task
{
    my($base, $input_dir, $output_dir, $error_dir, $fasta) = @_;

    my $task = $next_task++;

    my $idir = "$input_dir/$base";
    my $odir = "$output_dir/$base";
    my $edir = "$error_dir/$base";

    -d $idir or mkdir($idir) or die "Cannot mkdir $idir: $!\n";
    -d $odir or mkdir($odir) or die "Cannot mkdir $odir: $!\n";
    -d $edir or mkdir($edir) or die "Cannot mkdir $edir: $!\n";

    my $in = "$idir/in.$task";
    my $out = "$odir/out.$task";
    my $err = "$edir/err.$task";
    
    open(I, ">$in") or die "Cannot write $in: $!";
    print I $fasta;
    close(I);
    print TASK join("\t", $task, $in, $nr_file, $flags, $out, $err), "\n";
    $last_task = $task;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3