[Bio] / FortyEight / rp_chunk_sims.pl Repository:
ViewVC logotype

View of /FortyEight/rp_chunk_sims.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (download) (as text) (annotate)
Wed Jul 8 20:18:46 2009 UTC (10 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, mgrast_dev_05262011, mgrast_dev_04082011, rast_rel_2010_0928, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, rast_rel_2009_07_09, rast_rel_2010_0827, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.4: +32 -13 lines
Add support for pulling sims from sims server and submitting only those that don't match to be computed.

Add use of $FIG_Config::rast_sims_data to point at the directory holding
nr & peg.synonyms; explicitly here for the purposes of more easily updating
to new sims databases.

#
# Chunk a fasta file into pieces suitable for cluster BLAST calculations.
#
# We are provided the NR and peg.synonyms files that should be used for this.
#
# Usage: rp_chunk_sims fasta-file nr peg.synonyms sims-job-dir
#
# We write a file task.list into sims-job-dir that contains the list of work units.
#
# The work units will write raw sims into sims-job-dir/sims.raw
#

use strict;
use File::Basename;
use Cwd 'abs_path';

my $usage = "Usage: $0 [-size max-size] [-include-self] [-self-fasta fasta-file] fasta-file nr peg.synonyms sims-job-dir";

my $max_size = 10_000;
my $include_self = 0;
my $self_fasta;

while (@ARGV > 0 and $ARGV[0] =~ /^-/)
{
    my $arg = shift @ARGV;
    if ($arg =~ /^-size/)
    {
	$max_size = shift @ARGV;
    }
    elsif ($arg =~ /^-include-self/)
    {
	$include_self++;
    }
    elsif ($arg =~ /^-self-fasta/)
    {
	$self_fasta = shift @ARGV;
    }
    else
    {
	die $usage;
    }
}

@ARGV == 4 or die $usage;

my $fasta = shift;
my $nr_file = shift;
my $pegsyn = shift;
my $jobdir = shift;

if (!defined($self_fasta))
{
    $self_fasta = $fasta;
}

-d $jobdir or mkdir $jobdir or die "Cannot mkdir $jobdir: $!\n";

my $next_task = 1;
my $last_task;

my $task_file = "$jobdir/task.list";
my $input_dir = "$jobdir/sims.in";
my $output_dir = "$jobdir/sims.raw";
my $error_dir = "$jobdir/sims.err";

-d $input_dir or mkdir $input_dir or die "Cannot mkdir $input_dir: $!\n";
-d $output_dir or mkdir $output_dir or die "Cannot mkdir $output_dir: $!\n";
-d $error_dir or mkdir $error_dir or die "Cannot mkdir $error_dir: $!\n";

my $flags = "-m 8 -e 1.0e-5 -FF -p blastp";

my @fasta_files = ($fasta);

open(TASK, ">$task_file") or die "Cannot write $task_file: $!";

#
# Buzz through once to ensure we can open them.
#
for my $file (@fasta_files, $self_fasta)
{
    open(F, "<$file") or die "Cannot open $file: $!\n";
    close(F);
}


#
# Prepare and submit self-sims.
#
if ($include_self)
{
    #
    # hack - leftover from legacy chunking code that wanted multiple
    # directories of sims. we need to direct the output all to the
    # same directory.
    #
    my $base = basename($fasta);
    my $file = abs_path($self_fasta);

    system("$FIG_Config::ext_bin/formatdb", "-p", "t", "-i", $file);
    my $task = $next_task++;
    print TASK join("\t", $task, $file, $file, $flags,
		    "$output_dir/$base/out.$task", "$error_dir/$base/err.$task"), "\n";
}

for my $file (@fasta_files)
{
    my $cur_size = 0;
    my $cur_input = '';

    my $base = basename($file);
    $file = abs_path($file);
	
    open(F, "<$file") or die "Cannot open $file: $!\n";

    print "Chunk file $file\n";
    
    while (<F>)
    {
	if (/^>/)
	{
	    if ($cur_size >= $max_size)
	    {
		write_task($base, $input_dir, $output_dir, $error_dir, $cur_input);
		$cur_size = 0;
		$cur_input = '';
	    }
	    $cur_input .= $_;
	    $cur_size += length($_);
	}
	else
	{
	    $cur_input .= $_;
	    $cur_size += length($_);
	}
    }
    if ($cur_size >= 0)
    {
	write_task($base, $input_dir, $output_dir, $error_dir, $cur_input);
	$cur_size = 0;
	$cur_input = '';
    }
    close(F);
}

close(TASK);

print "tasks\t1\t$last_task\n";

#
# Write an input chunk to $dir.
# Write a line on the 
sub write_task
{
    my($base, $input_dir, $output_dir, $error_dir, $fasta) = @_;

    my $task = $next_task++;

    my $idir = "$input_dir/$base";
    my $odir = "$output_dir/$base";
    my $edir = "$error_dir/$base";

    -d $idir or mkdir($idir) or die "Cannot mkdir $idir: $!\n";
    -d $odir or mkdir($odir) or die "Cannot mkdir $odir: $!\n";
    -d $edir or mkdir($edir) or die "Cannot mkdir $edir: $!\n";

    my $in = "$idir/in.$task";
    my $out = "$odir/out.$task";
    my $err = "$edir/err.$task";
    
    open(I, ">$in") or die "Cannot write $in: $!";
    print I $fasta;
    close(I);
    print TASK join("\t", $task, $in, $nr_file, $flags, $out, $err), "\n";
    $last_task = $task;
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3