[Bio] / Kmers2 / mk-kmers.pl Repository:
ViewVC logotype

View of /Kmers2/mk-kmers.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Wed Feb 13 20:42:16 2013 UTC (6 years, 10 months ago) by olson
Branch: MAIN
Kmer / partition creation code.

use Data::Dumper;
use File::Basename;
use lib '../FigKernelPackages';
use strict;
use Proc::ParallelLoop;
use gjoseqlib;

=head1 NAME

mk-kmers

=head1 SYNOPSIS

mk-kmers partition-dir K

=head1 DESCRIPTION

Compute the kmer signatures for the given partition dir.

=cut    

@ARGV == 2 or die "Usage: mk-kmers partition-dir k\n";

my $part_dir = shift;
my $k = shift;

my $sort_tmp = "/space/partitions/tmp";

-d $part_dir or die "Directoy $part_dir does not exist\n";
$k =~ /^\d+$/ or die "Invalid K '$k'\n";

my $out_kdir = "$part_dir/sigs.$k";
-d $out_kdir or mkdir($out_kdir) or die "Cannot mkdir $out_kdir: $!\n";

my $nprocs = 6;

opendir(D, $part_dir);
my @dirs = grep { -d "$part_dir/$_" && /set.\d+/ } readdir(D);

my @work;
for my $dir (@dirs)
{
    my($id) = $dir =~ /set\.(\d+)/;
    open(F, "<", "$part_dir/$dir/definition") or die "Cannot open $dir/definition: $!";
    
    my $tmp = "$part_dir/$dir/tmp.$k";
    -d $tmp or mkdir $tmp or die "Cannot mkdir $tmp: $!";

    $_ = <F>;
    my @files = <F>;
    chomp @files;

    push @work, map { my $base = basename($_); [$id, $_, "$tmp/$base"] } @files;
}

print scalar(@work) . " work items scheduled for $nprocs workers\n";

pareach \@work, sub {
    my $ent = shift;
    my($tag, $file, $output) = @$ent;

    open(my $fh, "<", $file) or die "Cannot open $file; $!";
    my $base = basename($file);
    open(my $out, "| sort --parallel=1 -S 20G -T $sort_tmp -u > $output") or die "Cannot open $output: $!";
#    open(my $out, ">", $output) or die "Cannot open $output: $!";

    while (my($id, $def, $seq) = read_next_fasta_seq($fh))
    {
	my $s = length($seq);
	for (my $i=0; ($i < (length($seq) - $k)); $i++)
	{
	    my $kmer = uc substr($seq,$i,$k);
	    
	    if ($kmer !~ /[^ACDEFGHIKLMNPQRSTVWY]/)
	    {
		my $rev =  reverse( $kmer );
		$rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/;
		
		print $out ($kmer lt $rev ? $kmer : $rev), "\t", $tag, "\n";
	    }
	}
    }

    close($fh);
}, { Max_Workers => $nprocs };

#
# We have the raw data. Now sort/unique to prepare for merge.
#

pareach \@dirs, sub {
    my($dir) = shift;

    my @files = <$part_dir/$dir/tmp.$k/*>;
    print "$dir: @files\n";
    
    my $cmd = "sort --parallel=1 -m -T $sort_tmp -S 20G -u @files > $part_dir/$dir/raw.$k";
    print "$cmd\n";
    my $rc = system($cmd);
    if ($rc != 0)
    {
	die "failed with rc=$rc: $cmd\n";
    }
}, { Max_Workers => $nprocs };

#
# Now we can create the signatures.
#

my @raws = map { "$part_dir/$_/raw.$k" } @dirs;

my $cmd = "sort -S 20G -T $sort_tmp -m @raws | uniq -u -w $k > $out_kdir/sigs";

print "$cmd\n";
my $rc = system($cmd);
$rc == 0 or die "failed with rc=$rc: $cmd\n";

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3