[Bio] / FigKernelScripts / get_close_pegs_kmer_memcache.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_close_pegs_kmer_memcache.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (download) (as text) (annotate)
Fri Jan 31 20:43:56 2014 UTC (5 years, 10 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0729, rast_rel_2014_0912, HEAD
Changes since 1.1: +1 -2 lines
put hash size back appropriately

use strict;
use Data::Dumper;
use gjoseqlib;
use IO::Compress::Gzip;
use Getopt::Long;
use Cache::Memcached::Fast;

my $dataD = "/vol/ross/GeneralKmerCompareRegions/Data";

open(SIZE, "<", "$dataD/hash.size") || die "Cannot open $dataD/hash.size: $!";
my $hash_size = <SIZE>;
chomp $hash_size;
close(SIZE);

our @doctypes = qw(peg
		   rna
		   atn
		   att
		   bs
		   opr
		   pbs
		   pi
		   pp
		   prm
		   pseudo
		   rsw
		   sRNA
		   trm
		   box
		   );

$| = 1;

our %tmap;
for my $i (0..$#doctypes)
{
    $tmap{$doctypes[$i]} = $i;
    $tmap{$i} = $doctypes[$i];
}

my $cache = new Cache::Memcached::Fast({
    servers => [ { address => 'elm.mcs.anl.gov:11211' } ],
    compress_threshold => 10_000,
    compress_ratio => 0.9,
    compress_methods => [ \&IO::Compress::Gzip::gzip,
			 \&IO::Uncompress::Gunzip::gunzip ],
    max_failures => 3,
    failure_timeout => 2,
});

while (my($fid, $def, $seq) = read_next_fasta_seq())
{
    $seq = uc $seq;
    my $kmers = &get_kmers($seq);
    my %hits;
    my $res = $cache->get_multi(@$kmers);
    while (my($kmer, $pegsN) = each(%$res))
    {
	foreach my $pegN (split(/,/,$pegsN))
	{
	    $hits{$pegN}++;
	}
    }
    my @sorted = map { [$_,$hits{$_}] } sort { $hits{$b} <=> $hits{$a} } keys(%hits);

    my @output;
    foreach my $tuple (@sorted)
    {
	my($pegE,$n) = @$tuple;
	my $peg = docid_to_fid($pegE);

	push(@output,[$pegE,$peg,$n]);
    }
    foreach $_ (@output)
    {
	print join("\t",@$_),"\n";
    }
    print "//\n";
}

sub get_kmers {
    my($seq) = @_;

    open(TMP,">tmp.$$") || die "could not open tmp.$$";
    print TMP ">peg\n$seq\n";
    close(TMP);
    
    my @hits = map { ($_ =~ /^HIT\s+\d+\s+(\d+)/) ? $1 : () }
               `kmer_guts -d 1 -D $dataD/Data.Kmers -s $hash_size -a < tmp.$$`;
    unlink("tmp.$$");
    my %hits = map { ($_ => 1) } @hits;
    return [sort { $a <=> $b } keys(%hits)];
}


sub fid_to_docid
{
    my($fid) = @_;
    
    if ($fid =~ /^fig\|(\d+)\.(\d+)\.([^.]+)\.(\d+)$/)
    {
	my ($g, $ext, $type, $num) = ($1, $2, $3, $4);
	my $tnum = $tmap{$type};

	#
	# right to left: (cumulative)
	# 17 bits for feature number    (0)
	# 4 bits for type		(17)
	# 15 bits for ext		(21)
	# Rest for genome 		(36)

	my $enc;

	$enc = $g << 36| $ext << 21 | $tnum << 17 | $num;
	
	return $enc;
    }

    return undef;
}

sub docid_to_fid
{
    my($doc) = @_;

    my($g, $e, $t, $n);

    $g = $doc >> 36;
    $e = ($doc >> 21) & 0x7fff;
    $t = ($doc >> 17) & 0xf;
    $n = $doc & 0x1ffff;
    
    my $type = $tmap{$t};
    my $genome = "$g.$e";
    my $fid = "fig|$genome.$type.$n";

    return $fid;
}


MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3