[Bio] / FigKernelScripts / get_close_pegs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/get_close_pegs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Aug 19 16:40:42 2014 UTC (5 years, 2 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0912, HEAD
Initial commit of Ross kmer compare regions code,
coped from ~overbeek/Ross/GeneralKmerCompareRegions/

use strict;
use Data::Dumper;
use Getopt::Long;
use SeedEnv;
use DB_File;
use gjoseqlib;

my $dataD;
my $dataD_kmers;
my $rc  = GetOptions('d=s' => \$dataD);

my $usage = "usage: get_close_pegs -d Data  < request-as-seqs > repsonses\n";

if ((! $rc) || (! $dataD) || (! "$dataD/$dataD_kmers"))
{ 
    print STDERR $usage; exit ;
}

my %kmer_to_pegs;
tie %kmer_to_pegs, "DB_File", "$dataD/kmer_to_pegs.db", O_RDWR|O_CREAT, 0666, $DB_HASH 
	or die "Cannot open file kmer_to_pegs.db: $!\n";

my %genomeN_to_genome = map { ($_ =~ /^(\d+)\t(\S+)/) ? ($1 => $2) : () } `cat $dataD/genome_encoding`;

my @seqs = &gjoseqlib::read_fasta;
foreach my $tuple (@seqs)
{
    my($id,undef,$seq) = @$tuple;
    my $seq = uc $seq;
    my $kmers = &get_kmers($dataD_kmers,$seq);
#    print STDERR &Dumper(['kmers hit',$kmers]);
    my %hits;
    foreach my $kmer (@$kmers)
    {
	my $pegsN = $kmer_to_pegs{$kmer};
	foreach my $pegN (split(/,/,$pegsN))
	{
	    $hits{$pegN}++;
	}
    }
    my @sorted = map { [$_,$hits{$_}] } sort { $hits{$b} <=> $hits{$a} } keys(%hits);

    my @output;
    foreach my $tuple (@sorted)
    {
	my($pegE,$n) = @$tuple;
	my $g = $pegE >> 15;
	my $pegN = $pegE & ((2 ** 15) - 1);
	my $peg = "fig|" . $genomeN_to_genome{$g} . ".peg." . $pegN;
	push(@output,[$peg,$n]);
    }
    foreach $_ (@output)
    {
	print join("\t",@$_),"\n";
    }
    print "//\n";
}
untie %kmer_to_pegs;

sub get_kmers {
    my($dataD_kmers,$seq) = @_;

    open(TMP,">tmp.$$") || die "could not open tmp.$$";
    print TMP ">peg\n$seq\n";
    close(TMP);
    
#    open(SIZE,"<$dataD/hash.size") || die "where is hash.size?";
#    my $hash_size = <SIZE>;
#    chomp $hash_size;

    my @hits = map { ($_ =~ /^HIT\s+\d+\s+(\d+)/) ? $1 : () }
               `kmer_guts -d 1 -D $dataD/Data.Kmers -a < tmp.$$`;
#               `kmer_guts -d 1 -D $dataD/Data.Kmers -s $hash_size -a < tmp.$$`;
    unlink("tmp.$$");
    my %hits = map { ($_ => 1) } @hits;
    return [sort { $a <=> $b } keys(%hits)];
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3