[Bio] / FigKernelScripts / generate_kmer_peg_rel.pl Repository:
ViewVC logotype

View of /FigKernelScripts/generate_kmer_peg_rel.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Aug 19 16:40:42 2014 UTC (5 years, 2 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0912, HEAD
Initial commit of Ross kmer compare regions code,
coped from ~overbeek/Ross/GeneralKmerCompareRegions/

use strict;
use Data::Dumper;
use Getopt::Long;
use SeedEnv;

my $usage = "usage: generate_kmer_peg_rel -d Data -s Seqs -g Genomes\n";

# Seqs is a directory.  Files in it are genome IDs.  Each file contains
# translations of pegs.

# Data is a Data directory containing
#
#     Data.Kmers [kmer Data directory]
#    genome_encoding and kmer_to_pegs.db get added

# The Seqs directory determines the genomes used (coreSEED)
####
#
# This generates files in the GenomeData directory 
#
#   perl generate_kmer_peg_rel.pl -d Data -s Seqs -g GenomeData
#
# Each file is for a specific genome.  It contains a table
#
#      [Encoded-kmer,PEG,NumberHits]
#

my $dataD;
my $seqsD;
my $genomes;
my $rc  = GetOptions('d=s' => \$dataD,
		     'g=s' => \$genomes,
                     's=s' => \$seqsD);
if ((! $rc) || (! $dataD) || (! $seqsD) || (! $genomes))
{ 
    print STDERR $usage; exit ;
}

mkdir($genomes,0777);
opendir(SEQS,"$seqsD") || die "could not open $seqsD";
my @genomes = grep { $_ !~ /^\./ } readdir(SEQS);
closedir(SEQS);

foreach my $g (@genomes)
{
    next if (-s "$genomes/$g");
    open(OUT,"| sort -k1,2 -n  -T . > $genomes/$g") || die "could not open $genomes/$g";
    open(KMERS,"kmer_guts -d 1 -D $dataD/Data.Kmers -a < $seqsD/$g | ")
	|| die "could not open kmers";
    my $peg;
    my %hits;
    while (defined($_ = <KMERS>))
    {
	if ($_ =~ /^PROTEIN-ID\s+(\S+)/)
	{
	    my $next_peg = $1;
	    my @kmers = keys(%hits);
	    foreach my $kmer (@kmers)
	    {
		print OUT $kmer,"\t",$peg,"\t",$hits{$kmer},"\n";
	    }
	    $peg = $next_peg;
	    undef %hits;
	}
	elsif ($_ =~ /^HIT\s+\d+\s+(\d+)\s+(\d+)/)
	{
 	    $hits{$1} = $2;
	}
    }
    my @kmers = keys(%hits);
    foreach my $kmer(@kmers)
    {
	print OUT $kmer,"\t",$peg,"\t",$hits{$kmer},"\n";
    }
    close(OUT);
    close(KMERS);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3