[Bio] / FigKernelScripts / FFB2_create_binary_kmers.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_create_binary_kmers.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Mon Nov 16 21:52:22 2009 UTC (10 years, 4 months ago) by overbeek
Branch: MAIN
CVS Tags: rast_rel_2010_0526, rast_rel_2010_0118
new scripts for updating FIGfams and Kmer tables

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
########################################################################


use KmersC;
use strict;
use Data::Dumper;

#
# Scan input once to determine the max value of the value fields. Then create
# the output file and scan again, writing values.
#

my @max;
my @lens;

if ($ARGV[0] eq '-s')
{
    shift;
    @lens =  split(/,/, $ARGV[0]);
    shift;
    print "Set column lengths to @lens\n";
}

@ARGV >= 2 or die "Usage: $0 [-s column-sizes] input-data-file output-binary-file [cols]\n";

my $in_file = shift;
my $out_file = shift;
my @cols = @ARGV;

open(IN, "<", $in_file) or die "Cannot open $in_file: $!";

my $motif_len;
while (<IN>)
{
    chomp;
    my($motif, @vals) = split(/\t/);
    if (!defined($motif_len))
    {
	$motif_len = length($motif);
    }

    if (@lens)
    {
	last;
    }
    
    if (@cols)
    {
	@vals = @vals[@cols];
    }

    for my $i (0 .. $#vals)
    {
	$max[$i] = $vals[$i] if $vals[$i] > $max[$i];
    }
}

if (!@lens)
{
    for my $m (@max)
    {
	my $l;
	if ($m < 128)
	{
	    $l = 1;
	}
	elsif ($m < 32768)
	{
	    $l = 2;
	}
	elsif ($m < 2147483648)
	{
	    $l = 4;
	}
	else
	{
	    die "Max value $m is greater than a 4-byte signed int";
	}
	push(@lens, $l);
    }
}
print STDERR "Computed max lengths: @max and byte sizes: @lens\n";

my $cr = new KmersFileCreator(0xfeedface, $motif_len, 0, \@lens);
$cr->open_file($out_file);
$cr->write_file_header();

seek(IN, 0, 0);

my $count = 0;
while (<IN>)
{
    chomp;
    my($motif, @vals) = split(/\t/);
    if (@cols)
    {
	@vals = @vals[@cols];
    }
    $cr->write_entry($motif, \@vals);
    $count++;
}
$cr->close_file();

print "Loaded $count oligos\n";

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3