[Bio] / FigKernelScripts / FFB2_create_binary_kmers.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_create_binary_kmers.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.6 - (download) (as text) (annotate)
Mon Feb 14 22:44:07 2011 UTC (8 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.5: +10 -8 lines
Update to figfam build code

########################################################################
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#
########################################################################


use KmersC;
use strict;
use Data::Dumper;
use PerlIO::gzip;
use SortMerge;
use Getopt::Long;

#
# Scan input once to determine the max value of the value fields. Then create
# the output file and scan again, writing values.
#

my @max;
my @lens;
my $motif_len;
my $column_sizes;

#
# Multiline input is of the form
#  motif column value
# where the column field defines the column into which the data should go.
# It is 1-based.
#
my $multiline;
my $merge = '';

my $rc = GetOptions("size=s" => \$column_sizes,
		    "multiline" => \$multiline,
		    "merge=s" => \$merge,
		    "length=s" => \$motif_len);

if (!$rc || @ARGV < 2)
{
    die "Usage: $0 [-s column-sizes] [-l motif-len] input-data-file output-binary-file [cols]\n";
}

if ($column_sizes)
{
    @lens =  split(/,/, $column_sizes);
    print "Set column lengths to @lens\n";
}

my $in_file = shift;
my $out_file = shift;
my @cols = @ARGV;

#
# for merge, we don't look at input fh.
#
my $input_fh;
if ($merge eq '')
{
    if ($in_file eq '-')
    {
	if (@lens == 0 || !defined($motif_len))
	{
	    die "In order to read from stdin, both the -s and the -l arguments must be specified\n";
	}
	$input_fh = \*STDIN;
    }
    else
    {
	open($input_fh, "<", $in_file) or die "Cannot open $in_file: $!";
	
	while (<$input_fh>)
	{
	    chomp;
	    my($motif, @vals) = split(/\t/);
	    if (!defined($motif_len))
	    {
		$motif_len = length($motif);
	    }
	    
	    if (@lens)
	    {
		last;
	    }
	    
	    if (@cols)
	    {
		@vals = @vals[@cols];
	    }
	    
	    for my $i (0 .. $#vals)
	    {
		$max[$i] = $vals[$i] if $vals[$i] > $max[$i];
	    }
	}
	
	seek($input_fh, 0, 0);
    }
}
else
{
    if (@lens == 0 || !defined($motif_len))
    {
	die "In order to merge, both the -s and the -l arguments must be specified\n";
    }
}

if (!@lens)
{
    for my $m (@max)
    {
	my $l;
	if ($m < 128)
	{
	    $l = 1;
	}
	elsif ($m < 32768)
	{
	    $l = 2;
	}
	elsif ($m < 2147483648)
	{
	    $l = 4;
	}
	else
	{
	    die "Max value $m is greater than a 4-byte signed int";
	}
	push(@lens, $l);
    }
}

my $num_keys = @lens;

print STDERR "Computed max lengths: @max and byte sizes: @lens\n";

my $cr = new KmersFileCreator(0xfeedface, $motif_len, 0, \@lens);
$cr->open_file($out_file);
$cr->write_file_header();

my $count = 0;

if ($merge)
{
    my @code;
    my @fh;

    if (!open(MF, "<", $merge))
    {
	die "Cannot open merge file $merge: $!";
    }

    my %by_col;
    
    while (<MF>)
    {
	chomp;
	my($file, $col, $src_col) = split(/\t/);
	my $fh;
	if ($file =~ /\.gz$/)
	{
	    open($fh, "<:gzip", $file) or die "Cannot open $file with gzip: $!";
#	    open($fh, "-|", "gunzip", "-c", $file) or die "Cannot open $file with gzip: $!";
	}
	else
	{
	    open($fh, "<", $file) or die "Cannot open $file: $!";
	}
	$src_col = 2 if !defined($src_col);
	push(@{$by_col{$col}}, [$file, $fh, $src_col]);
    }
    close(MF);
    for my $col (sort keys %by_col)
    {
	my $fhlist = $by_col{$col};

	push @code, sub {
	    my $ent = $fhlist->[0];
	    my($file, $fh, $src_col) = @$ent;
	    my $l = <$fh>;
	    while (!defined($l))
	    {
		close($fh);
		shift @$fhlist;
		if (@$fhlist == 0)
		{
		    return ();
		}
		my $ent = $fhlist->[0];
		($file, $fh, $src_col) = @$ent;
		print STDERR "Moving to $file for column $col src_col=$src_col\n";
		$l = <$fh>;
	    }

	    if (my @d = split(/\t/, $l))
	    {
		my $motif = $d[0];
		my $val = $d[$src_col - 1];
		return ($motif, [$motif, $col, $val]);
	    }
	    else
	    {
		die "Badly formatted input at line $. file $file: $l\n";
	    }
	};
    }

    my $writer = MultilineWriter->new($cr, $num_keys);

    Sort::Merge::sort_coderefs(\@code,
			       sub {
				   my($code, $key, $value) = @{$_[0]};
				   # print "@$value\n";
				   $writer->push_one_value(@$value);
			       });
    $writer->finish();

    $count = $writer->{count};
}
elsif ($multiline)
{
    my $writer = MultilineWriter->new($cr, $num_keys);
    while (defined(my $l = <$input_fh>))
    {
	if ($l =~ /^(\S+)\t(\d+)\t(\d+)/)
	{
	    my($motif, $col, $val) = ($1, $2, $3);
	    $writer->push_one_value($motif, $col, $val);
	}
	else
	{
	    warn "Badly formatted input at $.: $l\n";
	}
    }
    close($input_fh);
    $writer->finish();
    $count = $writer->count();
}
else
{
    while (<$input_fh>)
    {
	chomp;
	my($motif, @vals) = split(/\t/);
	if (@cols)
	{
	    @vals = @vals[@cols];
	}
	$cr->write_entry($motif, \@vals);
	$count++;
    }
}
$cr->close_file();

print "Loaded $count oligos\n";

package MultilineWriter;
use strict;
use base 'Class::Accessor';

__PACKAGE__->mk_accessors(qw(cr cur val count));

sub new
{
    my($class, $cr, $num_keys) = @_;

    my $self = {
	cr => $cr,
	cur => '',
	count => 0,
	num_keys => $num_keys,
	blank_val => [map { -1 } 1..$num_keys],
    };
    bless $self, $class;
    $self->set_blank();
    return $self;
}

sub set_blank
{
    my($self) = @_;
    @{$self->{val}} = @{$self->{blank_val}};
}

sub push_one_value
{
    my($self, $motif, $col, $val) = @_;
    # print "$self->{cur} $motif $col $val\n";
    if ($self->{cur} ne $motif)
    {
	if ($self->{cur} ne '')
	{
	    if (@{$self->{val}} != $self->{num_keys})
	    {
		die "Val became invalid: " . Dumper($self->{val});
	    }
	    if ($self->{val}->[0] >= 0)
	    {
		#print "write @{$self->{val}}\n";
		$self->{cr}->write_entry($self->{cur}, $self->{val});
		$self->{count}++;
	    }
	    $self->set_blank;
	}
	$self->{cur} = $motif;
    }
    $self->{val}->[$col - 1] = 0 + $val;
}

sub finish
{
    my($self) = @_;
    if ($self->{val}->[0] >= 0)
    {
	$self->{cr}->write_entry($self->{cur}, $self->{val});
	$self->{count}++;
	$self->set_blank;
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3