[Bio] / FigKernelScripts / FFB3_usable_motifs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB3_usable_motifs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (download) (as text) (annotate)
Tue Jan 10 13:18:14 2012 UTC (7 years, 10 months ago) by olson
Branch: MAIN
CVS Tags: rast_rel_2014_0729, mgrast_version_3_2, rast_rel_2014_0912, HEAD
New figfam building code.



use IO::File;
use strict;
use List::Util qw(reduce sum);
use Data::Dumper;
use Getopt::Long;
use PerlIO::gzip;

my $usage = "usage: FFB3_usable_motifs [-no-offset] Dir range value-column input-file";

my $no_offset;
my $rc = GetOptions("no-offset" => \$no_offset);

($rc && @ARGV == 4) or die $usage;

my $dir = shift;
my $range = shift;
my $value_column = shift;
my $input_file = shift;

if ($value_column != 2 && $value_column != 3)
{
    die "Invalid value column: must be 2 or 3\n";
}

my $match_re;

if ($no_offset)
{
    if ($value_column == 2)
    {
	$match_re = qr(^(\S+)\t(\S+));
    }
    elsif ($value_column == 3)
    {
	$match_re = qr(^(\S+)\t\S+\tFIG(\d+));
    }
}
else
{
    if ($value_column == 2)
    {
	$match_re = qr(^(\S+)\t(\S+)(?:.*OFF(\d+))?);
    }
    elsif ($value_column == 3)
    {
	$match_re = qr(^(\S+)\t\S+\tFIG(\d+)(?:.*OFF(\d+))?);
    }
}
    

my($min_k, $max_k);

if ($range =~ /^\d+$/)
{
    $min_k = $max_k = $range;
}
elsif ($range =~ /^(\d+)-(\d+)/)
{
    $min_k = $1;
    $max_k = $2;
}
else
{
    die "Invalid range '$range'\n";
}

open(INPUT, "-|", "uniq", $input_file) or die "cannot open $input_file: $!";
#open(INPUT, "<", $input_file) or die "cannot open $input_file: $!";

if (! -d $dir) { mkdir($dir,0777) || die "could not make $dir" }
my $i;

my @files;
for ($i=$min_k; ($i <= $max_k); $i++)
{
    if (! -d "$dir/$i" ) { mkdir("$dir/$i",0777) || die "could not make $dir/$i" }
    open($files[$i], ">:gzip", "$dir/$i/good.oligos.gz") or die "could not write to $dir/$i/good.oligos.gz: $!";
    
#    $files[$i] = IO::File->new("| gzip -c > $dir/$i/good.oligos.gz");
#    $files[$i] = IO::File->new(">$dir/$i/good.oligos");
}

my $line = <INPUT>;
while ($line && ($line =~ $match_re))
{
    my $curr = substr($1,0,$min_k);
    my @pairs;
    while ($line && ($line =~ $match_re) && (substr($1,0,$min_k) eq $curr))
    {
	push(@pairs,[$1,$2,length($1),$3]);  # [Oligo,FuncIndex,length,kmer-offset]
	$line = <INPUT>;
    }
    &process_set(\@pairs,\@files);
}
close(INPUT);

for ($i=$min_k; ($i <= $max_k); $i++)
{
    close($files[$i]);
}


sub process_set {
    my($pairs,$files) = @_;

    my $i;
    for ($i=$min_k; ($i <= $max_k); $i++)
    {
	my $fh = $files->[$i];
	my %counts;
	my %sums;
# 	foreach my $pair (@$pairs)
# 	{
# 	    my($oligo,$funcI,$len) = @$pair;

# 	    my $short_oligo;
# 	    if ($len >= $i)
# 	    {
# 		$short_oligo = substr($oligo,0,$i);
# 		$counts{$short_oligo}->{$funcI}++;
# 	    }
# 	}

	# 
	#  The above optimizes to the following.
	#

	foreach my $pair (@$pairs)
	{
	    my($oligo, $func_index, $len, $offset) = @$pair;
	    if ($len >= $i)
	    {
		my $short_oligo = substr($oligo, 0, $i);

		#
		# Accumulate dtaa to compute average offset (and min/max)
		#
		# tuple[0] = accumulator
		# tuple[1] = count
		# tuple[2] = max
		# tuple[3] = min
		#
		if (!exists($sums{$func_index}))
		{
		    my $tuple = [$offset, 1, $offset, $offset];
		    $sums{$func_index} = $tuple;
		}
		else
		{
		    my $tuple = $sums{$func_index};
		    $tuple->[0] += $offset;
		    $tuple->[1]++;
		    $tuple->[2] = $offset if $offset > $tuple->[2];
		    $tuple->[3] = $offset if $offset < $tuple->[3];
		}
		
		$counts{$short_oligo}->{$pair->[1]}++;
	    }
	}


	foreach my $short_oligo (sort keys(%counts))
	{
	    my $funcI_hash = $counts{$short_oligo};

# 	    my @tmp = sort { $funcI_hash->{$b} <=> $funcI_hash->{$a}} keys(%$funcI_hash);
# 	    my $sum = 0;
# 	    foreach my $funcI (@tmp) { $sum += $funcI_hash->{$funcI} }
	    #
	    # The above optimizes to the following. No need to sort when we
	    # just want the max value.
	    #

	    my($k, $v, $max, $maxv);
	    my $sum = 0;
	    while (($k, $v) = each %$funcI_hash)
	    {
		if ($v > $maxv)
		{
		    $maxv = $v;
		    $max = $k;
		}
		$sum += $v;
	    }

#	    my $sum = sum values %$funcI_hash;
#	    my $max = reduce { $funcI_hash->{$a} > $funcI_hash->{$b} ? $a : $b } keys %$funcI_hash;

	    if ($maxv == $sum)
#	    if ($maxv / $sum > 0.9)
#	    if (($funcI_hash->{$tmp[0]} / $sum) > 0.9)
	    {
		if ($no_offset)
		{
		    print $fh $short_oligo, "\t", $max, "\n";
		}
		else
		{
		    my($fn_total, $fn_count, $fn_max, $fn_min) = @{$sums{$max}};
		    my $avg_offset = 0;
		    if ($fn_count > 0)
		    {
			$avg_offset = int($fn_total / $fn_count);
		    }
		    
		    print $fh $short_oligo, "\t", $max, "\t", $avg_offset, "\t", $fn_max, "\t", $fn_min, "\n";
		}
#		print $fh "$short_oligo\t$tmp[0]\t" . $funcI_hash->{$tmp[0]}/$sum . "\t" . $funcI_hash->{$tmp[0]} . "\n";
#		print $fh join("\t",($short_oligo,
#				     $tmp[0],
#				     sprintf("%0.3f",$funcI_hash->{$tmp[0]}/$sum),
#				     $funcI_hash->{$tmp[0]})),"\n";
	    }
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3