[Bio] / FigKernelScripts / FFB2_usable_motifs2.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_usable_motifs2.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Mon Feb 14 22:44:07 2011 UTC (8 years, 9 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, mgrast_release_3_0, mgrast_dev_03252011, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.2: +45 -8 lines
Update to figfam build code


my $usage = "usage: usable_motifs Dir range value-column input-file";
use IO::File;
use strict;
use List::Util qw(reduce sum);
use Data::Dumper;

my($dir, $range, $value_column, $input_file);
(
 ($dir = shift @ARGV) &&
 ($range = shift @ARGV) &&
 ($value_column = shift @ARGV) &&
 ($input_file = shift @ARGV)
)
    || die $usage;

my $match_re;

if ($value_column == 2)
{
    $match_re = qr(^(\S+)\t(\S+)(?:.*OFF(\d+))?);
}
elsif ($value_column ==3 )
{
    $match_re = qr(^(\S+)\t\S+\tFIG(\d+)(?:.*OFF(\d+))?);
}
else
{
    die "invalid value column\n";
}

my($min_k, $max_k);

if ($range =~ /^\d+$/)
{
    $min_k = $max_k = $range;
}
elsif ($range =~ /^(\d+)-(\d+)/)
{
    $min_k = $1;
    $max_k = $2;
}
else
{
    die "Invalid range '$range'\n";
}

open(INPUT, "-|", "uniq", $input_file) or die "cannot open $input_file: $!";
#open(INPUT, "<", $input_file) or die "cannot open $input_file: $!";

if (! -d $dir) { mkdir($dir,0777) || die "could not make $dir" }
my $i;

my @files;
for ($i=$min_k; ($i <= $max_k); $i++)
{
    if (! -d "$dir/$i" ) { mkdir("$dir/$i",0777) || die "could not make $dir/$i" }
    $files[$i] = IO::File->new("| gzip -c > $dir/$i/good.oligos.gz");
#    $files[$i] = IO::File->new(">$dir/$i/good.oligos");
}

my $line = <INPUT>;
while ($line && ($line =~ $match_re))
{
    my $curr = substr($1,0,$min_k);
    my @pairs;
    while ($line && ($line =~ $match_re) && (substr($1,0,$min_k) eq $curr))
    {
	push(@pairs,[$1,$2,length($1),$3]);  # [Oligo,FuncIndex,length,kmer-offset]
	$line = <INPUT>;
    }
    &process_set(\@pairs,\@files);
}
close(INPUT);

for ($i=$min_k; ($i <= $max_k); $i++)
{
    close($files[$i]);
}


sub process_set {
    my($pairs,$files) = @_;

    my $i;
    for ($i=$min_k; ($i <= $max_k); $i++)
    {
	my $fh = $files->[$i];
	my %counts;
	my %sums;
# 	foreach my $pair (@$pairs)
# 	{
# 	    my($oligo,$funcI,$len) = @$pair;

# 	    my $short_oligo;
# 	    if ($len >= $i)
# 	    {
# 		$short_oligo = substr($oligo,0,$i);
# 		$counts{$short_oligo}->{$funcI}++;
# 	    }
# 	}

	# 
	#  The above optimizes to the following.
	#

	foreach my $pair (@$pairs)
	{
	    my($oligo, $func_index, $len, $offset) = @$pair;
	    if ($len >= $i)
	    {
		my $short_oligo = substr($oligo, 0, $i);

		#
		# Accumulate data to compute average offset (and min/max)
		#
		# tuple[0] = accumulator
		# tuple[1] = count
		# tuple[2] = max
		# tuple[3] = min
		#
		if (!exists($sums{$func_index}))
		{
		    my $tuple = [$offset, 1, $offset, $offset];
		    $sums{$func_index} = $tuple;
		}
		else
		{
		    my $tuple = $sums{$func_index};
		    $tuple->[0] += $offset;
		    $tuple->[1]++;
		    $tuple->[2] = $offset if $offset > $tuple->[2];
		    $tuple->[3] = $offset if $offset < $tuple->[3];
		}
		
		$counts{$short_oligo}->{$pair->[1]}++;
	    }
	}


	foreach my $short_oligo (sort keys(%counts))
	{
	    my $funcI_hash = $counts{$short_oligo};

# 	    my @tmp = sort { $funcI_hash->{$b} <=> $funcI_hash->{$a}} keys(%$funcI_hash);
# 	    my $sum = 0;
# 	    foreach my $funcI (@tmp) { $sum += $funcI_hash->{$funcI} }
	    #
	    # The above optimizes to the following. No need to sort when we
	    # just want the max value.
	    #

	    my($k, $v, $max, $maxv);
	    my $sum = 0;
	    while (($k, $v) = each %$funcI_hash)
	    {
		if ($v > $maxv)
		{
		    $maxv = $v;
		    $max = $k;
		}
		$sum += $v;
	    }

#	    my $sum = sum values %$funcI_hash;
#	    my $max = reduce { $funcI_hash->{$a} > $funcI_hash->{$b} ? $a : $b } keys %$funcI_hash;

	    if ($maxv == $sum)
#	    if ($maxv / $sum > 0.9)
#	    if (($funcI_hash->{$tmp[0]} / $sum) > 0.9)
	    {
		my($fn_total, $fn_count, $fn_max, $fn_min) = @{$sums{$max}};
		my $avg_offset = 0;
		if ($fn_count > 0)
		{
		    $avg_offset = int($fn_total / $fn_count);
		}

		print $fh $short_oligo, "\t", $max, "\t", $avg_offset, "\t", $fn_max, "\t", $fn_min, "\n";
#		print $fh "$short_oligo\t$tmp[0]\t" . $funcI_hash->{$tmp[0]}/$sum . "\t" . $funcI_hash->{$tmp[0]} . "\n";
#		print $fh join("\t",($short_oligo,
#				     $tmp[0],
#				     sprintf("%0.3f",$funcI_hash->{$tmp[0]}/$sum),
#				     $funcI_hash->{$tmp[0]})),"\n";
	    }
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3