[Bio] / FigKernelScripts / FFB2_usable_motifs.pl Repository:
ViewVC logotype

View of /FigKernelScripts/FFB2_usable_motifs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (download) (as text) (annotate)
Fri Oct 29 17:10:34 2010 UTC (9 years, 5 months ago) by olson
Branch: MAIN
CVS Tags: mgrast_dev_08112011, mgrast_dev_08022011, rast_rel_2014_0912, myrast_rel40, mgrast_dev_05262011, mgrast_dev_04082011, mgrast_version_3_2, mgrast_dev_12152011, mgrast_dev_06072011, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2011_0119, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, mgrast_dev_04012011, myrast_33, rast_rel_2011_0928, mgrast_dev_04052011, mgrast_dev_02222011, mgrast_dev_10262011, HEAD
Changes since 1.3: +5 -0 lines
Update to FF processing

my $usage = "usage: usable_motifs Dir";
use IO::File;
use strict;
use List::Util qw(reduce sum);
use Data::Dumper;

my $dir;
(
 ($dir = shift @ARGV)
)
    || die $usage;

my $range = "7-12";
if (@ARGV)
{
    $range = shift;
}

my($min_k, $max_k);

if ($range =~ /^\d+$/)
{
    $min_k = $max_k = $range;
}
elsif ($range =~ /^(\d+)-(\d+)/)
{
    $min_k = $1;
    $max_k = $2;
}
else
{
    die "Invalid range '$range'\n";
}

if (! -d $dir) { mkdir($dir,0777) || die "could not make $dir" }
my $i;

my @files;
for ($i=$min_k; ($i <= $max_k); $i++)
{
    if (! -d "$dir/$i" ) { mkdir("$dir/$i",0777) || die "could not make $dir/$i" }
    $files[$i] = IO::File->new("| gzip -c > $dir/$i/good.oligos.gz");
#    $files[$i] = IO::File->new(">$dir/$i/good.oligos");
}

my $line = <STDIN>;
while ($line && ($line =~ /^(\S+)\t(\S+)/))
{
    my $curr = substr($1,0,$min_k);
    my @pairs;
    while ($line && ($line =~ /^(\S+)\t(\S+)/) && (substr($1,0,$min_k) eq $curr))
    {
	push(@pairs,[$1,$2,length($1)]);  # [Oligo,FuncIndex]
	$line = <STDIN>;
    }
    &process_set(\@pairs,\@files);
}

for ($i=$min_k; ($i <= $max_k); $i++)
{
    close($files[$i]);
}

sub process_set {
    my($pairs,$files) = @_;

    my $i;
    for ($i=$min_k; ($i <= $max_k); $i++)
    {
	my $fh = $files->[$i];
	my %counts;
# 	foreach my $pair (@$pairs)
# 	{
# 	    my($oligo,$funcI,$len) = @$pair;

# 	    my $short_oligo;
# 	    if ($len >= $i)
# 	    {
# 		$short_oligo = substr($oligo,0,$i);
# 		$counts{$short_oligo}->{$funcI}++;
# 	    }
# 	}

	# 
	#  The above optimizes to the following.
	#

	for (@$pairs)
	{
	    $counts{substr($_->[0], 0, $i)}->{$_->[1]}++ if $_->[2] >= $i;
	}


	foreach my $short_oligo (sort keys(%counts))
	{
	    my $funcI_hash = $counts{$short_oligo};

# 	    my @tmp = sort { $funcI_hash->{$b} <=> $funcI_hash->{$a}} keys(%$funcI_hash);
# 	    my $sum = 0;
# 	    foreach my $funcI (@tmp) { $sum += $funcI_hash->{$funcI} }
	    #
	    # The above optimizes to the following. No need to sort when we
	    # just want the max value.
	    #

	    my($k, $v, $max, $maxv);
	    my $sum = 0;
	    while (($k, $v) = each %$funcI_hash)
	    {
		if ($v > $maxv)
		{
		    $maxv = $v;
		    $max = $k;
		}
		$sum += $v;
	    }

#	    my $sum = sum values %$funcI_hash;
#	    my $max = reduce { $funcI_hash->{$a} > $funcI_hash->{$b} ? $a : $b } keys %$funcI_hash;
	    
	    if ($maxv / $sum > 0.9)
#	    if (($funcI_hash->{$tmp[0]} / $sum) > 0.9)
	    {
		print $fh $short_oligo, "\t", $max, "\n";
#		print $fh "$short_oligo\t$tmp[0]\t" . $funcI_hash->{$tmp[0]}/$sum . "\t" . $funcI_hash->{$tmp[0]} . "\n";
#		print $fh join("\t",($short_oligo,
#				     $tmp[0],
#				     sprintf("%0.3f",$funcI_hash->{$tmp[0]}/$sum),
#				     $funcI_hash->{$tmp[0]})),"\n";
	    }
	}
    }
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3