[Bio] / FigKernelScripts / feature_length_histogram.pl Repository:
ViewVC logotype

View of /FigKernelScripts/feature_length_histogram.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Fri Nov 30 21:29:06 2007 UTC (12 years, 2 months ago) by gdpusch
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07
Changes since 1.2: +32 -9 lines
Fixed duplicated ID bug, plus minor cosmetic changes. -- /gdp

# -*- perl -*-
########################################################################
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
########################################################################

$0 =~ m/([^\/]+)$/;
my $self  =  $1;
my $usage = "$self  [-norm] [-null] [-nolabel] [-pept] [-locus] < tbl.file > tbl.cumul 2> summary";

if (defined($ARGV[0]) && ($ARGV[0] =~ m/-help/))
{
    print STDERR "\n\t$usage\n\n";
    exit(1);
}

$norm = $null = $nolabel = $pept = $locus = $file = "";
while (@ARGV)
{
    if    ($ARGV[0] =~ m/-norm/)     { $norm    = shift; }
    elsif ($ARGV[0] =~ m/-null/)     { $null    = shift; }
    elsif ($ARGV[0] =~ m/-nolabel/)  { $nolabel = shift; }
    elsif ($ARGV[0] =~ m/-pept/)     { $pept    = shift; }
    elsif ($ARGV[0] =~ m/-locus/)    { $locus   = shift; }
    elsif (-s $ARGV[0])              { $file    = shift; }
    else  { die "Invalid arg $ARGV[0] --- usage: $usage"; }
}

if ($file) { 
    open(FILE, "<$file") || die "could not read-open $file";
    $fh = \*FILE; 
}
else {
    $fh = \*STDIN;
}


$chars = 0;
$total = 0;
while (defined($entry = <$fh>))
{
    chomp $entry;
    if ($entry =~ m/^(\S+)\t(\S+\_(\d+)_(\d+))/)
    {
	++$total;
	($id, $loc, $beg, $end) = ($1, $2, $3, $4);
    }
    elsif ($entry =~ m/^(\S+\_(\d+)_(\d+))/)
    {
	++$total;
	($id, $loc, $beg, $end) = ($1, $1, $2, $3);
    }
    else
    {
	next;
    }
    
    $len = (1 + abs($end-$beg));
    if ($pept)
    {
	$len /= 3;
    }
    
    unless (defined($histo{$len})) 
    { 
	$histo{$len}  = 0;
	unless ($nolabel || $null)  { $id_set{$len} = []; }
    }
    
    $chars       += $len;
    $histo{$len} += 1;
    
    unless ($nolabel || $null)  {
	$x = $id_set{$len};
	if ($locus) {
	    push(@$x, $loc);
	}
	else {
	    push(@$x, $id);
	}
    }
}

$cumul  = 0;
$expect = 0;
foreach $len (sort {$a <=> $b} keys %histo)
{
    $expect += $histo{$len} * $len;
    
    $cumul  += $histo{$len};
    
    if (! defined($min))     { $min = $len; }
    
    if ((! defined($median)) && ($cumul > $total/2))  { $median = $len; }
    
    $plot = $norm ? ($cumul/$total) : $cumul ;
    
    unless ($null)
    {
	print "$len\t$histo{$len}\t$plot";
	print "\t", join(", ", @{$id_set{$len}}) unless ($nolabel || $null);
	print "\n";
    }
    
    $max = $len;
}
$expect  = int(0.5 + 10*$expect/$cumul)/10;

print STDERR "\nThere are $chars chars in $total seqs.";
print STDERR "\nmin length = $min, median length = $median, mean length = $expect, max length = $max\n\n";

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3