[Bio] / FigKernelScripts / bayes_expected_start_scores.pl Repository:
ViewVC logotype

View of /FigKernelScripts/bayes_expected_start_scores.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (download) (as text) (annotate)
Thu Apr 19 11:06:01 2007 UTC (12 years, 10 months ago) by overbeek
Branch: MAIN
CVS Tags: mgrast_dev_08112011, rast_rel_2009_05_18, mgrast_dev_08022011, rast_rel_2014_0912, rast_rel_2008_06_18, myrast_rel40, rast_rel_2008_06_16, mgrast_dev_05262011, rast_rel_2008_12_18, mgrast_dev_04082011, rast_rel_2008_07_21, rast_rel_2010_0928, rast_2008_0924, mgrast_version_3_2, mgrast_dev_12152011, rast_rel_2008_04_23, mgrast_dev_06072011, rast_rel_2008_09_30, rast_rel_2009_0925, rast_rel_2010_0526, rast_rel_2014_0729, mgrast_dev_02212011, rast_rel_2010_1206, mgrast_release_3_0, mgrast_dev_03252011, rast_rel_2010_0118, mgrast_rel_2008_0924, mgrast_rel_2008_1110_v2, rast_rel_2009_02_05, rast_rel_2011_0119, mgrast_rel_2008_0625, mgrast_release_3_0_4, mgrast_release_3_0_2, mgrast_release_3_0_3, mgrast_release_3_0_1, mgrast_dev_03312011, mgrast_release_3_1_2, mgrast_release_3_1_1, mgrast_release_3_1_0, mgrast_dev_04132011, rast_rel_2008_10_09, mgrast_dev_04012011, rast_release_2008_09_29, mgrast_rel_2008_0806, mgrast_rel_2008_0923, mgrast_rel_2008_0919, rast_rel_2009_07_09, rast_rel_2010_0827, mgrast_rel_2008_1110, myrast_33, rast_rel_2011_0928, rast_rel_2008_09_29, mgrast_rel_2008_0917, rast_rel_2008_10_29, mgrast_dev_04052011, mgrast_dev_02222011, rast_rel_2009_03_26, mgrast_dev_10262011, rast_rel_2008_11_24, rast_rel_2008_08_07, HEAD
Changes since 1.2: +92 -28 lines
Added comments to expalin logic. -- gdp

# -*- perl -*-
#
# Copyright (c) 2003-2006 University of Chicago and Fellowship
# for Interpretations of Genomes. All Rights Reserved.
#
# This file is part of the SEED Toolkit.
# 
# The SEED Toolkit is free software. You can redistribute
# it and/or modify it under the terms of the SEED Toolkit
# Public License. 
#
# You should have received a copy of the SEED Toolkit Public License
# along with this program; if not write to the University of Chicago
# at info@ci.uchicago.edu or the Fellowship for Interpretation of
# Genomes at veronika@thefig.info or download a copy from
# http://www.theseed.org/LICENSE.TXT.
#


use FIG;

use Memoize;
memoize('binom');

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#...Script to re-score STARTs by bayes expected risk
# Arguments:
#   avg_peg_len    = Average length of a PEG (usually ~267 AA)
#   avg_leader_len = Average length of upstream region
#-----------------------------------------------------------------------

$0 =~ m/^([^\/]+)$/;
$usage = "$1  avg_peg_len avg_leader_len  < scored.start.objects > re-scored.start.objects";

((($avg_peg_len, $avg_leader_len) = @ARGV) == 2)
    || die "usage: $usage";

#...Convert parameters to probabilities,
#   assuming Negative Binomial distribution of PEG lengths,
#   and geometrical distribution of upstream region lengths
$nu = 2;
$p_peg    = $nu / ($nu + $avg_peg_len/3);
$p_leader = 1 / (1 + $avg_leader_len/3);


#...Penalities elicited from Ross for "short" or "long" calls
#   (probably should have been parameters, with these as defaults):
$short_call_weight = -0.50;
$long_call_weight  = -0.25;

#... End Of Record marker (NOTE: subrecords end with "//\n")
$/ = "///\n";

#... $tick and $tock are counters for progress display
$tick = $tock = 0;
while (defined($entry = <STDIN>)) {
    
    #...trim off End of Record marker
    chomp $entry;
    
    #...Print dot to STDERR every 100 records:
    if ($tick >= 100) { print STDERR ".";  $tick = 0; $tock++; } else { $tick++; }
    #...Print newline to STDERR every 50 dots:
    if ($tock >= 50)  { print STDERR "\n"; $tock = 0; }
    
    #...Split subrecords at "//\n" mark:
    @calls  = split m{//\n}, $entry;
    
    #...Shift off record header, leaving just call subrecords:
    $header = shift @calls;
    
    #...Can't remember why this is here, but it appears
    #   to pop off the last call subrecord if it is null;
    #   perhaps one of the ealier programs left a null subrecord
    #   at the end of the call block ???
    unless ($calls[-1]) { pop @calls; }
    
    #...Subrecords consists of list of fields of form "KEY=VALUE\n";
    #   split header subrecord at newlines, and store as a hash:
    %header_fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $header;
    
    #...Extract the begining and end coordinates of the ORF, and compute its length:
    $orf_beg = $header_fields{BEGIN};
    $orf_end = $header_fields{END};
    $orf_len = (1 + abs($orf_end - $orf_beg)) / 3;
    
    #...Split call subrecords at newlines, and convert to hash keyed on START offsets;
    #   for convenience, extract score values into separate hash.
    undef %scores;   #...Clear out scores from last set of calls
    foreach $call (@calls) {
	
	$fields  = {};
	%$fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $call;
	
	$calls{$fields->{START}}  = $fields;
	$scores{$fields->{START}} = $fields->{SCORE};
    }
    
    #...extract sorted list of START offsets:
    @candidates = sort { $a <=> $b } keys %scores;
    
    #...Compute likelihoods of START candidate offsets,
    #   and accumulate normalization constant:
    $normalization = 0;
    foreach $candidate (@candidates) {
	
	$called_len  = $orf_len - ($candidate-1)/3;
	
	$prob_leader = $p_leader * (1-$p_leader)**(($candidate-1)/3);
	$prob_peg    = &binom(($called_len + $nu - 1), $called_len)
	             * $p_peg**$nu * (1-$p_peg)**$called_len;
		
	$unnormed_prob{$candidate} = $prob_leader * $prob_peg;
	
	$normalization += $unnormed_prob{$candidate};
    }
    
    #...Find best bayes expected payoff:
    $best_pos    =  0;              #...Probably should have defaulted to either leftmost or original offset
    $best_expect = -9.99999999e99;  #...Initialize to score worse than any possible score
    
    #...Loop over all possible START candidates ("states of nature")
    foreach $called_pos (@candidates) {
	$expectation = 0;  #...Initialize expacted value to zero
	
	#...Loop over possible START candidate decisions ("actions")
	foreach $candidate (@candidates) {
	    
	    $unnormed_prob = $unnormed_prob{$candidate};
	    
	    if    ($called_pos < $candidate) {
		#...Penalize score if decision call is longer than assumed "state of nature"
		#   (i.e., decision is to the left of (smaller than) "state of nature"),
		#   and weight by probability that assumed "state of nature" is the true START
		#   (Arguably should use score for "state of nature," not "decision" call):
		$expectation += $unnormed_prob * $long_call_weight  * $scores{$candidate};
	    }
	    elsif ($called_pos > $candidate) {
		#...Penalize score if decision call is shorter than assumed "state of nature"
		#   (i.e., decision is to the right of (larger than) "state of nature"),
		#   and weight by probability that assumed "state of nature" is the true START
		#   (Arguably should use score for "state of nature," not "decision" call):
		$expectation += $unnormed_prob * $short_call_weight * $scores{$candidate};
	    }
	    else
	    {
		#...$called_pos == $candidate, so accept unpenalized score
		#   when decision and "state of nature" are identical, weighted
		#   by probability that assumed "state of nature" is the true START:
		$expectation += $unnormed_prob * $scores{$candidate};
	    }
	}
	$expectation /= $normalization;   #...Normalize expected score
	
	#...If expected score of decision is better than best seen so far,
	#   accept it as new best score:
	if ($expectation > $best_expect)
	{
	    $best_pos    = $called_pos;
	    $best_expect = $expectation;
	}
	
	#...Copy original score to "RAW_SCORE" field,
	#   and replace score with bayes expected score:
	$calls{$called_pos}->{RAW_SCORE} = $calls{$called_pos}->{SCORE};
	$calls{$called_pos}->{SCORE}     = $expectation;
    }
    
    
    #...Convert offset with best bayes expected score to new START coordinate:
    if ($orf_beg < $orf_end) {
	$new_start = $orf_beg + $best_pos - 1;
    }
    else {
	$new_start = $orf_beg - $best_pos + 1;
    }
    
    #...Add best result to header:
    $header_fields{NEW_START_POS} = $best_pos;
    $header_fields{NEW_START_LOC} = "$header_fields{CONTIG}\_$new_start\_$orf_end";
    
    #...Print out header key/value pairs:
    while (($k, $v) = each %header_fields) { print "$k=$v\n"; }
    print "//\n";   #...print subrecord separator...
    
    #...Print out key/value pairs for re-scored calls, sorted by new scores:
    foreach $candidate (sort { $calls{$b}->{SCORE} <=> $calls{$a}->{SCORE} } keys %calls)
    {
	$call = $calls{$candidate};
	while (($k, $v) = each %$call) { print "$k=$v\n"; }
	print "//\n";
    }
    
    #...Print "End of Record" marker:
    print "///\n";
}
print STDERR "\n\n";



#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++    
#...Recursive computation of generalized binomial coefficients
#   extrapolated to "negative" arguments using gamma-function identities,
#   and using various identities to make the calculation more efficient:
#-----------------------------------------------------------------------    
sub binom {
    my ($n, $k) = @_;
    my ($sign, $binom);

    $sign = 1;
    if ($n < 0)    { $n = ($k - $n - 1); $sign = -1 if ($k % 2); }
    if ($k > $n/2) { $k = ($n - $k); }
#   print STDERR "n = $n, k = $k\n";

    return 0 if ($k < 0);
    return 1 if ($k == 0);

    $binom = &binom($n-1, $k) + &binom($n-1, $k-1);

    return ($sign * $binom);
}

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3