[Bio] / FigKernelScripts / bayes_expected_start_scores.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/bayes_expected_start_scores.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : overbeek 1.1 # -*- perl -*-
2 : olson 1.2 #
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : overbeek 1.1
20 :     use FIG;
21 :    
22 :     use Memoize;
23 :     memoize('binom');
24 :    
25 : overbeek 1.3 #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
26 :     #...Script to re-score STARTs by bayes expected risk
27 :     # Arguments:
28 :     # avg_peg_len = Average length of a PEG (usually ~267 AA)
29 :     # avg_leader_len = Average length of upstream region
30 :     #-----------------------------------------------------------------------
31 :    
32 : overbeek 1.1 $0 =~ m/^([^\/]+)$/;
33 :     $usage = "$1 avg_peg_len avg_leader_len < scored.start.objects > re-scored.start.objects";
34 :    
35 :     ((($avg_peg_len, $avg_leader_len) = @ARGV) == 2)
36 :     || die "usage: $usage";
37 :    
38 : overbeek 1.3 #...Convert parameters to probabilities,
39 :     # assuming Negative Binomial distribution of PEG lengths,
40 :     # and geometrical distribution of upstream region lengths
41 : overbeek 1.1 $nu = 2;
42 :     $p_peg = $nu / ($nu + $avg_peg_len/3);
43 :     $p_leader = 1 / (1 + $avg_leader_len/3);
44 :    
45 : overbeek 1.3
46 :     #...Penalities elicited from Ross for "short" or "long" calls
47 :     # (probably should have been parameters, with these as defaults):
48 : overbeek 1.1 $short_call_weight = -0.50;
49 :     $long_call_weight = -0.25;
50 :    
51 : overbeek 1.3 #... End Of Record marker (NOTE: subrecords end with "//\n")
52 : overbeek 1.1 $/ = "///\n";
53 :    
54 : overbeek 1.3 #... $tick and $tock are counters for progress display
55 : overbeek 1.1 $tick = $tock = 0;
56 : overbeek 1.3 while (defined($entry = <STDIN>)) {
57 :    
58 :     #...trim off End of Record marker
59 : overbeek 1.1 chomp $entry;
60 :    
61 : overbeek 1.3 #...Print dot to STDERR every 100 records:
62 : overbeek 1.1 if ($tick >= 100) { print STDERR "."; $tick = 0; $tock++; } else { $tick++; }
63 : overbeek 1.3 #...Print newline to STDERR every 50 dots:
64 : overbeek 1.1 if ($tock >= 50) { print STDERR "\n"; $tock = 0; }
65 :    
66 : overbeek 1.3 #...Split subrecords at "//\n" mark:
67 : overbeek 1.1 @calls = split m{//\n}, $entry;
68 : overbeek 1.3
69 :     #...Shift off record header, leaving just call subrecords:
70 : overbeek 1.1 $header = shift @calls;
71 : overbeek 1.3
72 :     #...Can't remember why this is here, but it appears
73 :     # to pop off the last call subrecord if it is null;
74 :     # perhaps one of the ealier programs left a null subrecord
75 :     # at the end of the call block ???
76 : overbeek 1.1 unless ($calls[-1]) { pop @calls; }
77 :    
78 : overbeek 1.3 #...Subrecords consists of list of fields of form "KEY=VALUE\n";
79 :     # split header subrecord at newlines, and store as a hash:
80 :     %header_fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $header;
81 :    
82 :     #...Extract the begining and end coordinates of the ORF, and compute its length:
83 : overbeek 1.1 $orf_beg = $header_fields{BEGIN};
84 :     $orf_end = $header_fields{END};
85 :     $orf_len = (1 + abs($orf_end - $orf_beg)) / 3;
86 :    
87 : overbeek 1.3 #...Split call subrecords at newlines, and convert to hash keyed on START offsets;
88 :     # for convenience, extract score values into separate hash.
89 :     undef %scores; #...Clear out scores from last set of calls
90 :     foreach $call (@calls) {
91 :    
92 : overbeek 1.1 $fields = {};
93 :     %$fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $call;
94 :    
95 :     $calls{$fields->{START}} = $fields;
96 :     $scores{$fields->{START}} = $fields->{SCORE};
97 :     }
98 : overbeek 1.3
99 :     #...extract sorted list of START offsets:
100 : overbeek 1.1 @candidates = sort { $a <=> $b } keys %scores;
101 :    
102 : overbeek 1.3 #...Compute likelihoods of START candidate offsets,
103 :     # and accumulate normalization constant:
104 : overbeek 1.1 $normalization = 0;
105 : overbeek 1.3 foreach $candidate (@candidates) {
106 :    
107 : overbeek 1.1 $called_len = $orf_len - ($candidate-1)/3;
108 :    
109 :     $prob_leader = $p_leader * (1-$p_leader)**(($candidate-1)/3);
110 :     $prob_peg = &binom(($called_len + $nu - 1), $called_len)
111 :     * $p_peg**$nu * (1-$p_peg)**$called_len;
112 :    
113 :     $unnormed_prob{$candidate} = $prob_leader * $prob_peg;
114 :    
115 :     $normalization += $unnormed_prob{$candidate};
116 :     }
117 :    
118 : overbeek 1.3 #...Find best bayes expected payoff:
119 :     $best_pos = 0; #...Probably should have defaulted to either leftmost or original offset
120 :     $best_expect = -9.99999999e99; #...Initialize to score worse than any possible score
121 :    
122 :     #...Loop over all possible START candidates ("states of nature")
123 :     foreach $called_pos (@candidates) {
124 :     $expectation = 0; #...Initialize expacted value to zero
125 :    
126 :     #...Loop over possible START candidate decisions ("actions")
127 :     foreach $candidate (@candidates) {
128 :    
129 : overbeek 1.1 $unnormed_prob = $unnormed_prob{$candidate};
130 :    
131 : overbeek 1.3 if ($called_pos < $candidate) {
132 :     #...Penalize score if decision call is longer than assumed "state of nature"
133 :     # (i.e., decision is to the left of (smaller than) "state of nature"),
134 :     # and weight by probability that assumed "state of nature" is the true START
135 :     # (Arguably should use score for "state of nature," not "decision" call):
136 : overbeek 1.1 $expectation += $unnormed_prob * $long_call_weight * $scores{$candidate};
137 :     }
138 : overbeek 1.3 elsif ($called_pos > $candidate) {
139 :     #...Penalize score if decision call is shorter than assumed "state of nature"
140 :     # (i.e., decision is to the right of (larger than) "state of nature"),
141 :     # and weight by probability that assumed "state of nature" is the true START
142 :     # (Arguably should use score for "state of nature," not "decision" call):
143 : overbeek 1.1 $expectation += $unnormed_prob * $short_call_weight * $scores{$candidate};
144 :     }
145 : overbeek 1.3 else
146 : overbeek 1.1 {
147 : overbeek 1.3 #...$called_pos == $candidate, so accept unpenalized score
148 :     # when decision and "state of nature" are identical, weighted
149 :     # by probability that assumed "state of nature" is the true START:
150 : overbeek 1.1 $expectation += $unnormed_prob * $scores{$candidate};
151 :     }
152 :     }
153 : overbeek 1.3 $expectation /= $normalization; #...Normalize expected score
154 : overbeek 1.1
155 : overbeek 1.3 #...If expected score of decision is better than best seen so far,
156 :     # accept it as new best score:
157 : overbeek 1.1 if ($expectation > $best_expect)
158 :     {
159 :     $best_pos = $called_pos;
160 :     $best_expect = $expectation;
161 :     }
162 :    
163 : overbeek 1.3 #...Copy original score to "RAW_SCORE" field,
164 :     # and replace score with bayes expected score:
165 : overbeek 1.1 $calls{$called_pos}->{RAW_SCORE} = $calls{$called_pos}->{SCORE};
166 :     $calls{$called_pos}->{SCORE} = $expectation;
167 :     }
168 :    
169 : overbeek 1.3
170 :     #...Convert offset with best bayes expected score to new START coordinate:
171 :     if ($orf_beg < $orf_end) {
172 : overbeek 1.1 $new_start = $orf_beg + $best_pos - 1;
173 :     }
174 : overbeek 1.3 else {
175 : overbeek 1.1 $new_start = $orf_beg - $best_pos + 1;
176 :     }
177 :    
178 : overbeek 1.3 #...Add best result to header:
179 : overbeek 1.1 $header_fields{NEW_START_POS} = $best_pos;
180 :     $header_fields{NEW_START_LOC} = "$header_fields{CONTIG}\_$new_start\_$orf_end";
181 :    
182 : overbeek 1.3 #...Print out header key/value pairs:
183 : overbeek 1.1 while (($k, $v) = each %header_fields) { print "$k=$v\n"; }
184 : overbeek 1.3 print "//\n"; #...print subrecord separator...
185 :    
186 :     #...Print out key/value pairs for re-scored calls, sorted by new scores:
187 : overbeek 1.1 foreach $candidate (sort { $calls{$b}->{SCORE} <=> $calls{$a}->{SCORE} } keys %calls)
188 :     {
189 :     $call = $calls{$candidate};
190 :     while (($k, $v) = each %$call) { print "$k=$v\n"; }
191 :     print "//\n";
192 :     }
193 : overbeek 1.3
194 :     #...Print "End of Record" marker:
195 : overbeek 1.1 print "///\n";
196 :     }
197 :     print STDERR "\n\n";
198 :    
199 :    
200 : overbeek 1.3
201 :     #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
202 :     #...Recursive computation of generalized binomial coefficients
203 :     # extrapolated to "negative" arguments using gamma-function identities,
204 :     # and using various identities to make the calculation more efficient:
205 :     #-----------------------------------------------------------------------
206 :     sub binom {
207 : overbeek 1.1 my ($n, $k) = @_;
208 :     my ($sign, $binom);
209 :    
210 :     $sign = 1;
211 :     if ($n < 0) { $n = ($k - $n - 1); $sign = -1 if ($k % 2); }
212 :     if ($k > $n/2) { $k = ($n - $k); }
213 :     # print STDERR "n = $n, k = $k\n";
214 :    
215 :     return 0 if ($k < 0);
216 :     return 1 if ($k == 0);
217 :    
218 :     $binom = &binom($n-1, $k) + &binom($n-1, $k-1);
219 :    
220 :     return ($sign * $binom);
221 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3