[Bio] / FigKernelScripts / bayes_expected_start_scores.pl Repository: Repository Listing Bio

# Diff of /FigKernelScripts/bayes_expected_start_scores.pl

revision 1.2, Mon Dec 5 18:56:37 2005 UTC revision 1.3, Thu Apr 19 11:06:01 2007 UTC
# Line 22  Line 22
22  use Memoize;  use Memoize;
23  memoize('binom');  memoize('binom');
24
25    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
26    #...Script to re-score STARTs by bayes expected risk
27    # Arguments:
28    #   avg_peg_len    = Average length of a PEG (usually ~267 AA)
29    #   avg_leader_len = Average length of upstream region
30    #-----------------------------------------------------------------------
31
32  \$0 =~ m/^([^\/]+)\$/;  \$0 =~ m/^([^\/]+)\$/;
33  \$usage = "\$1  avg_peg_len avg_leader_len  < scored.start.objects > re-scored.start.objects";  \$usage = "\$1  avg_peg_len avg_leader_len  < scored.start.objects > re-scored.start.objects";
34
35  (((\$avg_peg_len, \$avg_leader_len) = @ARGV) == 2)  (((\$avg_peg_len, \$avg_leader_len) = @ARGV) == 2)
36      || die "usage: \$usage";      || die "usage: \$usage";
37
38    #...Convert parameters to probabilities,
39    #   assuming Negative Binomial distribution of PEG lengths,
40    #   and geometrical distribution of upstream region lengths
41  \$nu = 2;  \$nu = 2;
42  \$p_peg    = \$nu / (\$nu + \$avg_peg_len/3);  \$p_peg    = \$nu / (\$nu + \$avg_peg_len/3);
44
45
46    #...Penalities elicited from Ross for "short" or "long" calls
47    #   (probably should have been parameters, with these as defaults):
48  \$short_call_weight = -0.50;  \$short_call_weight = -0.50;
49  \$long_call_weight  = -0.25;  \$long_call_weight  = -0.25;
50
51    #... End Of Record marker (NOTE: subrecords end with "//\n")
52  \$/ = "///\n";  \$/ = "///\n";
53
54    #... \$tick and \$tock are counters for progress display
55  \$tick = \$tock = 0;  \$tick = \$tock = 0;
56  while (defined(\$entry = <STDIN>))  while (defined(\$entry = <STDIN>)) {
57  {
58        #...trim off End of Record marker
59      chomp \$entry;      chomp \$entry;
60
61        #...Print dot to STDERR every 100 records:
62      if (\$tick >= 100) { print STDERR ".";  \$tick = 0; \$tock++; } else { \$tick++; }      if (\$tick >= 100) { print STDERR ".";  \$tick = 0; \$tock++; } else { \$tick++; }
63        #...Print newline to STDERR every 50 dots:
64      if (\$tock >= 50)  { print STDERR "\n"; \$tock = 0; }      if (\$tock >= 50)  { print STDERR "\n"; \$tock = 0; }
65
66        #...Split subrecords at "//\n" mark:
67      @calls  = split m{//\n}, \$entry;      @calls  = split m{//\n}, \$entry;
68
69        #...Shift off record header, leaving just call subrecords:
71
72        #...Can't remember why this is here, but it appears
73        #   to pop off the last call subrecord if it is null;
74        #   perhaps one of the ealier programs left a null subrecord
75        #   at the end of the call block ???
76      unless (\$calls[-1]) { pop @calls; }      unless (\$calls[-1]) { pop @calls; }
77
78      %header_fields = map { m/^([^=]+)=([^=]+)\$/ } split /\n/, \$header;      #...Subrecords consists of list of fields of form "KEY=VALUE\n";
79        #   split header subrecord at newlines, and store as a hash:
80        %header_fields = map { m/^([^=]+)=([^=]+)\$/; \$1 => \$2 } split /\n/, \$header;
81
82        #...Extract the begining and end coordinates of the ORF, and compute its length:
85      \$orf_len = (1 + abs(\$orf_end - \$orf_beg)) / 3;      \$orf_len = (1 + abs(\$orf_end - \$orf_beg)) / 3;
86
87      undef %scores;      #...Split call subrecords at newlines, and convert to hash keyed on START offsets;
88      foreach \$call (@calls)      #   for convenience, extract score values into separate hash.
89      {      undef %scores;   #...Clear out scores from last set of calls
90        foreach \$call (@calls) {
91
92          \$fields  = {};          \$fields  = {};
93          %\$fields = map { m/^([^=]+)=([^=]+)\$/; \$1 => \$2 } split /\n/, \$call;          %\$fields = map { m/^([^=]+)=([^=]+)\$/; \$1 => \$2 } split /\n/, \$call;
94
95          \$calls{\$fields->{START}}  = \$fields;          \$calls{\$fields->{START}}  = \$fields;
96          \$scores{\$fields->{START}} = \$fields->{SCORE};          \$scores{\$fields->{START}} = \$fields->{SCORE};
97      }      }
98
99        #...extract sorted list of START offsets:
100      @candidates = sort { \$a <=> \$b } keys %scores;      @candidates = sort { \$a <=> \$b } keys %scores;
101
102        #...Compute likelihoods of START candidate offsets,
103        #   and accumulate normalization constant:
104      \$normalization = 0;      \$normalization = 0;
105      foreach \$candidate (@candidates)      foreach \$candidate (@candidates) {
106      {
107          \$called_len  = \$orf_len - (\$candidate-1)/3;          \$called_len  = \$orf_len - (\$candidate-1)/3;
108
# Line 79  Line 115
115          \$normalization += \$unnormed_prob{\$candidate};          \$normalization += \$unnormed_prob{\$candidate};
116      }      }
117
118      \$best_pos    =  0;      #...Find best bayes expected payoff:
119      \$best_expect = -9.99999999e99;      \$best_pos    =  0;              #...Probably should have defaulted to either leftmost or original offset
120      foreach \$called_pos (@candidates)      \$best_expect = -9.99999999e99;  #...Initialize to score worse than any possible score
121      {
122          \$expectation = 0;      #...Loop over all possible START candidates ("states of nature")
123          foreach \$candidate (@candidates)      foreach \$called_pos (@candidates) {
124          {          \$expectation = 0;  #...Initialize expacted value to zero
125
126            #...Loop over possible START candidate decisions ("actions")
127            foreach \$candidate (@candidates) {
128
129              \$unnormed_prob = \$unnormed_prob{\$candidate};              \$unnormed_prob = \$unnormed_prob{\$candidate};
130
131              if    (\$called_pos < \$candidate)              if    (\$called_pos < \$candidate) {
132              {                  #...Penalize score if decision call is longer than assumed "state of nature"
133                    #   (i.e., decision is to the left of (smaller than) "state of nature"),
134                    #   and weight by probability that assumed "state of nature" is the true START
135                    #   (Arguably should use score for "state of nature," not "decision" call):
136                  \$expectation += \$unnormed_prob * \$long_call_weight  * \$scores{\$candidate};                  \$expectation += \$unnormed_prob * \$long_call_weight  * \$scores{\$candidate};
137              }              }
138              elsif (\$called_pos > \$candidate)              elsif (\$called_pos > \$candidate) {
139              {                  #...Penalize score if decision call is shorter than assumed "state of nature"
140                    #   (i.e., decision is to the right of (larger than) "state of nature"),
141                    #   and weight by probability that assumed "state of nature" is the true START
142                    #   (Arguably should use score for "state of nature," not "decision" call):
143                  \$expectation += \$unnormed_prob * \$short_call_weight * \$scores{\$candidate};                  \$expectation += \$unnormed_prob * \$short_call_weight * \$scores{\$candidate};
144              }              }
145              else # (\$called_pos == \$candidate)              else
146              {              {
147                    #...\$called_pos == \$candidate, so accept unpenalized score
148                    #   when decision and "state of nature" are identical, weighted
149                    #   by probability that assumed "state of nature" is the true START:
150                  \$expectation += \$unnormed_prob * \$scores{\$candidate};                  \$expectation += \$unnormed_prob * \$scores{\$candidate};
151              }              }
152          }          }
153          \$expectation /= \$normalization;          \$expectation /= \$normalization;   #...Normalize expected score
154
155            #...If expected score of decision is better than best seen so far,
156            #   accept it as new best score:
157          if (\$expectation > \$best_expect)          if (\$expectation > \$best_expect)
158          {          {
159              \$best_pos    = \$called_pos;              \$best_pos    = \$called_pos;
160              \$best_expect = \$expectation;              \$best_expect = \$expectation;
161          }          }
162
163            #...Copy original score to "RAW_SCORE" field,
164            #   and replace score with bayes expected score:
165          \$calls{\$called_pos}->{RAW_SCORE} = \$calls{\$called_pos}->{SCORE};          \$calls{\$called_pos}->{RAW_SCORE} = \$calls{\$called_pos}->{SCORE};
166          \$calls{\$called_pos}->{SCORE}     = \$expectation;          \$calls{\$called_pos}->{SCORE}     = \$expectation;
167      }      }
168
169      if (\$orf_beg < \$orf_end)
170      {      #...Convert offset with best bayes expected score to new START coordinate:
171        if (\$orf_beg < \$orf_end) {
172          \$new_start = \$orf_beg + \$best_pos - 1;          \$new_start = \$orf_beg + \$best_pos - 1;
173      }      }
174      else      else {
{
175          \$new_start = \$orf_beg - \$best_pos + 1;          \$new_start = \$orf_beg - \$best_pos + 1;
176      }      }
177
181
182        #...Print out header key/value pairs:
183      while ((\$k, \$v) = each %header_fields) { print "\$k=\$v\n"; }      while ((\$k, \$v) = each %header_fields) { print "\$k=\$v\n"; }
184      print "//\n";      print "//\n";   #...print subrecord separator...
185
186        #...Print out key/value pairs for re-scored calls, sorted by new scores:
187      foreach \$candidate (sort { \$calls{\$b}->{SCORE} <=> \$calls{\$a}->{SCORE} } keys %calls)      foreach \$candidate (sort { \$calls{\$b}->{SCORE} <=> \$calls{\$a}->{SCORE} } keys %calls)
188      {      {
189          \$call = \$calls{\$candidate};          \$call = \$calls{\$candidate};
190          while ((\$k, \$v) = each %\$call) { print "\$k=\$v\n"; }          while ((\$k, \$v) = each %\$call) { print "\$k=\$v\n"; }
191          print "//\n";          print "//\n";
192      }      }
193
194        #...Print "End of Record" marker:
195      print "///\n";      print "///\n";
196  }  }
197  print STDERR "\n\n";  print STDERR "\n\n";
198
199
200  sub binom
201  {  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
202    #...Recursive computation of generalized binomial coefficients
203    #   extrapolated to "negative" arguments using gamma-function identities,
204    #   and using various identities to make the calculation more efficient:
205    #-----------------------------------------------------------------------
206    sub binom {
207      my (\$n, \$k) = @_;      my (\$n, \$k) = @_;
208      my (\$sign, \$binom);      my (\$sign, \$binom);
209

Legend:
 Removed from v.1.2 changed lines Added in v.1.3