[Bio] / FigKernelScripts / bayes_expected_start_scores.pl Repository:
ViewVC logotype

Diff of /FigKernelScripts/bayes_expected_start_scores.pl

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1.2, Mon Dec 5 18:56:37 2005 UTC revision 1.3, Thu Apr 19 11:06:01 2007 UTC
# Line 22  Line 22 
22  use Memoize;  use Memoize;
23  memoize('binom');  memoize('binom');
24    
25    #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
26    #...Script to re-score STARTs by bayes expected risk
27    # Arguments:
28    #   avg_peg_len    = Average length of a PEG (usually ~267 AA)
29    #   avg_leader_len = Average length of upstream region
30    #-----------------------------------------------------------------------
31    
32  $0 =~ m/^([^\/]+)$/;  $0 =~ m/^([^\/]+)$/;
33  $usage = "$1  avg_peg_len avg_leader_len  < scored.start.objects > re-scored.start.objects";  $usage = "$1  avg_peg_len avg_leader_len  < scored.start.objects > re-scored.start.objects";
34    
35  ((($avg_peg_len, $avg_leader_len) = @ARGV) == 2)  ((($avg_peg_len, $avg_leader_len) = @ARGV) == 2)
36      || die "usage: $usage";      || die "usage: $usage";
37    
38    #...Convert parameters to probabilities,
39    #   assuming Negative Binomial distribution of PEG lengths,
40    #   and geometrical distribution of upstream region lengths
41  $nu = 2;  $nu = 2;
42  $p_peg    = $nu / ($nu + $avg_peg_len/3);  $p_peg    = $nu / ($nu + $avg_peg_len/3);
43  $p_leader = 1 / (1 + $avg_leader_len/3);  $p_leader = 1 / (1 + $avg_leader_len/3);
44    
45    
46    #...Penalities elicited from Ross for "short" or "long" calls
47    #   (probably should have been parameters, with these as defaults):
48  $short_call_weight = -0.50;  $short_call_weight = -0.50;
49  $long_call_weight  = -0.25;  $long_call_weight  = -0.25;
50    
51    #... End Of Record marker (NOTE: subrecords end with "//\n")
52  $/ = "///\n";  $/ = "///\n";
53    
54    #... $tick and $tock are counters for progress display
55  $tick = $tock = 0;  $tick = $tock = 0;
56  while (defined($entry = <STDIN>))  while (defined($entry = <STDIN>)) {
57  {  
58        #...trim off End of Record marker
59      chomp $entry;      chomp $entry;
60    
61        #...Print dot to STDERR every 100 records:
62      if ($tick >= 100) { print STDERR ".";  $tick = 0; $tock++; } else { $tick++; }      if ($tick >= 100) { print STDERR ".";  $tick = 0; $tock++; } else { $tick++; }
63        #...Print newline to STDERR every 50 dots:
64      if ($tock >= 50)  { print STDERR "\n"; $tock = 0; }      if ($tock >= 50)  { print STDERR "\n"; $tock = 0; }
65    
66        #...Split subrecords at "//\n" mark:
67      @calls  = split m{//\n}, $entry;      @calls  = split m{//\n}, $entry;
68    
69        #...Shift off record header, leaving just call subrecords:
70      $header = shift @calls;      $header = shift @calls;
71    
72        #...Can't remember why this is here, but it appears
73        #   to pop off the last call subrecord if it is null;
74        #   perhaps one of the ealier programs left a null subrecord
75        #   at the end of the call block ???
76      unless ($calls[-1]) { pop @calls; }      unless ($calls[-1]) { pop @calls; }
77    
78      %header_fields = map { m/^([^=]+)=([^=]+)$/ } split /\n/, $header;      #...Subrecords consists of list of fields of form "KEY=VALUE\n";
79        #   split header subrecord at newlines, and store as a hash:
80        %header_fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $header;
81    
82        #...Extract the begining and end coordinates of the ORF, and compute its length:
83      $orf_beg = $header_fields{BEGIN};      $orf_beg = $header_fields{BEGIN};
84      $orf_end = $header_fields{END};      $orf_end = $header_fields{END};
85      $orf_len = (1 + abs($orf_end - $orf_beg)) / 3;      $orf_len = (1 + abs($orf_end - $orf_beg)) / 3;
86    
87      undef %scores;      #...Split call subrecords at newlines, and convert to hash keyed on START offsets;
88      foreach $call (@calls)      #   for convenience, extract score values into separate hash.
89      {      undef %scores;   #...Clear out scores from last set of calls
90        foreach $call (@calls) {
91    
92          $fields  = {};          $fields  = {};
93          %$fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $call;          %$fields = map { m/^([^=]+)=([^=]+)$/; $1 => $2 } split /\n/, $call;
94    
95          $calls{$fields->{START}}  = $fields;          $calls{$fields->{START}}  = $fields;
96          $scores{$fields->{START}} = $fields->{SCORE};          $scores{$fields->{START}} = $fields->{SCORE};
97      }      }
98    
99        #...extract sorted list of START offsets:
100      @candidates = sort { $a <=> $b } keys %scores;      @candidates = sort { $a <=> $b } keys %scores;
101    
102        #...Compute likelihoods of START candidate offsets,
103        #   and accumulate normalization constant:
104      $normalization = 0;      $normalization = 0;
105      foreach $candidate (@candidates)      foreach $candidate (@candidates) {
106      {  
107          $called_len  = $orf_len - ($candidate-1)/3;          $called_len  = $orf_len - ($candidate-1)/3;
108    
109          $prob_leader = $p_leader * (1-$p_leader)**(($candidate-1)/3);          $prob_leader = $p_leader * (1-$p_leader)**(($candidate-1)/3);
# Line 79  Line 115 
115          $normalization += $unnormed_prob{$candidate};          $normalization += $unnormed_prob{$candidate};
116      }      }
117    
118      $best_pos    =  0;      #...Find best bayes expected payoff:
119      $best_expect = -9.99999999e99;      $best_pos    =  0;              #...Probably should have defaulted to either leftmost or original offset
120      foreach $called_pos (@candidates)      $best_expect = -9.99999999e99;  #...Initialize to score worse than any possible score
121      {  
122          $expectation = 0;      #...Loop over all possible START candidates ("states of nature")
123          foreach $candidate (@candidates)      foreach $called_pos (@candidates) {
124          {          $expectation = 0;  #...Initialize expacted value to zero
125    
126            #...Loop over possible START candidate decisions ("actions")
127            foreach $candidate (@candidates) {
128    
129              $unnormed_prob = $unnormed_prob{$candidate};              $unnormed_prob = $unnormed_prob{$candidate};
130    
131              if    ($called_pos < $candidate)              if    ($called_pos < $candidate) {
132              {                  #...Penalize score if decision call is longer than assumed "state of nature"
133                    #   (i.e., decision is to the left of (smaller than) "state of nature"),
134                    #   and weight by probability that assumed "state of nature" is the true START
135                    #   (Arguably should use score for "state of nature," not "decision" call):
136                  $expectation += $unnormed_prob * $long_call_weight  * $scores{$candidate};                  $expectation += $unnormed_prob * $long_call_weight  * $scores{$candidate};
137              }              }
138              elsif ($called_pos > $candidate)              elsif ($called_pos > $candidate) {
139              {                  #...Penalize score if decision call is shorter than assumed "state of nature"
140                    #   (i.e., decision is to the right of (larger than) "state of nature"),
141                    #   and weight by probability that assumed "state of nature" is the true START
142                    #   (Arguably should use score for "state of nature," not "decision" call):
143                  $expectation += $unnormed_prob * $short_call_weight * $scores{$candidate};                  $expectation += $unnormed_prob * $short_call_weight * $scores{$candidate};
144              }              }
145              else # ($called_pos == $candidate)              else
146              {              {
147                    #...$called_pos == $candidate, so accept unpenalized score
148                    #   when decision and "state of nature" are identical, weighted
149                    #   by probability that assumed "state of nature" is the true START:
150                  $expectation += $unnormed_prob * $scores{$candidate};                  $expectation += $unnormed_prob * $scores{$candidate};
151              }              }
152          }          }
153          $expectation /= $normalization;          $expectation /= $normalization;   #...Normalize expected score
154    
155            #...If expected score of decision is better than best seen so far,
156            #   accept it as new best score:
157          if ($expectation > $best_expect)          if ($expectation > $best_expect)
158          {          {
159              $best_pos    = $called_pos;              $best_pos    = $called_pos;
160              $best_expect = $expectation;              $best_expect = $expectation;
161          }          }
162    
163            #...Copy original score to "RAW_SCORE" field,
164            #   and replace score with bayes expected score:
165          $calls{$called_pos}->{RAW_SCORE} = $calls{$called_pos}->{SCORE};          $calls{$called_pos}->{RAW_SCORE} = $calls{$called_pos}->{SCORE};
166          $calls{$called_pos}->{SCORE}     = $expectation;          $calls{$called_pos}->{SCORE}     = $expectation;
167      }      }
168    
169      if ($orf_beg < $orf_end)  
170      {      #...Convert offset with best bayes expected score to new START coordinate:
171        if ($orf_beg < $orf_end) {
172          $new_start = $orf_beg + $best_pos - 1;          $new_start = $orf_beg + $best_pos - 1;
173      }      }
174      else      else {
     {  
175          $new_start = $orf_beg - $best_pos + 1;          $new_start = $orf_beg - $best_pos + 1;
176      }      }
177    
178        #...Add best result to header:
179      $header_fields{NEW_START_POS} = $best_pos;      $header_fields{NEW_START_POS} = $best_pos;
180      $header_fields{NEW_START_LOC} = "$header_fields{CONTIG}\_$new_start\_$orf_end";      $header_fields{NEW_START_LOC} = "$header_fields{CONTIG}\_$new_start\_$orf_end";
181    
182        #...Print out header key/value pairs:
183      while (($k, $v) = each %header_fields) { print "$k=$v\n"; }      while (($k, $v) = each %header_fields) { print "$k=$v\n"; }
184      print "//\n";      print "//\n";   #...print subrecord separator...
185    
186        #...Print out key/value pairs for re-scored calls, sorted by new scores:
187      foreach $candidate (sort { $calls{$b}->{SCORE} <=> $calls{$a}->{SCORE} } keys %calls)      foreach $candidate (sort { $calls{$b}->{SCORE} <=> $calls{$a}->{SCORE} } keys %calls)
188      {      {
189          $call = $calls{$candidate};          $call = $calls{$candidate};
190          while (($k, $v) = each %$call) { print "$k=$v\n"; }          while (($k, $v) = each %$call) { print "$k=$v\n"; }
191          print "//\n";          print "//\n";
192      }      }
193    
194        #...Print "End of Record" marker:
195      print "///\n";      print "///\n";
196  }  }
197  print STDERR "\n\n";  print STDERR "\n\n";
198    
199    
200  sub binom  
201  {  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
202    #...Recursive computation of generalized binomial coefficients
203    #   extrapolated to "negative" arguments using gamma-function identities,
204    #   and using various identities to make the calculation more efficient:
205    #-----------------------------------------------------------------------
206    sub binom {
207      my ($n, $k) = @_;      my ($n, $k) = @_;
208      my ($sign, $binom);      my ($sign, $binom);
209    

Legend:
Removed from v.1.2  
changed lines
  Added in v.1.3

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3