[Bio] / FigKernelScripts / assess_gene_call_quality.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/assess_gene_call_quality.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.10 - (view) (download) (as text)

1 : overbeek 1.1 # -*- perl -*-
2 :    
3 :     use FIG;
4 : overbeek 1.3 use GenomeMeta;
5 : overbeek 1.1 use strict;
6 :    
7 :     $0 =~ m/([^\/]+)$/;
8 :     my $this_tool_name = $1;
9 : overbeek 1.10 my $usage = "$this_tool_name [-h(elp)] [-no_fatal] [-meta=meta.file] [-parms=minlen,rna,conv,div,samestrand] OrgDir > fatal_errs 2> warnings";
10 : overbeek 1.1
11 : overbeek 1.3 if ((not @ARGV) || ($ARGV[0] =~ m/^-{1,2}h(elp)?/)) {
12 : overbeek 1.1 die "\n usage: $usage\n\n";
13 :     }
14 :    
15 : overbeek 1.10 my $no_fatal = "";
16 :     my $meta_file = "";
17 : overbeek 1.1 my $overlap_parms = "";
18 : overbeek 1.10
19 :     my $FATAL = qq(FATAL);
20 : overbeek 1.1 while (@ARGV && ($ARGV[0] =~ m/^-/))
21 :     {
22 : overbeek 1.10 if ($ARGV[0] =~ m/^-{1,2}no_fatal/) {
23 :     $no_fatal = shift @ARGV;
24 :     $FATAL = qq(WARNING);
25 :     }
26 :     elsif ($ARGV[0] =~ m/^-{1,2}meta=(\S+)/) {
27 : overbeek 1.3 $meta_file = $1;
28 :     shift @ARGV;
29 :     }
30 :     elsif ($ARGV[0] =~ m/^-{1,2}parms=\d+,\d+,\d+,\d+,\d+/) {
31 : overbeek 1.1 $overlap_parms = shift @ARGV;
32 :     }
33 :     else {
34 :     die "Invalid arg $ARGV[0]\n\n usage: $usage\n\n";
35 :     }
36 :     }
37 :    
38 :     my $org_dir;
39 :     (($org_dir = shift @ARGV) && (-d $org_dir))
40 :     || die "OrgDir $org_dir does not exist\n usage: $usage\n\n";
41 :     $org_dir =~ s/\/$//;
42 :    
43 :     my $org_id;
44 :     if ($org_dir =~ m/(\d+\.\d+)$/) {
45 :     $org_id = $1;
46 :     }
47 :     else {
48 :     die "Could not extract a well-formed Org-ID from directory-name $org_dir";
49 :     }
50 :    
51 : overbeek 1.3 my $meta;
52 :     if ($meta_file)
53 :     {
54 : overbeek 1.8 $meta = GenomeMeta->new($org_id, $meta_file);
55 :     $meta->add_log_entry("qc", ["Assessing gene-call quality", $org_id, $org_dir]);
56 : overbeek 1.3 }
57 :    
58 : overbeek 1.1 my $taxonomy = (-s "$org_dir/TAXONOMY") ?
59 :     `cat $org_dir/TAXONOMY`
60 :     : die "\n No TAXONOMY file in $org_dir\n\n";
61 :    
62 : olson 1.6 if (($taxonomy !~ m/^\s*Bact/) && ($taxonomy !~ m/^\s*Arch/)) {
63 : overbeek 1.1 die "\n WARNING: This heuristic is sensible only for Bacteria or Archaea;\n"
64 :     , " TAXONOMY in $org_dir is:\n\n"
65 :     , " $taxonomy\n\n";
66 :     }
67 :    
68 :     my $contigs = "$org_dir/contigs";
69 :    
70 :     my $rna_tbl = (-s ($_ = "$org_dir/Features/rna/tbl")) ? $_ : "/dev/null";
71 :     my $peg_tbl = (-s ($_ = "$org_dir/Features/peg/tbl")) ? $_ : "/dev/null";
72 :     my $orf_tbl = (-s ($_ = "$org_dir/Features/orf/tbl")) ? $_ : "/dev/null";
73 :    
74 : olson 1.4 if (!open(SUMMARY, ">$org_dir/overlap.summary"))
75 :     {
76 :     my $err = "cannot open $org_dir/overlap.summary: $!";
77 : overbeek 1.7 $meta->add_log_entry($0, $err) if $meta_file;
78 : olson 1.4 die $err;
79 :     }
80 :    
81 :     my $cmd = "$FIG_Config::bin/make_overlap_report $overlap_parms $contigs $rna_tbl $peg_tbl $orf_tbl";
82 :    
83 : overbeek 1.7 $meta->add_log_entry($0, ['start', $cmd]) if $meta_file;
84 : olson 1.4 if (!open(OVER, "$cmd 2>&1 1> $org_dir/overlap.report |"))
85 :     {
86 :     my $err = "Execution of $cmd failed: $!";
87 : overbeek 1.7 $meta->add_log_entry($0, [$err, $cmd]) if $meta_file;
88 : olson 1.4 die $err;
89 :     }
90 :    
91 :     my @overlap_summary;
92 :    
93 :     while (<OVER>)
94 :     {
95 :     push(@overlap_summary, $_);
96 :     print SUMMARY $_;
97 :     }
98 :     close(SUMMARY);
99 :    
100 :     if (!close(OVER))
101 :     {
102 :     my $err = "close failed with \$!=$! \$?=$?";
103 : overbeek 1.7 $meta->add_log_entry($0, [$err, $cmd]) if $meta_file;
104 : olson 1.4 die "Execution of $cmd > $org_dir/overlap.summary failed: $err";
105 :     }
106 : overbeek 1.7 $meta->add_log_entry($0, ['end', $cmd]) if $meta_file;
107 :     $meta->add_log_entry($0, \@overlap_summary) if $meta_file;
108 : overbeek 1.1
109 :     my $num_features = 0;
110 :     my $bad_starts = 0;
111 :     my $bad_stops = 0;
112 :     my $too_short = 0;
113 :     my $rna_overlaps = 0;
114 :     my $same_stop = 0;
115 :     my $embedded = 0;
116 :     my $convergent = 0;
117 :     my $divergent = 0;
118 :     my $same_strand = 0;
119 :     my $impossible = 0;
120 :    
121 : overbeek 1.3 my $msg;
122 : overbeek 1.1 foreach my $line (@overlap_summary)
123 :     {
124 :     if ($line =~ m/^Number of features\:\s+(\d+)/) {
125 :     $num_features = $1;
126 :     }
127 :    
128 :     if ($line =~ m/^Bad START codons\:\s+(\d+)/) {
129 :     $bad_starts = $1;
130 :     }
131 :    
132 :     if ($line =~ m/^Bad STOP codons\:\s+(\d+)/) {
133 :     $bad_stops = $1;
134 :     }
135 :    
136 :     if ($line =~ m/^Too short\:\s+(\d+)/) {
137 :     $too_short = $1;
138 :     }
139 :    
140 :     if ($line =~ m/^RNA overlaps\:\s+(\d+)/) {
141 :     $rna_overlaps = $1;
142 :     }
143 :    
144 :     if ($line =~ m/^Same-STOP PEGs\:\s+(\d+)/) {
145 :     $same_stop = $1;
146 :     }
147 :    
148 :     if ($line =~ m/^Embedded PEGs\:\s+(\d+)/) {
149 :     $embedded = $1;
150 :     }
151 :    
152 :     if ($line =~ m/^Convergent overlaps\:\s+(\d+)/) {
153 :     $convergent = $1;
154 :     }
155 :    
156 :     if ($line =~ m/^Divergent overlaps\:\s+(\d+)/) {
157 :     $divergent = $1;
158 :     }
159 :    
160 :     if ($line =~ m/^Same-strand overlaps\:\s+(\d+)/) {
161 :     $same_strand = $1;
162 :     }
163 :    
164 :     if ($line =~ m/^Impossible overlaps\:\s+(\d+)/) {
165 :     $impossible = $1;
166 :     }
167 :     }
168 :    
169 :     my $num_missing = 0;
170 : olson 1.9 my @gaplen_histogram = `($FIG_Config::bin/find_gaps $org_id $contigs $rna_tbl $peg_tbl | $FIG_Config::bin/feature_length_histogram -nolabel) 2> /dev/null`;
171 : overbeek 1.1 foreach my $entry (@gaplen_histogram)
172 :     {
173 :     if ($entry =~ m/^(\d+)\t(\d+)/) {
174 :     my ($len, $num) = ($1, $2);
175 :     if ($len > 2000)
176 :     {
177 :     $num_missing += $num*$len;
178 :     }
179 :     }
180 :     else {
181 :     die "Malformed gap histogram entry: $entry";
182 :     }
183 :     }
184 :     $num_missing = int(0.5 + $num_missing/1000);
185 :    
186 :    
187 :     if ($num_features == 0) {
188 : overbeek 1.3 if ($meta_file) {
189 :     $meta->add_log_entry("qc", "$org_dir has no features called");
190 :    
191 :     $meta->set_metadata( qq(qc.Org-ID), [ qq(INFO), $org_id ] );
192 :     $meta->set_metadata( qq(qc.Num_features), [ qq(SCORE), 0 ] );
193 :     $meta->set_metadata( qq(qc.Num_tot), [ qq(SCORE), 999999999 ] );
194 :     $meta->set_metadata( qq(qc.Num_fatal), [ qq(SCORE), 999999999 ] );
195 :     $meta->set_metadata( qq(qc.Num_warn), [ qq(SCORE), 999999999 ] );
196 :     $meta->set_metadata( qq(qc.Possible_missing), [ qq(SCORE), $num_missing ] );
197 :     $meta->set_metadata( qq(qc.Pct_fatal), [ qq(SCORE), 100 ] );
198 :     $meta->set_metadata( qq(qc.Pct_warn), [ qq(SCORE), 100 ] );
199 :     $meta->set_metadata( qq(qc.Pct_missing), [ qq(SCORE), 100 ] );
200 :     }
201 :    
202 : overbeek 1.1 print STDOUT "# $org_dir has no features called\n"
203 :     , "INFO\tOrg-ID\t$org_id\n"
204 :     , "SCORE\tNum_features\t0\n"
205 :     , "SCORE\tNum_tot\t999999999\n"
206 :     , "SCORE\tNum_fatal\t999999999\n"
207 :     , "SCORE\tNum_warn\t999999999\n"
208 :     , "SCORE\tPossible_missing\t$num_missing\n"
209 :     , "SCORE\tPct_fatal\t100\n"
210 :     , "SCORE\tPct_warn\t100\n"
211 :     , "SCORE\tPct_missing\t100\n"
212 :     , "//\n";
213 : overbeek 1.3
214 : overbeek 1.1 exit(0);
215 :     }
216 :    
217 :     my $num_fatal = $rna_overlaps + $bad_starts + $bad_stops + $same_stop + $embedded + $impossible;
218 :     my $num_warn = $too_short + $convergent + $divergent + $same_strand;
219 : overbeek 1.10
220 :     if ($no_fatal) {
221 :     $num_warn += $num_fatal;
222 :     $num_fatal = 0;
223 :     }
224 :    
225 : overbeek 1.1 my $num_tot = $num_fatal + $num_warn;
226 :    
227 :     my ($pct_fatal, $pct_warn, $pct_tot, $pct_missing);
228 :     if ($num_features) {
229 :     $pct_fatal = sprintf "%3.1f", 100 * $num_fatal / $num_features;
230 :     $pct_warn = sprintf "%3.1f", 100 * $num_warn / $num_features;
231 :     $pct_tot = sprintf "%3.1f", 100 * $num_tot / $num_features;
232 :     $pct_missing = sprintf "%3.1f", 100 * $num_missing / $num_features;
233 :     }
234 :     else {
235 :     $num_fatal = $num_warn = $num_tot = 999999999;
236 :     $pct_fatal = $pct_warn = $pct_tot = $pct_missing = 100;
237 :     }
238 :    
239 :     my $recall = 0;
240 :     if ($num_missing > (0.1) * $num_features) {
241 :     $recall = 1;
242 :     }
243 :    
244 :     my $out_fh = \*STDERR;
245 : overbeek 1.10 if (($num_fatal && !$no_fatal) || ($num_tot >= (0.10) * $num_features)) {
246 : overbeek 1.1 $recall = 1;
247 :     $out_fh = \*STDOUT;
248 :     }
249 :    
250 : olson 1.5 #
251 :     # We want to always write output.
252 :     #
253 :     if (1 || $num_fatal || $num_warn || $recall) {
254 : overbeek 1.3 if ($meta_file) {
255 :     $meta->set_metadata( qq(qc.Org-ID), [ qq(INFO), $org_id ] );
256 :    
257 : overbeek 1.10 $meta->set_metadata( qq(qc.Num_features), [ qq(SCORE), $num_features ] );
258 :     $meta->set_metadata( qq(qc.Num_tot), [ qq(SCORE), $num_tot ] );
259 :     $meta->set_metadata( qq(qc.Num_fatal), [ qq(SCORE), $num_fatal ] );
260 :     $meta->set_metadata( qq(qc.Num_warn), [ qq(SCORE), $num_warn ] );
261 :     $meta->set_metadata( qq(qc.Possible_missing), [ qq(SCORE), $num_missing ] ) if $num_missing;
262 : overbeek 1.3
263 : overbeek 1.10 $meta->set_metadata( qq(qc.Pct_tot), [ qq(SCORE), $pct_tot ] );
264 :     $meta->set_metadata( qq(qc.Pct_fatal), [ qq(SCORE), $pct_fatal ] );
265 :     $meta->set_metadata( qq(qc.Pct_warn), [ qq(SCORE), $pct_warn ] );
266 :     $meta->set_metadata( qq(qc.Pct_missing), [ qq(SCORE), $pct_missing ] );# if $num_missing;
267 : overbeek 1.3
268 : overbeek 1.10 $meta->set_metadata( qq(qc.RNA_overlaps), [ qq($FATAL), $rna_overlaps ] );# if $rna_overlaps;
269 :     $meta->set_metadata( qq(qc.Bad_STARTs), [ qq($FATAL), $bad_starts ] );# if $bad_starts;
270 :     $meta->set_metadata( qq(qc.Bad_STOPs), [ qq($FATAL), $bad_stops ] );# if $bad_stops;
271 :     $meta->set_metadata( qq(qc.Embedded), [ qq($FATAL), $embedded ] );# if $embedded;
272 :     $meta->set_metadata( qq(qc.Impossible), [ qq($FATAL), $impossible ] );# if $impossible;
273 : overbeek 1.3
274 : olson 1.5 $meta->set_metadata( qq(qc.Too_short), [ qq(WARNING), $too_short ] );# if $too_short;
275 :     $meta->set_metadata( qq(qc.Convergent), [ qq(WARNING), $convergent ] );# if $convergent;
276 :     $meta->set_metadata( qq(qc.Divergent), [ qq(WARNING), $divergent ] );# if $divergent;
277 :     $meta->set_metadata( qq(qc.Same_strand), [ qq(WARNING), $same_strand ] );# if $same_strand;
278 : overbeek 1.3 }
279 :    
280 :     if ($num_fatal) {
281 :     $msg = "$org_dir contains $num_tot features with fatal errors or warnings"
282 :     . " ($pct_tot\%), out of $num_features features";
283 :     print $out_fh "# $msg\n";
284 :     $meta->add_log_entry("qc", $msg) if $meta_file;
285 :     }
286 :    
287 :     if ($num_warn) {
288 :     $msg = "$org_dir contains or may lack $num_warn features with warnings"
289 :     . " ($pct_warn\%), out of $num_features features";
290 :     print $out_fh "# $msg\n";
291 :     $meta->add_log_entry("qc", $msg) if $meta_file;
292 :     }
293 :    
294 :     if ($recall) {
295 :     $msg = "Recommend recalling PEGs, or at least deleting the features with fatal errors";
296 :     print $out_fh "# $msg\n";
297 :     $meta->add_log_entry("qc", $msg) if $meta_file;
298 :     }
299 : overbeek 1.1
300 :     print $out_fh "INFO\tOrg-ID\t$org_id\n";
301 :    
302 :     print $out_fh "SCORE\tNum_features\t$num_features\n";
303 :     print $out_fh "SCORE\tNum_tot\t$num_tot\n";
304 :     print $out_fh "SCORE\tNum_fatal\t$num_fatal\n";
305 :     print $out_fh "SCORE\tNum_warn\t$num_warn\n";
306 : overbeek 1.10 print $out_fh "SCORE\tPossible_missing\t$num_missing\n" ;# if $num_missing;
307 : overbeek 1.1
308 :     print $out_fh "SCORE\tPct_tot\t$pct_tot\n";
309 :     print $out_fh "SCORE\tPct_fatal\t$pct_fatal\n";
310 :     print $out_fh "SCORE\tPct_warn\t$pct_warn\n";
311 : overbeek 1.10 print $out_fh "SCORE\tPct_missing\t$pct_missing\n" ;# if $num_missing;
312 : overbeek 1.3
313 : overbeek 1.10 print $out_fh "$FATAL\tRNA_overlaps\t$rna_overlaps\n" ;# if $rna_overlaps;
314 :     print $out_fh "$FATAL\tBad_STARTs\t$bad_starts\n" ;# if $bad_starts;
315 :     print $out_fh "$FATAL\tBad_STOPs\t$bad_stops\n" ;# if $bad_stops;
316 :     print $out_fh "$FATAL\tSame_STOP\t$same_stop\n" ;# if $same_stop;
317 :     print $out_fh "$FATAL\tEmbedded\t$embedded\n" ;# if $embedded;
318 :     print $out_fh "$FATAL\tImpossible_overlaps\t$impossible\n";# if $impossible;
319 :    
320 :     print $out_fh "WARNING\tToo_short\t$too_short\n" ;# if $too_short;
321 :     print $out_fh "WARNING\tConvergent\t$convergent\n" ;# if $convergent;
322 :     print $out_fh "WARNING\tDivergent\t$divergent\n" ;# if $divergent;
323 :     print $out_fh "WARNING\tSame_strand\t$same_strand\n" ;# if $same_strand;
324 : overbeek 1.1
325 :     print $out_fh "//\n";
326 :     }
327 :    
328 :     exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3