[Bio] / FigKernelPackages / Quality.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/Quality.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : overbeek 1.1 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 :     package Quality;
19 :    
20 :     use strict;
21 :     use FIG;
22 :     use FIG_Config;
23 :    
24 : overbeek 1.3 use Carp;
25 :     use File::Basename;
26 :    
27 : overbeek 1.1 =head1 Routines for Quality Assessment and Repair
28 :    
29 :     =head3 assess_assembly_quality
30 :    
31 :     C<< &Quality::assess_assembly_quality($org_dir); >>
32 :    
33 :     C<< &Quality::assess_assembly_quality($org_dir, $estimated_read_length); >>
34 :    
35 :     Makes an I<extremely> crude and rather conservative estimate of assembly
36 :     coverage-depth and percent completeness, based on Lander-Waterman theory.
37 :    
38 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
39 :     and does not default to a directory in the SEED organism hierarchy.
40 :    
41 :     The routine assumes a read-length of 500 bp unless told otherwise;
42 :     set the optional argument C<$estimated_read_length> to C<100> for genomes
43 :     sequenced using the "454" technology.
44 :    
45 :     Returns a list of two strings: C<($depth, $completness)>.
46 :    
47 :     =cut
48 :    
49 :     sub assess_assembly_quality {
50 :     my ($org_dir, $estimated_read_length) = @_;
51 :    
52 :     if (!-d $org_dir) {
53 :     warn "OrgDir $org_dir does not exist";
54 :     return (undef, undef);
55 :     }
56 :    
57 :     if (not defined($estimated_read_length)) {
58 :     $estimated_read_length = 500;
59 :     }
60 :    
61 :     my $depth = 10.0;
62 :     my $completeness;
63 :     if (!-s "$org_dir/contigs") {
64 :     warn "Contigs file $org_dir/contigs does not exist";
65 :     return (undef, undef);
66 :     }
67 :     else {
68 :     my $summary = `sequence_length_histogram -null $org_dir/contigs 2>&1`;
69 :     if ($summary =~ m/There are (\d+) chars in (\d+) seq.*mean length = (\d+)/so) {
70 :     my ($chars, $num_seqs, $expect) = ($1, $2, $3);
71 : overbeek 1.3 if ($num_seqs == 1) {
72 :     return (10.0, 0.999954);
73 :     }
74 : overbeek 1.1
75 :     my $size = $chars;
76 :     my $eff_read_len = $estimated_read_length - 50;
77 :     for (my $i=0; $i < 500; ++$i) {
78 :     my $last = $depth;
79 :     my $n_reads = $depth * $size / $eff_read_len;
80 :     my $n_single = exp(-$depth) * ( ($n_reads - 2) * exp(-$depth) + 2.0 );
81 :    
82 :     $depth = log( 1.0 + $depth * ($size / $eff_read_len) / ($num_seqs + $n_single - 1) );
83 :     $completeness = (1.0 - exp(-$depth));
84 :     $size = $chars / $completeness;
85 :    
86 : overbeek 1.3 if ($completeness > 0.999955) {
87 :     $completeness = 0.999955;
88 :     $depth = 10.0;
89 :     last;
90 :     }
91 :    
92 : overbeek 1.1 last if ($depth == $last);
93 :     }
94 :     }
95 :     else {
96 :     warn "Could not parse sequence_length_histogram summary: $summary";
97 :     }
98 :     }
99 :    
100 :     $depth = sprintf "%.1f", $depth;
101 :     $completeness = sprintf "%.2f", (100.0 * $completeness);
102 :    
103 :     return ($depth, $completeness);
104 :     }
105 :    
106 :    
107 :    
108 :     =head3 assess_gene_call_quality
109 :    
110 :     C<< &Quality::assess_gene_call_quality($org_dir); >>
111 :    
112 :     This routine is a wrapper for the command-line tool C<assess_gene_call_quality>,
113 :     which catches some of the more common fatal errors and warnings for a genome
114 :     skeleton directory.
115 :    
116 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
117 :     and does not default to a directory in the SEED organism hierarchy.
118 :    
119 :     The routine returns a list of pointers to two hashes, C<($fatal, $warnings)>,
120 :     whose keys are the type of error in each class, and whose values are the
121 :     number of features having that type of error.
122 :    
123 :     As a side-effect, this routine creates three files in the skeleton OrgDir:
124 :    
125 : parrello 1.2 =over 4
126 :    
127 : overbeek 1.1 =item * C<quality.report>, which is a tab-seperated three-column file
128 :     output by the command-line tool C<assess_gene_call_quality>,
129 :     with entry format C<< KEY_TYPE\tKEY_NAME\tVALUE\n >>. The report is
130 :     headed by one or more "human-readable" comment lines beginning with the
131 :     '#' character, and terminated by the end-of-record indicator C<< //\n >>.
132 :    
133 :     =item * C<overlap.report>, which is a detailed, human-readable
134 :     (but not easily machine-parsible!) report that is output by the
135 :     command-line tool C<make_overlap_report>; this tool finds embedded or
136 :     overlapping feature pairs exceeding a set of user-definable thresholds,
137 :     as well as PEGs with invalid START or STOP codons, etc.
138 :     (The default thresholds are set to values that we consider unacceptable
139 :     for a SEED genome)
140 :    
141 :     =item * C<overlap.summary>, which is a short human-readbale summary
142 :     of the number of problematic features found by C<make_overlap_report>.
143 :    
144 : parrello 1.2 =back
145 :    
146 : overbeek 1.1 B<TODO:> C<assess_gene_call_quality> does not yet catch all the errors
147 :     that will be flagged as "FATAL" by the command-line tool C<verify_genome_directory>.
148 :    
149 :     =cut
150 :    
151 :     sub assess_gene_call_quality {
152 :     my ($org_dir) = @_;
153 :    
154 :     if (!-d $org_dir) {
155 :     warn "OrgDir $org_dir does not exist\n";
156 :     return (undef, undef);
157 :     }
158 : overbeek 1.3 my $parent = basename($org_dir) || confess "Could not extract parent of $org_dir";
159 : overbeek 1.1
160 :     my $fatal = {};
161 :     my $warnings = {};
162 : overbeek 1.3 if (system("assess_gene_call_quality --metas=$parent/meta.xml $org_dir > $org_dir/quality.report 2>&1")) {
163 :     warn "FAILED: assess_gene_call_quality $org_dir > $org_dir/quality.report 2>&1";
164 :     return (undef, undef);
165 : overbeek 1.1 }
166 :    
167 :     my @report = `cat $org_dir/quality.report`;
168 :     %$fatal = map { m/^\S+\t(\S+)\t(\S+)/; $1 => $2
169 :     } grep {
170 :     m/^FATAL/
171 :     } @report;
172 :    
173 :     %$warnings = map { m/^\S+\t(\S+)\t(\S+)/; $1 => $2
174 :     } grep {
175 :     m/^WARNING/
176 :     } @report;
177 :    
178 :     return ($fatal, $warnings);
179 :     }
180 :    
181 :    
182 :     =head3 remove_rna_overlaps
183 :    
184 :     C<< &Quality::remove_rna_overlaps($org_dir); >>
185 :    
186 :     C<< &Quality::remove_rna_overlaps($org_dir, $max_overlap); >>
187 :    
188 :     This routine is a wrapper for the command-line tool C<remove_rna_overlaps>,
189 :     which removes any PEGs from the file C<< $org_dir?features/peg/tbl >>
190 :     that overlap an RNA feature by more than some threshold, and then rebuilds
191 :     the PEG-translation FASTA file C<< $org_dir?features/peg/fasta >>.
192 :    
193 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
194 :     and does not default to a directory in the SEED organism hierarchy.
195 :    
196 :     The optional argument C<$max_overlap> resets the maximum number of base-pairs
197 :     that a PEG is allowed to overlap an RNA feature before it will be rejected.
198 :     (The default maximum overlap is 10 bp.)
199 :    
200 :     Returns C<1> on success, and C<undef> on failure.
201 :    
202 :     B<TODO:> The command-line tool C<remove_rna_overlaps> needs to be
203 :     re-written to only remove PEGs that overlap RNAs "non-removably," and
204 :     just re-call the STARTs for those PEGs that have "removable" overlaps.
205 :    
206 :     =cut
207 :    
208 :     sub remove_rna_overlaps {
209 :     my ($org_dir, $max_overlap) = @_;
210 :    
211 :     if (!-d $org_dir) {
212 :     warn "OrgDir $org_dir does not exist\n";
213 :     return undef;
214 :     }
215 :    
216 :     $max_overlap = $max_overlap ? qq(-max=$max_overlap) : qq();
217 :     if (system("remove_rna_overlaps $max_overlap $org_dir")) {
218 :     warn "FAILED: remove_rna_overlaps $max_overlap $org_dir";
219 :     return undef;
220 :     }
221 :    
222 : overbeek 1.3 &assess_gene_call_quality($org_dir) || confess "Could not re-assess call quality of $org_dir";
223 : overbeek 1.1 return 1;
224 :     }
225 :    
226 :    
227 :    
228 :     =head3 remove_embedded_pegs
229 :    
230 :     C<< &Quality::remove_embedded_pegs($org_dir); >>
231 :    
232 :     This routine is a wrapper for the command-line tool C<remove_embedded_pegs>,
233 :     which removes any PEGs from the file C<< $org_dir?features/peg/tbl >>
234 :     that are embedded inside other pegs, and then rebuilds the PEG-translation
235 :     FASTA file C<< $org_dir?features/peg/fasta >>.
236 :    
237 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
238 :     and does not default to a directory in the SEED organism hierarchy.
239 :    
240 :     The optional argument C<$max_overlap> resets the maximum number of base-pairs
241 :     that a PEG is allowed to overlap an RNA feature before it will be rejected.
242 :     (The default maximum overlap is 10 bp.)
243 :    
244 :     Returns C<1> on success, and C<undef> on failure.
245 :    
246 :     B<TODO:> The command-line tool C<remove_embedded_pegs> needs to be
247 :     re-written to remove PEGs more intelligently, based on which PEG
248 :     has less comparative support.
249 :    
250 :     =cut
251 :    
252 :     sub remove_embedded_pegs {
253 :     my ($org_dir) = @_;
254 :    
255 :     if (!-d $org_dir) {
256 :     warn "OrgDir $org_dir does not exist\n";
257 :     return undef;
258 :     }
259 :    
260 :     if (system("remove_embedded_pegs $org_dir")) {
261 :     warn "FAILED: remove_embedded_pegs $org_dir";
262 :     return undef;
263 :     }
264 :    
265 : overbeek 1.3 &assess_gene_call_quality($org_dir) || confess "Could not re-assess call quality of $org_dir";
266 : overbeek 1.1 return 1;
267 :     }
268 :    
269 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3