[Bio] / FigKernelPackages / Quality.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/Quality.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : overbeek 1.1 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 :     package Quality;
19 :    
20 :     use strict;
21 :     use FIG;
22 :     use FIG_Config;
23 :    
24 : overbeek 1.3 use Carp;
25 :     use File::Basename;
26 :    
27 : overbeek 1.1 =head1 Routines for Quality Assessment and Repair
28 :    
29 :     =head3 assess_assembly_quality
30 :    
31 :     C<< &Quality::assess_assembly_quality($org_dir); >>
32 :    
33 :     C<< &Quality::assess_assembly_quality($org_dir, $estimated_read_length); >>
34 :    
35 :     Makes an I<extremely> crude and rather conservative estimate of assembly
36 :     coverage-depth and percent completeness, based on Lander-Waterman theory.
37 :    
38 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
39 :     and does not default to a directory in the SEED organism hierarchy.
40 :    
41 :     The routine assumes a read-length of 500 bp unless told otherwise;
42 :     set the optional argument C<$estimated_read_length> to C<100> for genomes
43 :     sequenced using the "454" technology.
44 :    
45 : overbeek 1.4 Returns a list of two strings on success, C<($depth, $completness)>,
46 :     and an empty list on failure.
47 : overbeek 1.1
48 :     =cut
49 :    
50 :     sub assess_assembly_quality {
51 :     my ($org_dir, $estimated_read_length) = @_;
52 :    
53 :     if (!-d $org_dir) {
54 :     warn "OrgDir $org_dir does not exist";
55 : overbeek 1.4 return ();
56 : overbeek 1.1 }
57 :    
58 :     if (not defined($estimated_read_length)) {
59 :     $estimated_read_length = 500;
60 :     }
61 :    
62 :     my $depth = 10.0;
63 :     my $completeness;
64 :     if (!-s "$org_dir/contigs") {
65 :     warn "Contigs file $org_dir/contigs does not exist";
66 : overbeek 1.4 return ();
67 : overbeek 1.1 }
68 :     else {
69 :     my $summary = `sequence_length_histogram -null $org_dir/contigs 2>&1`;
70 :     if ($summary =~ m/There are (\d+) chars in (\d+) seq.*mean length = (\d+)/so) {
71 :     my ($chars, $num_seqs, $expect) = ($1, $2, $3);
72 : overbeek 1.3 if ($num_seqs == 1) {
73 :     return (10.0, 0.999954);
74 :     }
75 : overbeek 1.1
76 :     my $size = $chars;
77 :     my $eff_read_len = $estimated_read_length - 50;
78 :     for (my $i=0; $i < 500; ++$i) {
79 :     my $last = $depth;
80 :     my $n_reads = $depth * $size / $eff_read_len;
81 :     my $n_single = exp(-$depth) * ( ($n_reads - 2) * exp(-$depth) + 2.0 );
82 :    
83 :     $depth = log( 1.0 + $depth * ($size / $eff_read_len) / ($num_seqs + $n_single - 1) );
84 :     $completeness = (1.0 - exp(-$depth));
85 :     $size = $chars / $completeness;
86 :    
87 : overbeek 1.3 if ($completeness > 0.999955) {
88 :     $completeness = 0.999955;
89 :     $depth = 10.0;
90 :     last;
91 :     }
92 :    
93 : overbeek 1.1 last if ($depth == $last);
94 :     }
95 :     }
96 :     else {
97 :     warn "Could not parse sequence_length_histogram summary: $summary";
98 :     }
99 :     }
100 :    
101 :     $depth = sprintf "%.1f", $depth;
102 :     $completeness = sprintf "%.2f", (100.0 * $completeness);
103 :    
104 :     return ($depth, $completeness);
105 :     }
106 :    
107 :    
108 :    
109 :     =head3 assess_gene_call_quality
110 :    
111 :     C<< &Quality::assess_gene_call_quality($org_dir); >>
112 :    
113 :     This routine is a wrapper for the command-line tool C<assess_gene_call_quality>,
114 :     which catches some of the more common fatal errors and warnings for a genome
115 :     skeleton directory.
116 :    
117 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
118 :     and does not default to a directory in the SEED organism hierarchy.
119 :    
120 : overbeek 1.4 On success, the routine returns a list of pointers to two hashes, C<($fatal, $warnings)>,
121 : overbeek 1.1 whose keys are the type of error in each class, and whose values are the
122 : overbeek 1.4 number of features having that type of error; on failure, it returns the empty list.
123 : overbeek 1.1
124 :     As a side-effect, this routine creates three files in the skeleton OrgDir:
125 :    
126 : parrello 1.2 =over 4
127 :    
128 : overbeek 1.1 =item * C<quality.report>, which is a tab-seperated three-column file
129 :     output by the command-line tool C<assess_gene_call_quality>,
130 :     with entry format C<< KEY_TYPE\tKEY_NAME\tVALUE\n >>. The report is
131 :     headed by one or more "human-readable" comment lines beginning with the
132 :     '#' character, and terminated by the end-of-record indicator C<< //\n >>.
133 :    
134 :     =item * C<overlap.report>, which is a detailed, human-readable
135 :     (but not easily machine-parsible!) report that is output by the
136 :     command-line tool C<make_overlap_report>; this tool finds embedded or
137 :     overlapping feature pairs exceeding a set of user-definable thresholds,
138 :     as well as PEGs with invalid START or STOP codons, etc.
139 :     (The default thresholds are set to values that we consider unacceptable
140 :     for a SEED genome)
141 :    
142 :     =item * C<overlap.summary>, which is a short human-readbale summary
143 :     of the number of problematic features found by C<make_overlap_report>.
144 :    
145 : parrello 1.2 =back
146 :    
147 : overbeek 1.1 B<TODO:> C<assess_gene_call_quality> does not yet catch all the errors
148 :     that will be flagged as "FATAL" by the command-line tool C<verify_genome_directory>.
149 :    
150 :     =cut
151 :    
152 :     sub assess_gene_call_quality {
153 :     my ($org_dir) = @_;
154 :    
155 :     if (!-d $org_dir) {
156 :     warn "OrgDir $org_dir does not exist\n";
157 : overbeek 1.4 return ();
158 : overbeek 1.1 }
159 : overbeek 1.3 my $parent = basename($org_dir) || confess "Could not extract parent of $org_dir";
160 : overbeek 1.1
161 :     my $fatal = {};
162 :     my $warnings = {};
163 : olson 1.5 if (system("assess_gene_call_quality --meta=$parent/meta.xml $org_dir > $org_dir/quality.report 2>&1")) {
164 : overbeek 1.3 warn "FAILED: assess_gene_call_quality $org_dir > $org_dir/quality.report 2>&1";
165 : overbeek 1.4 return ();
166 : overbeek 1.1 }
167 :    
168 :     my @report = `cat $org_dir/quality.report`;
169 :     %$fatal = map { m/^\S+\t(\S+)\t(\S+)/; $1 => $2
170 :     } grep {
171 :     m/^FATAL/
172 :     } @report;
173 :    
174 :     %$warnings = map { m/^\S+\t(\S+)\t(\S+)/; $1 => $2
175 :     } grep {
176 :     m/^WARNING/
177 :     } @report;
178 :    
179 :     return ($fatal, $warnings);
180 :     }
181 :    
182 :    
183 :     =head3 remove_rna_overlaps
184 :    
185 :     C<< &Quality::remove_rna_overlaps($org_dir); >>
186 :    
187 :     C<< &Quality::remove_rna_overlaps($org_dir, $max_overlap); >>
188 :    
189 :     This routine is a wrapper for the command-line tool C<remove_rna_overlaps>,
190 :     which removes any PEGs from the file C<< $org_dir?features/peg/tbl >>
191 :     that overlap an RNA feature by more than some threshold, and then rebuilds
192 :     the PEG-translation FASTA file C<< $org_dir?features/peg/fasta >>.
193 :    
194 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
195 :     and does not default to a directory in the SEED organism hierarchy.
196 :    
197 :     The optional argument C<$max_overlap> resets the maximum number of base-pairs
198 :     that a PEG is allowed to overlap an RNA feature before it will be rejected.
199 :     (The default maximum overlap is 10 bp.)
200 :    
201 :     Returns C<1> on success, and C<undef> on failure.
202 :    
203 :     B<TODO:> The command-line tool C<remove_rna_overlaps> needs to be
204 :     re-written to only remove PEGs that overlap RNAs "non-removably," and
205 :     just re-call the STARTs for those PEGs that have "removable" overlaps.
206 :    
207 :     =cut
208 :    
209 :     sub remove_rna_overlaps {
210 :     my ($org_dir, $max_overlap) = @_;
211 :    
212 :     if (!-d $org_dir) {
213 :     warn "OrgDir $org_dir does not exist\n";
214 :     return undef;
215 :     }
216 :    
217 :     $max_overlap = $max_overlap ? qq(-max=$max_overlap) : qq();
218 :     if (system("remove_rna_overlaps $max_overlap $org_dir")) {
219 :     warn "FAILED: remove_rna_overlaps $max_overlap $org_dir";
220 :     return undef;
221 :     }
222 :    
223 : overbeek 1.3 &assess_gene_call_quality($org_dir) || confess "Could not re-assess call quality of $org_dir";
224 : overbeek 1.1 return 1;
225 :     }
226 :    
227 :    
228 :    
229 :     =head3 remove_embedded_pegs
230 :    
231 :     C<< &Quality::remove_embedded_pegs($org_dir); >>
232 :    
233 :     This routine is a wrapper for the command-line tool C<remove_embedded_pegs>,
234 :     which removes any PEGs from the file C<< $org_dir?features/peg/tbl >>
235 :     that are embedded inside other pegs, and then rebuilds the PEG-translation
236 :     FASTA file C<< $org_dir?features/peg/fasta >>.
237 :    
238 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
239 :     and does not default to a directory in the SEED organism hierarchy.
240 :    
241 :     The optional argument C<$max_overlap> resets the maximum number of base-pairs
242 :     that a PEG is allowed to overlap an RNA feature before it will be rejected.
243 :     (The default maximum overlap is 10 bp.)
244 :    
245 :     Returns C<1> on success, and C<undef> on failure.
246 :    
247 :     B<TODO:> The command-line tool C<remove_embedded_pegs> needs to be
248 :     re-written to remove PEGs more intelligently, based on which PEG
249 :     has less comparative support.
250 :    
251 :     =cut
252 :    
253 :     sub remove_embedded_pegs {
254 :     my ($org_dir) = @_;
255 :    
256 :     if (!-d $org_dir) {
257 :     warn "OrgDir $org_dir does not exist\n";
258 :     return undef;
259 :     }
260 :    
261 :     if (system("remove_embedded_pegs $org_dir")) {
262 :     warn "FAILED: remove_embedded_pegs $org_dir";
263 :     return undef;
264 :     }
265 :    
266 : overbeek 1.3 &assess_gene_call_quality($org_dir) || confess "Could not re-assess call quality of $org_dir";
267 : overbeek 1.1 return 1;
268 :     }
269 :    
270 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3