[Bio] / FigKernelPackages / Quality.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/Quality.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : overbeek 1.1 #
2 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 :     package Quality;
19 :    
20 :     use strict;
21 :     use FIG;
22 :     use FIG_Config;
23 :    
24 :     =head1 Routines for Quality Assessment and Repair
25 :    
26 :     =head3 assess_assembly_quality
27 :    
28 :     C<< &Quality::assess_assembly_quality($org_dir); >>
29 :    
30 :     C<< &Quality::assess_assembly_quality($org_dir, $estimated_read_length); >>
31 :    
32 :     Makes an I<extremely> crude and rather conservative estimate of assembly
33 :     coverage-depth and percent completeness, based on Lander-Waterman theory.
34 :    
35 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
36 :     and does not default to a directory in the SEED organism hierarchy.
37 :    
38 :     The routine assumes a read-length of 500 bp unless told otherwise;
39 :     set the optional argument C<$estimated_read_length> to C<100> for genomes
40 :     sequenced using the "454" technology.
41 :    
42 :     Returns a list of two strings: C<($depth, $completness)>.
43 :    
44 :     =cut
45 :    
46 :     sub assess_assembly_quality {
47 :     my ($org_dir, $estimated_read_length) = @_;
48 :    
49 :     if (!-d $org_dir) {
50 :     warn "OrgDir $org_dir does not exist";
51 :     return (undef, undef);
52 :     }
53 :    
54 :     if (not defined($estimated_read_length)) {
55 :     $estimated_read_length = 500;
56 :     }
57 :    
58 :     my $depth = 10.0;
59 :     my $completeness;
60 :     if (!-s "$org_dir/contigs") {
61 :     warn "Contigs file $org_dir/contigs does not exist";
62 :     return (undef, undef);
63 :     }
64 :     else {
65 :     my $summary = `sequence_length_histogram -null $org_dir/contigs 2>&1`;
66 :     if ($summary =~ m/There are (\d+) chars in (\d+) seq.*mean length = (\d+)/so) {
67 :     my ($chars, $num_seqs, $expect) = ($1, $2, $3);
68 :    
69 :     my $size = $chars;
70 :     my $eff_read_len = $estimated_read_length - 50;
71 :     for (my $i=0; $i < 500; ++$i) {
72 :     my $last = $depth;
73 :     my $n_reads = $depth * $size / $eff_read_len;
74 :     my $n_single = exp(-$depth) * ( ($n_reads - 2) * exp(-$depth) + 2.0 );
75 :    
76 :     $depth = log( 1.0 + $depth * ($size / $eff_read_len) / ($num_seqs + $n_single - 1) );
77 :     $completeness = (1.0 - exp(-$depth));
78 :     $size = $chars / $completeness;
79 :    
80 :     last if ($depth == $last);
81 :     }
82 :     }
83 :     else {
84 :     warn "Could not parse sequence_length_histogram summary: $summary";
85 :     }
86 :     }
87 :    
88 :     $depth = sprintf "%.1f", $depth;
89 :     $completeness = sprintf "%.2f", (100.0 * $completeness);
90 :    
91 :     return ($depth, $completeness);
92 :     }
93 :    
94 :    
95 :    
96 :     =head3 assess_gene_call_quality
97 :    
98 :     C<< &Quality::assess_gene_call_quality($org_dir); >>
99 :    
100 :     This routine is a wrapper for the command-line tool C<assess_gene_call_quality>,
101 :     which catches some of the more common fatal errors and warnings for a genome
102 :     skeleton directory.
103 :    
104 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
105 :     and does not default to a directory in the SEED organism hierarchy.
106 :    
107 :     The routine returns a list of pointers to two hashes, C<($fatal, $warnings)>,
108 :     whose keys are the type of error in each class, and whose values are the
109 :     number of features having that type of error.
110 :    
111 :     As a side-effect, this routine creates three files in the skeleton OrgDir:
112 :    
113 : parrello 1.2 =over 4
114 :    
115 : overbeek 1.1 =item * C<quality.report>, which is a tab-seperated three-column file
116 :     output by the command-line tool C<assess_gene_call_quality>,
117 :     with entry format C<< KEY_TYPE\tKEY_NAME\tVALUE\n >>. The report is
118 :     headed by one or more "human-readable" comment lines beginning with the
119 :     '#' character, and terminated by the end-of-record indicator C<< //\n >>.
120 :    
121 :     =item * C<overlap.report>, which is a detailed, human-readable
122 :     (but not easily machine-parsible!) report that is output by the
123 :     command-line tool C<make_overlap_report>; this tool finds embedded or
124 :     overlapping feature pairs exceeding a set of user-definable thresholds,
125 :     as well as PEGs with invalid START or STOP codons, etc.
126 :     (The default thresholds are set to values that we consider unacceptable
127 :     for a SEED genome)
128 :    
129 :     =item * C<overlap.summary>, which is a short human-readbale summary
130 :     of the number of problematic features found by C<make_overlap_report>.
131 :    
132 : parrello 1.2 =back
133 :    
134 : overbeek 1.1 B<TODO:> C<assess_gene_call_quality> does not yet catch all the errors
135 :     that will be flagged as "FATAL" by the command-line tool C<verify_genome_directory>.
136 :    
137 :     =cut
138 :    
139 :     sub assess_gene_call_quality {
140 :     my ($org_dir) = @_;
141 :    
142 :     if (!-d $org_dir) {
143 :     warn "OrgDir $org_dir does not exist\n";
144 :     return (undef, undef);
145 :     }
146 :    
147 :     my $fatal = {};
148 :     my $warnings = {};
149 :     if ( (!-f "$org_dir/quality.report")
150 :     || ((-M "$org_dir/quality.report") > (-M "$org_dir/contigs"))
151 :     ) {
152 :     if (system("assess_gene_call_quality $org_dir > $org_dir/quality.report 2>&1")) {
153 :     warn "FAILED: assess_gene_call_quality $org_dir > $org_dir/quality.report 2>&1";
154 :     return (undef, undef);
155 :     }
156 :     }
157 :    
158 :     my @report = `cat $org_dir/quality.report`;
159 :     %$fatal = map { m/^\S+\t(\S+)\t(\S+)/; $1 => $2
160 :     } grep {
161 :     m/^FATAL/
162 :     } @report;
163 :    
164 :     %$warnings = map { m/^\S+\t(\S+)\t(\S+)/; $1 => $2
165 :     } grep {
166 :     m/^WARNING/
167 :     } @report;
168 :    
169 :     return ($fatal, $warnings);
170 :     }
171 :    
172 :    
173 :     =head3 remove_rna_overlaps
174 :    
175 :     C<< &Quality::remove_rna_overlaps($org_dir); >>
176 :    
177 :     C<< &Quality::remove_rna_overlaps($org_dir, $max_overlap); >>
178 :    
179 :     This routine is a wrapper for the command-line tool C<remove_rna_overlaps>,
180 :     which removes any PEGs from the file C<< $org_dir?features/peg/tbl >>
181 :     that overlap an RNA feature by more than some threshold, and then rebuilds
182 :     the PEG-translation FASTA file C<< $org_dir?features/peg/fasta >>.
183 :    
184 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
185 :     and does not default to a directory in the SEED organism hierarchy.
186 :    
187 :     The optional argument C<$max_overlap> resets the maximum number of base-pairs
188 :     that a PEG is allowed to overlap an RNA feature before it will be rejected.
189 :     (The default maximum overlap is 10 bp.)
190 :    
191 :     Returns C<1> on success, and C<undef> on failure.
192 :    
193 :     B<TODO:> The command-line tool C<remove_rna_overlaps> needs to be
194 :     re-written to only remove PEGs that overlap RNAs "non-removably," and
195 :     just re-call the STARTs for those PEGs that have "removable" overlaps.
196 :    
197 :     =cut
198 :    
199 :     sub remove_rna_overlaps {
200 :     my ($org_dir, $max_overlap) = @_;
201 :    
202 :     if (!-d $org_dir) {
203 :     warn "OrgDir $org_dir does not exist\n";
204 :     return undef;
205 :     }
206 :    
207 :     $max_overlap = $max_overlap ? qq(-max=$max_overlap) : qq();
208 :     if (system("remove_rna_overlaps $max_overlap $org_dir")) {
209 :     warn "FAILED: remove_rna_overlaps $max_overlap $org_dir";
210 :     return undef;
211 :     }
212 :    
213 :     return 1;
214 :     }
215 :    
216 :    
217 :    
218 :     =head3 remove_embedded_pegs
219 :    
220 :     C<< &Quality::remove_embedded_pegs($org_dir); >>
221 :    
222 :     This routine is a wrapper for the command-line tool C<remove_embedded_pegs>,
223 :     which removes any PEGs from the file C<< $org_dir?features/peg/tbl >>
224 :     that are embedded inside other pegs, and then rebuilds the PEG-translation
225 :     FASTA file C<< $org_dir?features/peg/fasta >>.
226 :    
227 :     The "Skeleton OrgDir" directory-path argument C<$org_dir> is mandatory,
228 :     and does not default to a directory in the SEED organism hierarchy.
229 :    
230 :     The optional argument C<$max_overlap> resets the maximum number of base-pairs
231 :     that a PEG is allowed to overlap an RNA feature before it will be rejected.
232 :     (The default maximum overlap is 10 bp.)
233 :    
234 :     Returns C<1> on success, and C<undef> on failure.
235 :    
236 :     B<TODO:> The command-line tool C<remove_embedded_pegs> needs to be
237 :     re-written to remove PEGs more intelligently, based on which PEG
238 :     has less comparative support.
239 :    
240 :     =cut
241 :    
242 :     sub remove_embedded_pegs {
243 :     my ($org_dir) = @_;
244 :    
245 :     if (!-d $org_dir) {
246 :     warn "OrgDir $org_dir does not exist\n";
247 :     return undef;
248 :     }
249 :    
250 :     if (system("remove_embedded_pegs $org_dir")) {
251 :     warn "FAILED: remove_embedded_pegs $org_dir";
252 :     return undef;
253 :     }
254 :    
255 :     return 1;
256 :     }
257 :    
258 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3