[Bio] / FigKernelPackages / GenomeChecker.pm Repository:
ViewVC logotype

Annotation of /FigKernelPackages/GenomeChecker.pm

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #
2 :     # Copyright (c) 2003-2015 University of Chicago and Fellowship
3 :     # for Interpretations of Genomes. All Rights Reserved.
4 :     #
5 :     # This file is part of the SEED Toolkit.
6 :     #
7 :     # The SEED Toolkit is free software. You can redistribute
8 :     # it and/or modify it under the terms of the SEED Toolkit
9 :     # Public License.
10 :     #
11 :     # You should have received a copy of the SEED Toolkit Public License
12 :     # along with this program; if not write to the University of Chicago
13 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
14 :     # Genomes at veronika@thefig.info or download a copy from
15 :     # http://www.theseed.org/LICENSE.TXT.
16 :     #
17 :    
18 :    
19 :     package GenomeChecker;
20 :    
21 :     use strict;
22 :     use warnings;
23 :     use RoleParse;
24 :     use ScriptUtils;
25 :     use Stats;
26 :     use FIG_Config;
27 :     use SeedUtils qw();
28 :     use Math::Round;
29 :    
30 :     =head1 Data Structures to Check Genomes for Completeness
31 :    
32 :     This object manages data structures to check genome for completeness and contamination. The algorithm is simple, and
33 :     is based on files created by the scripts L<taxon_analysis.pl>, L<group_marker_roles.pl>, and L<compute_taxon_map.pl>
34 :     plus the global role-mapping file C<roles.in.subsystems>.
35 :    
36 :     For each taxonomic group, there is a list of roles that appear singly in 97% of the genomes in that taxonomic grouping
37 :     and an associated weight for each. Completeness is measured by the weighted percent of those roles that actually occur.
38 :     Contamination is measured by the weighted number of duplicates found.
39 :    
40 :     This object contains the following fields.
41 :    
42 :     =over 4
43 :    
44 :     =item taxNames
45 :    
46 :     A hash mapping taxonomic group IDs to names.
47 :    
48 :     =item roleMap
49 :    
50 :     A hash mapping role checksums to role IDs.
51 :    
52 :     =item nameMap
53 :    
54 :     A hash mapping role IDs to names.
55 :    
56 :     =item stats
57 :    
58 :     A L<Stats> object for tracking statistics.
59 :    
60 :     =item logH
61 :    
62 :     Handle of a log file for status messages, or C<undef> if no status messages should be produced.
63 :    
64 :     =item roleLists
65 :    
66 :     A hash mapping each taxonomic group ID to a hash of the identifying role IDs and their weights. (In other words, the key of
67 :     the sub-hash is role ID and the value is the role weight).
68 :    
69 :     =item taxSizes
70 :    
71 :     A hash mapping each taxonomic group ID to its total role weight.
72 :    
73 :     =back
74 :    
75 :     =cut
76 :    
77 :     =head2 Special Methods
78 :    
79 :     =head3 new
80 :    
81 :     my $checker = GenomeChecker->new($checkDir, %options);
82 :    
83 :     Create a new genome-checker object.
84 :    
85 :     =over 4
86 :    
87 :     =item checkDir
88 :    
89 :     The name of a directory containing the main input file-- C<weighted.tbl> from L<p3-taxon-analysis.pl> and L<group_marker_roles.pl>.
90 :    
91 :     =item options
92 :    
93 :     A hash containing zero or more of the following keys.
94 :    
95 :     =over 8
96 :    
97 :     =item rolesInSubsystems
98 :    
99 :     The name of the file containing the master list of roles. This is a tab-delimited file with no headers. The first column of each
100 :     record is the role ID, the second is the role checksum, and the third is the role name. The default file is C<roles.in.subsystems>
101 :     in the global data directory.
102 :    
103 :     =item roleHashes
104 :    
105 :     If specified, overrides B<rolesInSubsystems>. A 2-tuple containing (0) a reference to a hash that maps role IDs to names,
106 :     and (1) a reference to a hash that maps role checksums to role IDs.
107 :    
108 :     =item logH
109 :    
110 :     Open handle for an output stream to contain log messages. The default is to not write log messages.
111 :    
112 :     =item stats
113 :    
114 :     A L<Stats> object for tracking statistics. If none is specified, one will be created.
115 :    
116 :     =back
117 :    
118 :     =back
119 :    
120 :     =cut
121 :    
122 :     sub new {
123 :     my ($class, $checkDir, %options) = @_;
124 :     # Process the options.
125 :     my $stats = $options{stats} // Stats->new();
126 :     my $logH = $options{logH};
127 :     my $roleFile = $options{rolesInSubsystems} // "$FIG_Config::global/roles.in.subsystems";
128 :     # We will track the roles of interest in here. When we read roles.in.subsystems we will fill in the role names.
129 :     my %nameMap;
130 :     # This will map role checksums to role IDs.
131 :     my %roleMap;
132 :     # This will be set to TRUE if we already have the name and role maps from the client.
133 :     my $preLoaded;
134 :     # These will be the pointers to the name and role maps. If the client specified the role hashes, we put them in here.
135 :     my ($nameMap, $roleMap) = (\%nameMap, \%roleMap);
136 :     if ($options{roleHashes}) {
137 :     $nameMap = $options{roleHashes}[0];
138 :     $roleMap = $options{roleHashes}[1];
139 :     $preLoaded = 1;
140 :     }
141 :     # This will be our map of taxonomic group IDs to role hashes.
142 :     my %roleLists;
143 :     # This will map group IDs to total weight.
144 :     my %taxSizes;
145 :     # This will map group IDs to names.
146 :     my %taxNames;
147 :     # This will map taxon IDs to group IDs.
148 :     my %taxonMap;
149 :     # Create and bless the object.
150 :     my $retVal = { taxNames => \%taxNames, roleMap => $roleMap,
151 :     nameMap => $nameMap, taxSizes => \%taxSizes, stats => $stats,
152 :     roleLists => \%roleLists, logH => $logH };
153 :     bless $retVal, $class;
154 :     # Get the roles.tbl file.
155 :     $retVal->Log("Processing weighted.tbl.\n");
156 :     open(my $rh, "<$checkDir/weighted.tbl") || die "Could not open weighted.tbl in $checkDir: $!";
157 :     # Loop through the taxonomic groups.
158 :     while (! eof $rh) {
159 :     my ($taxon, $size, $name) = ScriptUtils::get_line($rh);
160 :     $taxNames{$taxon} = $name;
161 :     $taxSizes{$taxon} = $size;
162 :     $stats->Add(taxGroupIn => 1);
163 :     # Now we loop through the roles.
164 :     my $done;
165 :     my %weights;
166 :     while (! eof $rh && ! $done) {
167 :     my ($role, $weight) = ScriptUtils::get_line($rh);
168 :     if ($role eq '//') {
169 :     $done = 1;
170 :     } else {
171 :     # We need to track the roles in the name map as well as the group's role hash.
172 :     # Later the name map is used to fill in the role names for the roles we need.
173 :     $nameMap{$role} = $role;
174 :     $weights{$role} = $weight;
175 :     $stats->Add(taxRoleIn => 1);
176 :     }
177 :     }
178 :     $roleLists{$taxon} = \%weights;
179 :     }
180 :     close $rh; undef $rh;
181 :     my $markerCount = scalar keys %nameMap;
182 :     $retVal->Log("$markerCount marker roles found in " . scalar(keys %roleLists) . " taxonomic groups.\n");
183 :     # If we are NOT preloaded, we need to create the role-parsing hashes.
184 :     if (! $preLoaded) {
185 :     # Now we need to know the name and checksum of each marker role. This is found in roles.in.subsystems.
186 :     $retVal->Log("Processing $roleFile.\n");
187 :     open($rh, "<$roleFile") || die "Could not open $roleFile: $!";
188 :     # Loop through the roles.
189 :     while (! eof $rh) {
190 :     my ($role, $checksum, $name) = ScriptUtils::get_line($rh);
191 :     if ($nameMap{$role}) {
192 :     $stats->Add(roleNamed => 1);
193 :     $nameMap{$role} = $name;
194 :     $roleMap{$checksum} = $role;
195 :     } else {
196 :     $stats->Add(roleNotUsed => 1);
197 :     }
198 :     }
199 :     close $rh; undef $rh;
200 :     # Verify we got all the roles.
201 :     my $roleCount = scalar(keys %roleMap);
202 :     if ($roleCount != $markerCount) {
203 :     die "$markerCount role markers in roles.tbl, but only $roleCount were present in $roleFile.";
204 :     }
205 :     } else {
206 :     # Here we are pre-loaded. Verify that we have all the roles we need.
207 :     my $notFound;
208 :     for my $role (keys %nameMap) {
209 :     if (! $nameMap->{$role}) {
210 :     $notFound++;
211 :     }
212 :     }
213 :     if ($notFound) {
214 :     die "$notFound roles missing from pre-loaded role hashes.";
215 :     }
216 :     }
217 :     # Return the object created.
218 :     return $retVal;
219 :     }
220 :    
221 :    
222 :     =head3 Log
223 :    
224 :     $checker->Log($message);
225 :    
226 :     Write a message to the log stream, if it exists.
227 :    
228 :     =over 4
229 :    
230 :     =item message
231 :    
232 :     Message string to write.
233 :    
234 :     =back
235 :    
236 :     =cut
237 :    
238 :     sub Log {
239 :     my ($self, $message) = @_;
240 :     my $logH = $self->{logH};
241 :     if ($logH) {
242 :     print $logH $message;
243 :     }
244 :     }
245 :    
246 :     =head2 Query Methods
247 :    
248 :     =head3 role_name
249 :    
250 :     my $name = $checker->role_name($role);
251 :    
252 :     Return the name of a role given its ID. If the role is not in the role hash, the role ID itself will be returned.
253 :    
254 :     =over 4
255 :    
256 :     =item role
257 :    
258 :     ID of a role.
259 :    
260 :     =item RETURN
261 :    
262 :     Return the name of the role.
263 :    
264 :     =back
265 :    
266 :     =cut
267 :    
268 :     sub role_name {
269 :     my ($self, $role) = @_;
270 :     return $self->{nameMap}{$role} // $role;
271 :     }
272 :    
273 :     =head3 roles_of_function
274 :    
275 :     my @roles = $checker->roles_of_function($function);
276 :    
277 :     Return the IDs of the roles found in the specified function. This may be an empty list.
278 :    
279 :     =over 4
280 :    
281 :     =item function
282 :    
283 :     A functional assignment description.
284 :    
285 :     =item RETURN
286 :    
287 :     Returns a list of the IDs for the roles found in the function.
288 :    
289 :     =back
290 :    
291 :     =cut
292 :    
293 :     sub roles_of_function {
294 :     my ($self, $function) = @_;
295 :     # Get the role map.
296 :     my $roleMap = $self->{roleMap};
297 :     # Divide the function into roles.
298 :     my @roles = SeedUtils::roles_of_function($function // '');
299 :     # Convert each role into an ID.
300 :     my @retVal;
301 :     for my $role (@roles) {
302 :     my $check = RoleParse::Checksum($role);
303 :     my $id = $roleMap->{$check};
304 :     if ($id) {
305 :     push @retVal, $id;
306 :     }
307 :     }
308 :     # Return the roles found.
309 :     return @retVal;
310 :     }
311 :    
312 :    
313 :     =head3 taxon_data
314 :    
315 :     my ($name, $roleHash) = $checker->taxon_data($taxonID);
316 :    
317 :     Return the name and the hash of universal roles for a taxonomic group.
318 :    
319 :     =over 4
320 :    
321 :     =item taxonID
322 :    
323 :     The taxonomic ID of the group whose data is desired.
324 :    
325 :     =item RETURN
326 :    
327 :     Returns a two-element list containing the name of the taxonomic group and a reference to a hash mapping each universal role to its weight. If the taxonomic group
328 :     is not in this object, undefined values will be returned.
329 :    
330 :     =back
331 :    
332 :     =cut
333 :    
334 :     sub taxon_data {
335 :     my ($self, $taxonID) = @_;
336 :     # These will be the return values.
337 :     my ($name, $roleHash);
338 :     # Look for the group name.
339 :     $name = $self->{taxNames}{$taxonID};
340 :     if ($name) {
341 :     # We found it, so get the role hash, too.
342 :     $roleHash = $self->{roleLists}{$taxonID};
343 :     }
344 :     # Return the results.
345 :     return ($name, $roleHash);
346 :     }
347 :    
348 :    
349 :     =head2 Public Manipulation Methods
350 :    
351 :     =head3 Check
352 :    
353 :     my $dataH = $checker->Check($geo);
354 :    
355 :     Check a L<GEO> for completeness and contamination. A hash of the problematic roles will be returned as well.
356 :    
357 :     =over 4
358 :    
359 :     =item geo
360 :    
361 :     The L<GenomeEvalObject> to be checked.
362 :    
363 :     =item RETURN
364 :    
365 :     Returns a reference to a hash with the following keys.
366 :    
367 :     =over 8
368 :    
369 :     =item complete
370 :    
371 :     The percent completeness.
372 :    
373 :     =item contam
374 :    
375 :     The percent contamination.
376 :    
377 :     =item extra
378 :    
379 :     The percent of extra genomes. This is an attempt to mimic the checkM contamination value.
380 :    
381 :     =item roleData
382 :    
383 :     Reference to a hash mapping each marker role ID to the number of times it was found.
384 :    
385 :     =item taxon
386 :    
387 :     Name of the taxonomic group used.
388 :    
389 :     =back
390 :    
391 :     =back
392 :    
393 :     =cut
394 :    
395 :     sub Check {
396 :     my ($self, $geo) = @_;
397 :     # These will be the return values.
398 :     my ($complete, $contam, $multi);
399 :     my $taxGroup = 'root';
400 :     # Get the statistics object.
401 :     my $stats = $self->{stats};
402 :     # This hash will count the roles.
403 :     my %roleData;
404 :     # Get the role map. We use this to compute role IDs from role names.
405 :     my $roleMap = $self->{roleMap};
406 :     # Get the hash of role lists.
407 :     my $roleLists = $self->{roleLists};
408 :     # Compute the appropriate taxonomic group for this GTO and get its role list.
409 :     my $lineage = $geo->lineage || [];
410 :     my @taxons = @$lineage;
411 :     my $groupID;
412 :     my $taxon = $geo->taxon;
413 :     while (! $groupID && (my $taxon1 = pop @taxons)) {
414 :     if ($roleLists->{$taxon1}) {
415 :     $groupID = $taxon1;
416 :     }
417 :     }
418 :     if (! $taxon) {
419 :     # No taxonomic data at all.
420 :     $self->Log("No taxonomic information available for " . $geo->id . "\n");
421 :     } elsif (! defined $groupID) {
422 :     # Taxonomic data, but no group. We give up.
423 :     $self->Log("No taxonomic group in database that includes $taxon.\n");
424 :     } else {
425 :     # Get the group data.
426 : parrello 1.2 my $roleHash;
427 :     ($taxGroup, $roleHash) = $self->taxon_data($groupID);
428 : parrello 1.1 $self->Log("Group $groupID: $taxGroup selected for $taxon.\n");
429 :     # Fill the roleData hash from the role list.
430 :     %roleData = map { $_ => 0 } keys %$roleHash;
431 :     my $markers = scalar keys %roleData;
432 :     my $size = $self->{taxSizes}{$groupID};
433 :     $self->Log("$markers markers with total weight $size for group $groupID: $self->{taxNames}{$groupID}.\n");
434 :     # Get the role counts for the genome.
435 :     my $countsH = $geo->roleCounts;
436 :     # Now we count the markers.
437 :     my ($found, $extra, $total) = (0, 0, 0);
438 :     for my $roleID (keys %roleData) {
439 :     my $count = $countsH->{$roleID} // 0;
440 :     $roleData{$roleID} = $count;
441 :     if ($count >= 1) {
442 :     my $weight = $roleHash->{$roleID};
443 :     $found += $weight;
444 :     $extra += ($count - 1) * $weight;
445 :     }
446 :     }
447 :     # Compute the percentages.
448 :     $complete = $found * 100 / $size;
449 :     $contam = ($extra > 0 ? $extra * 100 / ($found + $extra) : 0);
450 :     $multi = $extra * 100 / $size;
451 :     }
452 :     # Return the results.
453 :     my $retVal = { complete => $complete, contam => $contam, multi => $multi, roleData => \%roleData, taxon => $taxGroup };
454 :     return $retVal;
455 :     }
456 :    
457 :     =head3 Check2
458 :    
459 :     my ($complete, $contam, $taxon, $seedFlag) = $checker->Check2($geo, $oh);
460 :    
461 :     This performs the same job as L</Check> (evaluating a genome for completeness and contamination),
462 :     but it returns a list of the key metrics in printable form. If an output file handle is
463 :     provided, it will also write the results to the output file.
464 :    
465 :     =over 4
466 :    
467 :     =item geo
468 :    
469 :     The L<GEO> to be checked.
470 :    
471 :     =item oh (optional)
472 :    
473 :     If specified, an open file handle to which the output should be written. This consists of the labeled metrics
474 :     followed by the (role, predicted, actual) tuples in tab-delimited format.
475 :    
476 :     =item RETURN
477 :    
478 :     Returns a list containing the percent completeness, percent contamination, the name of the taxonomic grouping
479 :     used, and a flag that is 'Y' if the seed protein is good and 'N' otherwise. The two percentages will be rounded
480 :     to the nearest tenth of a percent.
481 :    
482 :     =back
483 :    
484 :     =cut
485 :    
486 :     sub Check2 {
487 :     my ($self, $geo, $oh) = @_;
488 :     # Get the stats object.
489 :     my $stats = $self->{stats};
490 :     # Do the evaluation and format the output values.
491 :     my $evalH = $self->Check($geo);
492 :     my ($complete, $contam, $taxon) = (0, 100, 'N/F');
493 :     if (defined $evalH->{complete}) {
494 :     $complete = Math::Round::nearest(0.1, $evalH->{complete} // 0);
495 :     $contam = Math::Round::nearest(0.1, $evalH->{contam} // 100);
496 :     $taxon = $evalH->{taxon};
497 :     }
498 :     my $seedFlag = ($geo->good_seed ? 'Y' : '');
499 :     my $roleH = $evalH->{roleData};
500 :     # Update the statistics.
501 :     if (! $roleH) {
502 :     $stats->Add(evalGFailed => 1);
503 :     } else {
504 :     $stats->Add(genomeComplete => 1) if GEO::completeX($complete);
505 :     $stats->Add(genomeClean => 1) if GEO::contamX($contam);
506 :     }
507 :     if ($seedFlag) {
508 :     $stats->Add(genomeGoodSeed => 1);
509 :     } else {
510 :     $stats->Add(genomeBadSeed => 1);
511 :     }
512 :     if ($oh) {
513 :     # Output the check results.
514 :     print $oh "Good Seed: $seedFlag\n";
515 :     print $oh "Completeness: $complete\n";
516 :     print $oh "Contamination: $contam\n";
517 :     print $oh "Group: $taxon\n";
518 :     # Now output the role counts.
519 :     if ($roleH) {
520 :     for my $role (sort keys %$roleH) {
521 :     my $count = $roleH->{$role};
522 :     print $oh "$role\t1\t$count\n";
523 :     }
524 :     }
525 :     }
526 :     return ($complete, $contam, $taxon, $seedFlag);
527 :     }
528 :    
529 :    
530 :     1;

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3