[Bio] / FigKernelScripts / complete_genomes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/complete_genomes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.7 - (view) (download) (as text)

1 : redwards 1.1 # -*- perl -*-
2 : olson 1.7 #
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : redwards 1.1
20 :     =pod
21 :    
22 :     =head1 complete_genomes.pl
23 :    
24 :     This script will try and identify those genomes that are "complete". It is based on a heuristic developed by Gary Olsen, see http://fogbugz.nmpdr.org/default.php?pg=pgEditBug&command=view&ixBug=94 for more details.
25 :    
26 :     The basic rule of thumb is that more than 70% of the nucleotides in total must be in contigs > 20 kb.
27 :    
28 :     =cut
29 :    
30 :     use strict;
31 :     use FIG;
32 :    
33 :    
34 :     my $usage=<<EOF;
35 :    
36 :     $0 [-e|-b] -v
37 :    
38 :    
39 :     One of either -e or -b must be provided
40 :     -e check eukaryotes
41 :     -b check bacteria
42 : redwards 1.3 -a check archea
43 :     -z check all genomes
44 : redwards 1.2
45 : redwards 1.1 -v verbose output
46 :    
47 :     EOF
48 :    
49 :    
50 :     # figure out any command line options
51 :     my ($check_domain, $verbose)=(undef, 0);
52 :    
53 :     while (@ARGV)
54 :     {
55 :     my $try=shift(@ARGV);
56 : redwards 1.2 if ($try eq "-e") {$check_domain="Eukaryota"}
57 : redwards 1.1 elsif ($try eq "-b") {$check_domain="Bacteria"}
58 : redwards 1.3 elsif ($try eq "-a") {$check_domain="Archaea"}
59 :     elsif ($try eq "-z") {$check_domain="all"}
60 : redwards 1.1 elsif ($try eq "-v") {$verbose=1}
61 :     }
62 :    
63 :    
64 :     die $usage unless ($check_domain);
65 :    
66 :     my $fig=new FIG;
67 :    
68 :     # first find out how many nucleotides we have, and how many contigs we have > 20 kb
69 :    
70 : redwards 1.4 print join "\t", "Genome ID", "Genome", "Domain", "Number of contigs", "Number of contigs > 20 kb", "Total length", "Total length in contigs > 20kb", "Fraction of bases in contigs > 20kb", "Complete", "Status", "Conflict", "\n";
71 : redwards 1.1
72 : redwards 1.2 if ($check_domain eq "all") {$check_domain=''}
73 : redwards 1.1 foreach my $genome ($fig->genomes(undef, undef, $check_domain))
74 :     {
75 :     my ($totallength, $contigs, $length_big, $number_big)=(0,0,0,0);
76 :     print STDERR "Checking ", $fig->genus_species($genome), "\n" if ($verbose);
77 :     foreach my $contig ($fig->all_contigs($genome))
78 :     {
79 :     my $len=$fig->contig_ln($genome, $contig);
80 :     $contigs++;
81 :     $totallength+=$len;
82 :     if ($len > 20000) {$length_big += $len; $number_big++}
83 :     }
84 :    
85 :     # check to see whether the genome has the tag "status"
86 : redwards 1.5 my @statii=$fig->get_attributes($genome, "STATUS");
87 :     my $status='';
88 : redwards 1.6 $statii[0] && ($status=$statii[0]->[2]);
89 : redwards 1.4 my $conflict='';
90 :    
91 :     # rules for the conflicts:
92 :     # 1. >70% of sequence in contigs >20kb is complete
93 :     # 2. status should be "complete"
94 :     # 3. complete should be true
95 : redwards 1.1
96 : redwards 1.4 if ($length_big/$totallength > 0.70)
97 :     {
98 :     # this should be considered complete
99 :     $conflict = 1 if (lc($status) ne "complete");
100 :     $conflict = 1 if (!$fig->is_complete($genome));
101 :     }
102 :     else
103 :     {
104 :     # this should not be complete
105 :     $conflict = 1 if (lc($status) ne "incomplete");
106 :     $conflict = 1 if ($fig->is_complete($genome));
107 :     }
108 :    
109 :    
110 :    
111 :     print join "\t", $genome, $fig->genus_species($genome), $fig->genome_domain($genome), $contigs, $number_big, $totallength, $length_big, $length_big/$totallength, $fig->is_complete($genome), $status, $conflict, "\n";
112 : redwards 1.1 }
113 :    
114 :    
115 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3