[Bio] / FigKernelScripts / assess_completeness.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/assess_completeness.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.9 - (view) (download) (as text)

1 : golsen 1.1 # -*- perl -*-
2 : olson 1.7 #
3 :     # Copyright (c) 2003-2006 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 : golsen 1.1
20 :     use strict;
21 :     use Carp;
22 :     use Data::Dumper;
23 :    
24 : overbeek 1.3 use FIG; my $fig = new FIG;
25 :    
26 : golsen 1.1 my $usage = "assess_completeness [-f min_fraction] [-s min_size] genome_id ...";
27 :    
28 :     # assess_completeness
29 :     #
30 :     # Reads contigs files for a genome and writes a file called PROBABLY_COMPLETE
31 :     # to the genome directory if the genome is judged to be complete by the
32 :     # criterion described below.
33 :     #
34 :     # A simple but remarkably robust heuristic is used. The program calculates the
35 :     # fraction of the nucleotides in contigs of greater than or equal to 20,000 bp.
36 :     # Emperically, values of 0.7 or better correspond to effectively complete
37 :     # genomes (the vast majority of genes can be found and the data quality tends
38 :     # to be very good). The threshold can be set by the -f option.
39 :     #
40 :    
41 : overbeek 1.3 my $minfrac = 0.7;
42 :     my $minlen = 20000;
43 : overbeek 1.2 my $minsize = 300000;
44 : golsen 1.1
45 :     my ( $flag, $val );
46 :     while ( @ARGV && ( ( $flag, $val ) = $ARGV[0] =~ /^\-(.)(.*)/ ) )
47 :     {
48 :     shift;
49 :     if ( ( $flag eq "f" ) && ( $val ||= shift @ARGV ) )
50 :     {
51 :     $minfrac = $val;
52 :     }
53 : overbeek 1.3 elsif ( ( $flag eq "l" ) && ( $val ||= shift @ARGV ) )
54 :     {
55 :     $minlen = $val;
56 :     }
57 : overbeek 1.2 elsif ( ( $flag eq "s" ) && ( $val ||= shift @ARGV ) )
58 : golsen 1.1 {
59 :     $minsize = $val;
60 :     }
61 :     else
62 :     {
63 :     print STDERR "Problem with flag '$flag'\n";
64 :     die "Usage: $usage\n";
65 :     }
66 :     }
67 :    
68 :     @ARGV and ( $minfrac > 0 ) and ( $minsize > 0 ) or die "Usage: $usage\n";
69 :    
70 :     my $orgdir = "$FIG_Config::organisms";
71 : overbeek 1.6 my $genomedir;
72 : golsen 1.1
73 :     my ( $genome, $ttlen, $inbig, $contig, $len );
74 :    
75 :     foreach $genome ( @ARGV )
76 :     {
77 : overbeek 1.8 print STDERR "." if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} == 1));
78 : overbeek 1.6
79 :     if (-d $genome)
80 :     {
81 :     $genomedir = $genome;
82 :     } else {
83 :     $genomedir = "$orgdir/$genome"; #...Fall back to installed organisms...
84 :     }
85 :    
86 : overbeek 1.8 if (opendir( GENOME, $genomedir ) )
87 :     {
88 :     print STDERR "Checking genomedir=$genomedir\n" if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
89 :     }
90 :     else
91 : golsen 1.1 {
92 :     print STDERR "Skipping $genome; could not open directory $genomedir\n";
93 :     next;
94 :     }
95 :    
96 : efrank 1.5 #The next line requires the org to have already been installed. But
97 :     #add_genome calls assess_completeness as part of the load!
98 :     #
99 :     #next unless ($fig->is_prokaryotic($genome) || $fig->is_eukaryotic($genome));
100 :     #
101 :     #So, intead, go directly to the TAXONOMY file and make the check ourselves:
102 :    
103 :     my $taxonomy = `cat "$genomedir/TAXONOMY"`;
104 : gdpusch 1.9 if ($taxonomy =~ /^\s*(Archaea|Bacteria|Eukaryota)/) {
105 : overbeek 1.8 print STDERR "TAXONOMY=$taxonomy"
106 :     if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
107 :     } else {
108 :     print STDERR "Taxonomy $taxonomy is not an A,B,E genome --- skipping\n"
109 :     if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
110 :     next;
111 :     }
112 :    
113 : golsen 1.1 $ttlen = $inbig = 0;
114 :     foreach $contig ( grep { $_ =~ /^contigs\d*$/ } readdir( GENOME ) )
115 :     {
116 : overbeek 1.8 open( LENGTHS, "fastasize < $genomedir/$contig |" ) || die "Could not pipe-open fastasize < $genomedir/$contig";
117 :    
118 : golsen 1.1 while ( defined( $_ = <LENGTHS> ) )
119 :     {
120 :     chomp;
121 :     if ( ( $len ) = $_ =~ /\s(\d+)$/ )
122 :     {
123 :     $ttlen += $len;
124 : overbeek 1.3 if ($len >= $minlen) { $inbig += $len }
125 : golsen 1.1 }
126 :     }
127 : overbeek 1.8 close( LENGTHS ) || die "Could not close pipe fastasize < $genomedir/$contig";
128 :     }
129 :    
130 :     if (not $ttlen)
131 :     {
132 :     print STDERR "Could not get total contig lengths from $genomedir/$contig --- skipping"
133 :     if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
134 :     next;
135 : golsen 1.1 }
136 :    
137 : overbeek 1.6 print STDERR "$genomedir:\tttlen = $ttlen,\tinbig = $inbig,\tfrac = ", int(0.5 + 10000 * $inbig / $ttlen)/100, "%\n";
138 : overbeek 1.4
139 : overbeek 1.3 if (($ttlen >= $minsize) && ($inbig >= $minfrac *$ttlen) )
140 : golsen 1.1 {
141 : overbeek 1.8 print STDERR "Write-opening $genomedir/PROBABLY_COMPLETE\n"
142 :     if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
143 :     open ( COMP, ">$genomedir/PROBABLY_COMPLETE" ) || die "Could not write-open $genomedir/PROBABLY_COMPLETE\n";
144 : golsen 1.1 my $frac = sprintf "%.2f", 100* $inbig / $ttlen;
145 : overbeek 1.3 print COMP "Judged to be complete because $frac% of the nucleotides are in contigs >= $minlen bp\n";
146 : golsen 1.1 close COMP;
147 :     }
148 :     else
149 :     {
150 : overbeek 1.3 if (-e "$genomedir/PROBABLY_COMPLETE")
151 :     {
152 : overbeek 1.8 print STDERR "Contigs $genomedir/contigs failed to pass cuts --- deleting existing file $genomedir/PROBABLY_COMPLETE\n"
153 :     if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} > 1));
154 : overbeek 1.3 unlink "$genomedir/PROBABLY_COMPLETE"
155 :     || die "Failed attempt to remove $genomedir/PROBABLY_COMPLETE\n";
156 :     }
157 : golsen 1.1 }
158 :     }
159 : overbeek 1.8
160 :     print STDERR "\n" if $ENV{VERBOSE};
161 :     print STDERR "\n" if (defined($ENV{VERBOSE}) && ($ENV{VERBOSE} == 1));
162 : golsen 1.1
163 : overbeek 1.3 exit(0);

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3