[Bio] / FigKernelScripts / find_close_genomes.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/find_close_genomes.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/env perl
2 :     #
3 :     # Copyright (c) 2003-2015 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 :    
20 :     use strict;
21 :     use warnings;
22 :     use ScriptUtils;
23 :     use KmerRepGenomes;
24 :     my $have_shrub;
25 :     eval {
26 :     require Shrub;
27 :     $have_shrub = 1;
28 :     };
29 :    
30 :     =head1 Find Close Genomes Using Kmers
31 :    
32 :     find_close_genomes.pl [ options ] <fastaFile
33 :    
34 :     This script uses a kmer database to find the genomes closest to the contigs in a FASTA file. The kmer database
35 :     must be built from the Shrub. This script eschews that option so that it will compile and run in a SEED
36 :     environment as well as SEEDtk.
37 :    
38 :     =head2 Parameters
39 :    
40 :     The standard input is a FASTA file containing the contigs to process. The standard input can be specified as a
41 :     command-line option using L<ScriptUtils/ih_options>. The other command-line options include those from
42 :     L<Shrub::script_options> (if in the SEEDtk environment) as well as the following.
43 :    
44 :     =over 4
45 :    
46 :     =item kmerFile
47 :    
48 :     Name of a json file containing the kmer database. This is a required parameter. If the file does not exist, and we
49 :     are running in the SEEDtk environment, the kmer database will be built on the fly. If it does not exist and we are
50 :     running in the SEED environment, it is an error.
51 :    
52 :     =item kmerSize
53 :    
54 :     The size of a protein kmer. The default is C<10>. This option is only used in the SEEDtk environment and only if no kmer
55 :     database file exists.
56 :    
57 :     =item minHits
58 :    
59 :     The minimum number of hits for a genome to be considered relevant. The default is C<400>. This option is only used
60 :     in the SEEDtk environment and only if no database file exists.
61 :    
62 :     =item maxFound
63 :    
64 :     The maximum number of genomes in which a kmer can be found before it is considered common. Common kmers are removed from
65 :     the hash. This option is only used in the SEEDtk environment and only if no database file exists.
66 :    
67 :     =item priv
68 :    
69 : parrello 1.2 The privilege level at which the roles of a protein should be assessed. The default is C<1>. This option is only used in
70 :     the SEEDtk environment and only if no database file exists.
71 : parrello 1.1
72 :     =item output
73 :    
74 :     The name of an output file. This will be a tab-delimited file, each record containing (0) a genome ID, and (1) a
75 :     count of the number of kmer hits. If this parameter is omitted, the information will be written to the standard
76 :     output, but will be mixed with status messages.
77 :    
78 :     =back
79 :    
80 :     =cut
81 :    
82 :     # Decide which options make sense here.
83 :     my @opts = (ScriptUtils::ih_options(),
84 :     ['minhits=i', 'minimum number of kmer hits for a genome to be considered useful', { default => 400 }],
85 :     ['output|o=s', 'genome output file']);
86 :     if ($have_shrub) {
87 :     push @opts, ['maxfound=i', 'maximum number of kmer occurrences before it is considered common', { default => 10 }],
88 :     ['kmersize|k=i', 'kmer size (protein)', { default => 10 }],
89 :     ['priv=i', 'privilege level for role associations', { default => 1 }],
90 :     ['kmerFile|f=s', 'kmer database file name', { default => "$FIG_Config::global/reps_kmer.json" }],
91 :     Shrub::script_options();
92 :     } else {
93 :     push @opts, ['kmerFile|f=s', 'kmer database file name', { required => 1 }],
94 :     }
95 :    
96 :     # Get the command-line parameters.
97 :     my $opt = ScriptUtils::Opts('', @opts);
98 :     # This will point to the shrub database if we need it.
99 :     my $shrub;
100 :     # This will contain our options for the call to the kmer facility.
101 :     my %options = (minHits => $opt->minhits);
102 :     # This will contain the representative roles for the call to the kmer facility.
103 :     my @repRoles;
104 :     # Check the kmer database file.
105 :     my $kmerFile = $opt->kmerfile;
106 :     if (! -f $kmerFile) {
107 :     # Here we need to build a kmer database.
108 :     if (! $have_shrub) {
109 :     die "Kmer database must be pre-built in the SEED environment.";
110 :     } else {
111 :     # Set up to build the database.
112 :     $shrub = Shrub->new_for_script($opt);
113 :     $options{kmerSize} = $opt->kmersize;
114 :     $options{priv} = $opt->priv;
115 :     $options{maxFound} = $opt->maxfound;
116 :     # Get the representative roles.
117 :     require FIG_Config;
118 :     open(my $rh, "<$FIG_Config::global/rep_roles.tbl") || die "Could not open representative roles file.";
119 :     @repRoles = map { $_ =~ /^(\S+)/; $1 } <$rh>;
120 :     }
121 :     }
122 :     # Create the kmer processing object.
123 :     my $kmerThing = KmerRepGenomes->new($shrub, $kmerFile, \@repRoles, %options);
124 :     # Open the input file.
125 :     my $ih = ScriptUtils::IH($opt->input);
126 :     # Find the close genomes.
127 :     my $genomeH = $kmerThing->FindGenomes($ih);
128 :     # Open the output file.
129 :     my $oh;
130 :     if ($opt->output) {
131 :     open($oh, '>', $opt->output) || die "Could not open output file: $!";
132 :     } else {
133 :     $oh = \*STDOUT;
134 :     }
135 :     # Write out the genomes found.
136 :     print "Writing genomes found.\n\n";
137 :     my @genomes = sort { $genomeH->{$b}[0] <=> $genomeH->{$a}[0] } keys %$genomeH;
138 :     for my $genome (@genomes) {
139 :     print $oh join("\t", $genome, @{$genomeH->{$genome}}) . "\n";
140 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3