[Bio] / FigKernelScripts / build_sample_sigs.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/build_sample_sigs.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/env perl
2 :     #
3 :     # Copyright (c) 2003-2015 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 :    
20 :     use strict;
21 :     use warnings;
22 :     use FIG_Config;
23 :     use ScriptUtils;
24 :     use KmerDb;
25 :     use FastA;
26 :     use Stats;
27 :     use P3Utils;
28 :     use P3DataAPI;
29 :    
30 :     =head1 Create Signature DNA Kmers for Representative Genomes
31 :    
32 :     build_sample_sigs.pl [ options ] repDir
33 :    
34 :     This script creates a signature kmer database for representative genomes.
35 :    
36 :     =head2 Parameters
37 :    
38 :     The positional parameter is the name of the representative-genome directory.
39 :    
40 :     The command-line options are the following.
41 :    
42 :     =over 4
43 :    
44 :     =item kmer
45 :    
46 :     Kmer size to use, in base pairs. The default is C<23>.
47 :    
48 :     =item maxFound
49 :    
50 :     The maximum number of times a kmer can be found before it is considered common. Common kmers are removed from the hash. The
51 :     default is C<10>.
52 :    
53 :     =item indicative
54 :    
55 :     If specified, the kmers will be indicative, rather than signature. Normally, only kmers unique to a genome will be put in
56 :     the database. In indicative mode, all uncommon kmers are kept.
57 :    
58 :     =item reduced
59 :    
60 :     If specified, the kmers will be based solely on the seed proteins. If this is the case, there must be a C<6.1.1.20.dna.fasta>
61 :     file in the representative-genomes directory.
62 :    
63 :     =item outFile
64 :    
65 :     The name to give to the output file. The default is I<type>I<scope>C<.>I<KK>C<.json>, where I<type> is C<sig> for signatures
66 :     and C<kmer> in B<indicative> mode, I<KK> is the kmer size, and I<scope> is C<Prot> in reduced mode and empty otherwise.
67 :    
68 :     =item subset
69 :    
70 :     If specified, the name of a file containing genome IDs in the first column. Only the genomes in the subset will be used.
71 :    
72 :     =back
73 :    
74 :     =cut
75 :    
76 :     $| = 1;
77 :     # Get the command-line parameters.
78 :     my $opt = P3Utils::script_opts('repDir',
79 :     ['kmer|K=i', 'kmer size to use', { default => 23 }],
80 :     ['indicative|I', 'indicative instead of signature kmers'],
81 :     ['maxFound|m=i', 'maximum number of occurrences before a kmer is considered common', { default => 10 }],
82 :     ['reduced', 'process the seed proteins only'],
83 :     ['outfile=s', 'name to give to the output file'],
84 :     ['subset=s', 'if specified, the name of a file containing a restricted set of genome IDs']
85 :     );
86 :     my $stats = Stats->new();
87 :     # Get the representative genome directory.
88 :     my ($repDir) = @ARGV;
89 :     if (! $repDir) {
90 :     die "No representative genome directory specified.";
91 :     } elsif (! -s "$repDir/complete.genomes") {
92 :     die "$repDir does not have a complete.genomes file.";
93 :     }
94 :     # Get the options.
95 :     my $K = $opt->kmer;
96 :     my $maxFound = $opt->maxfound;
97 :     my $indicMode = $opt->indicative;
98 :     my $reduced = $opt->reduced;
99 :     print "Kmer size is $K.\n";
100 :     # Compute the output file name.
101 :     my $outFile = $opt->outfile;
102 :     if (! $outFile) {
103 :     $outFile = "$repDir/" . join('.', ($indicMode ? 'kmer' : 'sig') . ($reduced ? 'Prot' : ''), $K, 'json');
104 :     }
105 :     if ($reduced && ! -s "$repDir/6.1.1.20.dna.fasta") {
106 :     die "$repDir does not have a DNA FASTA file.";
107 :     }
108 :     # Connect to the database.
109 :     my $p3 = P3DataAPI->new();
110 :     # Create the kmer database.
111 :     my $kmerDb = KmerDb->new(kmerSize => $K, maxFound => $maxFound, mirror => 1);
112 :     # Check for the subset file.
113 :     my %subset;
114 :     my $subFile = $opt->subset;
115 :     if ($subFile) {
116 :     open(my $ih, '<', $subFile) || die "Could not open subset file: $!";
117 :     while (! eof $ih) {
118 :     my $line = <$ih>;
119 :     if ($line =~ /(\d+\.\d+)/) {
120 :     $subset{$1} = 1;
121 :     }
122 :     }
123 :     }
124 :     # Read in the genome IDs and compute the names.
125 :     my %genomes;
126 :     open(my $gh, "<$repDir/complete.genomes") || die "Could not open complete.genomes: $!";
127 :     while (! eof $gh) {
128 :     my $line = <$gh>;
129 :     if ($line =~ /^(\d+\.\d+)\t(.+)/) {
130 :     my ($genome, $name) = ($1, $2);
131 :     if ($subFile && ! $subset{$genome}) {
132 :     $stats->Add(genomeSkipped => 1);
133 :     } else {
134 :     $kmerDb->AddGroup($genome, $name);
135 :     $genomes{$genome} = $name;
136 :     $stats->Add(genomeIn => 1);
137 :     }
138 :     }
139 :     }
140 :     print "Genome names stored.\n";
141 :     close $gh;
142 :     # Loop through the DNA sequences.
143 :     if ($reduced) {
144 :     # Reduced mode. Read the DNA fasta file.
145 :     my $fh = FastA->new("$repDir/6.1.1.20.dna.fasta");
146 :     while ($fh->next) {
147 :     my $id = $fh->id;
148 :     if ($id =~ /(\d+\.\d+)/) {
149 :     my $genome = $1;
150 :     if ($genomes{$genome}) {
151 :     $kmerDb->AddSequence($genome, $fh->left);
152 :     $stats->Add(sequenceIn => 1);
153 :     }
154 :     }
155 :     }
156 :     } else {
157 :     # Full mode: read the DNA sequences.
158 :     for my $genome (sort keys %genomes) {
159 :     my $name = $kmerDb->name($genome);
160 :     print "Processing $genome $name.\n";
161 :     $stats->Add(genomeScanned => 1);
162 :     my $seqs = P3Utils::get_data($p3, contig => [['eq', 'genome_id', $genome]], ['sequence']);
163 :     for my $seq (@$seqs) {
164 :     my $sequence = $seq->[0];
165 :     $kmerDb->AddSequence($genome, $sequence);
166 :     $stats->Add(sequenceIn => 1);
167 :     }
168 :     }
169 :     }
170 :     # Now we finalize depending on the mode.
171 :     if ($indicMode) {
172 :     print "Finalizing indicators.\n";
173 :     $kmerDb->Finalize();
174 :     } else {
175 :     print "Finalizing discriminators.\n";
176 :     $kmerDb->ComputeDiscriminators();
177 :     }
178 :     # Write the output file.
179 :     print "Saving database as $outFile.\n";
180 :     $kmerDb->Save($outFile);
181 :     print "All done.\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3