[Bio] / FigKernelScripts / collate_reads.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/collate_reads.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 #!/usr/bin/env perl
2 :     #
3 :     # Copyright (c) 2003-2015 University of Chicago and Fellowship
4 :     # for Interpretations of Genomes. All Rights Reserved.
5 :     #
6 :     # This file is part of the SEED Toolkit.
7 :     #
8 :     # The SEED Toolkit is free software. You can redistribute
9 :     # it and/or modify it under the terms of the SEED Toolkit
10 :     # Public License.
11 :     #
12 :     # You should have received a copy of the SEED Toolkit Public License
13 :     # along with this program; if not write to the University of Chicago
14 :     # at info@ci.uchicago.edu or the Fellowship for Interpretation of
15 :     # Genomes at veronika@thefig.info or download a copy from
16 :     # http://www.theseed.org/LICENSE.TXT.
17 :     #
18 :    
19 :    
20 :     use strict;
21 :     use warnings;
22 :     use FIG_Config;
23 :     use ScriptUtils;
24 :     use FastA;
25 :     use FastQ;
26 :     use File::Copy::Recursive;
27 :     use KmerDb;
28 :     use Stats;
29 :    
30 :     =head1 Collate Reads Into Possible Bins
31 :    
32 :     collate_reads.pl [ options ] workDir kmerFile fq1 fq2
33 :    
34 :     This script processes unassembled reads and attempts to collate them into bins based on signature kmers. The reads
35 :     are read individually, and a stride-based check used to look for signature kmers. The reads with signature kmers
36 :     are collected in a separate output stream for each potential bin.
37 :    
38 :     The output files will be labeled C<bin1>, C<bin2>, ... and so forth. The index for the first bin can be set using
39 :     the C<--idx> parameter.
40 :    
41 :     =head2 Parameters
42 :    
43 :     The positional parameters are the name of a working directory, the name of a kmer database, and the names
44 :     of the input files. The input files can either be a single FASTA file or a pair of paired-end FASTQ files. The number
45 :     of parameters will determine how the files are treated.
46 :    
47 :     The command-line options are the following.
48 :    
49 :     =over 4
50 :    
51 :     =item stride
52 :    
53 :     The number of positions to skip when checking kmers in a read. The default is C<10>. Use C<1> to insure every possible read
54 :     substring is checked.
55 :    
56 :     =item min
57 :    
58 :     Minimum number of kmer hits to consider a read worth including. The default is C<1>.
59 :    
60 :     =item idx
61 :    
62 :     The index number to give to the first bin produced.
63 :    
64 :     =back
65 :    
66 :     =cut
67 :    
68 :     $| = 1;
69 :     # Get the command-line parameters.
70 :     my $opt = ScriptUtils::Opts('workDir kmerFile fq1 fq2',
71 :     ['stride=i', 'distance between kmer checks in the read', { default => 10 }],
72 :     ['min|m=i', 'minimum number of hits required to bin', { default => 1 }],
73 :     ['idx|i=i', 'index of first bin', { default => 1 }]
74 :     );
75 :     # Get the parameters.
76 :     my ($workDir, $kmerFile, @inFiles) = @ARGV;
77 :     if (! $workDir) {
78 :     die "No output directory specified.";
79 :     } elsif (! -d $workDir) {
80 :     print "Creating output directory $workDir.\n";
81 :     File::Copy::Recursive::pathmk($workDir) || die "Could not create $workDir: $!";
82 :     } elsif (! $kmerFile) {
83 :     die "No kmer file specified.";
84 :     } elsif (! -s $kmerFile) {
85 :     die "Kmer file $kmerFile not found or empty.";
86 :     } elsif (! @inFiles) {
87 :     die "No input files specified.";
88 :     }
89 :     my $stats = Stats->new();
90 :     # Get the options.
91 :     my $min = $opt->min;
92 :     my $stride = $opt->stride;
93 :     my $idx = $opt->idx;
94 :     # Load the kmer database.
95 :     print "Loading kmer database.\n";
96 :     my $kmerDb = KmerDb->new(json => $kmerFile);
97 :     # Create the reader.
98 :     my $ih;
99 :     if (@inFiles == 2) {
100 :     $ih = FastQ->new(@inFiles);
101 :     } else {
102 :     $ih = FastA->new(@inFiles);
103 :     }
104 :     # Create the output bins.
105 :     my $groups = $kmerDb->all_groups();
106 :     print scalar(@$groups) . " bins will be produced, starting with bin$idx.\n";
107 :     my %bins;
108 :     for my $group (@$groups) {
109 :     my $name = $kmerDb->name($group);
110 :     print "bin$idx is $group $name.\n";
111 :     my $oh = $ih->Out("$workDir/bin$idx");
112 :     $bins{$group} = $oh;
113 :     }
114 :     # Now read the input, counting bins and writing the output.
115 :     my ($progress, $binned) = (0, 0);
116 :     while ($ih->next) {
117 :     $stats->Add(readIn => 1);
118 :     my @seqs = $ih->seqs;
119 :     my %counts;
120 :     for my $seq (@seqs) {
121 :     $kmerDb->count_hits($seq, \%counts, undef, $stride);
122 :     }
123 :     # Write the sequence to every bin that exceeds the minimum number of hits.
124 :     my $used;
125 :     for my $group (keys %counts) {
126 :     my $hits = $counts{$group};
127 :     $stats->Add(hitsFound => $hits);
128 :     if ($hits >= $min) {
129 :     $bins{$group}->Write($ih);
130 :     $stats->Add("reads-$group" => 1);
131 :     $used = 1;
132 :     } else {
133 :     $stats->Add(lowHits => 1);
134 :     }
135 :     }
136 :     if ($used) {
137 :     $stats->Add(readUsed => 1);
138 :     $binned++;
139 :     }
140 :     $progress++;
141 :     print "$progress reads processed, $binned binned.\n" if $progress % 100000 == 0;
142 :     }
143 :     # Close all the files.
144 :     print "Closing output files.\n";
145 :     for my $group (keys %bins) {
146 :     $bins{$group}->Close();
147 :     }
148 :     print "All done.\n" . $stats->Show();

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3