44 |
|
|
45 |
If the sequence ID is already present in the input file, the name of the column containing the sequence ID. |
If the sequence ID is already present in the input file, the name of the column containing the sequence ID. |
46 |
|
|
47 |
|
=item verbose |
48 |
|
|
49 |
|
If specified, status messages will be written to STDERR. |
50 |
|
|
51 |
=back |
=back |
52 |
|
|
53 |
=cut |
=cut |
55 |
use strict; |
use strict; |
56 |
use P3DataAPI; |
use P3DataAPI; |
57 |
use P3Utils; |
use P3Utils; |
58 |
|
use Math::Round; |
59 |
|
|
60 |
# Get the command-line options. |
# Get the command-line options. |
61 |
my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(), |
my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(), |
62 |
['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }], |
['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }], |
63 |
['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }], |
['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }], |
64 |
['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'], |
['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'], |
65 |
['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature'] |
['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature'], |
66 |
|
['verbose|v', 'display progress messages on STDERR'], |
67 |
); |
); |
68 |
# This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form |
# This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form |
69 |
# [category, start, end]. |
# [category, start, end]. |
73 |
my $seqCol = $opt->sequence; |
my $seqCol = $opt->sequence; |
74 |
my $maxGap = $opt->maxgap; |
my $maxGap = $opt->maxgap; |
75 |
my $minCount = $opt->mincount; |
my $minCount = $opt->mincount; |
76 |
|
my $debug = $opt->verbose; |
77 |
# Get the category column. |
# Get the category column. |
78 |
my ($catCol) = @ARGV; |
my ($catCol) = @ARGV; |
79 |
if (! defined $catCol) { |
if (! defined $catCol) { |
101 |
push @selects, 'start', 'end'; |
push @selects, 'start', 'end'; |
102 |
$queryNeeded = 1; |
$queryNeeded = 1; |
103 |
} |
} |
104 |
|
if ($queryNeeded && $debug) { |
105 |
|
print STDERR "PATRIC queries will be needed.\n"; |
106 |
|
} |
107 |
# Find the category column. |
# Find the category column. |
108 |
$catCol = P3Utils::find_column($catCol, $outHeaders); |
$catCol = P3Utils::find_column($catCol, $outHeaders); |
109 |
# Form the full header set and write it out. |
# Form the full header set and write it out. |
110 |
if (! $opt->nohead) { |
if (! $opt->nohead) { |
111 |
my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count'); |
my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count', 'percent'); |
112 |
P3Utils::print_cols(\@headers); |
P3Utils::print_cols(\@headers); |
113 |
} |
} |
114 |
|
# These are used for status messages. |
115 |
|
my $count = 0; |
116 |
|
my $period = ($queryNeeded ? 1000 : 100000); |
117 |
|
# This counts the category occurrences. |
118 |
|
my %catCounts; |
119 |
# Loop through the input. |
# Loop through the input. |
120 |
while (! eof $ih) { |
while (! eof $ih) { |
121 |
my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt); |
my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt); |
129 |
for my $couplet (@$couplets) { |
for my $couplet (@$couplets) { |
130 |
my ($fid, $line) = @$couplet; |
my ($fid, $line) = @$couplet; |
131 |
my $category = $line->[$catCol]; |
my $category = $line->[$catCol]; |
132 |
|
$count++; |
133 |
if ($category) { |
if ($category) { |
134 |
my ($start, $end, $sequence); |
my ($start, $end, $sequence); |
135 |
my $fidData = $rows{$fid}; |
my $fidData = $rows{$fid}; |
160 |
push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end]; |
push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end]; |
161 |
} |
} |
162 |
} |
} |
163 |
|
print STDERR "$count features processed.\n" if $debug && $count % $period == 0; |
164 |
} |
} |
165 |
# Now we have category and position data for each feature sorted by sequence. |
# Now we have category and position data for each feature sorted by sequence. |
166 |
# For each list, we sort by start position and figure out what qualifies as a couple. |
# For each list, we sort by start position and figure out what qualifies as a couple. |
167 |
# The couples are counted in this hash, which is keyed by "element1\telement2". |
# The couples are counted in this hash, which is keyed by "element1\telement2". |
168 |
my %couples; |
my %couples; |
169 |
for my $contig (keys %contigs) { |
print STDERR scalar(keys %contigs) . " contigs ready to examine.\n" if $debug; |
170 |
|
for my $contig (sort keys %contigs) { |
171 |
my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}}; |
my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}}; |
172 |
|
print STDERR "Processing $contig with " . scalar(@features) . " features.\n" if $debug; |
173 |
# We process one feature at a time, and stop when there are none left |
# We process one feature at a time, and stop when there are none left |
174 |
# to couple with it. |
# to couple with it. |
175 |
my $feat = shift @features; |
my $feat = shift @features; |
176 |
while (scalar @features) { |
while (scalar @features) { |
177 |
my $cat1 = $feat->[0]; |
my $cat1 = $feat->[0]; |
178 |
|
$catCounts{$cat1}++; |
179 |
# Compute the latest start position that qualifies as a couple. |
# Compute the latest start position that qualifies as a couple. |
180 |
my $limit = $feat->[2] + $maxGap; |
my $limit = $feat->[2] + $maxGap; |
181 |
# Loop through the remaining features until we hit the limit. |
# Loop through the remaining features until we hit the limit. |
190 |
} |
} |
191 |
} |
} |
192 |
# Sort the couples and output them. |
# Sort the couples and output them. |
193 |
|
print STDERR "Writing couples.\n" if $debug; |
194 |
my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples; |
my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples; |
195 |
for my $couple (@couples) { |
for my $couple (@couples) { |
196 |
P3Utils::print_cols([$couple, $couples{$couple}]); |
my ($cat1, $cat2) = split /\t/, $couple; |
197 |
|
my $count = $couples{$couple}; |
198 |
|
my $total = ($catCounts{$cat1} + $catCounts{$cat2}) / 2; |
199 |
|
P3Utils::print_cols([$couple, $count, Math::Round::nearest(0.01, $count * 100 / $total)]); |
200 |
} |
} |