[Bio] / FigKernelScripts / p3-find-couples.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-find-couples.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : parrello 1.1 =head1 Find Physically Coupled Categories (e.g. Roles, Families)
2 :    
3 :     p3-find-couples.pl [options] catCol
4 :    
5 :     This script performes the general function of finding categories of features that tend to be physically coupled,
6 :     that is, commonly occurring in close proximity on the contigs. It can be
7 :     used to find coupled protein families, coupled roles, coupled functional assignments, or any number of things.
8 :     The input (key) column should contain feature IDs. The I<category column> specified as a parameter should identify
9 :     the column that contains the feature classification of interest. This could be the feature's role, its global protein
10 :     family (or other type of protein family), or anything of importance that groups similar features. The output will
11 :     display pairs of these categories that tend to occur phyiscally close together on the chromosome. So, for example,
12 :     if the category column contained roles, this program would output role couples. If the category column contained
13 :     global protein families, this program would output protein family couples.
14 :    
15 : parrello 1.3 A blank value in the category column will cause the input line to be ignored.
16 :    
17 : parrello 1.1 The output will be three columns-- the two category IDs and the number of times the couple occurred.
18 :    
19 :     =head2 Parameters
20 :    
21 :     The positional parameter is the index (1-based) or name of the column containing the category information.
22 :    
23 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
24 :    
25 :     Additional command-line options are those given in L<P3Utils/col_options> (to specify the column containing
26 :     feature IDs) plus the following.
27 :    
28 :     =over 4
29 :    
30 :     =item minCount
31 :    
32 :     The minimum number of times a couple must occur to be considered significant. The default is C<5>.
33 :    
34 :     =item maxGap
35 :    
36 :     The maximum number of base pairs allowed between two features in the same cluster. The default is C<2000>.
37 :    
38 :     =item location
39 :    
40 :     If the feature location is already present in the input file, the name of the column containing the feature location.
41 : parrello 1.4 The location should be in the form of a start and end with two dots in between, the format used in GenBank and PATRIC.
42 : parrello 1.1
43 :     =item sequence
44 :    
45 :     If the sequence ID is already present in the input file, the name of the column containing the sequence ID.
46 :    
47 : parrello 1.5 =item verbose
48 :    
49 :     If specified, status messages will be written to STDERR.
50 :    
51 : parrello 1.1 =back
52 :    
53 :     =cut
54 :    
55 :     use strict;
56 :     use P3DataAPI;
57 :     use P3Utils;
58 : parrello 1.5 use Math::Round;
59 : parrello 1.1
60 :     # Get the command-line options.
61 :     my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(),
62 :     ['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }],
63 :     ['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }],
64 :     ['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'],
65 : parrello 1.5 ['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature'],
66 :     ['verbose|v', 'display progress messages on STDERR'],
67 : parrello 1.1 );
68 :     # This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form
69 :     # [category, start, end].
70 :     my %contigs;
71 :     # Get the options.
72 :     my $locCol = $opt->location;
73 :     my $seqCol = $opt->sequence;
74 :     my $maxGap = $opt->maxgap;
75 :     my $minCount = $opt->mincount;
76 : parrello 1.5 my $debug = $opt->verbose;
77 : parrello 1.1 # Get the category column.
78 :     my ($catCol) = @ARGV;
79 :     if (! defined $catCol) {
80 :     die "No category column specified.";
81 :     }
82 :     # Get access to PATRIC.
83 :     my $p3 = P3DataAPI->new();
84 :     # Open the input file.
85 :     my $ih = P3Utils::ih($opt);
86 :     # Read the incoming headers.
87 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
88 :     # Find the location and sequence columns. An undef for either means we need to get its data from the database,
89 :     # in which case the field name goes in the select list.
90 :     my @selects = 'patric_id';
91 :     my $queryNeeded;
92 :     if (defined $seqCol) {
93 :     $seqCol = P3Utils::find_column($seqCol, $outHeaders);
94 :     } else {
95 :     push @selects, 'sequence_id';
96 :     $queryNeeded = 1;
97 :     }
98 :     if (defined $locCol) {
99 :     $locCol = P3Utils::find_column($locCol, $outHeaders);
100 :     } else {
101 :     push @selects, 'start', 'end';
102 :     $queryNeeded = 1;
103 :     }
104 : parrello 1.5 if ($queryNeeded && $debug) {
105 :     print STDERR "PATRIC queries will be needed.\n";
106 :     }
107 : parrello 1.1 # Find the category column.
108 :     $catCol = P3Utils::find_column($catCol, $outHeaders);
109 :     # Form the full header set and write it out.
110 :     if (! $opt->nohead) {
111 : parrello 1.5 my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count', 'percent');
112 : parrello 1.1 P3Utils::print_cols(\@headers);
113 :     }
114 : parrello 1.5 # These are used for status messages.
115 :     my $count = 0;
116 :     my $period = ($queryNeeded ? 1000 : 100000);
117 :     # This counts the category occurrences.
118 :     my %catCounts;
119 : parrello 1.1 # Loop through the input.
120 :     while (! eof $ih) {
121 :     my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
122 :     # Do we need to read from the database?
123 :     my %rows;
124 :     if ($queryNeeded) {
125 :     my $inString = '(' . join(',', map { $_->[0] } @$couplets) . ')';
126 :     %rows = map { $_->{patric_id} => $_ } $p3->query(genome_feature => [select => @selects], [in => 'patric_id', $inString]);
127 :     }
128 :     # Now we run through the couplets, putting the [category, start, end] tuples in the hash.
129 :     for my $couplet (@$couplets) {
130 :     my ($fid, $line) = @$couplet;
131 :     my $category = $line->[$catCol];
132 : parrello 1.5 $count++;
133 : parrello 1.3 if ($category) {
134 :     my ($start, $end, $sequence);
135 :     my $fidData = $rows{$fid};
136 :     # Here we get the start and end.
137 :     if (defined $locCol) {
138 :     my $loc = $line->[$locCol];
139 :     if ($loc =~ /(\d+)\.\.(\d+)/) {
140 :     ($start, $end) = ($1, $2);
141 :     } else {
142 :     die "Invalid location string \'$loc\'.";
143 :     }
144 :     } elsif (! $fidData) {
145 :     die "$fid not found in PATRIC.";
146 :     } else {
147 :     ($start, $end) = ($fidData->{start}, $fidData->{end});
148 :     }
149 :     # Here we get the sequence ID.
150 :     if (defined $seqCol) {
151 :     $sequence = $line->[$seqCol];
152 :     } elsif (! $fidData) {
153 :     die "$fid not found in PATRIC.";
154 : parrello 1.1 } else {
155 : parrello 1.3 $sequence = $fidData->{sequence_id};
156 : parrello 1.1 }
157 : parrello 1.3 # Compute the genome ID.
158 :     my ($genomeID) = ($fid =~ /(\d+\.\d+)/);
159 :     # Put the feature in the hash.
160 :     push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end];
161 : parrello 1.1 }
162 :     }
163 : parrello 1.5 print STDERR "$count features processed.\n" if $debug && $count % $period == 0;
164 : parrello 1.1 }
165 :     # Now we have category and position data for each feature sorted by sequence.
166 :     # For each list, we sort by start position and figure out what qualifies as a couple.
167 :     # The couples are counted in this hash, which is keyed by "element1\telement2".
168 :     my %couples;
169 : parrello 1.5 print STDERR scalar(keys %contigs) . " contigs ready to examine.\n" if $debug;
170 :     for my $contig (sort keys %contigs) {
171 : parrello 1.1 my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}};
172 : parrello 1.5 print STDERR "Processing $contig with " . scalar(@features) . " features.\n" if $debug;
173 : parrello 1.1 # We process one feature at a time, and stop when there are none left
174 :     # to couple with it.
175 :     my $feat = shift @features;
176 :     while (scalar @features) {
177 :     my $cat1 = $feat->[0];
178 : parrello 1.5 $catCounts{$cat1}++;
179 : parrello 1.1 # Compute the latest start position that qualifies as a couple.
180 :     my $limit = $feat->[2] + $maxGap;
181 :     # Loop through the remaining features until we hit the limit.
182 :     for my $other (@features) { last if $other->[1] > $limit;
183 :     if ($cat1 ne $other->[0]) {
184 :     my $couple = join("\t", sort ($cat1, $other->[0]));
185 :     $couples{$couple}++;
186 :     }
187 :     }
188 :     # Get the next feature.
189 :     $feat = shift @features;
190 :     }
191 :     }
192 :     # Sort the couples and output them.
193 : parrello 1.5 print STDERR "Writing couples.\n" if $debug;
194 : parrello 1.1 my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples;
195 :     for my $couple (@couples) {
196 : parrello 1.5 my ($cat1, $cat2) = split /\t/, $couple;
197 :     my $count = $couples{$couple};
198 :     my $total = ($catCounts{$cat1} + $catCounts{$cat2}) / 2;
199 :     P3Utils::print_cols([$couple, $count, Math::Round::nearest(0.01, $count * 100 / $total)]);
200 : parrello 1.1 }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3