[Bio] / FigKernelScripts / p3-find-couples.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-find-couples.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : parrello 1.1 =head1 Find Physically Coupled Categories (e.g. Roles, Families)
2 :    
3 :     p3-find-couples.pl [options] catCol
4 :    
5 :     This script performes the general function of finding categories of features that tend to be physically coupled,
6 :     that is, commonly occurring in close proximity on the contigs. It can be
7 :     used to find coupled protein families, coupled roles, coupled functional assignments, or any number of things.
8 :     The input (key) column should contain feature IDs. The I<category column> specified as a parameter should identify
9 :     the column that contains the feature classification of interest. This could be the feature's role, its global protein
10 :     family (or other type of protein family), or anything of importance that groups similar features. The output will
11 :     display pairs of these categories that tend to occur phyiscally close together on the chromosome. So, for example,
12 :     if the category column contained roles, this program would output role couples. If the category column contained
13 :     global protein families, this program would output protein family couples.
14 :    
15 : parrello 1.3 A blank value in the category column will cause the input line to be ignored.
16 :    
17 : parrello 1.1 The output will be three columns-- the two category IDs and the number of times the couple occurred.
18 :    
19 :     =head2 Parameters
20 :    
21 :     The positional parameter is the index (1-based) or name of the column containing the category information.
22 :    
23 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
24 :    
25 :     Additional command-line options are those given in L<P3Utils/col_options> (to specify the column containing
26 :     feature IDs) plus the following.
27 :    
28 :     =over 4
29 :    
30 :     =item minCount
31 :    
32 :     The minimum number of times a couple must occur to be considered significant. The default is C<5>.
33 :    
34 :     =item maxGap
35 :    
36 :     The maximum number of base pairs allowed between two features in the same cluster. The default is C<2000>.
37 :    
38 :     =item location
39 :    
40 :     If the feature location is already present in the input file, the name of the column containing the feature location.
41 : parrello 1.4 The location should be in the form of a start and end with two dots in between, the format used in GenBank and PATRIC.
42 : parrello 1.1
43 :     =item sequence
44 :    
45 :     If the sequence ID is already present in the input file, the name of the column containing the sequence ID.
46 :    
47 :     =back
48 :    
49 :     =cut
50 :    
51 :     use strict;
52 :     use P3DataAPI;
53 :     use P3Utils;
54 :    
55 :    
56 :     # Get the command-line options.
57 :     my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(),
58 :     ['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }],
59 :     ['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }],
60 :     ['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'],
61 :     ['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature']
62 :     );
63 :     # This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form
64 :     # [category, start, end].
65 :     my %contigs;
66 :     # Get the options.
67 :     my $locCol = $opt->location;
68 :     my $seqCol = $opt->sequence;
69 :     my $maxGap = $opt->maxgap;
70 :     my $minCount = $opt->mincount;
71 :     # Get the category column.
72 :     my ($catCol) = @ARGV;
73 :     if (! defined $catCol) {
74 :     die "No category column specified.";
75 :     }
76 :     # Get access to PATRIC.
77 :     my $p3 = P3DataAPI->new();
78 :     # Open the input file.
79 :     my $ih = P3Utils::ih($opt);
80 :     # Read the incoming headers.
81 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
82 :     # Find the location and sequence columns. An undef for either means we need to get its data from the database,
83 :     # in which case the field name goes in the select list.
84 :     my @selects = 'patric_id';
85 :     my $queryNeeded;
86 :     if (defined $seqCol) {
87 :     $seqCol = P3Utils::find_column($seqCol, $outHeaders);
88 :     } else {
89 :     push @selects, 'sequence_id';
90 :     $queryNeeded = 1;
91 :     }
92 :     if (defined $locCol) {
93 :     $locCol = P3Utils::find_column($locCol, $outHeaders);
94 :     } else {
95 :     push @selects, 'start', 'end';
96 :     $queryNeeded = 1;
97 :     }
98 :     # Find the category column.
99 :     $catCol = P3Utils::find_column($catCol, $outHeaders);
100 :     # Form the full header set and write it out.
101 :     if (! $opt->nohead) {
102 :     my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count');
103 :     P3Utils::print_cols(\@headers);
104 :     }
105 :     # Loop through the input.
106 :     while (! eof $ih) {
107 :     my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
108 :     # Do we need to read from the database?
109 :     my %rows;
110 :     if ($queryNeeded) {
111 :     my $inString = '(' . join(',', map { $_->[0] } @$couplets) . ')';
112 :     %rows = map { $_->{patric_id} => $_ } $p3->query(genome_feature => [select => @selects], [in => 'patric_id', $inString]);
113 :     }
114 :     # Now we run through the couplets, putting the [category, start, end] tuples in the hash.
115 :     for my $couplet (@$couplets) {
116 :     my ($fid, $line) = @$couplet;
117 :     my $category = $line->[$catCol];
118 : parrello 1.3 if ($category) {
119 :     my ($start, $end, $sequence);
120 :     my $fidData = $rows{$fid};
121 :     # Here we get the start and end.
122 :     if (defined $locCol) {
123 :     my $loc = $line->[$locCol];
124 :     if ($loc =~ /(\d+)\.\.(\d+)/) {
125 :     ($start, $end) = ($1, $2);
126 :     } else {
127 :     die "Invalid location string \'$loc\'.";
128 :     }
129 :     } elsif (! $fidData) {
130 :     die "$fid not found in PATRIC.";
131 :     } else {
132 :     ($start, $end) = ($fidData->{start}, $fidData->{end});
133 :     }
134 :     # Here we get the sequence ID.
135 :     if (defined $seqCol) {
136 :     $sequence = $line->[$seqCol];
137 :     } elsif (! $fidData) {
138 :     die "$fid not found in PATRIC.";
139 : parrello 1.1 } else {
140 : parrello 1.3 $sequence = $fidData->{sequence_id};
141 : parrello 1.1 }
142 : parrello 1.3 # Compute the genome ID.
143 :     my ($genomeID) = ($fid =~ /(\d+\.\d+)/);
144 :     # Put the feature in the hash.
145 :     push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end];
146 : parrello 1.1 }
147 :     }
148 :     }
149 :     # Now we have category and position data for each feature sorted by sequence.
150 :     # For each list, we sort by start position and figure out what qualifies as a couple.
151 :     # The couples are counted in this hash, which is keyed by "element1\telement2".
152 :     my %couples;
153 :     for my $contig (keys %contigs) {
154 :     my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}};
155 :     # We process one feature at a time, and stop when there are none left
156 :     # to couple with it.
157 :     my $feat = shift @features;
158 :     while (scalar @features) {
159 :     my $cat1 = $feat->[0];
160 :     # Compute the latest start position that qualifies as a couple.
161 :     my $limit = $feat->[2] + $maxGap;
162 :     # Loop through the remaining features until we hit the limit.
163 :     for my $other (@features) { last if $other->[1] > $limit;
164 :     if ($cat1 ne $other->[0]) {
165 :     my $couple = join("\t", sort ($cat1, $other->[0]));
166 :     $couples{$couple}++;
167 :     }
168 :     }
169 :     # Get the next feature.
170 :     $feat = shift @features;
171 :     }
172 :     }
173 :     # Sort the couples and output them.
174 :     my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples;
175 :     for my $couple (@couples) {
176 : parrello 1.2 P3Utils::print_cols([$couple, $couples{$couple}]);
177 : parrello 1.1 }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3