[Bio] / FigKernelScripts / p3-find-couples.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-find-couples.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 =head1 Find Physically Coupled Categories (e.g. Roles, Families)
2 :    
3 :     p3-find-couples.pl [options] catCol
4 :    
5 :     This script performes the general function of finding categories of features that tend to be physically coupled,
6 :     that is, commonly occurring in close proximity on the contigs. It can be
7 :     used to find coupled protein families, coupled roles, coupled functional assignments, or any number of things.
8 :     The input (key) column should contain feature IDs. The I<category column> specified as a parameter should identify
9 :     the column that contains the feature classification of interest. This could be the feature's role, its global protein
10 :     family (or other type of protein family), or anything of importance that groups similar features. The output will
11 :     display pairs of these categories that tend to occur phyiscally close together on the chromosome. So, for example,
12 :     if the category column contained roles, this program would output role couples. If the category column contained
13 :     global protein families, this program would output protein family couples.
14 :    
15 : parrello 1.3 A blank value in the category column will cause the input line to be ignored.
16 :    
17 : parrello 1.1 The output will be three columns-- the two category IDs and the number of times the couple occurred.
18 :    
19 :     =head2 Parameters
20 :    
21 :     The positional parameter is the index (1-based) or name of the column containing the category information.
22 :    
23 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
24 :    
25 :     Additional command-line options are those given in L<P3Utils/col_options> (to specify the column containing
26 :     feature IDs) plus the following.
27 :    
28 :     =over 4
29 :    
30 :     =item minCount
31 :    
32 :     The minimum number of times a couple must occur to be considered significant. The default is C<5>.
33 :    
34 :     =item maxGap
35 :    
36 :     The maximum number of base pairs allowed between two features in the same cluster. The default is C<2000>.
37 :    
38 :     =item location
39 :    
40 :     If the feature location is already present in the input file, the name of the column containing the feature location.
41 :    
42 :     =item sequence
43 :    
44 :     If the sequence ID is already present in the input file, the name of the column containing the sequence ID.
45 :    
46 :     =back
47 :    
48 :     =cut
49 :    
50 :     use strict;
51 :     use P3DataAPI;
52 :     use P3Utils;
53 :    
54 :    
55 :     # Get the command-line options.
56 :     my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(),
57 :     ['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }],
58 :     ['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }],
59 :     ['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'],
60 :     ['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature']
61 :     );
62 :     # This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form
63 :     # [category, start, end].
64 :     my %contigs;
65 :     # Get the options.
66 :     my $locCol = $opt->location;
67 :     my $seqCol = $opt->sequence;
68 :     my $maxGap = $opt->maxgap;
69 :     my $minCount = $opt->mincount;
70 :     # Get the category column.
71 :     my ($catCol) = @ARGV;
72 :     if (! defined $catCol) {
73 :     die "No category column specified.";
74 :     }
75 :     # Get access to PATRIC.
76 :     my $p3 = P3DataAPI->new();
77 :     # Open the input file.
78 :     my $ih = P3Utils::ih($opt);
79 :     # Read the incoming headers.
80 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
81 :     # Find the location and sequence columns. An undef for either means we need to get its data from the database,
82 :     # in which case the field name goes in the select list.
83 :     my @selects = 'patric_id';
84 :     my $queryNeeded;
85 :     if (defined $seqCol) {
86 :     $seqCol = P3Utils::find_column($seqCol, $outHeaders);
87 :     } else {
88 :     push @selects, 'sequence_id';
89 :     $queryNeeded = 1;
90 :     }
91 :     if (defined $locCol) {
92 :     $locCol = P3Utils::find_column($locCol, $outHeaders);
93 :     } else {
94 :     push @selects, 'start', 'end';
95 :     $queryNeeded = 1;
96 :     }
97 :     # Find the category column.
98 :     $catCol = P3Utils::find_column($catCol, $outHeaders);
99 :     # Form the full header set and write it out.
100 :     if (! $opt->nohead) {
101 :     my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count');
102 :     P3Utils::print_cols(\@headers);
103 :     }
104 :     # Loop through the input.
105 :     while (! eof $ih) {
106 :     my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
107 :     # Do we need to read from the database?
108 :     my %rows;
109 :     if ($queryNeeded) {
110 :     my $inString = '(' . join(',', map { $_->[0] } @$couplets) . ')';
111 :     %rows = map { $_->{patric_id} => $_ } $p3->query(genome_feature => [select => @selects], [in => 'patric_id', $inString]);
112 :     }
113 :     # Now we run through the couplets, putting the [category, start, end] tuples in the hash.
114 :     for my $couplet (@$couplets) {
115 :     my ($fid, $line) = @$couplet;
116 :     my $category = $line->[$catCol];
117 : parrello 1.3 if ($category) {
118 :     my ($start, $end, $sequence);
119 :     my $fidData = $rows{$fid};
120 :     # Here we get the start and end.
121 :     if (defined $locCol) {
122 :     my $loc = $line->[$locCol];
123 :     if ($loc =~ /(\d+)\.\.(\d+)/) {
124 :     ($start, $end) = ($1, $2);
125 :     } else {
126 :     die "Invalid location string \'$loc\'.";
127 :     }
128 :     } elsif (! $fidData) {
129 :     die "$fid not found in PATRIC.";
130 :     } else {
131 :     ($start, $end) = ($fidData->{start}, $fidData->{end});
132 :     }
133 :     # Here we get the sequence ID.
134 :     if (defined $seqCol) {
135 :     $sequence = $line->[$seqCol];
136 :     } elsif (! $fidData) {
137 :     die "$fid not found in PATRIC.";
138 : parrello 1.1 } else {
139 : parrello 1.3 $sequence = $fidData->{sequence_id};
140 : parrello 1.1 }
141 : parrello 1.3 # Compute the genome ID.
142 :     my ($genomeID) = ($fid =~ /(\d+\.\d+)/);
143 :     # Put the feature in the hash.
144 :     push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end];
145 : parrello 1.1 }
146 :     }
147 :     }
148 :     # Now we have category and position data for each feature sorted by sequence.
149 :     # For each list, we sort by start position and figure out what qualifies as a couple.
150 :     # The couples are counted in this hash, which is keyed by "element1\telement2".
151 :     my %couples;
152 :     for my $contig (keys %contigs) {
153 :     my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}};
154 :     # We process one feature at a time, and stop when there are none left
155 :     # to couple with it.
156 :     my $feat = shift @features;
157 :     while (scalar @features) {
158 :     my $cat1 = $feat->[0];
159 :     # Compute the latest start position that qualifies as a couple.
160 :     my $limit = $feat->[2] + $maxGap;
161 :     # Loop through the remaining features until we hit the limit.
162 :     for my $other (@features) { last if $other->[1] > $limit;
163 :     if ($cat1 ne $other->[0]) {
164 :     my $couple = join("\t", sort ($cat1, $other->[0]));
165 :     $couples{$couple}++;
166 :     }
167 :     }
168 :     # Get the next feature.
169 :     $feat = shift @features;
170 :     }
171 :     }
172 :     # Sort the couples and output them.
173 :     my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples;
174 :     for my $couple (@couples) {
175 : parrello 1.2 P3Utils::print_cols([$couple, $couples{$couple}]);
176 : parrello 1.1 }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3