[Bio] / FigKernelScripts / p3-find-couples.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-find-couples.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.1 - (view) (download) (as text)

1 : parrello 1.1 =head1 Find Physically Coupled Categories (e.g. Roles, Families)
2 :    
3 :     p3-find-couples.pl [options] catCol
4 :    
5 :     This script performes the general function of finding categories of features that tend to be physically coupled,
6 :     that is, commonly occurring in close proximity on the contigs. It can be
7 :     used to find coupled protein families, coupled roles, coupled functional assignments, or any number of things.
8 :     The input (key) column should contain feature IDs. The I<category column> specified as a parameter should identify
9 :     the column that contains the feature classification of interest. This could be the feature's role, its global protein
10 :     family (or other type of protein family), or anything of importance that groups similar features. The output will
11 :     display pairs of these categories that tend to occur phyiscally close together on the chromosome. So, for example,
12 :     if the category column contained roles, this program would output role couples. If the category column contained
13 :     global protein families, this program would output protein family couples.
14 :    
15 :     The output will be three columns-- the two category IDs and the number of times the couple occurred.
16 :    
17 :     =head2 Parameters
18 :    
19 :     The positional parameter is the index (1-based) or name of the column containing the category information.
20 :    
21 :     The standard input can be overriddn using the options in L<P3Utils/ih_options>.
22 :    
23 :     Additional command-line options are those given in L<P3Utils/col_options> (to specify the column containing
24 :     feature IDs) plus the following.
25 :    
26 :     =over 4
27 :    
28 :     =item minCount
29 :    
30 :     The minimum number of times a couple must occur to be considered significant. The default is C<5>.
31 :    
32 :     =item maxGap
33 :    
34 :     The maximum number of base pairs allowed between two features in the same cluster. The default is C<2000>.
35 :    
36 :     =item location
37 :    
38 :     If the feature location is already present in the input file, the name of the column containing the feature location.
39 :    
40 :     =item sequence
41 :    
42 :     If the sequence ID is already present in the input file, the name of the column containing the sequence ID.
43 :    
44 :     =back
45 :    
46 :     =cut
47 :    
48 :     use strict;
49 :     use P3DataAPI;
50 :     use P3Utils;
51 :    
52 :    
53 :     # Get the command-line options.
54 :     my $opt = P3Utils::script_opts('catCol', P3Utils::delim_options(), P3Utils::col_options(), P3Utils::ih_options(),
55 :     ['minCount|mincount|min|m=i', 'minimum occurrence count', { default => 5 }],
56 :     ['maxGap|maxgap|maxG|maxg|g=i', 'maximum feature gap', { default => 2000 }],
57 :     ['location|loc|l=s', 'index (1-based) or name of column containing feature location (if any)'],
58 :     ['sequence|seq|s=s', 'index (1-based) or name of column containing the ID of the contig containing the feature']
59 :     );
60 :     # This is keyed on genomeID:sequenceID and will contain the list of features for each sequence, in the form
61 :     # [category, start, end].
62 :     my %contigs;
63 :     # Get the options.
64 :     my $locCol = $opt->location;
65 :     my $seqCol = $opt->sequence;
66 :     my $maxGap = $opt->maxgap;
67 :     my $minCount = $opt->mincount;
68 :     # Get the category column.
69 :     my ($catCol) = @ARGV;
70 :     if (! defined $catCol) {
71 :     die "No category column specified.";
72 :     }
73 :     # Get access to PATRIC.
74 :     my $p3 = P3DataAPI->new();
75 :     # Open the input file.
76 :     my $ih = P3Utils::ih($opt);
77 :     # Read the incoming headers.
78 :     my ($outHeaders, $keyCol) = P3Utils::process_headers($ih, $opt);
79 :     # Find the location and sequence columns. An undef for either means we need to get its data from the database,
80 :     # in which case the field name goes in the select list.
81 :     my @selects = 'patric_id';
82 :     my $queryNeeded;
83 :     if (defined $seqCol) {
84 :     $seqCol = P3Utils::find_column($seqCol, $outHeaders);
85 :     } else {
86 :     push @selects, 'sequence_id';
87 :     $queryNeeded = 1;
88 :     }
89 :     if (defined $locCol) {
90 :     $locCol = P3Utils::find_column($locCol, $outHeaders);
91 :     } else {
92 :     push @selects, 'start', 'end';
93 :     $queryNeeded = 1;
94 :     }
95 :     # Find the category column.
96 :     $catCol = P3Utils::find_column($catCol, $outHeaders);
97 :     # Form the full header set and write it out.
98 :     if (! $opt->nohead) {
99 :     my @headers = ("$outHeaders->[$catCol]1", "$outHeaders->[$catCol]2", 'count');
100 :     P3Utils::print_cols(\@headers);
101 :     }
102 :     # Loop through the input.
103 :     while (! eof $ih) {
104 :     my $couplets = P3Utils::get_couplets($ih, $keyCol, $opt);
105 :     # Do we need to read from the database?
106 :     my %rows;
107 :     if ($queryNeeded) {
108 :     my $inString = '(' . join(',', map { $_->[0] } @$couplets) . ')';
109 :     %rows = map { $_->{patric_id} => $_ } $p3->query(genome_feature => [select => @selects], [in => 'patric_id', $inString]);
110 :     }
111 :     # Now we run through the couplets, putting the [category, start, end] tuples in the hash.
112 :     for my $couplet (@$couplets) {
113 :     my ($fid, $line) = @$couplet;
114 :     my $category = $line->[$catCol];
115 :     my ($start, $end, $sequence);
116 :     my $fidData = $rows{$fid};
117 :     # Here we get the start and end.
118 :     if (defined $locCol) {
119 :     my $loc = $line->[$locCol];
120 :     if ($loc =~ /(\d+)\.\.(\d+)/) {
121 :     ($start, $end) = ($1, $2);
122 :     } else {
123 :     die "Invalid location string \'$loc\'.";
124 :     }
125 :     } elsif (! $fidData) {
126 :     die "$fid not found in PATRIC.";
127 :     } else {
128 :     ($start, $end) = ($fidData->{start}, $fidData->{end});
129 :     }
130 :     # Here we get the sequence ID.
131 :     if (defined $seqCol) {
132 :     $sequence = $line->[$seqCol];
133 :     } elsif (! $fidData) {
134 :     die "$fid not found in PATRIC.";
135 :     } else {
136 :     $sequence = $fidData->{sequence_id};
137 :     }
138 :     # Compute the genome ID.
139 :     my ($genomeID) = ($fid =~ /(\d+\.\d+)/);
140 :     # Put the feature in the hash.
141 :     push @{$contigs{"$genomeID:$sequence"}}, [$category, $start, $end];
142 :     }
143 :     }
144 :     # Now we have category and position data for each feature sorted by sequence.
145 :     # For each list, we sort by start position and figure out what qualifies as a couple.
146 :     # The couples are counted in this hash, which is keyed by "element1\telement2".
147 :     my %couples;
148 :     for my $contig (keys %contigs) {
149 :     my @features = sort { $a->[1] <=> $b->[1] } @{$contigs{$contig}};
150 :     # We process one feature at a time, and stop when there are none left
151 :     # to couple with it.
152 :     my $feat = shift @features;
153 :     while (scalar @features) {
154 :     my $cat1 = $feat->[0];
155 :     # Compute the latest start position that qualifies as a couple.
156 :     my $limit = $feat->[2] + $maxGap;
157 :     # Loop through the remaining features until we hit the limit.
158 :     for my $other (@features) { last if $other->[1] > $limit;
159 :     if ($cat1 ne $other->[0]) {
160 :     my $couple = join("\t", sort ($cat1, $other->[0]));
161 :     $couples{$couple}++;
162 :     }
163 :     }
164 :     # Get the next feature.
165 :     $feat = shift @features;
166 :     }
167 :     }
168 :     # Sort the couples and output them.
169 :     my @couples = sort { $couples{$b} <=> $couples{$a} } grep { $couples{$_} >= $minCount } keys %couples;
170 :     for my $couple (@couples) {
171 :     print "$couple\t$couples{$couple}\n";
172 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3