[Bio] / FigKernelScripts / p3-generate-close-roles.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-generate-close-roles.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.3 - (view) (download) (as text)

1 : parrello 1.1 =head1 Find Roles That Occur Close Together
2 :    
3 :     p3-generate-close-roles.pl [options] <roles.tbl >pairs.tbl
4 :    
5 :     This script is part of a pipeline to compute functionally-coupled roles. It takes a file of locations and roles, then
6 :     outputs a file of pairs of roles with the number of times features containing those two roles occur close together on
7 :     the chromosome. Such roles typically have related functions in a genome.
8 :    
9 :     The input file must contain the following four fields.
10 :    
11 :     =over 4
12 :    
13 :     =item 1
14 :    
15 :     genome ID
16 :    
17 :     =item 2
18 :    
19 :     contig (sequence) ID
20 :    
21 :     =item 3
22 :    
23 :     location in the sequence
24 :    
25 :     =item 4
26 :    
27 :     functional role
28 :    
29 :     =back
30 :    
31 :     The default script assumes the four columns are in that order. This can all be overridden with command-line options.
32 :    
33 :     The input file must be sorted by genome ID and then by sequence ID within genome ID. Otherwise, the results will be
34 :     incorrect. Use L<p3-sort.pl> to sort the file.
35 :    
36 :     The location is a PATRIC location string, either of the form I<start>C<..>I<end> or C<complement(>I<left>C<..>I<right>C<)>.
37 :     Given a set of genome IDs in the file C<genomes.tbl>, you can generate the proper file using the following pipe.
38 :    
39 :     p3-get-genome-features --attr sequence_id --attr location --attr product <genomes.tbl | p3-function-to-role
40 :    
41 :     (If PATRIC does not yet have roles defined, you will need to use an additional command-line option on L<p3-function-to-role.pl>.)
42 :    
43 :     =head2 Parameters
44 :    
45 :     There are no positional parameters.
46 :    
47 : parrello 1.2 The standard input can be overriddn using the options in L<P3Utils/ih_options>.
48 : parrello 1.1
49 :     Additional command-line options are
50 :    
51 :     =over 4
52 :    
53 :     =item genome
54 :    
55 :     The index (1-based) or name of the column containing the genome ID. The default is C<1>.
56 :    
57 :     =item sequence
58 :    
59 :     The index (1-based) or name of the column containing the sequence ID. The default is C<2>.
60 :    
61 :     =item location
62 :    
63 :     The index (1-based) or name of the column containing the location string. The default is C<3>.
64 :    
65 :     =item role
66 :    
67 :     The index (1-based) or name of the column containing the role description. The default is C<4>.
68 :    
69 :     =item maxGap
70 :    
71 :     The maximum space between two features considered close. The default is C<2000>.
72 :    
73 :     =item minOcc
74 :    
75 :     The minimum number of occurrences for a pair to be considered significant. The default is C<4>.
76 :    
77 :     =back
78 :    
79 : parrello 1.3 =head3 Example
80 :    
81 :     This command is shown in the tutorial p3_common_tasks.html
82 :    
83 :     p3-get-genome-features --eq feature_type,CDS --attr sequence_id --attr location --attr product &lt;genomes.tbl | p3-function-to-role | p3-generate-close-roles
84 :     role1 role2 count
85 :     Transposase, IS3/IS911 family Mobile element protein 33
86 :     Mobile element protein Mobile element protein 29
87 :     Lead, cadmium, zinc and mercury transporting ATPase (EC 3.6.3.3) (EC 3.6.3.5) Copper-translocating P-type ATPase (EC 3.6.3.4) 25
88 :     Potassium efflux system KefA protein Small-conductance mechanosensitive channel 13
89 :     ...
90 :    
91 : parrello 1.1 =cut
92 :    
93 :     use strict;
94 :     use P3Utils;
95 :    
96 :     # Get the command-line options.
97 :     my $opt = P3Utils::script_opts('', P3Utils::ih_options(),
98 :     ['genome|g=s', 'index (1-based) or name of the genome ID column', { default => 1 }],
99 :     ['sequence|seq|s=s', 'index (1-based) or name of the sequence ID column', { default => 2 }],
100 :     ['location|loc|l=s', 'index (1-based) or name of the location column', { default => 3 }],
101 :     ['role|R=s', 'index (1-based) or name of the role column', { default => 4 }],
102 :     ['maxGap|max-gap|gap=i', 'maximum permissible gap between close features', { default => 2000 }],
103 :     ['minOcc|min-occ|occ=i', 'minimum number of occurrences for a significant pair', { default => 4 }],
104 :     );
105 :     # Get the options.
106 :     my $maxGap = $opt->maxgap;
107 :     my $minOcc = $opt->minocc;
108 :     # Open the input file.
109 :     my $ih = P3Utils::ih($opt);
110 :     # Read the incoming headers and compute the critical column indices.
111 :     my ($headers, $cols) = P3Utils::find_headers($ih, roles => $opt->genome, $opt->sequence, $opt->location, $opt->role);
112 :     # This hash maps a role description to a role index. The list maps the other direction.
113 :     my %roleMap;
114 :     my @roleList;
115 :     # This hash maps a role pair (represented as a comma-separate list of role indices) to a count.
116 :     my %pairCounts;
117 :     # This is a list of [roleIndex, start, end] tuples for the current contig.
118 :     my @roleTuples;
119 :     # This is the current genome ID.
120 :     my $genomeID = '';
121 :     # This is the current sequence ID.
122 :     my $sequenceID = '';
123 :     # Loop through the input.
124 :     while (! eof $ih) {
125 :     my ($genome, $sequence, $loc, $role) = P3Utils::get_cols($ih, $cols);
126 :     if ($genomeID ne $genome || $sequenceID ne $sequence) {
127 :     if ($genomeID) {
128 :     # We have finished the previous contig, so process it.
129 :     process_batch(\@roleTuples, \%pairCounts, $maxGap);
130 :     }
131 :     # Restart for a new contig.
132 :     ($genomeID, $sequenceID) = ($genome, $sequence);
133 :     @roleTuples = ();
134 :     }
135 :     # Compute the role index.
136 :     my $roleID = $roleMap{$role};
137 :     if (! defined $roleID) {
138 :     $roleID = scalar @roleList;
139 :     $roleMap{$role} = scalar @roleList;
140 :     push @roleList, $role;
141 :     }
142 :     # Parse the location and store the tuple.
143 :     if ($loc =~ /(\d+)[><]?\.\.[><]?(\d+)/) {
144 :     push @roleTuples, [$roleID, $1, $2];
145 :     } else {
146 :     die "Invalid location string \"$loc\".";
147 :     }
148 :     }
149 :     # Process the residual batch.
150 :     process_batch(\@roleTuples, \%pairCounts, $maxGap);
151 :     # Sort to compute the significant pairs.
152 :     my @pairs = sort { $pairCounts{$b} <=> $pairCounts{$a} } keys %pairCounts;
153 :     # Now we can output the results.
154 :     P3Utils::print_cols(['role1', 'role2', 'count']);
155 :     for my $pair (@pairs) {
156 :     my ($role1, $role2) = map { $roleList[$_] } split /,/, $pair;
157 :     my $count = $pairCounts{$pair};
158 :     if ($count >= $minOcc) {
159 :     P3Utils::print_cols([$role1, $role2, $count]);
160 :     }
161 :     }
162 :    
163 :     # Process a batch of role tuples to produce pair counts.
164 :     sub process_batch {
165 :     my ($roleTuples, $pairCounts, $maxGap) = @_;
166 :     # Sort the role tuples by start location.
167 :     my @sorted = sort { $a->[1] <=> $b->[1] } @$roleTuples;
168 :     # Loop through the sorted list.
169 :     while (@sorted) {
170 :     my $first = shift @sorted;
171 :     my ($role1, undef, $end) = @$first;
172 :     # We now know for a fact that every feature left in sorted starts to the right of the $role1 feature.
173 :     # We pair with everything that starts before the gap distance after our end.
174 :     $end += $maxGap;
175 :     for my $other (@sorted) { last if $other->[1] > $end;
176 :     my $pair = join(',', sort ($role1, $other->[0]));
177 :     $pairCounts{$pair}++;
178 :     }
179 :     }
180 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3