[Bio] / FigKernelScripts / p3-generate-close-roles.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/p3-generate-close-roles.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (view) (download) (as text)

1 : parrello 1.1 =head1 Find Roles That Occur Close Together
2 :    
3 :     p3-generate-close-roles.pl [options] <roles.tbl >pairs.tbl
4 :    
5 :     This script is part of a pipeline to compute functionally-coupled roles. It takes a file of locations and roles, then
6 :     outputs a file of pairs of roles with the number of times features containing those two roles occur close together on
7 :     the chromosome. Such roles typically have related functions in a genome.
8 :    
9 :     The input file must contain the following four fields.
10 :    
11 :     =over 4
12 :    
13 :     =item 1
14 :    
15 :     genome ID
16 :    
17 :     =item 2
18 :    
19 :     contig (sequence) ID
20 :    
21 :     =item 3
22 :    
23 :     location in the sequence
24 :    
25 :     =item 4
26 :    
27 :     functional role
28 :    
29 :     =back
30 :    
31 :     The default script assumes the four columns are in that order. This can all be overridden with command-line options.
32 :    
33 :     The input file must be sorted by genome ID and then by sequence ID within genome ID. Otherwise, the results will be
34 :     incorrect. Use L<p3-sort.pl> to sort the file.
35 :    
36 :     The location is a PATRIC location string, either of the form I<start>C<..>I<end> or C<complement(>I<left>C<..>I<right>C<)>.
37 :     Given a set of genome IDs in the file C<genomes.tbl>, you can generate the proper file using the following pipe.
38 :    
39 :     p3-get-genome-features --attr sequence_id --attr location --attr product <genomes.tbl | p3-function-to-role
40 :    
41 :     (If PATRIC does not yet have roles defined, you will need to use an additional command-line option on L<p3-function-to-role.pl>.)
42 :    
43 :     =head2 Parameters
44 :    
45 :     There are no positional parameters.
46 :    
47 : parrello 1.2 The standard input can be overriddn using the options in L<P3Utils/ih_options>.
48 : parrello 1.1
49 :     Additional command-line options are
50 :    
51 :     =over 4
52 :    
53 :     =item genome
54 :    
55 :     The index (1-based) or name of the column containing the genome ID. The default is C<1>.
56 :    
57 :     =item sequence
58 :    
59 :     The index (1-based) or name of the column containing the sequence ID. The default is C<2>.
60 :    
61 :     =item location
62 :    
63 :     The index (1-based) or name of the column containing the location string. The default is C<3>.
64 :    
65 :     =item role
66 :    
67 :     The index (1-based) or name of the column containing the role description. The default is C<4>.
68 :    
69 :     =item maxGap
70 :    
71 :     The maximum space between two features considered close. The default is C<2000>.
72 :    
73 :     =item minOcc
74 :    
75 :     The minimum number of occurrences for a pair to be considered significant. The default is C<4>.
76 :    
77 :     =back
78 :    
79 :     =cut
80 :    
81 :     use strict;
82 :     use P3Utils;
83 :    
84 :     # Get the command-line options.
85 :     my $opt = P3Utils::script_opts('', P3Utils::ih_options(),
86 :     ['genome|g=s', 'index (1-based) or name of the genome ID column', { default => 1 }],
87 :     ['sequence|seq|s=s', 'index (1-based) or name of the sequence ID column', { default => 2 }],
88 :     ['location|loc|l=s', 'index (1-based) or name of the location column', { default => 3 }],
89 :     ['role|R=s', 'index (1-based) or name of the role column', { default => 4 }],
90 :     ['maxGap|max-gap|gap=i', 'maximum permissible gap between close features', { default => 2000 }],
91 :     ['minOcc|min-occ|occ=i', 'minimum number of occurrences for a significant pair', { default => 4 }],
92 :     );
93 :     # Get the options.
94 :     my $maxGap = $opt->maxgap;
95 :     my $minOcc = $opt->minocc;
96 :     # Open the input file.
97 :     my $ih = P3Utils::ih($opt);
98 :     # Read the incoming headers and compute the critical column indices.
99 :     my ($headers, $cols) = P3Utils::find_headers($ih, roles => $opt->genome, $opt->sequence, $opt->location, $opt->role);
100 :     # This hash maps a role description to a role index. The list maps the other direction.
101 :     my %roleMap;
102 :     my @roleList;
103 :     # This hash maps a role pair (represented as a comma-separate list of role indices) to a count.
104 :     my %pairCounts;
105 :     # This is a list of [roleIndex, start, end] tuples for the current contig.
106 :     my @roleTuples;
107 :     # This is the current genome ID.
108 :     my $genomeID = '';
109 :     # This is the current sequence ID.
110 :     my $sequenceID = '';
111 :     # Loop through the input.
112 :     while (! eof $ih) {
113 :     my ($genome, $sequence, $loc, $role) = P3Utils::get_cols($ih, $cols);
114 :     if ($genomeID ne $genome || $sequenceID ne $sequence) {
115 :     if ($genomeID) {
116 :     # We have finished the previous contig, so process it.
117 :     process_batch(\@roleTuples, \%pairCounts, $maxGap);
118 :     }
119 :     # Restart for a new contig.
120 :     ($genomeID, $sequenceID) = ($genome, $sequence);
121 :     @roleTuples = ();
122 :     }
123 :     # Compute the role index.
124 :     my $roleID = $roleMap{$role};
125 :     if (! defined $roleID) {
126 :     $roleID = scalar @roleList;
127 :     $roleMap{$role} = scalar @roleList;
128 :     push @roleList, $role;
129 :     }
130 :     # Parse the location and store the tuple.
131 :     if ($loc =~ /(\d+)[><]?\.\.[><]?(\d+)/) {
132 :     push @roleTuples, [$roleID, $1, $2];
133 :     } else {
134 :     die "Invalid location string \"$loc\".";
135 :     }
136 :     }
137 :     # Process the residual batch.
138 :     process_batch(\@roleTuples, \%pairCounts, $maxGap);
139 :     # Sort to compute the significant pairs.
140 :     my @pairs = sort { $pairCounts{$b} <=> $pairCounts{$a} } keys %pairCounts;
141 :     # Now we can output the results.
142 :     P3Utils::print_cols(['role1', 'role2', 'count']);
143 :     for my $pair (@pairs) {
144 :     my ($role1, $role2) = map { $roleList[$_] } split /,/, $pair;
145 :     my $count = $pairCounts{$pair};
146 :     if ($count >= $minOcc) {
147 :     P3Utils::print_cols([$role1, $role2, $count]);
148 :     }
149 :     }
150 :    
151 :     # Process a batch of role tuples to produce pair counts.
152 :     sub process_batch {
153 :     my ($roleTuples, $pairCounts, $maxGap) = @_;
154 :     # Sort the role tuples by start location.
155 :     my @sorted = sort { $a->[1] <=> $b->[1] } @$roleTuples;
156 :     # Loop through the sorted list.
157 :     while (@sorted) {
158 :     my $first = shift @sorted;
159 :     my ($role1, undef, $end) = @$first;
160 :     # We now know for a fact that every feature left in sorted starts to the right of the $role1 feature.
161 :     # We pair with everything that starts before the gap distance after our end.
162 :     $end += $maxGap;
163 :     for my $other (@sorted) { last if $other->[1] > $end;
164 :     my $pair = join(',', sort ($role1, $other->[0]));
165 :     $pairCounts{$pair}++;
166 :     }
167 :     }
168 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3