[Bio] / FigKernelScripts / svr_find_clusters_relevant_to_reaction.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/svr_find_clusters_relevant_to_reaction.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.5 - (view) (download) (as text)

1 : overbeek 1.1 use strict;
2 : overbeek 1.4 use SeedEnv;
3 : parrello 1.2 use ScriptThing;
4 : overbeek 1.1 use Data::Dumper;
5 :     use Carp;
6 :    
7 :     #
8 :     # This is a SAS Component
9 :     #
10 :    
11 :    
12 :     =head1 svr_find_clusters_relevant_to_reaction
13 :    
14 :     Find clusters potentially relevant to a search for a "missing gene"
15 :    
16 :     ------
17 :    
18 :     Example:
19 :    
20 :     svr_find_clusters_relevant_to_reaction -g 83333.1 < unconnected.reactions
21 :    
22 :     would take as input a file containing reacions that are believed to
23 :     be present in the genome 83333.1, but cannot yet be connected to specific genes.
24 :    
25 :     ------
26 :    
27 :     The standard input should be a tab-separated table (i.e., each line
28 :     is a tab-separated set of fields). Normally, the last field in each
29 :     line would contain a role for which relevant clusters are desired.
30 :     If some other column contains the roles, use
31 :    
32 :     -c N
33 :    
34 : overbeek 1.5 where N is the column (from 1) that contains the reaction in each case.
35 : overbeek 1.1
36 :     This is a pipe command. The input is taken from the standard input, and the
37 :     output is to the standard output.
38 :    
39 :     =head2 Command-Line Options
40 :    
41 :     =over 4
42 :    
43 :     =item -c Column
44 :    
45 : overbeek 1.5 This is used only if the column containing reactions is not the last.
46 : overbeek 1.1
47 :     =item -g Genome
48 :    
49 :     This normally specifies the genome for which relevant clusters are
50 :     sought. If it is an integer, then each line of input is thought of as
51 :     a genome-role pair, and the integer specifies the column in each input
52 :     line that will contain the genome. The -c parameter is the column
53 :     that will be used as a role.
54 :    
55 :     =item -d MaxSteps [default is 1]
56 :    
57 :     This parameter gives the maximum number of steps (i.e., the "radius") the
58 :     program can take to create the neighborhood of a reaction
59 :    
60 : parrello 1.3 =item -i inputFile
61 :    
62 :     If specified, the name of the input file; otherwise, the input will be taken from STDIN.
63 :    
64 : overbeek 1.1 =back
65 :    
66 :     =head2 Output Format
67 :    
68 :     The standard output is a tab-delimited file. It consists of the input
69 :     file with an extra column added. The extra column will contain a list
70 :     of two or more comma-separated PEGs. There may be more than one
71 :     output line for a single input (if multiple clusters are detected within the
72 :     genome).
73 :    
74 :     =cut
75 :    
76 :     use Getopt::Long;
77 : parrello 1.3 # use Tracer; ##HACK
78 : overbeek 1.1
79 :     my $usage = "usage: svr_find_clusters_relevant_to_reaction -g Genome [-c column]";
80 :    
81 :     my $column;
82 :     my $genome;
83 : parrello 1.3 my $inFileName;
84 : overbeek 1.1 my $dist = 1;
85 : parrello 1.3 my $url;
86 : overbeek 1.1
87 :     my $rc = GetOptions('c=i' => \$column,
88 :     'd=i' => \$dist,
89 : parrello 1.3 'g=s' => \$genome,
90 :     'i=s' => \$inFileName,
91 :     'u=s' => \$url,
92 : overbeek 1.1 );
93 :    
94 :     if ((! $rc) || (! defined($genome))) { print STDERR $usage; exit }
95 : parrello 1.3 # Allowing the specification of an input file on the command line enables this script to run
96 :     # in the real-time debugger on Bruce's laptop.
97 :     my $ih;
98 :     if ($inFileName) {
99 :     open($ih, "<$inFileName") || die "Could not open $inFileName: $!\n";
100 :     } else {
101 :     $ih = \*STDIN;
102 :     }
103 :     my $sapO = SAPserver->new(url => $url);
104 :     # Determine how we're going to find the genome ID.
105 :     my $genomeColumn;
106 :     if ($genome =~ /^\d+$/) {
107 :     # Here the genome is taken from an input column.
108 :     $genomeColumn = $genome;
109 :     }
110 : overbeek 1.1
111 : parrello 1.3 # The main loop pulls out batches of input 100 lines at a time. The input is formatted into
112 :     # 2-tuples of the form (keyColumn, inputLine). Note the use of the $column parameter to get
113 :     # the key from the correct column.
114 :     while (my @tuples = ScriptThing::GetBatch($ih, 100, $column)) {
115 :     # Get a list of the reactions in this batch.
116 :     my @reactionList = map { $_->[0] } @tuples;
117 :     # Get a hash that maps each reaction to a sub-hash of its neighbors.
118 :     my $neighH = $sapO->reaction_neighbors( -ids => \@reactionList, -depth => $dist );
119 :     # Now we need a list of all the reactions we've found. This includes the incoming reactions
120 :     # and all their neighbors. To prevent duplicates, we build the list using a hash.
121 :     my %all_reactions = map { $_ => 1 } map { ($_, keys %{$neighH->{$_}}) } @reactionList;
122 :     # Get a hash that maps each reaction to its roles.
123 :     my $reactions2rolesH = $sapO->reactions_to_roles( -ids => [keys(%all_reactions)]);
124 :     # Now loop through the tuples in this batch.
125 :     for my $tuple (@tuples) {
126 :     my ($reaction, $line) = @$tuple;
127 :     # Compute the genome ID.
128 :     my $genome1;
129 :     if ($genomeColumn) {
130 :     my @cols = split /\t/, $line;
131 :     $genome1 = $cols[$genomeColumn - 1];
132 :     } else {
133 :     $genome1 = $genome;
134 :     }
135 :     # Get all the reactions in the neighborhood of the incoming reaction. This always
136 :     # includes the incoming reaction itself, since its in the neighborhood at a distance
137 :     # of 0.
138 :     my @reactions = keys(%{$neighH->{$reaction}});
139 : overbeek 1.4 # $_ = @reactions; print STDERR "$_ reactions in neighborhood of $reaction\n";
140 : parrello 1.3 # Get all the roles for these reactions.
141 :     my %tmp = map { $_ => 1 } map { @{$reactions2rolesH->{$_}} } @reactions;
142 :     my @roles = keys(%tmp);
143 : overbeek 1.5 next if (@roles == 0);
144 : parrello 1.3 # $_ = @roles; print STDERR "$_ roles in neighborhood\n";
145 :     # Find all the features in the genome of interest for those roles.
146 :     my $role_to_pegsH = $sapO->occ_of_role( -roles => \@roles, -genomes => [$genome1] );
147 :     my %pegsH = map { $_ => 1 } map {@{$role_to_pegsH->{$_}}} keys(%$role_to_pegsH);
148 :     my @pegs = keys(%pegsH);
149 :     # $_ = @pegs; print STDERR "$_ pegs\n";
150 :     # Now we want to find the features with these roles that are physically close to
151 :     # each other.
152 :     my $locH = $sapO->fid_locations( -ids => \@pegs, -boundaries => 1 );
153 :     my @pegs_with_locs = sort { ($a->[1] cmp $b->[1]) or ($a->[2] <=> $b->[2]) }
154 :     map { my $peg = $_; my $loc = $locH->{$peg};
155 :     ($loc =~ /^\d+\.\d+:(\S+)_(\d+)([-+])(\d+)/) ? [$peg,$1,$2,$3,$4] : () } @pegs;
156 :     my $clusters = &clustered_pegs(\@pegs_with_locs);
157 :     foreach my $cluster (@$clusters)
158 :     {
159 :     print "$line\t" . join(",",@$cluster) . "\n";
160 :     }
161 : overbeek 1.1 }
162 : parrello 1.3
163 :     }
164 : overbeek 1.1
165 :     sub clustered_pegs {
166 :     my($pegs_with_locs) = @_;
167 :    
168 :     my $clusters = [];
169 :     my $i;
170 :     $i = 0;
171 :     while ($i < (@$pegs_with_locs - 1))
172 :     {
173 :     my $j;
174 :     for ($j=$i+1; ($j < @$pegs_with_locs) && (&gap_sz($pegs_with_locs->[$j-1],$pegs_with_locs->[$j]) <= 3000); $j++) {}
175 :     if ($j > $i+1)
176 :     {
177 :     push(@$clusters,[map { $_->[0] } @{$pegs_with_locs}[$i..$j-1]]);
178 :     }
179 :     $i = $j;
180 :     }
181 :     return $clusters;
182 :     }
183 :    
184 :     sub gap_sz {
185 :     my($x,$y) = @_;
186 :    
187 :     if ($x->[1] ne $y->[1]) { return 1000000 }
188 :     my $min = ($x->[3] eq "+") ? ($x->[2] + $x->[4]) : $x->[2];
189 :     my $max = ($y->[3] eq "+") ? $y->[2] : ($y->[2] - $y->[4]);
190 :     return abs($max - $min);
191 :     }
192 :    

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3