[Bio] / FigKernelScripts / rapid_subsystem_inference_batch.pl Repository:
ViewVC logotype

Annotation of /FigKernelScripts/rapid_subsystem_inference_batch.pl

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.4 - (view) (download) (as text)

1 : olson 1.1 ########################################################################
2 :     #
3 :    
4 :    
5 :     use strict;
6 :     use Data::Dumper;
7 : olson 1.2 use Getopt::Long;
8 : olson 1.1
9 :     use SeedUtils;
10 :     use FIG;
11 :     my $fig = new FIG;
12 :    
13 : olson 1.2 my $missing_genomes_file;
14 : olson 1.3 my $verbose;
15 : olson 1.1
16 : olson 1.3 my $usage = "usage: rapid_subsystem_inference_batch [--verbose] [--missing-genomes filename] [evidence-log] < input-def";
17 : olson 1.2
18 : olson 1.3 my $rc = GetOptions("missing-genomes=s" => \$missing_genomes_file,
19 :     "verbose" => \$verbose);
20 : olson 1.2
21 :     $rc && (@ARGV == 0 or @ARGV == 1) or die $usage;
22 : olson 1.1
23 :     my $evlogFile = shift;
24 :    
25 :     my $evlogFH;
26 :    
27 :     if ($evlogFile)
28 :     {
29 :     $evlogFH = new FileHandle(">>$evlogFile");
30 :     $evlogFH or warn "Cannot open evidence log $evlogFile for append: $!";
31 :     }
32 :     else
33 :     {
34 :     $evlogFH = new FileHandle(">/dev/null");
35 :     }
36 :    
37 : olson 1.2 my $missing_genomes_fh;
38 :     if (defined($missing_genomes_file))
39 :     {
40 :     open($missing_genomes_fh, ">", $missing_genomes_file) or die "Cannot open $missing_genomes_file: $!";
41 :     }
42 :    
43 : olson 1.1 my %ss_data;
44 :    
45 :     my $res = $fig->db_handle->SQL(qq(SELECT subsystem, genome, variant
46 :     FROM subsystem_genome_variant));
47 :     for my $ent (@$res)
48 :     {
49 :     my($ss, $genome, $variant) = @$ent;
50 :     $ss_data{$ss}->{variant}->{$genome} = $variant;
51 :     }
52 :    
53 : olson 1.2 $res = $fig->db_handle->SQL(qq(SELECT subsystem, role
54 :     FROM subsystem_nonaux_role));
55 : olson 1.1 for my $ent (@$res)
56 :     {
57 : olson 1.2 my($ss, $role) = @$ent;
58 :     push(@{$ss_data{$ss}->{roles}}, $role);
59 :     }
60 :    
61 :     my %genome_roles;
62 :     my $sth = $fig->db_handle->{_dbh}->prepare(qq(SELECT subsystem, genome, role
63 :     FROM subsystem_genome_role),
64 :     { mysql_use_result => 1 });
65 :     $sth->execute();
66 :     while (my $ent = $sth->fetchrow_arrayref())
67 :     {
68 : olson 1.1 my($ss, $genome, $role) = @$ent;
69 : olson 1.2 push(@{$genome_roles{$ss}->{$genome}}, $role);
70 :     #push(@{$ss_data{$ss}->{roles}->{$genome}}, $role);
71 : olson 1.1 }
72 : olson 1.2 $sth->finish();
73 : olson 1.1
74 :     for my $ss (keys %ss_data)
75 :     {
76 : olson 1.2 my $rhash = $genome_roles{$ss};
77 : olson 1.1 while (my($genome, $roles) = each %$rhash)
78 :     {
79 :     my $vcode = $ss_data{$ss}->{variant}->{$genome};
80 : olson 1.2 my $key = join("\t", sort @$roles);
81 : olson 1.4 if ($key eq '')
82 :     {
83 :     print STDERR "Null key from $ss $genome $vcode\n";
84 :     next;
85 :     }
86 : olson 1.2 $ss_data{$ss}->{vcodes}->{$key}->{$vcode}++;
87 : olson 1.1 }
88 :     }
89 : olson 1.2 undef %genome_roles;
90 : olson 1.1
91 : olson 1.2 #print STDERR Dumper(\%ss_data);
92 :     #print "dumped\n";
93 : olson 1.1 while (<STDIN>)
94 :     {
95 :     chomp;
96 :     my($funcs_file, $ss_out, $bindings_out) = split(/\t/);
97 :    
98 :     my ($func_fh, $ss_fh, $bindings_fh);
99 :     open($func_fh, "<", $funcs_file) or die "Cannot read $funcs_file: $!";
100 :     open($ss_fh, ">", $ss_out) or die "Cannot write $ss_out: $!";
101 :     open($bindings_fh, ">", $bindings_out) or die "Cannot write $bindings_out: $!";
102 :    
103 : olson 1.3 print STDERR "Process $funcs_file\n" if $verbose;
104 : olson 1.4 print $evlogFH "assigned_functions\t$funcs_file\n";
105 : olson 1.2 process_file($func_fh, $ss_fh, $bindings_fh, $missing_genomes_fh);
106 : olson 1.1
107 :     close($bindings_fh);
108 :     close($ss_fh);
109 :     }
110 :    
111 : olson 1.2 close $missing_genomes_fh if $missing_genomes_fh;
112 :    
113 : olson 1.1 sub process_file
114 :     {
115 : olson 1.2 my($func_fh, $ss_fh, $bindings_fh, $missing_genomes_fh) = @_;
116 : olson 1.1
117 :     #
118 :     # Read the proposed function annotations from the given fh. Split
119 :     # assignments that are joined with ; or @ into separate roles.
120 :     #
121 :     # Store the role => peg bindings in the %bindings hash.
122 :     #
123 :    
124 :     my %bindings;
125 :     my %funcs;
126 : olson 1.2 my $genome;
127 :     my $errs;
128 : olson 1.1 while (defined($_ = <$func_fh>)) ### Keep just the last assignment in the file
129 :     {
130 : olson 1.2 chomp;
131 :     my($peg, $func) = split(/\t/);
132 :     my $this = &FIG::genome_of($peg);
133 :     if (defined($genome))
134 :     {
135 :     if ($this ne $genome)
136 :     {
137 :     warn "Invalid peg $peg: Does not match previous genome $genome\n";
138 :     $errs++;
139 :     next;
140 :     }
141 :     }
142 :     else
143 : olson 1.1 {
144 : olson 1.2 $genome = $this;
145 : olson 1.1 }
146 : olson 1.2
147 :     $funcs{$peg} = $func;
148 :     }
149 :     if ($errs)
150 :     {
151 :     warn "Errors found scanning input\n";
152 :     return;
153 : olson 1.1 }
154 :    
155 :     foreach my $peg (keys(%funcs))
156 :     {
157 :     my $func = $funcs{$peg};
158 :     my @roles = &SeedUtils::roles_of_function($func);
159 :     foreach my $role (@roles)
160 :     {
161 :     $bindings{$role}->{$peg} = 1;
162 :     }
163 :     }
164 :    
165 : olson 1.2 my %ss_vc;
166 :    
167 :     for my $subsys_name (sort grep { $fig->usable_subsystem($_) } keys %ss_data)
168 : olson 1.1 {
169 : olson 1.3 # print STDERR "Process $subsys_name\n";
170 : olson 1.4 print $evlogFH "subsystem\t$subsys_name\n";
171 : olson 1.2
172 : olson 1.1 my $ss_data = $ss_data{$subsys_name};
173 :     my $variant = $ss_data->{variant};
174 : olson 1.2 my $roles = $ss_data->{roles};
175 :     my $vcodes = $ss_data->{vcodes};
176 :    
177 :     #
178 :     # Compute the signature for the genome in question based
179 :     # on the %bindings hash we computed ealier.
180 :     #
181 :    
182 :     my @roles_in_this_genome = grep { exists($bindings{$_}) } @$roles;
183 :    
184 :     my $key = join("\t",sort @roles_in_this_genome);
185 :    
186 :     #
187 :     # Attempt to match this signature with one already present in the subsystem.
188 :     #
189 :    
190 :     my $n;
191 :     my $bestN = 0;
192 :     my $bestK = undef;
193 :     my $matches = $vcodes->{$key};
194 :    
195 :     if ($matches)
196 :     {
197 :     #
198 :     # We found an exact match.
199 :     #
200 :     print $evlogFH join("\t", "exact_match", $subsys_name, join(",", sort keys %$matches), $key), "\n";
201 :     }
202 :     else
203 : olson 1.1 {
204 :     #
205 : olson 1.2 # No exact mactch
206 : olson 1.1 #
207 : olson 1.2 foreach my $key1 (sort keys(%$vcodes))
208 : olson 1.1 {
209 : olson 1.2 #
210 :     # Recall: $key is the key we are trying to match
211 :     # $key1 is the key we are currently examining.
212 :    
213 :     if (&not_minus_1($vcodes->{$key1}) &&
214 :     (length($key) > length($key1)) &&
215 :     ($n = &contains($key,$key1)) &&
216 :     ($n > $bestN))
217 : olson 1.1 {
218 : olson 1.2 $bestN = $n;
219 :     $bestK = $key1;
220 : olson 1.1 }
221 :     }
222 : olson 1.2 if ($bestK)
223 : olson 1.1 {
224 : olson 1.2 $matches = $vcodes->{$bestK};
225 :     print $evlogFH join("\t", "partial_match", $subsys_name, join(",", sort keys %$matches), $key), "\n" if $evlogFH;
226 : olson 1.1 }
227 : olson 1.2 }
228 :    
229 :     if (defined($matches))
230 :     {
231 :     my @vcs = sort { ($vcodes->{$b} <=> $vcodes->{$a}) or ($b cmp $a) } keys(%$matches);
232 :     my $best = $vcs[0];
233 :    
234 :     $ss_vc{$subsys_name} = $best;
235 :     print $evlogFH join("\t", "best_subsys", $subsys_name, $best), "\n";
236 :     print $ss_fh join("\t",($subsys_name,$best)),"\n";
237 :    
238 :     foreach my $role (sort @$roles)
239 : olson 1.1 {
240 : olson 1.2 if (defined(my $bhash = $bindings{$role}))
241 : olson 1.1 {
242 : olson 1.2 foreach my $peg (sort { &FIG::by_fig_id($a, $b) } keys(%{$bhash}))
243 : olson 1.1 {
244 : olson 1.2 print $bindings_fh join("\t",($subsys_name,$role,$peg)),"\n";
245 : olson 1.1 }
246 :     }
247 :     }
248 : olson 1.2 }
249 :     }
250 :     if ($missing_genomes_fh && $fig->is_prokaryotic($genome))
251 :     {
252 :     #
253 :     # Determine if this genome is already present in
254 :     # any of the subsystems we found a variant for it in.
255 :     my @ss = keys %ss_vc;
256 :     if (@ss)
257 :     {
258 :     my $in = join(",", map { "?" } @ss);
259 :     my $res = $fig->db_handle->SQL(qq(SELECT subsystem, variant
260 :     FROM subsystem_genome_variant
261 :     WHERE subsystem IN ($in) AND
262 :     genome = ?), undef,
263 :     @ss, $genome);
264 : olson 1.3 #print "Search for $genome in @ss\n";
265 : olson 1.2 for my $ent (@$res)
266 : olson 1.1 {
267 : olson 1.2 my($ss, $v) = @$ent;
268 :     delete $ss_vc{$ss};
269 : olson 1.1 }
270 : olson 1.2 print $missing_genomes_fh join("\t", $genome, $_, $ss_vc{$_}), "\n" foreach sort keys %ss_vc;
271 : olson 1.1 }
272 :     }
273 :     }
274 :    
275 :     # returns undef iff $k2 is not a subset of $k1. If it is, it returns the size of $k2
276 :     sub contains {
277 :     my($k1,$k2) = @_;
278 :    
279 :     my %s1 = map { $_ => 1 } split(/\t/,$k1);
280 :     my @s2 = split(/\t/,$k2);
281 :     my $i;
282 :     for ($i=0; ($i < @s2) && $s1{$s2[$i]}; $i++) {}
283 :     return ($i < @s2) ? undef : scalar @s2;
284 :     }
285 :    
286 :     sub not_minus_1 {
287 :     my($hits) = @_;
288 :    
289 :     my @poss = keys(%$hits);
290 :     my $i;
291 :     for ($i=0; ($i < @poss) && ($poss[$i] eq "-1"); $i++) {}
292 :     return ($i < @poss);
293 :     }

MCS Webmaster
ViewVC Help
Powered by ViewVC 1.0.3