Parent Directory
|
Revision Log
Revision 1.79 - (view) (download) (as text)
1 : | efrank | 1.1 | package FIG; |
2 : | |||
3 : | use DBrtns; | ||
4 : | use Sim; | ||
5 : | use Blast; | ||
6 : | use FIG_Config; | ||
7 : | overbeek | 1.36 | use tree_utilities; |
8 : | olson | 1.79 | |
9 : | # | ||
10 : | # Conditionally evaluate this in case its prerequisites are not available. | ||
11 : | # | ||
12 : | |||
13 : | our $ClearinghouseOK = eval { | ||
14 : | require Clearinghouse; | ||
15 : | }; | ||
16 : | efrank | 1.1 | |
17 : | olson | 1.10 | use IO::Socket; |
18 : | |||
19 : | efrank | 1.1 | use FileHandle; |
20 : | |||
21 : | use Carp; | ||
22 : | use Data::Dumper; | ||
23 : | overbeek | 1.25 | use Time::Local; |
24 : | efrank | 1.1 | |
25 : | use strict; | ||
26 : | use Fcntl qw/:flock/; # import LOCK_* constants | ||
27 : | |||
28 : | sub new { | ||
29 : | my($class) = @_; | ||
30 : | |||
31 : | my $rdbH = new DBrtns; | ||
32 : | bless { | ||
33 : | _dbf => $rdbH, | ||
34 : | }, $class; | ||
35 : | } | ||
36 : | |||
37 : | sub DESTROY { | ||
38 : | my($self) = @_; | ||
39 : | my($rdbH); | ||
40 : | |||
41 : | if ($rdbH = $self->db_handle) | ||
42 : | { | ||
43 : | $rdbH->DESTROY; | ||
44 : | } | ||
45 : | } | ||
46 : | |||
47 : | overbeek | 1.7 | sub delete_genomes { |
48 : | my($self,$genomes) = @_; | ||
49 : | my $tmpD = "$FIG_Config::temp/tmp.deleted.$$"; | ||
50 : | my $tmp_Data = "$FIG_Config::temp/Data.$$"; | ||
51 : | |||
52 : | my %to_del = map { $_ => 1 } @$genomes; | ||
53 : | open(TMP,">$tmpD") || die "could not open $tmpD"; | ||
54 : | |||
55 : | my $genome; | ||
56 : | foreach $genome ($self->genomes) | ||
57 : | { | ||
58 : | if (! $to_del{$genome}) | ||
59 : | { | ||
60 : | print TMP "$genome\n"; | ||
61 : | } | ||
62 : | } | ||
63 : | close(TMP); | ||
64 : | |||
65 : | &run("extract_genomes $tmpD $FIG_Config::data $tmp_Data"); | ||
66 : | overbeek | 1.47 | |
67 : | # &run("mv $FIG_Config::data $FIG_Config::data.deleted; mv $tmp_Data $FIG_Config::data; fig load_all; rm -rf $FIG_Config::data.deleted"); | ||
68 : | |||
69 : | &run("mv $FIG_Config::data $FIG_Config::data.deleted"); | ||
70 : | &run("mv $tmp_Data $FIG_Config::data"); | ||
71 : | &run("fig load_all"); | ||
72 : | &run("rm -rf $FIG_Config::data.deleted"); | ||
73 : | overbeek | 1.7 | } |
74 : | |||
75 : | efrank | 1.1 | sub add_genome { |
76 : | my($self,$genomeF) = @_; | ||
77 : | |||
78 : | my $rc = 0; | ||
79 : | overbeek | 1.7 | if (($genomeF =~ /((.*\/)?(\d+\.\d+))$/) && (! -d "$FIG_Config::organisms/$3")) |
80 : | efrank | 1.1 | { |
81 : | my $genome = $3; | ||
82 : | my @errors = `$FIG_Config::bin/verify_genome_directory $genomeF`; | ||
83 : | if (@errors == 0) | ||
84 : | { | ||
85 : | disz | 1.60 | &run("cp -r $genomeF $FIG_Config::organisms; chmod -R 2777 $FIG_Config::organisms/$genome"); |
86 : | chmod 02777, "$FIG_Config::organisms/$genomeF"; | ||
87 : | overbeek | 1.18 | &run("index_contigs $genome"); |
88 : | &run("compute_genome_counts $genome"); | ||
89 : | efrank | 1.1 | &run("load_features $genome"); |
90 : | $rc = 1; | ||
91 : | if (-s "$FIG_Config::organisms/$genome/Features/peg/fasta") | ||
92 : | { | ||
93 : | &run("index_translations $genome"); | ||
94 : | my @tmp = `cut -f1 $FIG_Config::organisms/$genome/Features/peg/tbl`; | ||
95 : | golsen | 1.44 | chomp @tmp; |
96 : | overbeek | 1.7 | &run("cat $FIG_Config::organisms/$genome/Features/peg/fasta >> $FIG_Config::data/Global/nr"); |
97 : | &make_similarities(\@tmp); | ||
98 : | efrank | 1.1 | } |
99 : | if ((-s "$FIG_Config::organisms/$genome/assigned_functions") || | ||
100 : | (-d "$FIG_Config::organisms/$genome/UserModels")) | ||
101 : | { | ||
102 : | &run("add_assertions_of_function $genome"); | ||
103 : | } | ||
104 : | } | ||
105 : | } | ||
106 : | return $rc; | ||
107 : | } | ||
108 : | |||
109 : | sub make_similarities { | ||
110 : | my($fids) = @_; | ||
111 : | my $fid; | ||
112 : | |||
113 : | open(TMP,">>$FIG_Config::global/queued_similarities") | ||
114 : | || die "could not open $FIG_Config::global/queued_similarities"; | ||
115 : | foreach $fid (@$fids) | ||
116 : | { | ||
117 : | print TMP "$fid\n"; | ||
118 : | } | ||
119 : | close(TMP); | ||
120 : | olson | 1.10 | } |
121 : | |||
122 : | sub get_local_hostname { | ||
123 : | olson | 1.52 | |
124 : | # | ||
125 : | # See if there is a FIGdisk/config/hostname file. If there | ||
126 : | # is, force the hostname to be that. | ||
127 : | # | ||
128 : | |||
129 : | my $hostfile = "$FIG_Config::fig_disk/config/hostname"; | ||
130 : | if (-f $hostfile) | ||
131 : | { | ||
132 : | my $fh; | ||
133 : | if (open($fh, $hostfile)) | ||
134 : | { | ||
135 : | my $hostname = <$fh>; | ||
136 : | chomp($hostname); | ||
137 : | return $hostname; | ||
138 : | } | ||
139 : | } | ||
140 : | |||
141 : | olson | 1.10 | # |
142 : | # First check to see if we our hostname is correct. | ||
143 : | # | ||
144 : | # Map it to an IP address, and try to bind to that ip. | ||
145 : | # | ||
146 : | |||
147 : | my $tcp = getprotobyname('tcp'); | ||
148 : | |||
149 : | my $hostname = `hostname`; | ||
150 : | golsen | 1.44 | chomp($hostname); |
151 : | olson | 1.10 | |
152 : | my @hostent = gethostbyname($hostname); | ||
153 : | |||
154 : | if (@hostent > 0) | ||
155 : | { | ||
156 : | my $sock; | ||
157 : | my $ip = $hostent[4]; | ||
158 : | |||
159 : | socket($sock, PF_INET, SOCK_STREAM, $tcp); | ||
160 : | if (bind($sock, sockaddr_in(0, $ip))) | ||
161 : | { | ||
162 : | # | ||
163 : | # It worked. Reverse-map back to a hopefully fqdn. | ||
164 : | # | ||
165 : | |||
166 : | my @rev = gethostbyaddr($ip, AF_INET); | ||
167 : | if (@rev > 0) | ||
168 : | { | ||
169 : | olson | 1.28 | my $host = $rev[0]; |
170 : | # | ||
171 : | # Check to see if we have a FQDN. | ||
172 : | # | ||
173 : | |||
174 : | if ($host =~ /\./) | ||
175 : | { | ||
176 : | # | ||
177 : | # Good. | ||
178 : | # | ||
179 : | return $host; | ||
180 : | } | ||
181 : | else | ||
182 : | { | ||
183 : | # | ||
184 : | # We didn't get a fqdn; bail and return the IP address. | ||
185 : | # | ||
186 : | return get_hostname_by_adapter() | ||
187 : | } | ||
188 : | olson | 1.10 | } |
189 : | else | ||
190 : | { | ||
191 : | return inet_ntoa($ip); | ||
192 : | } | ||
193 : | } | ||
194 : | else | ||
195 : | { | ||
196 : | # | ||
197 : | # Our hostname must be wrong; we can't bind to the IP | ||
198 : | # address it maps to. | ||
199 : | # Return the name associated with the adapter. | ||
200 : | # | ||
201 : | return get_hostname_by_adapter() | ||
202 : | } | ||
203 : | } | ||
204 : | else | ||
205 : | { | ||
206 : | # | ||
207 : | # Our hostname isn't known to DNS. This isn't good. | ||
208 : | # Return the name associated with the adapter. | ||
209 : | # | ||
210 : | return get_hostname_by_adapter() | ||
211 : | } | ||
212 : | } | ||
213 : | |||
214 : | sub get_hostname_by_adapter { | ||
215 : | # | ||
216 : | # Attempt to determine our local hostname based on the | ||
217 : | # network environment. | ||
218 : | # | ||
219 : | # This implementation reads the routing table for the default route. | ||
220 : | # We then look at the interface config for the interface that holds the default. | ||
221 : | # | ||
222 : | # | ||
223 : | # Linux routing table: | ||
224 : | # [olson@yips 0.0.0]$ netstat -rn | ||
225 : | # Kernel IP routing table | ||
226 : | # Destination Gateway Genmask Flags MSS Window irtt Iface | ||
227 : | # 140.221.34.32 0.0.0.0 255.255.255.224 U 0 0 0 eth0 | ||
228 : | # 169.254.0.0 0.0.0.0 255.255.0.0 U 0 0 0 eth0 | ||
229 : | # 127.0.0.0 0.0.0.0 255.0.0.0 U 0 0 0 lo | ||
230 : | # 0.0.0.0 140.221.34.61 0.0.0.0 UG 0 0 0 eth0 | ||
231 : | # | ||
232 : | # Mac routing table: | ||
233 : | # | ||
234 : | # bash-2.05a$ netstat -rn | ||
235 : | # Routing tables | ||
236 : | # | ||
237 : | # Internet: | ||
238 : | # Destination Gateway Flags Refs Use Netif Expire | ||
239 : | # default 140.221.11.253 UGSc 12 120 en0 | ||
240 : | # 127.0.0.1 127.0.0.1 UH 16 8415486 lo0 | ||
241 : | # 140.221.8/22 link#4 UCS 12 0 en0 | ||
242 : | # 140.221.8.78 0:6:5b:f:51:c4 UHLW 0 183 en0 408 | ||
243 : | # 140.221.8.191 0:3:93:84:ab:e8 UHLW 0 92 en0 622 | ||
244 : | # 140.221.8.198 0:e0:98:8e:36:e2 UHLW 0 5 en0 691 | ||
245 : | # 140.221.9.6 0:6:5b:f:51:d6 UHLW 1 63 en0 1197 | ||
246 : | # 140.221.10.135 0:d0:59:34:26:34 UHLW 2 2134 en0 1199 | ||
247 : | # 140.221.10.152 0:30:1b:b0:ec:dd UHLW 1 137 en0 1122 | ||
248 : | # 140.221.10.153 127.0.0.1 UHS 0 0 lo0 | ||
249 : | # 140.221.11.37 0:9:6b:53:4e:4b UHLW 1 624 en0 1136 | ||
250 : | # 140.221.11.103 0:30:48:22:59:e6 UHLW 3 973 en0 1016 | ||
251 : | # 140.221.11.224 0:a:95:6f:7:10 UHLW 1 1 en0 605 | ||
252 : | # 140.221.11.237 0:1:30:b8:80:c0 UHLW 0 0 en0 1158 | ||
253 : | # 140.221.11.250 0:1:30:3:1:0 UHLW 0 0 en0 1141 | ||
254 : | # 140.221.11.253 0:d0:3:e:70:a UHLW 13 0 en0 1199 | ||
255 : | # 169.254 link#4 UCS 0 0 en0 | ||
256 : | # | ||
257 : | # Internet6: | ||
258 : | # Destination Gateway Flags Netif Expire | ||
259 : | # UH lo0 | ||
260 : | # fe80::%lo0/64 Uc lo0 | ||
261 : | # link#1 UHL lo0 | ||
262 : | # fe80::%en0/64 link#4 UC en0 | ||
263 : | # 0:a:95:a8:26:68 UHL lo0 | ||
264 : | # ff01::/32 U lo0 | ||
265 : | # ff02::%lo0/32 UC lo0 | ||
266 : | # ff02::%en0/32 link#4 UC en0 | ||
267 : | |||
268 : | my($fh); | ||
269 : | |||
270 : | if (!open($fh, "netstat -rn |")) | ||
271 : | { | ||
272 : | warn "Cannot run netstat to determine local IP address\n"; | ||
273 : | return "localhost"; | ||
274 : | } | ||
275 : | |||
276 : | my $interface_name; | ||
277 : | |||
278 : | while (<$fh>) | ||
279 : | { | ||
280 : | my @cols = split(); | ||
281 : | |||
282 : | if ($cols[0] eq "default" || $cols[0] eq "0.0.0.0") | ||
283 : | { | ||
284 : | $interface_name = $cols[$#cols]; | ||
285 : | } | ||
286 : | } | ||
287 : | close($fh); | ||
288 : | |||
289 : | olson | 1.11 | # print "Default route on $interface_name\n"; |
290 : | olson | 1.10 | |
291 : | # | ||
292 : | # Find ifconfig. | ||
293 : | # | ||
294 : | |||
295 : | my $ifconfig; | ||
296 : | |||
297 : | for my $dir ((split(":", $ENV{PATH}), "/sbin", "/usr/sbin")) | ||
298 : | { | ||
299 : | if (-x "$dir/ifconfig") | ||
300 : | { | ||
301 : | $ifconfig = "$dir/ifconfig"; | ||
302 : | last; | ||
303 : | } | ||
304 : | } | ||
305 : | |||
306 : | if ($ifconfig eq "") | ||
307 : | { | ||
308 : | warn "Ifconfig not found\n"; | ||
309 : | return "localhost"; | ||
310 : | } | ||
311 : | olson | 1.11 | # print "Foudn $ifconfig\n"; |
312 : | olson | 1.10 | |
313 : | if (!open($fh, "$ifconfig $interface_name |")) | ||
314 : | { | ||
315 : | warn "Could not run $ifconfig: $!\n"; | ||
316 : | return "localhost"; | ||
317 : | } | ||
318 : | |||
319 : | my $ip; | ||
320 : | while (<$fh>) | ||
321 : | { | ||
322 : | # | ||
323 : | # Mac: | ||
324 : | # inet 140.221.10.153 netmask 0xfffffc00 broadcast 140.221.11.255 | ||
325 : | # Linux: | ||
326 : | # inet addr:140.221.34.37 Bcast:140.221.34.63 Mask:255.255.255.224 | ||
327 : | # | ||
328 : | |||
329 : | chomp; | ||
330 : | s/^\s*//; | ||
331 : | |||
332 : | olson | 1.11 | # print "Have '$_'\n"; |
333 : | olson | 1.10 | if (/inet\s+addr:(\d+\.\d+\.\d+\.\d+)\s+/) |
334 : | { | ||
335 : | # | ||
336 : | # Linux hit. | ||
337 : | # | ||
338 : | $ip = $1; | ||
339 : | olson | 1.11 | # print "Got linux $ip\n"; |
340 : | olson | 1.10 | last; |
341 : | } | ||
342 : | elsif (/inet\s+(\d+\.\d+\.\d+\.\d+)\s+/) | ||
343 : | { | ||
344 : | # | ||
345 : | # Mac hit. | ||
346 : | # | ||
347 : | $ip = $1; | ||
348 : | olson | 1.11 | # print "Got mac $ip\n"; |
349 : | olson | 1.10 | last; |
350 : | } | ||
351 : | } | ||
352 : | close($fh); | ||
353 : | |||
354 : | if ($ip eq "") | ||
355 : | { | ||
356 : | warn "Didn't find an IP\n"; | ||
357 : | return "localhost"; | ||
358 : | } | ||
359 : | |||
360 : | return $ip; | ||
361 : | efrank | 1.1 | } |
362 : | |||
363 : | olson | 1.38 | sub get_seed_id { |
364 : | # | ||
365 : | # Retrieve the seed identifer from FIGdisk/config/seed_id. | ||
366 : | # | ||
367 : | # If it's not there, create one, and make it readonly. | ||
368 : | # | ||
369 : | |||
370 : | my $id; | ||
371 : | my $id_file = "$FIG_Config::fig_disk/config/seed_id"; | ||
372 : | if (! -f $id_file) | ||
373 : | { | ||
374 : | my $newid = `uuidgen`; | ||
375 : | if (!$newid) | ||
376 : | { | ||
377 : | die "Cannot run uuidgen: $!"; | ||
378 : | } | ||
379 : | |||
380 : | chomp($newid); | ||
381 : | my $fh = new FileHandle(">$id_file"); | ||
382 : | if (!$fh) | ||
383 : | { | ||
384 : | die "error creating $id_file: $!"; | ||
385 : | } | ||
386 : | print $fh "$newid\n"; | ||
387 : | $fh->close(); | ||
388 : | chmod(0444, $id_file); | ||
389 : | } | ||
390 : | my $fh = new FileHandle("<$id_file"); | ||
391 : | $id = <$fh>; | ||
392 : | chomp($id); | ||
393 : | return $id; | ||
394 : | } | ||
395 : | |||
396 : | efrank | 1.1 | sub cgi_url { |
397 : | return &plug_url($FIG_Config::cgi_url); | ||
398 : | } | ||
399 : | |||
400 : | sub temp_url { | ||
401 : | return &plug_url($FIG_Config::temp_url); | ||
402 : | } | ||
403 : | |||
404 : | sub plug_url { | ||
405 : | my($url) = @_; | ||
406 : | |||
407 : | golsen | 1.44 | my $name; |
408 : | |||
409 : | # Revised by GJO | ||
410 : | # First try to get url from the current http request | ||
411 : | |||
412 : | if ( defined( $ENV{ 'HTTP_HOST' } ) # This is where $cgi->url gets its value | ||
413 : | && ( $name = $ENV{ 'HTTP_HOST' } ) | ||
414 : | && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter | ||
415 : | ) {} | ||
416 : | |||
417 : | # Otherwise resort to alternative sources | ||
418 : | |||
419 : | elsif ( ( $name = &get_local_hostname ) | ||
420 : | && ( $url =~ s~^http://[^/]*~http://$name~ ) # ~ is delimiter | ||
421 : | ) {} | ||
422 : | |||
423 : | efrank | 1.1 | return $url; |
424 : | } | ||
425 : | |||
426 : | =pod | ||
427 : | |||
428 : | =head1 hiding/caching in a FIG object | ||
429 : | |||
430 : | We save the DB handle, cache taxonomies, and put a few other odds and ends in the | ||
431 : | FIG object. We expect users to invoke these services using the object $fig constructed | ||
432 : | using: | ||
433 : | |||
434 : | use FIG; | ||
435 : | my $fig = new FIG; | ||
436 : | |||
437 : | $fig is then used as the basic mechanism for accessing FIG services. It is, of course, | ||
438 : | just a hash that is used to retain/cache data. The most commonly accessed item is the | ||
439 : | DB filehandle, which is accessed via $self->db_handle. | ||
440 : | |||
441 : | We cache genus/species expansions, taxonomies, distances (very crudely estimated) estimated | ||
442 : | between genomes, and a variety of other things. I am not sure that using cached/2 was a | ||
443 : | good idea, but I did it. | ||
444 : | |||
445 : | =cut | ||
446 : | |||
447 : | sub db_handle { | ||
448 : | my($self) = @_; | ||
449 : | |||
450 : | return $self->{_dbf}; | ||
451 : | } | ||
452 : | |||
453 : | sub cached { | ||
454 : | my($self,$what) = @_; | ||
455 : | |||
456 : | my $x = $self->{$what}; | ||
457 : | if (! $x) | ||
458 : | { | ||
459 : | $x = $self->{$what} = {}; | ||
460 : | } | ||
461 : | return $x; | ||
462 : | } | ||
463 : | |||
464 : | ################ Basic Routines [ existed since WIT ] ########################## | ||
465 : | |||
466 : | |||
467 : | =pod | ||
468 : | |||
469 : | =head1 min | ||
470 : | |||
471 : | usage: $n = &FIG::min(@x) | ||
472 : | |||
473 : | Assumes @x contains numeric values. Returns the minimum of the values. | ||
474 : | |||
475 : | =cut | ||
476 : | |||
477 : | sub min { | ||
478 : | my(@x) = @_; | ||
479 : | my($min,$i); | ||
480 : | |||
481 : | (@x > 0) || return undef; | ||
482 : | $min = $x[0]; | ||
483 : | for ($i=1; ($i < @x); $i++) | ||
484 : | { | ||
485 : | $min = ($min > $x[$i]) ? $x[$i] : $min; | ||
486 : | } | ||
487 : | return $min; | ||
488 : | } | ||
489 : | |||
490 : | =pod | ||
491 : | |||
492 : | =head1 max | ||
493 : | |||
494 : | usage: $n = &FIG::max(@x) | ||
495 : | |||
496 : | Assumes @x contains numeric values. Returns the maximum of the values. | ||
497 : | |||
498 : | =cut | ||
499 : | |||
500 : | sub max { | ||
501 : | my(@x) = @_; | ||
502 : | my($max,$i); | ||
503 : | |||
504 : | (@x > 0) || return undef; | ||
505 : | $max = $x[0]; | ||
506 : | for ($i=1; ($i < @x); $i++) | ||
507 : | { | ||
508 : | $max = ($max < $x[$i]) ? $x[$i] : $max; | ||
509 : | } | ||
510 : | return $max; | ||
511 : | } | ||
512 : | |||
513 : | =pod | ||
514 : | |||
515 : | =head1 between | ||
516 : | |||
517 : | usage: &FIG::between($x,$y,$z) | ||
518 : | |||
519 : | Returns true iff $y is between $x and $z. | ||
520 : | |||
521 : | =cut | ||
522 : | |||
523 : | sub between { | ||
524 : | my($x,$y,$z) = @_; | ||
525 : | |||
526 : | if ($x < $z) | ||
527 : | { | ||
528 : | return (($x <= $y) && ($y <= $z)); | ||
529 : | } | ||
530 : | else | ||
531 : | { | ||
532 : | return (($x >= $y) && ($y >= $z)); | ||
533 : | } | ||
534 : | } | ||
535 : | |||
536 : | =pod | ||
537 : | |||
538 : | =head1 standard_genetic_code | ||
539 : | |||
540 : | usage: $code = &FIG::standard_genetic_code() | ||
541 : | |||
542 : | Routines like "translate" can take a "genetic code" as an argument. I implemented such | ||
543 : | codes using hashes that assumed uppercase DNA triplets as keys. | ||
544 : | |||
545 : | =cut | ||
546 : | |||
547 : | sub standard_genetic_code { | ||
548 : | |||
549 : | my $code = {}; | ||
550 : | |||
551 : | $code->{"AAA"} = "K"; | ||
552 : | $code->{"AAC"} = "N"; | ||
553 : | $code->{"AAG"} = "K"; | ||
554 : | $code->{"AAT"} = "N"; | ||
555 : | $code->{"ACA"} = "T"; | ||
556 : | $code->{"ACC"} = "T"; | ||
557 : | $code->{"ACG"} = "T"; | ||
558 : | $code->{"ACT"} = "T"; | ||
559 : | $code->{"AGA"} = "R"; | ||
560 : | $code->{"AGC"} = "S"; | ||
561 : | $code->{"AGG"} = "R"; | ||
562 : | $code->{"AGT"} = "S"; | ||
563 : | $code->{"ATA"} = "I"; | ||
564 : | $code->{"ATC"} = "I"; | ||
565 : | $code->{"ATG"} = "M"; | ||
566 : | $code->{"ATT"} = "I"; | ||
567 : | $code->{"CAA"} = "Q"; | ||
568 : | $code->{"CAC"} = "H"; | ||
569 : | $code->{"CAG"} = "Q"; | ||
570 : | $code->{"CAT"} = "H"; | ||
571 : | $code->{"CCA"} = "P"; | ||
572 : | $code->{"CCC"} = "P"; | ||
573 : | $code->{"CCG"} = "P"; | ||
574 : | $code->{"CCT"} = "P"; | ||
575 : | $code->{"CGA"} = "R"; | ||
576 : | $code->{"CGC"} = "R"; | ||
577 : | $code->{"CGG"} = "R"; | ||
578 : | $code->{"CGT"} = "R"; | ||
579 : | $code->{"CTA"} = "L"; | ||
580 : | $code->{"CTC"} = "L"; | ||
581 : | $code->{"CTG"} = "L"; | ||
582 : | $code->{"CTT"} = "L"; | ||
583 : | $code->{"GAA"} = "E"; | ||
584 : | $code->{"GAC"} = "D"; | ||
585 : | $code->{"GAG"} = "E"; | ||
586 : | $code->{"GAT"} = "D"; | ||
587 : | $code->{"GCA"} = "A"; | ||
588 : | $code->{"GCC"} = "A"; | ||
589 : | $code->{"GCG"} = "A"; | ||
590 : | $code->{"GCT"} = "A"; | ||
591 : | $code->{"GGA"} = "G"; | ||
592 : | $code->{"GGC"} = "G"; | ||
593 : | $code->{"GGG"} = "G"; | ||
594 : | $code->{"GGT"} = "G"; | ||
595 : | $code->{"GTA"} = "V"; | ||
596 : | $code->{"GTC"} = "V"; | ||
597 : | $code->{"GTG"} = "V"; | ||
598 : | $code->{"GTT"} = "V"; | ||
599 : | $code->{"TAA"} = "*"; | ||
600 : | $code->{"TAC"} = "Y"; | ||
601 : | $code->{"TAG"} = "*"; | ||
602 : | $code->{"TAT"} = "Y"; | ||
603 : | $code->{"TCA"} = "S"; | ||
604 : | $code->{"TCC"} = "S"; | ||
605 : | $code->{"TCG"} = "S"; | ||
606 : | $code->{"TCT"} = "S"; | ||
607 : | $code->{"TGA"} = "*"; | ||
608 : | $code->{"TGC"} = "C"; | ||
609 : | $code->{"TGG"} = "W"; | ||
610 : | $code->{"TGT"} = "C"; | ||
611 : | $code->{"TTA"} = "L"; | ||
612 : | $code->{"TTC"} = "F"; | ||
613 : | $code->{"TTG"} = "L"; | ||
614 : | $code->{"TTT"} = "F"; | ||
615 : | |||
616 : | return $code; | ||
617 : | } | ||
618 : | |||
619 : | =pod | ||
620 : | |||
621 : | =head1 translate | ||
622 : | |||
623 : | usage: $aa_seq = &FIG::translate($dna_seq,$code,$fix_start); | ||
624 : | |||
625 : | If $code is undefined, I use the standard genetic code. If $fix_start is true, I | ||
626 : | will translate initial TTG or GTG to 'M'. | ||
627 : | |||
628 : | =cut | ||
629 : | |||
630 : | sub translate { | ||
631 : | my( $dna,$code,$start) = @_; | ||
632 : | my( $i,$j,$ln ); | ||
633 : | my( $x,$y ); | ||
634 : | my( $prot ); | ||
635 : | |||
636 : | if (! defined($code)) | ||
637 : | { | ||
638 : | $code = &FIG::standard_genetic_code; | ||
639 : | } | ||
640 : | $ln = length($dna); | ||
641 : | $prot = "X" x ($ln/3); | ||
642 : | $dna =~ tr/a-z/A-Z/; | ||
643 : | |||
644 : | for ($i=0,$j=0; ($i < ($ln-2)); $i += 3,$j++) | ||
645 : | { | ||
646 : | $x = substr($dna,$i,3); | ||
647 : | if ($y = $code->{$x}) | ||
648 : | { | ||
649 : | substr($prot,$j,1) = $y; | ||
650 : | } | ||
651 : | } | ||
652 : | |||
653 : | if (($start) && ($ln >= 3) && (substr($dna,0,3) =~ /^[GT]TG$/)) | ||
654 : | { | ||
655 : | substr($prot,0,1) = 'M'; | ||
656 : | } | ||
657 : | return $prot; | ||
658 : | } | ||
659 : | |||
660 : | =pod | ||
661 : | |||
662 : | =head1 reverse_comp and rev_comp | ||
663 : | |||
664 : | usage: $dnaR = &FIG::reverse_comp($dna) or | ||
665 : | $dnaRP = &FIG::rev_comp($seqP) | ||
666 : | |||
667 : | In WIT, we implemented reverse complement passing a pointer to a sequence and returning | ||
668 : | a pointer to a sequence. In most cases the pointers are a pain (although in a few they | ||
669 : | are just what is needed). Hence, I kept both versions of the function to allow you | ||
670 : | to use whichever you like. Use rev_comp only for long strings where passing pointers is a | ||
671 : | reasonable effeciency issue. | ||
672 : | |||
673 : | =cut | ||
674 : | |||
675 : | sub reverse_comp { | ||
676 : | my($seq) = @_; | ||
677 : | |||
678 : | return ${&rev_comp(\$seq)}; | ||
679 : | } | ||
680 : | |||
681 : | sub rev_comp { | ||
682 : | my( $seqP ) = @_; | ||
683 : | my( $rev ); | ||
684 : | |||
685 : | $rev = reverse( $$seqP ); | ||
686 : | $rev =~ tr/a-z/A-Z/; | ||
687 : | $rev =~ tr/ACGTUMRWSYKBDHV/TGCAAKYWSRMVHDB/; | ||
688 : | return \$rev; | ||
689 : | } | ||
690 : | |||
691 : | =pod | ||
692 : | |||
693 : | =head1 verify_dir | ||
694 : | |||
695 : | usage: &FIG::verify_dir($dir) | ||
696 : | |||
697 : | Makes sure that $dir exists. If it has to create it, it sets permissions to 0777. | ||
698 : | |||
699 : | =cut | ||
700 : | |||
701 : | sub verify_dir { | ||
702 : | my($dir) = @_; | ||
703 : | |||
704 : | if (-d $dir) { return } | ||
705 : | if ($dir =~ /^(.*)\/[^\/]+$/) | ||
706 : | { | ||
707 : | &verify_dir($1); | ||
708 : | } | ||
709 : | mkdir($dir,0777) || die "could not make $dir"; | ||
710 : | disz | 1.60 | chmod 02777,$dir; |
711 : | efrank | 1.1 | } |
712 : | |||
713 : | =pod | ||
714 : | |||
715 : | =head1 run | ||
716 : | |||
717 : | usage: &FIG::run($cmd) | ||
718 : | |||
719 : | Runs $cmd and fails (with trace) if the command fails. | ||
720 : | |||
721 : | =cut | ||
722 : | |||
723 : | mkubal | 1.53 | |
724 : | efrank | 1.1 | sub run { |
725 : | my($cmd) = @_; | ||
726 : | |||
727 : | golsen | 1.44 | # my @tmp = `date`; chomp @tmp; print STDERR "$tmp[0]: running $cmd\n"; |
728 : | efrank | 1.1 | (system($cmd) == 0) || confess "FAILED: $cmd"; |
729 : | } | ||
730 : | |||
731 : | gdpusch | 1.45 | |
732 : | |||
733 : | =pod | ||
734 : | |||
735 : | =head1 read_fasta_record(\*FILEHANDLE) | ||
736 : | |||
737 : | Usage: ( $seq_id, $sequence, $comment ) = &read_fasta_record(\*FILEHANDLE); | ||
738 : | |||
739 : | Function: Reads a FASTA-formatted sequence file one record at a time. | ||
740 : | The input filehandle defaults to STDIN if not specified. | ||
741 : | Returns a sequence ID, a pointer to the sequence, and an optional | ||
742 : | record comment (NOTE: Record comments are deprecated, as some tools | ||
743 : | such as BLAST do not handle them gracefully). Returns an empty list | ||
744 : | if attempting to read a record results in an undefined value | ||
745 : | (e.g., due to reaching the EOF). | ||
746 : | |||
747 : | Author: Gordon D. Pusch | ||
748 : | |||
749 : | Date: 2004-Feb-18 | ||
750 : | |||
751 : | =cut | ||
752 : | |||
753 : | sub read_fasta_record | ||
754 : | { | ||
755 : | my ($file_handle) = @_; | ||
756 : | gdpusch | 1.46 | my ( $old_end_of_record, $fasta_record, @lines, $head, $sequence, $seq_id, $comment, @parsed_fasta_record ); |
757 : | gdpusch | 1.45 | |
758 : | if (not defined($file_handle)) { $file_handle = \*STDIN; } | ||
759 : | |||
760 : | $old_end_of_record = $/; | ||
761 : | $/ = "\n>"; | ||
762 : | |||
763 : | if (defined($fasta_record = <$file_handle>)) | ||
764 : | { | ||
765 : | chomp $fasta_record; | ||
766 : | @lines = split( /\n/, $fasta_record ); | ||
767 : | $head = shift @lines; | ||
768 : | $head =~ s/^>?//; | ||
769 : | $head =~ m/^(\S+)/; | ||
770 : | $seq_id = $1; | ||
771 : | |||
772 : | if ($head =~ m/^\S+\s+(.*)$/) { $comment = $1; } else { $comment = ""; } | ||
773 : | |||
774 : | $sequence = join( "", @lines ); | ||
775 : | |||
776 : | @parsed_fasta_record = ( $seq_id, \$sequence, $comment ); | ||
777 : | } | ||
778 : | else | ||
779 : | { | ||
780 : | @parsed_fasta_record = (); | ||
781 : | } | ||
782 : | |||
783 : | $/ = $old_end_of_record; | ||
784 : | |||
785 : | return @parsed_fasta_record; | ||
786 : | } | ||
787 : | |||
788 : | |||
789 : | efrank | 1.1 | =pod |
790 : | |||
791 : | =head1 display_id_and_seq | ||
792 : | |||
793 : | usage: &FIG::display_id_and_seq($id_and_comment,$seqP,$fh) | ||
794 : | |||
795 : | This command has always been used to put out fasta sequences. Note that it | ||
796 : | takes a pointer to the sequence. $fh is optional and defalts to STDOUT. | ||
797 : | |||
798 : | =cut | ||
799 : | |||
800 : | mkubal | 1.53 | |
801 : | efrank | 1.1 | sub display_id_and_seq { |
802 : | my( $id, $seq, $fh ) = @_; | ||
803 : | |||
804 : | if (! defined($fh) ) { $fh = \*STDOUT; } | ||
805 : | |||
806 : | print $fh ">$id\n"; | ||
807 : | &display_seq($seq, $fh); | ||
808 : | } | ||
809 : | |||
810 : | sub display_seq { | ||
811 : | my ( $seq, $fh ) = @_; | ||
812 : | my ( $i, $n, $ln ); | ||
813 : | |||
814 : | if (! defined($fh) ) { $fh = \*STDOUT; } | ||
815 : | |||
816 : | $n = length($$seq); | ||
817 : | # confess "zero-length sequence ???" if ( (! defined($n)) || ($n == 0) ); | ||
818 : | for ($i=0; ($i < $n); $i += 60) | ||
819 : | { | ||
820 : | if (($i + 60) <= $n) | ||
821 : | { | ||
822 : | $ln = substr($$seq,$i,60); | ||
823 : | } | ||
824 : | else | ||
825 : | { | ||
826 : | $ln = substr($$seq,$i,($n-$i)); | ||
827 : | } | ||
828 : | print $fh "$ln\n"; | ||
829 : | } | ||
830 : | } | ||
831 : | |||
832 : | ########## I commented the pods on the following routines out, since they should not | ||
833 : | ########## be part of the SOAP/WSTL interface | ||
834 : | #=pod | ||
835 : | # | ||
836 : | #=head1 file2N | ||
837 : | # | ||
838 : | #usage: $n = $fig->file2N($file) | ||
839 : | # | ||
840 : | #In some of the databases I need to store filenames, which can waste a lot of | ||
841 : | #space. Hence, I maintain a database for converting filenames to/from integers. | ||
842 : | # | ||
843 : | #=cut | ||
844 : | # | ||
845 : | sub file2N { | ||
846 : | my($self,$file) = @_; | ||
847 : | my($relational_db_response); | ||
848 : | |||
849 : | my $rdbH = $self->db_handle; | ||
850 : | |||
851 : | if (($relational_db_response = $rdbH->SQL("SELECT fileno FROM file_table WHERE ( file = \'$file\')")) && | ||
852 : | (@$relational_db_response == 1)) | ||
853 : | { | ||
854 : | return $relational_db_response->[0]->[0]; | ||
855 : | } | ||
856 : | elsif (($relational_db_response = $rdbH->SQL("SELECT MAX(fileno) FROM file_table ")) && (@$relational_db_response == 1) && ($relational_db_response->[0]->[0])) | ||
857 : | { | ||
858 : | my $fileno = $relational_db_response->[0]->[0] + 1; | ||
859 : | if ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', $fileno )")) | ||
860 : | { | ||
861 : | return $fileno; | ||
862 : | } | ||
863 : | } | ||
864 : | elsif ($rdbH->SQL("INSERT INTO file_table ( file, fileno ) VALUES ( \'$file\', 1 )")) | ||
865 : | { | ||
866 : | return 1; | ||
867 : | } | ||
868 : | return undef; | ||
869 : | } | ||
870 : | |||
871 : | #=pod | ||
872 : | # | ||
873 : | #=head1 N2file | ||
874 : | # | ||
875 : | #usage: $filename = $fig->N2file($n) | ||
876 : | # | ||
877 : | #In some of the databases I need to store filenames, which can waste a lot of | ||
878 : | #space. Hence, I maintain a database for converting filenames to/from integers. | ||
879 : | # | ||
880 : | #=cut | ||
881 : | # | ||
882 : | sub N2file { | ||
883 : | my($self,$fileno) = @_; | ||
884 : | my($relational_db_response); | ||
885 : | |||
886 : | my $rdbH = $self->db_handle; | ||
887 : | |||
888 : | if (($relational_db_response = $rdbH->SQL("SELECT file FROM file_table WHERE ( fileno = $fileno )")) && | ||
889 : | (@$relational_db_response == 1)) | ||
890 : | { | ||
891 : | return $relational_db_response->[0]->[0]; | ||
892 : | } | ||
893 : | return undef; | ||
894 : | } | ||
895 : | |||
896 : | |||
897 : | #=pod | ||
898 : | # | ||
899 : | #=head1 openF | ||
900 : | # | ||
901 : | #usage: $fig->openF($filename) | ||
902 : | # | ||
903 : | #Parts of the system rely on accessing numerous different files. The most obvious case is | ||
904 : | #the situation with similarities. It is important that the system be able to run in cases in | ||
905 : | #which an arbitrary number of files cannot be open simultaneously. This routine (with closeF) is | ||
906 : | #a hack to handle this. I should probably just pitch them and insist that the OS handle several | ||
907 : | #hundred open filehandles. | ||
908 : | # | ||
909 : | #=cut | ||
910 : | # | ||
911 : | sub openF { | ||
912 : | my($self,$file) = @_; | ||
913 : | my($fxs,$x,@fxs,$fh); | ||
914 : | |||
915 : | $fxs = $self->cached('_openF'); | ||
916 : | if ($x = $fxs->{$file}) | ||
917 : | { | ||
918 : | $x->[1] = time(); | ||
919 : | return $x->[0]; | ||
920 : | } | ||
921 : | |||
922 : | @fxs = keys(%$fxs); | ||
923 : | if (defined($fh = new FileHandle "<$file")) | ||
924 : | { | ||
925 : | if (@fxs >= 200) | ||
926 : | { | ||
927 : | @fxs = sort { $fxs->{$a}->[1] <=> $fxs->{$b}->[1] } @fxs; | ||
928 : | $x = $fxs->{$fxs[0]}; | ||
929 : | undef $x->[0]; | ||
930 : | delete $fxs->{$fxs[0]}; | ||
931 : | } | ||
932 : | $fxs->{$file} = [$fh,time()]; | ||
933 : | return $fh; | ||
934 : | } | ||
935 : | return undef; | ||
936 : | } | ||
937 : | |||
938 : | #=pod | ||
939 : | # | ||
940 : | #=head1 closeF | ||
941 : | # | ||
942 : | #usage: $fig->closeF($filename) | ||
943 : | # | ||
944 : | #Parts of the system rely on accessing numerous different files. The most obvious case is | ||
945 : | #the situation with similarities. It is important that the system be able to run in cases in | ||
946 : | #which an arbitrary number of files cannot be open simultaneously. This routine (with openF) is | ||
947 : | #a hack to handle this. I should probably just pitch them and insist that the OS handle several | ||
948 : | #hundred open filehandles. | ||
949 : | # | ||
950 : | #=cut | ||
951 : | # | ||
952 : | sub closeF { | ||
953 : | my($self,$file) = @_; | ||
954 : | my($fxs,$x); | ||
955 : | |||
956 : | if (($fxs = $self->{_openF}) && | ||
957 : | ($x = $fxs->{$file})) | ||
958 : | { | ||
959 : | undef $x->[0]; | ||
960 : | delete $fxs->{$file}; | ||
961 : | } | ||
962 : | } | ||
963 : | |||
964 : | =pod | ||
965 : | |||
966 : | =head1 ec_name | ||
967 : | |||
968 : | usage: $enzymatic_function = $fig->ec_name($ec) | ||
969 : | |||
970 : | Returns enzymatic name for EC. | ||
971 : | |||
972 : | =cut | ||
973 : | |||
974 : | sub ec_name { | ||
975 : | my($self,$ec) = @_; | ||
976 : | |||
977 : | ($ec =~ /^\d+\.\d+\.\d+\.\d+$/) || return ""; | ||
978 : | my $rdbH = $self->db_handle; | ||
979 : | my $relational_db_response = $rdbH->SQL("SELECT name FROM ec_names WHERE ( ec = \'$ec\' )"); | ||
980 : | |||
981 : | return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : ""; | ||
982 : | return ""; | ||
983 : | } | ||
984 : | |||
985 : | =pod | ||
986 : | |||
987 : | =head1 all_roles | ||
988 : | |||
989 : | usage: @roles = $fig->all_roles | ||
990 : | |||
991 : | mkubal | 1.54 | Supposed to return all known roles. For now, we get all ECs with "names". |
992 : | efrank | 1.1 | |
993 : | =cut | ||
994 : | |||
995 : | sub all_roles { | ||
996 : | my($self) = @_; | ||
997 : | |||
998 : | my $rdbH = $self->db_handle; | ||
999 : | my $relational_db_response = $rdbH->SQL("SELECT ec,name FROM ec_names"); | ||
1000 : | |||
1001 : | return @$relational_db_response; | ||
1002 : | } | ||
1003 : | |||
1004 : | =pod | ||
1005 : | |||
1006 : | =head1 expand_ec | ||
1007 : | |||
1008 : | usage: $expanded_ec = $fig->expand_ec($ec) | ||
1009 : | |||
1010 : | Expands "1.1.1.1" to "1.1.1.1 - alcohol dehydrogenase" or something like that. | ||
1011 : | |||
1012 : | =cut | ||
1013 : | |||
1014 : | sub expand_ec { | ||
1015 : | my($self,$ec) = @_; | ||
1016 : | my($name); | ||
1017 : | |||
1018 : | return ($name = $self->ec_name($ec)) ? "$ec - $name" : $ec; | ||
1019 : | } | ||
1020 : | |||
1021 : | |||
1022 : | =pod | ||
1023 : | |||
1024 : | =head1 clean_tmp | ||
1025 : | |||
1026 : | usage: &FIG::clean_tmp | ||
1027 : | |||
1028 : | We store temporary files in $FIG_Config::temp. There are specific classes of files | ||
1029 : | that are created and should be saved for at least a few days. This routine can be | ||
1030 : | invoked to clean out those that are over two days old. | ||
1031 : | |||
1032 : | =cut | ||
1033 : | |||
1034 : | sub clean_tmp { | ||
1035 : | |||
1036 : | my($file); | ||
1037 : | if (opendir(TMP,"$FIG_Config::temp")) | ||
1038 : | { | ||
1039 : | # change the pattern to pick up other files that need to be cleaned up | ||
1040 : | my @temp = grep { $_ =~ /^(Geno|tmp)/ } readdir(TMP); | ||
1041 : | foreach $file (@temp) | ||
1042 : | { | ||
1043 : | if (-M "$FIG_Config::temp/$file" > 2) | ||
1044 : | { | ||
1045 : | unlink("$FIG_Config::temp/$file"); | ||
1046 : | } | ||
1047 : | } | ||
1048 : | } | ||
1049 : | } | ||
1050 : | |||
1051 : | ################ Routines to process genomes and genome IDs ########################## | ||
1052 : | |||
1053 : | |||
1054 : | =pod | ||
1055 : | |||
1056 : | =head1 genomes | ||
1057 : | |||
1058 : | usage: @genome_ids = $fig->genomes; | ||
1059 : | |||
1060 : | Genomes are assigned ids of the form X.Y where X is the taxonomic id maintained by | ||
1061 : | NCBI for the species (not the specific strain), and Y is a sequence digit assigned to | ||
1062 : | this particular genome (as one of a set with the same genus/species). Genomes also | ||
1063 : | have versions, but that is a separate issue. | ||
1064 : | |||
1065 : | =cut | ||
1066 : | |||
1067 : | sub genomes { | ||
1068 : | overbeek | 1.13 | my($self,$complete,$restrictions) = @_; |
1069 : | |||
1070 : | my $rdbH = $self->db_handle; | ||
1071 : | |||
1072 : | my @where = (); | ||
1073 : | if ($complete) | ||
1074 : | { | ||
1075 : | push(@where,"( complete = \'1\' )") | ||
1076 : | } | ||
1077 : | |||
1078 : | if ($restrictions) | ||
1079 : | { | ||
1080 : | push(@where,"( restrictions = \'1\' )") | ||
1081 : | } | ||
1082 : | |||
1083 : | my $relational_db_response; | ||
1084 : | if (@where > 0) | ||
1085 : | { | ||
1086 : | my $where = join(" AND ",@where); | ||
1087 : | $relational_db_response = $rdbH->SQL("SELECT genome FROM genome where $where"); | ||
1088 : | } | ||
1089 : | else | ||
1090 : | { | ||
1091 : | $relational_db_response = $rdbH->SQL("SELECT genome FROM genome"); | ||
1092 : | } | ||
1093 : | my @genomes = sort { $a <=> $b } map { $_->[0] } @$relational_db_response; | ||
1094 : | efrank | 1.1 | return @genomes; |
1095 : | } | ||
1096 : | |||
1097 : | efrank | 1.2 | sub genome_counts { |
1098 : | overbeek | 1.13 | my($self,$complete) = @_; |
1099 : | my($x,$relational_db_response); | ||
1100 : | efrank | 1.2 | |
1101 : | overbeek | 1.13 | my $rdbH = $self->db_handle; |
1102 : | |||
1103 : | if ($complete) | ||
1104 : | { | ||
1105 : | $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome where complete = '1'"); | ||
1106 : | } | ||
1107 : | else | ||
1108 : | { | ||
1109 : | $relational_db_response = $rdbH->SQL("SELECT genome,maindomain FROM genome"); | ||
1110 : | } | ||
1111 : | |||
1112 : | my ($a,$b,$e,$v) = (0,0,0,0); | ||
1113 : | if (@$relational_db_response > 0) | ||
1114 : | efrank | 1.2 | { |
1115 : | overbeek | 1.13 | foreach $x (@$relational_db_response) |
1116 : | efrank | 1.2 | { |
1117 : | overbeek | 1.13 | if ($x->[1] =~ /^a/i) { $a++ } |
1118 : | elsif ($x->[1] =~ /^b/i) { $b++ } | ||
1119 : | elsif ($x->[1] =~ /^e/i) { $e++ } | ||
1120 : | elsif ($x->[1] =~ /^v/i) { $v++ } | ||
1121 : | efrank | 1.2 | } |
1122 : | } | ||
1123 : | overbeek | 1.13 | |
1124 : | efrank | 1.2 | return ($a,$b,$e,$v); |
1125 : | } | ||
1126 : | |||
1127 : | efrank | 1.1 | =pod |
1128 : | |||
1129 : | =head1 genome_version | ||
1130 : | |||
1131 : | usage: $version = $fig->genome_version($genome_id); | ||
1132 : | |||
1133 : | Versions are incremented for major updates. They are put in as major | ||
1134 : | updates of the form 1.0, 2.0, ... | ||
1135 : | |||
1136 : | Users may do local "editing" of the DNA for a genome, but when they do, | ||
1137 : | they increment the digits to the right of the decimal. Two genomes remain | ||
1138 : | comparable only if the versions match identically. Hence, minor updating should be | ||
1139 : | committed only by the person/group responsible for updating that genome. | ||
1140 : | |||
1141 : | We can, of course, identify which genes are identical between any two genomes (by matching | ||
1142 : | the DNA or amino acid sequences). However, the basic intent of the system is to | ||
1143 : | support editing by the main group issuing periodic major updates. | ||
1144 : | |||
1145 : | =cut | ||
1146 : | |||
1147 : | sub genome_version { | ||
1148 : | my($self,$genome) = @_; | ||
1149 : | |||
1150 : | my(@tmp); | ||
1151 : | if ((-s "$FIG_Config::organisms/$genome/VERSION") && | ||
1152 : | (@tmp = `cat $FIG_Config::organisms/$genome/VERSION`) && | ||
1153 : | ($tmp[0] =~ /^(\d+(\.\d+)?)$/)) | ||
1154 : | { | ||
1155 : | return $1; | ||
1156 : | } | ||
1157 : | return undef; | ||
1158 : | } | ||
1159 : | |||
1160 : | =pod | ||
1161 : | |||
1162 : | =head1 genus_species | ||
1163 : | |||
1164 : | usage: $gs = $fig->genus_species($genome_id) | ||
1165 : | |||
1166 : | Returns the genus and species (and strain if that has been properly recorded) | ||
1167 : | in a printable form. | ||
1168 : | |||
1169 : | =cut | ||
1170 : | |||
1171 : | sub genus_species { | ||
1172 : | my ($self,$genome) = @_; | ||
1173 : | overbeek | 1.13 | my $ans; |
1174 : | efrank | 1.1 | |
1175 : | my $genus_species = $self->cached('_genus_species'); | ||
1176 : | if (! ($ans = $genus_species->{$genome})) | ||
1177 : | { | ||
1178 : | overbeek | 1.13 | my $rdbH = $self->db_handle; |
1179 : | my $relational_db_response = $rdbH->SQL("SELECT genome,gname FROM genome"); | ||
1180 : | my $pair; | ||
1181 : | foreach $pair (@$relational_db_response) | ||
1182 : | efrank | 1.1 | { |
1183 : | overbeek | 1.13 | $genus_species->{$pair->[0]} = $pair->[1]; |
1184 : | efrank | 1.1 | } |
1185 : | overbeek | 1.13 | $ans = $genus_species->{$genome}; |
1186 : | efrank | 1.1 | } |
1187 : | return $ans; | ||
1188 : | } | ||
1189 : | |||
1190 : | =pod | ||
1191 : | |||
1192 : | =head1 org_of | ||
1193 : | |||
1194 : | usage: $org = $fig->org_of($prot_id) | ||
1195 : | |||
1196 : | In the case of external proteins, we can usually determine an organism, but not | ||
1197 : | anything more precise than genus/species (and often not that). This routine takes | ||
1198 : | efrank | 1.2 | a protein ID (which may be a feature ID) and returns "the organism". |
1199 : | efrank | 1.1 | |
1200 : | =cut | ||
1201 : | |||
1202 : | sub org_of { | ||
1203 : | my($self,$prot_id) = @_; | ||
1204 : | my $relational_db_response; | ||
1205 : | my $rdbH = $self->db_handle; | ||
1206 : | |||
1207 : | if ($prot_id =~ /^fig\|/) | ||
1208 : | { | ||
1209 : | return $self->genus_species($self->genome_of($prot_id)); | ||
1210 : | } | ||
1211 : | |||
1212 : | if (($relational_db_response = $rdbH->SQL("SELECT org FROM external_orgs WHERE ( prot = \'$prot_id\' )")) && | ||
1213 : | (@$relational_db_response >= 1)) | ||
1214 : | { | ||
1215 : | return $relational_db_response->[0]->[0]; | ||
1216 : | } | ||
1217 : | return ""; | ||
1218 : | } | ||
1219 : | |||
1220 : | =pod | ||
1221 : | |||
1222 : | =head1 abbrev | ||
1223 : | |||
1224 : | usage: $abbreviated_name = $fig->abbrev($genome_name) | ||
1225 : | |||
1226 : | For alignments and such, it is very useful to be able to produce an abbreviation of genus/species. | ||
1227 : | That's what this does. Note that multiple genus/species might reduce to the same abbreviation, so | ||
1228 : | be careful (disambiguate them, if you must). | ||
1229 : | |||
1230 : | =cut | ||
1231 : | |||
1232 : | sub abbrev { | ||
1233 : | my($genome_name) = @_; | ||
1234 : | |||
1235 : | $genome_name =~ s/^(\S{3})\S+/$1./; | ||
1236 : | $genome_name =~ s/^(\S+\s+\S{4})\S+/$1./; | ||
1237 : | if (length($genome_name) > 13) | ||
1238 : | { | ||
1239 : | $genome_name = substr($genome_name,0,13); | ||
1240 : | } | ||
1241 : | return $genome_name; | ||
1242 : | } | ||
1243 : | |||
1244 : | ################ Routines to process Features and Feature IDs ########################## | ||
1245 : | |||
1246 : | =pod | ||
1247 : | |||
1248 : | =head1 ftype | ||
1249 : | |||
1250 : | usage: $type = &FIG::ftype($fid) | ||
1251 : | |||
1252 : | Returns the type of a feature, given the feature ID. This just amounts | ||
1253 : | to lifting it out of the feature ID, since features have IDs of tghe form | ||
1254 : | |||
1255 : | fig|x.y.f.n | ||
1256 : | |||
1257 : | where | ||
1258 : | x.y is the genome ID | ||
1259 : | f is the type pf feature | ||
1260 : | n is an integer that is unique within the genome/type | ||
1261 : | |||
1262 : | =cut | ||
1263 : | |||
1264 : | sub ftype { | ||
1265 : | my($feature_id) = @_; | ||
1266 : | |||
1267 : | if ($feature_id =~ /^fig\|\d+\.\d+\.([^\.]+)/) | ||
1268 : | { | ||
1269 : | return $1; | ||
1270 : | } | ||
1271 : | return undef; | ||
1272 : | } | ||
1273 : | |||
1274 : | =pod | ||
1275 : | |||
1276 : | =head1 genome_of | ||
1277 : | |||
1278 : | usage: $genome_id = $fig->genome_of($fid) | ||
1279 : | |||
1280 : | This just extracts the genome ID from a feature ID. | ||
1281 : | |||
1282 : | =cut | ||
1283 : | |||
1284 : | |||
1285 : | sub genome_of { | ||
1286 : | my $prot_id = (@_ == 1) ? $_[0] : $_[1]; | ||
1287 : | |||
1288 : | if ($prot_id =~ /^fig\|(\d+\.\d+)/) { return $1; } | ||
1289 : | return undef; | ||
1290 : | } | ||
1291 : | |||
1292 : | =pod | ||
1293 : | |||
1294 : | =head1 by_fig_id | ||
1295 : | |||
1296 : | usage: @sorted_by_fig_id = sort { &FIG::by_fig_id($a,$b) } @fig_ids | ||
1297 : | |||
1298 : | This is a bit of a clutzy way to sort a list of FIG feature IDs, but it works. | ||
1299 : | |||
1300 : | =cut | ||
1301 : | |||
1302 : | sub by_fig_id { | ||
1303 : | my($a,$b) = @_; | ||
1304 : | my($g1,$g2,$t1,$t2,$n1,$n2); | ||
1305 : | if (($a =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g1,$t1,$n1) = ($1,$2,$3)) && | ||
1306 : | ($b =~ /^fig\|(\d+\.\d+).([^\.]+)\.(\d+)$/) && (($g2,$t2,$n2) = ($1,$2,$3))) | ||
1307 : | { | ||
1308 : | ($g1 <=> $g2) or ($t1 cmp $t2) or ($n1 <=> $n2); | ||
1309 : | } | ||
1310 : | else | ||
1311 : | { | ||
1312 : | $a cmp $b; | ||
1313 : | } | ||
1314 : | } | ||
1315 : | |||
1316 : | =pod | ||
1317 : | |||
1318 : | =head1 genes_in_region | ||
1319 : | |||
1320 : | usage: ($features_in_region,$beg1,$end1) = $fig->genes_in_region($genome,$contig,$beg,$end) | ||
1321 : | |||
1322 : | It is often important to be able to find the genes that occur in a specific region on | ||
1323 : | a chromosome. This routine is designed to provide this information. It returns all genes | ||
1324 : | that overlap the region ($genome,$contig,$beg,$end). $beg1 is set to the minimum coordinate of | ||
1325 : | the returned genes (which may be before the given region), and $end1 the maximum coordinate. | ||
1326 : | |||
1327 : | The routine assumes that genes are not more than 10000 bases long, which is certainly not true | ||
1328 : | in eukaryotes. Hence, in euks you may well miss genes that overlap the boundaries of the specified | ||
1329 : | region (sorry). | ||
1330 : | |||
1331 : | =cut | ||
1332 : | |||
1333 : | |||
1334 : | sub genes_in_region { | ||
1335 : | my($self,$genome,$contig,$beg,$end) = @_; | ||
1336 : | my($x,$relational_db_response,$feature_id,$b1,$e1,@feat,@tmp,$l,$u); | ||
1337 : | |||
1338 : | my $pad = 10000; | ||
1339 : | my $rdbH = $self->db_handle; | ||
1340 : | |||
1341 : | my $minV = $beg - $pad; | ||
1342 : | my $maxV = $end + $pad; | ||
1343 : | if (($relational_db_response = $rdbH->SQL("SELECT id FROM features | ||
1344 : | WHERE ( minloc > $minV ) AND ( minloc < $maxV ) AND (maxloc < $maxV) AND | ||
1345 : | ( genome = \'$genome\' ) AND ( contig = \'$contig\' );")) && | ||
1346 : | (@$relational_db_response >= 1)) | ||
1347 : | { | ||
1348 : | @tmp = sort { ($a->[1] cmp $b->[1]) or | ||
1349 : | ($a->[2] <=> $b->[2]) or | ||
1350 : | ($a->[3] <=> $b->[3]) | ||
1351 : | } | ||
1352 : | map { $feature_id = $_->[0]; | ||
1353 : | $x = $self->feature_location($feature_id); | ||
1354 : | $x ? [$feature_id,&boundaries_of($x)] : () | ||
1355 : | } @$relational_db_response; | ||
1356 : | |||
1357 : | |||
1358 : | ($l,$u) = (10000000000,0); | ||
1359 : | foreach $x (@tmp) | ||
1360 : | { | ||
1361 : | ($feature_id,undef,$b1,$e1) = @$x; | ||
1362 : | if (&between($beg,&min($b1,$e1),$end) || &between(&min($b1,$e1),$beg,&max($b1,$e1))) | ||
1363 : | { | ||
1364 : | push(@feat,$feature_id); | ||
1365 : | $l = &min($l,&min($b1,$e1)); | ||
1366 : | $u = &max($u,&max($b1,$e1)); | ||
1367 : | } | ||
1368 : | } | ||
1369 : | (@feat <= 0) || return ([@feat],$l,$u); | ||
1370 : | } | ||
1371 : | return ([],$l,$u); | ||
1372 : | } | ||
1373 : | |||
1374 : | sub close_genes { | ||
1375 : | my($self,$fid,$dist) = @_; | ||
1376 : | |||
1377 : | my $loc = $self->feature_location($fid); | ||
1378 : | if ($loc) | ||
1379 : | { | ||
1380 : | my($contig,$beg,$end) = &FIG::boundaries_of($loc); | ||
1381 : | if ($contig && $beg && $end) | ||
1382 : | { | ||
1383 : | my $min = &min($beg,$end) - $dist; | ||
1384 : | my $max = &max($beg,$end) + $dist; | ||
1385 : | my $feat; | ||
1386 : | ($feat,undef,undef) = $self->genes_in_region(&FIG::genome_of($fid),$contig,$min,$max); | ||
1387 : | return @$feat; | ||
1388 : | } | ||
1389 : | } | ||
1390 : | return (); | ||
1391 : | } | ||
1392 : | |||
1393 : | |||
1394 : | =pod | ||
1395 : | |||
1396 : | =head1 feature_location | ||
1397 : | |||
1398 : | usage: $loc = $fig->feature_location($fid) OR | ||
1399 : | @loc = $fig->feature_location($fid) | ||
1400 : | |||
1401 : | The location of a feature in a scalar context is | ||
1402 : | |||
1403 : | contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] | ||
1404 : | |||
1405 : | In a list context it is | ||
1406 : | |||
1407 : | (contig_b1_e1,contig_b2_e2,...) | ||
1408 : | |||
1409 : | =cut | ||
1410 : | |||
1411 : | sub feature_location { | ||
1412 : | my($self,$feature_id) = @_; | ||
1413 : | my($relational_db_response,$locations,$location); | ||
1414 : | |||
1415 : | $locations = $self->cached('_location'); | ||
1416 : | if (! ($location = $locations->{$feature_id})) | ||
1417 : | { | ||
1418 : | my $rdbH = $self->db_handle; | ||
1419 : | if (($relational_db_response = $rdbH->SQL("SELECT location FROM features WHERE ( id = \'$feature_id\' )")) && | ||
1420 : | (@$relational_db_response == 1)) | ||
1421 : | { | ||
1422 : | $locations->{$feature_id} = $location = $relational_db_response->[0]->[0]; | ||
1423 : | } | ||
1424 : | } | ||
1425 : | |||
1426 : | if ($location) | ||
1427 : | { | ||
1428 : | return wantarray() ? split(/,/,$location) : $location; | ||
1429 : | } | ||
1430 : | return undef; | ||
1431 : | } | ||
1432 : | |||
1433 : | =pod | ||
1434 : | |||
1435 : | =head1 boundaries_of | ||
1436 : | |||
1437 : | usage: ($contig,$beg,$end) = $fig->boundaries_of($loc) | ||
1438 : | |||
1439 : | The location of a feature in a scalar context is | ||
1440 : | |||
1441 : | contig_b1_e1,contig_b2_e2,... [one contig_b_e for each exon] | ||
1442 : | |||
1443 : | This routine takes as input such a location and reduces it to a single | ||
1444 : | description of the entire region containing the gene. | ||
1445 : | |||
1446 : | =cut | ||
1447 : | |||
1448 : | sub boundaries_of { | ||
1449 : | my($location) = (@_ == 1) ? $_[0] : $_[1]; | ||
1450 : | my($contigQ); | ||
1451 : | |||
1452 : | if (defined($location)) | ||
1453 : | { | ||
1454 : | my @exons = split(/,/,$location); | ||
1455 : | my($contig,$beg,$end); | ||
1456 : | if (($exons[0] =~ /^(\S+)_(\d+)_\d+$/) && | ||
1457 : | (($contig,$beg) = ($1,$2)) && ($contigQ = quotemeta $contig) && | ||
1458 : | ($exons[$#exons] =~ /^$contigQ\_\d+_(\d+)$/) && | ||
1459 : | ($end = $1)) | ||
1460 : | { | ||
1461 : | return ($contig,$beg,$end); | ||
1462 : | } | ||
1463 : | } | ||
1464 : | return undef; | ||
1465 : | } | ||
1466 : | |||
1467 : | |||
1468 : | =pod | ||
1469 : | |||
1470 : | =head1 all_features | ||
1471 : | |||
1472 : | usage: $fig->all_features($genome,$type) | ||
1473 : | |||
1474 : | Returns a list of all feature IDs of a specified type in the designated genome. You would | ||
1475 : | usually use just | ||
1476 : | |||
1477 : | $fig->pegs_of($genome) or | ||
1478 : | $fig->rnas_of($genome) | ||
1479 : | |||
1480 : | which simply invoke this routine. | ||
1481 : | |||
1482 : | =cut | ||
1483 : | |||
1484 : | sub all_features { | ||
1485 : | my($self,$genome,$type) = @_; | ||
1486 : | |||
1487 : | my $rdbH = $self->db_handle; | ||
1488 : | my $relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE (genome = \'$genome\' AND (type = \'$type\'))"); | ||
1489 : | |||
1490 : | if (@$relational_db_response > 0) | ||
1491 : | { | ||
1492 : | return map { $_->[0] } @$relational_db_response; | ||
1493 : | } | ||
1494 : | return (); | ||
1495 : | } | ||
1496 : | |||
1497 : | |||
1498 : | =pod | ||
1499 : | |||
1500 : | =head1 all_pegs_of | ||
1501 : | |||
1502 : | usage: $fig->all_pegs_of($genome) | ||
1503 : | |||
1504 : | Returns a list of all PEGs in the specified genome. Note that order is not | ||
1505 : | specified. | ||
1506 : | |||
1507 : | =cut | ||
1508 : | |||
1509 : | sub pegs_of { | ||
1510 : | my($self,$genome) = @_; | ||
1511 : | |||
1512 : | return $self->all_features($genome,"peg"); | ||
1513 : | } | ||
1514 : | |||
1515 : | |||
1516 : | =pod | ||
1517 : | |||
1518 : | =head1 all_rnas_of | ||
1519 : | |||
1520 : | usage: $fig->all_rnas($genome) | ||
1521 : | |||
1522 : | Returns a list of all RNAs for the given genome. | ||
1523 : | |||
1524 : | =cut | ||
1525 : | |||
1526 : | sub rnas_of { | ||
1527 : | my($self,$genome) = @_; | ||
1528 : | |||
1529 : | return $self->all_features($genome,"rna"); | ||
1530 : | } | ||
1531 : | |||
1532 : | =pod | ||
1533 : | |||
1534 : | =head1 feature_aliases | ||
1535 : | |||
1536 : | usage: @aliases = $fig->feature_aliases($fid) OR | ||
1537 : | $aliases = $fig->feature_aliases($fid) | ||
1538 : | |||
1539 : | Returns a list of aliases (gene IDs, arbitrary numbers assigned by authors, etc.) for the feature. | ||
1540 : | These must come from the tbl files, so add them there if you want to see them here. | ||
1541 : | |||
1542 : | In a scalar context, the aliases come back with commas separating them. | ||
1543 : | |||
1544 : | =cut | ||
1545 : | |||
1546 : | sub feature_aliases { | ||
1547 : | my($self,$feature_id) = @_; | ||
1548 : | my($rdbH,$relational_db_response,$aliases); | ||
1549 : | |||
1550 : | $rdbH = $self->db_handle; | ||
1551 : | if (($relational_db_response = $rdbH->SQL("SELECT aliases FROM features WHERE ( id = \'$feature_id\' )")) && | ||
1552 : | (@$relational_db_response == 1)) | ||
1553 : | { | ||
1554 : | $aliases = $relational_db_response->[0]->[0]; | ||
1555 : | } | ||
1556 : | return $aliases ? (wantarray ? split(/,/,$aliases) : $aliases) : undef; | ||
1557 : | } | ||
1558 : | |||
1559 : | =pod | ||
1560 : | |||
1561 : | overbeek | 1.34 | =head1 by_alias |
1562 : | |||
1563 : | usage: $peg = $fig->by_alias($alias) | ||
1564 : | |||
1565 : | Returns a FIG id if the alias can be converted. Right now we convert aliases | ||
1566 : | of the form NP_* (RefSeq IDs) or gi|* (GenBank IDs) | ||
1567 : | |||
1568 : | =cut | ||
1569 : | |||
1570 : | sub by_alias { | ||
1571 : | my($self,$alias) = @_; | ||
1572 : | my($rdbH,$relational_db_response,$peg); | ||
1573 : | |||
1574 : | $peg = ""; | ||
1575 : | $rdbH = $self->db_handle; | ||
1576 : | if (($relational_db_response = $rdbH->SQL("SELECT id FROM ext_alias WHERE ( alias = \'$alias\' )")) && | ||
1577 : | (@$relational_db_response == 1)) | ||
1578 : | { | ||
1579 : | $peg = $relational_db_response->[0]->[0]; | ||
1580 : | } | ||
1581 : | return $peg; | ||
1582 : | } | ||
1583 : | |||
1584 : | =pod | ||
1585 : | |||
1586 : | efrank | 1.1 | =head1 possibly_truncated |
1587 : | |||
1588 : | usage: $fig->possibly_truncated($fid) | ||
1589 : | |||
1590 : | Returns true iff the feature occurs near the end of a contig. | ||
1591 : | |||
1592 : | =cut | ||
1593 : | |||
1594 : | sub possibly_truncated { | ||
1595 : | my($self,$feature_id) = @_; | ||
1596 : | my($loc); | ||
1597 : | |||
1598 : | if ($loc = $self->feature_location($feature_id)) | ||
1599 : | { | ||
1600 : | my $genome = &genome_of($feature_id); | ||
1601 : | my ($contig,$beg,$end) = &boundaries_of($loc); | ||
1602 : | if ((! $self->near_end($genome,$contig,$beg)) && (! $self->near_end($genome,$contig,$end))) | ||
1603 : | { | ||
1604 : | return 0; | ||
1605 : | } | ||
1606 : | } | ||
1607 : | return 1; | ||
1608 : | } | ||
1609 : | |||
1610 : | sub near_end { | ||
1611 : | my($self,$genome,$contig,$x) = @_; | ||
1612 : | |||
1613 : | return (($x < 300) || ($x > ($self->contig_ln($genome,$contig) - 300))); | ||
1614 : | } | ||
1615 : | |||
1616 : | overbeek | 1.27 | sub is_real_feature { |
1617 : | my($self,$fid) = @_; | ||
1618 : | my($relational_db_response); | ||
1619 : | |||
1620 : | my $rdbH = $self->db_handle; | ||
1621 : | return (($relational_db_response = $rdbH->SQL("SELECT id FROM features WHERE ( id = \'$fid\' )")) && | ||
1622 : | mkubal | 1.53 | (@$relational_db_response == 1)) ? 1 : 0; |
1623 : | overbeek | 1.27 | } |
1624 : | |||
1625 : | efrank | 1.1 | ################ Routines to process functional coupling for PEGs ########################## |
1626 : | |||
1627 : | =pod | ||
1628 : | |||
1629 : | =head1 coupling_and_evidence | ||
1630 : | |||
1631 : | usage: @coupling_data = $fig->coupling_and_evidence($fid,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) | ||
1632 : | |||
1633 : | A computation of couplings and evidence starts with a given peg and produces a list of | ||
1634 : | 3-tuples. Each 3-tuple is of the form | ||
1635 : | |||
1636 : | [Score,CoupledToFID,Evidence] | ||
1637 : | |||
1638 : | Evidence is a list of 2-tuples of FIDs that are close in other genomes (producing | ||
1639 : | a "pair of close homologs" of [$peg,CoupledToFID]). The maximum score for a single | ||
1640 : | PCH is 1, but "Score" is the sum of the scores for the entire set of PCHs. | ||
1641 : | |||
1642 : | If $keep_record is true, the system records the information, asserting coupling for each | ||
1643 : | of the pairs in the set of evidence, and asserting a pin from the given $fd through all | ||
1644 : | of the PCH entries used in forming the score. | ||
1645 : | |||
1646 : | =cut | ||
1647 : | |||
1648 : | sub coupling_and_evidence { | ||
1649 : | my($self,$feature_id,$bound,$sim_cutoff,$coupling_cutoff,$keep_record) = @_; | ||
1650 : | my($neighbors,$neigh,$similar1,$similar2,@hits,$sc,$ev,$genome1); | ||
1651 : | |||
1652 : | if ($feature_id =~ /^fig\|(\d+\.\d+)/) | ||
1653 : | { | ||
1654 : | $genome1 = $1; | ||
1655 : | } | ||
1656 : | |||
1657 : | my($contig,$beg,$end) = &FIG::boundaries_of($self->feature_location($feature_id)); | ||
1658 : | if (! $contig) { return () } | ||
1659 : | |||
1660 : | ($neighbors,undef,undef) = $self->genes_in_region(&genome_of($feature_id), | ||
1661 : | $contig, | ||
1662 : | &min($beg,$end) - $bound, | ||
1663 : | &max($beg,$end) + $bound); | ||
1664 : | if (@$neighbors == 0) { return () } | ||
1665 : | $similar1 = $self->acceptably_close($feature_id,$sim_cutoff); | ||
1666 : | @hits = (); | ||
1667 : | |||
1668 : | foreach $neigh (grep { $_ =~ /peg/ } @$neighbors) | ||
1669 : | { | ||
1670 : | next if ($neigh eq $feature_id); | ||
1671 : | $similar2 = $self->acceptably_close($neigh,$sim_cutoff); | ||
1672 : | ($sc,$ev) = $self->coupling_ev($genome1,$similar1,$similar2,$bound); | ||
1673 : | if ($sc >= $coupling_cutoff) | ||
1674 : | { | ||
1675 : | push(@hits,[$sc,$neigh,$ev]); | ||
1676 : | } | ||
1677 : | } | ||
1678 : | if ($keep_record) | ||
1679 : | { | ||
1680 : | $self->add_chr_clusters_and_pins($feature_id,\@hits); | ||
1681 : | } | ||
1682 : | return sort { $b->[0] <=> $a->[0] } @hits; | ||
1683 : | } | ||
1684 : | |||
1685 : | overbeek | 1.35 | sub fast_coupling { |
1686 : | my($self,$peg,$bound,$coupling_cutoff) = @_; | ||
1687 : | my($genome,$genome1,$genome2,$peg1,$peg2,$peg3,%maps,$loc,$loc1,$loc2,$loc3); | ||
1688 : | my($pairs,$sc,%ev); | ||
1689 : | |||
1690 : | my @ans = (); | ||
1691 : | |||
1692 : | $genome = &genome_of($peg); | ||
1693 : | foreach $peg1 ($self->in_pch_pin_with($peg)) | ||
1694 : | { | ||
1695 : | $peg1 =~ s/,.*$//; | ||
1696 : | if ($peg ne $peg1) | ||
1697 : | { | ||
1698 : | $genome1 = &genome_of($peg1); | ||
1699 : | $maps{$peg}->{$genome1} = $peg1; | ||
1700 : | } | ||
1701 : | } | ||
1702 : | |||
1703 : | $loc = [&boundaries_of(scalar $self->feature_location($peg))]; | ||
1704 : | foreach $peg1 ($self->in_cluster_with($peg)) | ||
1705 : | { | ||
1706 : | if ($peg ne $peg1) | ||
1707 : | { | ||
1708 : | # print STDERR "peg1=$peg1\n"; | ||
1709 : | $loc1 = [&boundaries_of(scalar $self->feature_location($peg1))]; | ||
1710 : | if (&close_enough($loc,$loc1,$bound)) | ||
1711 : | { | ||
1712 : | foreach $peg2 ($self->in_pch_pin_with($peg1)) | ||
1713 : | { | ||
1714 : | $genome2 = &genome_of($peg2); | ||
1715 : | if (($peg3 = $maps{$peg}->{$genome2}) && ($peg2 ne $peg3)) | ||
1716 : | { | ||
1717 : | $loc2 = [&boundaries_of(scalar $self->feature_location($peg2))]; | ||
1718 : | $loc3 = [&boundaries_of(scalar $self->feature_location($peg3))]; | ||
1719 : | if (&close_enough($loc2,$loc3,$bound)) | ||
1720 : | { | ||
1721 : | push(@{$ev{$peg1}},[$peg3,$peg2]); | ||
1722 : | } | ||
1723 : | } | ||
1724 : | } | ||
1725 : | } | ||
1726 : | } | ||
1727 : | } | ||
1728 : | foreach $peg1 (keys(%ev)) | ||
1729 : | { | ||
1730 : | $pairs = $ev{$peg1}; | ||
1731 : | overbeek | 1.43 | $sc = $self->score([$peg,map { $_->[0] } @$pairs]); |
1732 : | overbeek | 1.35 | if ($sc >= $coupling_cutoff) |
1733 : | { | ||
1734 : | push(@ans,[$sc,$peg1]); | ||
1735 : | } | ||
1736 : | } | ||
1737 : | return sort { $b->[0] <=> $a->[0] } @ans; | ||
1738 : | } | ||
1739 : | |||
1740 : | |||
1741 : | sub score { | ||
1742 : | overbeek | 1.43 | my($self,$pegs) = @_; |
1743 : | overbeek | 1.51 | my(@ids); |
1744 : | overbeek | 1.35 | |
1745 : | overbeek | 1.51 | if ($self->{_no9s_scoring}) |
1746 : | { | ||
1747 : | @ids = map { $self->maps_to_id($_) } grep { $_ !~ /^fig\|999999/ } @$pegs; | ||
1748 : | } | ||
1749 : | else | ||
1750 : | { | ||
1751 : | @ids = map { $self->maps_to_id($_) } @$pegs; | ||
1752 : | } | ||
1753 : | overbeek | 1.43 | return &score1($self,\@ids) - 1; |
1754 : | } | ||
1755 : | |||
1756 : | sub score1 { | ||
1757 : | my($self,$pegs) = @_; | ||
1758 : | my($sim); | ||
1759 : | my($first,@rest) = @$pegs; | ||
1760 : | my $count = 1; | ||
1761 : | my %hits = map { $_ => 1 } @rest; | ||
1762 : | my @ordered = sort { $b->[0] <=> $a->[0] } | ||
1763 : | map { $sim = $_; [$sim->iden,$sim->id2] } | ||
1764 : | grep { $hits{$_->id2} } | ||
1765 : | $self->sims($first,1000,1,"raw"); | ||
1766 : | overbeek | 1.76 | my %ordered = map { $_->[1] => 1 } @ordered; |
1767 : | foreach $_ (@rest) | ||
1768 : | { | ||
1769 : | if (! $ordered{$_}) | ||
1770 : | { | ||
1771 : | push(@ordered,[0,$_]); | ||
1772 : | } | ||
1773 : | } | ||
1774 : | |||
1775 : | overbeek | 1.43 | while ((@ordered > 0) && ($ordered[0]->[0] >= 97)) |
1776 : | overbeek | 1.35 | { |
1777 : | overbeek | 1.43 | shift @ordered ; |
1778 : | } | ||
1779 : | while (@ordered > 0) | ||
1780 : | { | ||
1781 : | my $start = $ordered[0]->[0]; | ||
1782 : | $_ = shift @ordered; | ||
1783 : | my @sub = ( $_->[1] ); | ||
1784 : | while ((@ordered > 0) && ($ordered[0]->[0] > ($start-3))) | ||
1785 : | overbeek | 1.35 | { |
1786 : | overbeek | 1.43 | $_ = shift @ordered; |
1787 : | push(@sub, $_->[1]); | ||
1788 : | overbeek | 1.35 | } |
1789 : | |||
1790 : | overbeek | 1.43 | if (@sub == 1) |
1791 : | { | ||
1792 : | $count++; | ||
1793 : | } | ||
1794 : | else | ||
1795 : | { | ||
1796 : | $count += &score1($self,\@sub); | ||
1797 : | } | ||
1798 : | overbeek | 1.35 | } |
1799 : | overbeek | 1.43 | return $count; |
1800 : | overbeek | 1.35 | } |
1801 : | |||
1802 : | efrank | 1.1 | =pod |
1803 : | |||
1804 : | =head1 add_chr_clusters_and_pins | ||
1805 : | |||
1806 : | usage: $fig->add_chr_clusters_and_pins($peg,$hits) | ||
1807 : | |||
1808 : | The system supports retaining data relating to functional coupling. If a user | ||
1809 : | computes evidence once and then saves it with this routine, data relating to | ||
1810 : | both "the pin" and the "clusters" (in all of the organisms supporting the | ||
1811 : | functional coupling) will be saved. | ||
1812 : | |||
1813 : | $hits must be a pointer to a list of 3-tuples of the sort returned by | ||
1814 : | $fig->coupling_and_evidence. | ||
1815 : | |||
1816 : | =cut | ||
1817 : | |||
1818 : | sub add_chr_clusters_and_pins { | ||
1819 : | my($self,$peg,$hits) = @_; | ||
1820 : | my(@clusters,@pins,$x,$sc,$neigh,$pairs,$y,@corr,@orgs,%projection); | ||
1821 : | my($genome,$cluster,$pin,$peg2); | ||
1822 : | |||
1823 : | if (@$hits > 0) | ||
1824 : | { | ||
1825 : | @clusters = (); | ||
1826 : | @pins = (); | ||
1827 : | push(@clusters,[$peg,map { $_->[1] } @$hits]); | ||
1828 : | foreach $x (@$hits) | ||
1829 : | { | ||
1830 : | ($sc,$neigh,$pairs) = @$x; | ||
1831 : | push(@pins,[$neigh,map { $_->[1] } @$pairs]); | ||
1832 : | foreach $y (@$pairs) | ||
1833 : | { | ||
1834 : | $peg2 = $y->[0]; | ||
1835 : | if ($peg2 =~ /^fig\|(\d+\.\d+)/) | ||
1836 : | { | ||
1837 : | $projection{$1}->{$peg2} = 1; | ||
1838 : | } | ||
1839 : | } | ||
1840 : | } | ||
1841 : | @corr = (); | ||
1842 : | @orgs = keys(%projection); | ||
1843 : | if (@orgs > 0) | ||
1844 : | { | ||
1845 : | foreach $genome (sort { $a <=> $b } @orgs) | ||
1846 : | { | ||
1847 : | push(@corr,sort { &FIG::by_fig_id($a,$b) } keys(%{$projection{$genome}})); | ||
1848 : | } | ||
1849 : | push(@pins,[$peg,@corr]); | ||
1850 : | } | ||
1851 : | |||
1852 : | foreach $cluster (@clusters) | ||
1853 : | { | ||
1854 : | $self->add_chromosomal_cluster($cluster); | ||
1855 : | } | ||
1856 : | |||
1857 : | foreach $pin (@pins) | ||
1858 : | { | ||
1859 : | $self->add_pch_pin($pin); | ||
1860 : | } | ||
1861 : | } | ||
1862 : | } | ||
1863 : | |||
1864 : | sub coupling_ev { | ||
1865 : | my($self,$genome1,$sim1,$sim2,$bound) = @_; | ||
1866 : | my($ev,$sc,$i,$j); | ||
1867 : | |||
1868 : | $ev = []; | ||
1869 : | $sc = 0; | ||
1870 : | |||
1871 : | $i = 0; | ||
1872 : | $j = 0; | ||
1873 : | while (($i < @$sim1) && ($j < @$sim2)) | ||
1874 : | { | ||
1875 : | if ($sim1->[$i]->[0] < $sim2->[$j]->[0]) | ||
1876 : | { | ||
1877 : | $i++; | ||
1878 : | } | ||
1879 : | elsif ($sim1->[$i]->[0] > $sim2->[$j]->[0]) | ||
1880 : | { | ||
1881 : | $j++; | ||
1882 : | } | ||
1883 : | else | ||
1884 : | { | ||
1885 : | $sc += $self->accumulate_ev($genome1,$sim1->[$i]->[1],$sim2->[$j]->[1],$bound,$ev); | ||
1886 : | $i++; | ||
1887 : | $j++; | ||
1888 : | } | ||
1889 : | } | ||
1890 : | overbeek | 1.43 | return ($self->score([map { $_->[0] } @$ev]),$ev); |
1891 : | efrank | 1.1 | } |
1892 : | |||
1893 : | sub accumulate_ev { | ||
1894 : | my($self,$genome1,$feature_ids1,$feature_ids2,$bound,$ev) = @_; | ||
1895 : | overbeek | 1.43 | my($genome2,@locs1,@locs2,$i,$j,$x); |
1896 : | efrank | 1.1 | |
1897 : | if ((@$feature_ids1 == 0) || (@$feature_ids2 == 0)) { return 0 } | ||
1898 : | |||
1899 : | $feature_ids1->[0] =~ /^fig\|(\d+\.\d+)/; | ||
1900 : | $genome2 = $1; | ||
1901 : | @locs1 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids1; | ||
1902 : | @locs2 = map { $x = $self->feature_location($_); $x ? [&boundaries_of($x)] : () } @$feature_ids2; | ||
1903 : | |||
1904 : | for ($i=0; ($i < @$feature_ids1); $i++) | ||
1905 : | { | ||
1906 : | for ($j=0; ($j < @$feature_ids2); $j++) | ||
1907 : | { | ||
1908 : | if (($feature_ids1->[$i] ne $feature_ids2->[$j]) && | ||
1909 : | &close_enough($locs1[$i],$locs2[$j],$bound)) | ||
1910 : | { | ||
1911 : | push(@$ev,[$feature_ids1->[$i],$feature_ids2->[$j]]); | ||
1912 : | } | ||
1913 : | } | ||
1914 : | } | ||
1915 : | } | ||
1916 : | |||
1917 : | sub close_enough { | ||
1918 : | my($locs1,$locs2,$bound) = @_; | ||
1919 : | |||
1920 : | # print STDERR &Dumper(["close enough",$locs1,$locs2]); | ||
1921 : | return (($locs1->[0] eq $locs2->[0]) && (abs((($locs1->[1]+$locs1->[2])/2) - (($locs2->[1]+$locs2->[2])/2)) <= $bound)); | ||
1922 : | } | ||
1923 : | |||
1924 : | sub acceptably_close { | ||
1925 : | my($self,$feature_id,$sim_cutoff) = @_; | ||
1926 : | my(%by_org,$id2,$genome,$sim); | ||
1927 : | |||
1928 : | my($ans) = []; | ||
1929 : | |||
1930 : | overbeek | 1.31 | foreach $sim ($self->sims($feature_id,1000,$sim_cutoff,"fig")) |
1931 : | efrank | 1.1 | { |
1932 : | $id2 = $sim->id2; | ||
1933 : | if ($id2 =~ /^fig\|(\d+\.\d+)/) | ||
1934 : | { | ||
1935 : | my $genome = $1; | ||
1936 : | overbeek | 1.51 | if (! $self->is_eukaryotic($genome)) |
1937 : | efrank | 1.1 | { |
1938 : | push(@{$by_org{$genome}},$id2); | ||
1939 : | } | ||
1940 : | } | ||
1941 : | } | ||
1942 : | foreach $genome (sort { $a <=> $b } keys(%by_org)) | ||
1943 : | { | ||
1944 : | push(@$ans,[$genome,$by_org{$genome}]); | ||
1945 : | } | ||
1946 : | return $ans; | ||
1947 : | } | ||
1948 : | |||
1949 : | ################ Translations of PEGsand External Protein Sequences ########################## | ||
1950 : | |||
1951 : | |||
1952 : | =pod | ||
1953 : | |||
1954 : | =head1 translatable | ||
1955 : | |||
1956 : | usage: $fig->translatable($prot_id) | ||
1957 : | |||
1958 : | The system takes any number of sources of protein sequences as input (and builds an nr | ||
1959 : | for the purpose of computing similarities). For each of these input fasta files, it saves | ||
1960 : | (in the DB) a filename, seek address and length so that it can go get the translation if | ||
1961 : | needed. This routine simply returns true iff info on the translation exists. | ||
1962 : | |||
1963 : | =cut | ||
1964 : | |||
1965 : | sub translatable { | ||
1966 : | my($self,$prot) = @_; | ||
1967 : | |||
1968 : | return &translation_length($self,$prot) ? 1 : 0; | ||
1969 : | } | ||
1970 : | |||
1971 : | |||
1972 : | =pod | ||
1973 : | |||
1974 : | =head1 translation_length | ||
1975 : | |||
1976 : | usage: $len = $fig->translation_length($prot_id) | ||
1977 : | |||
1978 : | The system takes any number of sources of protein sequences as input (and builds an nr | ||
1979 : | for the purpose of computing similarities). For each of these input fasta files, it saves | ||
1980 : | (in the DB) a filename, seek address and length so that it can go get the translation if | ||
1981 : | needed. This routine returns the length of a translation. This does not require actually | ||
1982 : | retrieving the translation. | ||
1983 : | |||
1984 : | =cut | ||
1985 : | |||
1986 : | sub translation_length { | ||
1987 : | my($self,$prot) = @_; | ||
1988 : | |||
1989 : | $prot =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; | ||
1990 : | my $rdbH = $self->db_handle; | ||
1991 : | my $relational_db_response = $rdbH->SQL("SELECT slen FROM protein_sequence_seeks | ||
1992 : | WHERE id = \'$prot\' "); | ||
1993 : | |||
1994 : | return (@$relational_db_response == 1) ? $relational_db_response->[0]->[0] : undef; | ||
1995 : | } | ||
1996 : | |||
1997 : | |||
1998 : | =pod | ||
1999 : | |||
2000 : | =head1 get_translation | ||
2001 : | |||
2002 : | usage: $translation = $fig->get_translation($prot_id) | ||
2003 : | |||
2004 : | The system takes any number of sources of protein sequences as input (and builds an nr | ||
2005 : | for the purpose of computing similarities). For each of these input fasta files, it saves | ||
2006 : | (in the DB) a filename, seek address and length so that it can go get the translation if | ||
2007 : | needed. This routine returns a protein sequence. | ||
2008 : | |||
2009 : | =cut | ||
2010 : | |||
2011 : | sub get_translation { | ||
2012 : | my($self,$id) = @_; | ||
2013 : | my($rdbH,$relational_db_response,$fileN,$file,$fh,$seek,$ln,$tran); | ||
2014 : | |||
2015 : | $rdbH = $self->db_handle; | ||
2016 : | $id =~ s/^([^\|]+\|[^\|]+)\|.*$/$1/; | ||
2017 : | |||
2018 : | $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM protein_sequence_seeks WHERE id = \'$id\' "); | ||
2019 : | |||
2020 : | if ($relational_db_response && @$relational_db_response == 1) | ||
2021 : | { | ||
2022 : | ($fileN,$seek,$ln) = @{$relational_db_response->[0]}; | ||
2023 : | if (($fh = $self->openF($self->N2file($fileN))) && | ||
2024 : | ($ln > 10)) | ||
2025 : | { | ||
2026 : | seek($fh,$seek,0); | ||
2027 : | read($fh,$tran,$ln-1); | ||
2028 : | $tran =~ s/\s//g; | ||
2029 : | return $tran; | ||
2030 : | } | ||
2031 : | } | ||
2032 : | return ''; | ||
2033 : | } | ||
2034 : | |||
2035 : | =pod | ||
2036 : | |||
2037 : | =head1 mapped_prot_ids | ||
2038 : | |||
2039 : | usage: @mapped = $fig->mapped_prot_ids($prot) | ||
2040 : | |||
2041 : | This routine is at the heart of maintaining synonyms for protein sequences. The system | ||
2042 : | determines which protein sequences are "essentially the same". These may differ in length | ||
2043 : | (presumably due to miscalled starts), but the tails are identical (and the heads are not "too" extended). | ||
2044 : | Anyway, the set of synonyms is returned as a list of 2-tuples [Id,length] sorted | ||
2045 : | by length. | ||
2046 : | |||
2047 : | =cut | ||
2048 : | |||
2049 : | sub mapped_prot_ids { | ||
2050 : | my($self,$id) = @_; | ||
2051 : | |||
2052 : | my $rdbH = $self->db_handle; | ||
2053 : | my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); | ||
2054 : | if ($relational_db_response && (@$relational_db_response == 1)) | ||
2055 : | { | ||
2056 : | $id = $relational_db_response->[0]->[0]; | ||
2057 : | } | ||
2058 : | |||
2059 : | $relational_db_response = $rdbH->SQL("SELECT syn_id,syn_ln,maps_to_ln FROM peg_synonyms WHERE maps_to = \'$id\' "); | ||
2060 : | if ($relational_db_response && (@$relational_db_response > 0)) | ||
2061 : | { | ||
2062 : | return ([$id,$relational_db_response->[0]->[2]],map { [$_->[0],$_->[1]] } @$relational_db_response); | ||
2063 : | } | ||
2064 : | else | ||
2065 : | { | ||
2066 : | return ([$id,$self->translation_length($id)]); | ||
2067 : | } | ||
2068 : | overbeek | 1.14 | } |
2069 : | |||
2070 : | sub maps_to_id { | ||
2071 : | my($self,$id) = @_; | ||
2072 : | |||
2073 : | my $rdbH = $self->db_handle; | ||
2074 : | my $relational_db_response = $rdbH->SQL("SELECT maps_to FROM peg_synonyms WHERE syn_id = \'$id\' "); | ||
2075 : | return ($relational_db_response && (@$relational_db_response == 1)) ? $relational_db_response->[0]->[0] : $id; | ||
2076 : | efrank | 1.1 | } |
2077 : | |||
2078 : | ################ Assignments of Function to PEGs ########################## | ||
2079 : | |||
2080 : | =pod | ||
2081 : | |||
2082 : | =head1 function_of | ||
2083 : | |||
2084 : | usage: @functions = $fig->function_of($peg) OR | ||
2085 : | $function = $fig->function_of($peg,$user) | ||
2086 : | |||
2087 : | In a list context, you get back a list of 2-tuples. Each 2-tuple is of the | ||
2088 : | form [MadeBy,Function]. | ||
2089 : | |||
2090 : | In a scalar context, | ||
2091 : | |||
2092 : | 1. user is "master" if not specified | ||
2093 : | 2. function returned is the user's, if one exists; otherwise, master's, if one exists | ||
2094 : | |||
2095 : | In a scalar context, you get just the function. | ||
2096 : | |||
2097 : | =cut | ||
2098 : | |||
2099 : | # Note that we do not return confidence. I propose a separate function to get both | ||
2100 : | # function and confidence | ||
2101 : | # | ||
2102 : | sub function_of { | ||
2103 : | my($self,$id,$user) = @_; | ||
2104 : | my($relational_db_response,@tmp,$entry,$i); | ||
2105 : | my $wantarray = wantarray(); | ||
2106 : | my $rdbH = $self->db_handle; | ||
2107 : | |||
2108 : | if (($id =~ /^fig\|(\d+\.\d+\.peg\.\d+)/) && ($wantarray || $user)) | ||
2109 : | { | ||
2110 : | if (($relational_db_response = $rdbH->SQL("SELECT made_by,assigned_function FROM assigned_functions WHERE ( prot = \'$id\' )")) && | ||
2111 : | (@$relational_db_response >= 1)) | ||
2112 : | { | ||
2113 : | @tmp = sort { $a->[0] cmp $b->[0] } map { [$_->[0],$_->[1]] } @$relational_db_response; | ||
2114 : | for ($i=0; ($i < @tmp) && ($tmp[$i]->[0] ne "master"); $i++) {} | ||
2115 : | if ($i < @tmp) | ||
2116 : | { | ||
2117 : | $entry = splice(@tmp,$i,1); | ||
2118 : | unshift @tmp, ($entry); | ||
2119 : | } | ||
2120 : | |||
2121 : | my $val; | ||
2122 : | if ($wantarray) { return @tmp } | ||
2123 : | elsif ($user && ($val = &extract_by_who(\@tmp,$user))) { return $val } | ||
2124 : | elsif ($user && ($val = &extract_by_who(\@tmp,"master"))) { return $val } | ||
2125 : | else { return "" } | ||
2126 : | } | ||
2127 : | } | ||
2128 : | else | ||
2129 : | { | ||
2130 : | if (($relational_db_response = $rdbH->SQL("SELECT assigned_function FROM assigned_functions WHERE ( prot = \'$id\' AND made_by = \'master\' )")) && | ||
2131 : | (@$relational_db_response >= 1)) | ||
2132 : | { | ||
2133 : | return $wantarray ? (["master",$relational_db_response->[0]->[0]]) : $relational_db_response->[0]->[0]; | ||
2134 : | } | ||
2135 : | } | ||
2136 : | |||
2137 : | return $wantarray ? () : ""; | ||
2138 : | } | ||
2139 : | |||
2140 : | =pod | ||
2141 : | |||
2142 : | =head1 translated_function_of | ||
2143 : | |||
2144 : | usage: $function = $fig->translated_function_of($peg,$user) | ||
2145 : | |||
2146 : | You get just the translated function. | ||
2147 : | |||
2148 : | =cut | ||
2149 : | |||
2150 : | sub translated_function_of { | ||
2151 : | my($self,$id,$user) = @_; | ||
2152 : | |||
2153 : | my $func = $self->function_of($id,$user); | ||
2154 : | if ($func) | ||
2155 : | { | ||
2156 : | $func = $self->translate_function($func); | ||
2157 : | } | ||
2158 : | return $func; | ||
2159 : | } | ||
2160 : | |||
2161 : | |||
2162 : | sub extract_by_who { | ||
2163 : | my($xL,$who) = @_; | ||
2164 : | my($i); | ||
2165 : | |||
2166 : | for ($i=0; ($i < @$xL) && ($xL->[$i]->[0] ne $who); $i++) {} | ||
2167 : | return ($i < @$xL) ? $xL->[$i]->[1] : ""; | ||
2168 : | } | ||
2169 : | |||
2170 : | |||
2171 : | =pod | ||
2172 : | |||
2173 : | =head1 translate_function | ||
2174 : | |||
2175 : | usage: $translated_func = $fig->translate_function($func) | ||
2176 : | |||
2177 : | Translates a function based on the function.synonyms table. | ||
2178 : | |||
2179 : | =cut | ||
2180 : | |||
2181 : | sub translate_function { | ||
2182 : | my($self,$function) = @_; | ||
2183 : | |||
2184 : | my ($tran,$from,$to,$line); | ||
2185 : | if (! ($tran = $self->{_function_translation})) | ||
2186 : | { | ||
2187 : | $tran = {}; | ||
2188 : | if (open(TMP,"<$FIG_Config::global/function.synonyms")) | ||
2189 : | { | ||
2190 : | while (defined($line = <TMP>)) | ||
2191 : | { | ||
2192 : | golsen | 1.44 | chomp $line; |
2193 : | efrank | 1.1 | ($from,$to) = split(/\t/,$line); |
2194 : | $tran->{$from} = $to; | ||
2195 : | } | ||
2196 : | close(TMP); | ||
2197 : | } | ||
2198 : | overbeek | 1.22 | foreach $from (keys(%$tran)) |
2199 : | { | ||
2200 : | $to = $tran->{$from}; | ||
2201 : | if ($tran->{$to}) | ||
2202 : | { | ||
2203 : | delete $tran->{$from}; | ||
2204 : | } | ||
2205 : | } | ||
2206 : | efrank | 1.1 | $self->{_function_translation} = $tran; |
2207 : | } | ||
2208 : | overbeek | 1.4 | |
2209 : | while ($to = $tran->{$function}) | ||
2210 : | { | ||
2211 : | $function = $to; | ||
2212 : | } | ||
2213 : | return $function; | ||
2214 : | efrank | 1.1 | } |
2215 : | |||
2216 : | =pod | ||
2217 : | |||
2218 : | =head1 assign_function | ||
2219 : | |||
2220 : | usage: $fig->assign_function($peg,$user,$function,$confidence) | ||
2221 : | |||
2222 : | Assigns a function. Note that confidence can (and should be if unusual) included. | ||
2223 : | Note that no annotation is written. This should normally be done in a separate | ||
2224 : | call of the form | ||
2225 : | |||
2226 : | |||
2227 : | |||
2228 : | =cut | ||
2229 : | |||
2230 : | sub assign_function { | ||
2231 : | my($self,$peg,$user,$function,$confidence) = @_; | ||
2232 : | my($role,$roleQ); | ||
2233 : | |||
2234 : | my $rdbH = $self->db_handle; | ||
2235 : | $confidence = $confidence ? $confidence : ""; | ||
2236 : | my $genome = $self->genome_of($peg); | ||
2237 : | |||
2238 : | $rdbH->SQL("DELETE FROM assigned_functions WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); | ||
2239 : | |||
2240 : | my $funcQ = quotemeta $function; | ||
2241 : | $rdbH->SQL("INSERT INTO assigned_functions ( prot, made_by, assigned_function, quality, org ) VALUES ( \'$peg\', \'$user\', \'$funcQ\', \'$confidence\', \'$genome\' )"); | ||
2242 : | $rdbH->SQL("DELETE FROM roles WHERE ( prot = \'$peg\' AND made_by = \'$user\' )"); | ||
2243 : | |||
2244 : | foreach $role (&roles_of_function($function)) | ||
2245 : | { | ||
2246 : | $roleQ = quotemeta $role; | ||
2247 : | $rdbH->SQL("INSERT INTO roles ( prot, role, made_by, org ) VALUES ( \'$peg\', '$roleQ\', \'$user\', \'$genome\' )"); | ||
2248 : | } | ||
2249 : | |||
2250 : | &verify_dir("$FIG_Config::organisms/$genome/UserModels"); | ||
2251 : | if ($user ne "master") | ||
2252 : | { | ||
2253 : | &verify_dir("$FIG_Config::organisms/$genome/UserModels/$user"); | ||
2254 : | } | ||
2255 : | |||
2256 : | overbeek | 1.66 | my $file; |
2257 : | if ((($user eq "master") && ($file = "$FIG_Config::organisms/$genome/assigned_functions") && open(TMP,">>$file")) || | ||
2258 : | (($user ne "master") && ($file = "$FIG_Config::organisms/$genome/UserModels/$user/assigned_functions") && open(TMP,">>$file"))) | ||
2259 : | efrank | 1.1 | { |
2260 : | flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; | ||
2261 : | seek(TMP,0,2) || confess "failed to seek to the end of the file"; | ||
2262 : | print TMP "$peg\t$function\t$confidence\n"; | ||
2263 : | close(TMP); | ||
2264 : | overbeek | 1.66 | chmod(0777,$file); |
2265 : | efrank | 1.1 | return 1; |
2266 : | } | ||
2267 : | return 0; | ||
2268 : | } | ||
2269 : | |||
2270 : | sub hypo { | ||
2271 : | my $x = (@_ == 1) ? $_[0] : $_[1]; | ||
2272 : | |||
2273 : | overbeek | 1.23 | if (! $x) { return 1 } |
2274 : | if ($x =~ /hypoth/i) { return 1 } | ||
2275 : | if ($x =~ /conserved protein/i) { return 1 } | ||
2276 : | overbeek | 1.63 | if ($x =~ /gene product/i) { return 1 } |
2277 : | if ($x =~ /interpro/i) { return 1 } | ||
2278 : | if ($x =~ /B[sl][lr]\d/i) { return 1 } | ||
2279 : | if ($x =~ /^U\d/) { return 1 } | ||
2280 : | if ($x =~ /^orf/i) { return 1 } | ||
2281 : | if ($x =~ /uncharacterized/i) { return 1 } | ||
2282 : | if ($x =~ /psedogene/i) { return 1 } | ||
2283 : | if ($x =~ /^predicted/i) { return 1 } | ||
2284 : | if ($x =~ /AGR_/) { return 1 } | ||
2285 : | overbeek | 1.51 | if ($x =~ /similar to/i) { return 1 } |
2286 : | overbeek | 1.63 | if ($x =~ /similarity/i) { return 1 } |
2287 : | if ($x =~ /glimmer/i) { return 1 } | ||
2288 : | overbeek | 1.23 | if ($x =~ /unknown/i) { return 1 } |
2289 : | return 0; | ||
2290 : | efrank | 1.1 | } |
2291 : | |||
2292 : | ############################ Similarities ############################### | ||
2293 : | |||
2294 : | =pod | ||
2295 : | |||
2296 : | =head1 sims | ||
2297 : | |||
2298 : | usage: @sims = $fig->sims($peg,$maxN,$maxP,$select) | ||
2299 : | |||
2300 : | Returns a list of similarities for $peg such that | ||
2301 : | |||
2302 : | there will be at most $maxN similarities, | ||
2303 : | |||
2304 : | each similarity will have a P-score <= $maxP, and | ||
2305 : | |||
2306 : | $select gives processing instructions: | ||
2307 : | |||
2308 : | "raw" means that the similarities will not be expanded (by far fastest option) | ||
2309 : | "fig" means return only similarities to fig genes | ||
2310 : | "all" means that you want all the expanded similarities. | ||
2311 : | |||
2312 : | By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant | ||
2313 : | protein collection, and converting it to a set of similarities (one for each of the | ||
2314 : | proteins that are essentially identical to the representative in the nr). | ||
2315 : | |||
2316 : | =cut | ||
2317 : | |||
2318 : | sub sims { | ||
2319 : | overbeek | 1.29 | my ($self,$id,$maxN,$maxP,$select,$max_expand) = @_; |
2320 : | efrank | 1.1 | my($sim); |
2321 : | overbeek | 1.29 | $max_expand = defined($max_expand) ? $max_expand : $maxN; |
2322 : | efrank | 1.1 | |
2323 : | my @sims = (); | ||
2324 : | my @maps_to = $self->mapped_prot_ids($id); | ||
2325 : | if (@maps_to > 0) | ||
2326 : | { | ||
2327 : | my $rep_id = $maps_to[0]->[0]; | ||
2328 : | my @entry = grep { $_->[0] eq $id } @maps_to; | ||
2329 : | if ((@entry == 1) && defined($entry[0]->[1])) | ||
2330 : | { | ||
2331 : | if ((! defined($maps_to[0]->[1])) || | ||
2332 : | (! defined($entry[0]->[1]))) | ||
2333 : | { | ||
2334 : | print STDERR &Dumper(\@maps_to,\@entry); | ||
2335 : | confess "bad"; | ||
2336 : | } | ||
2337 : | my $delta = $maps_to[0]->[1] - $entry[0]->[1]; | ||
2338 : | my @raw_sims = &get_raw_sims($self,$rep_id,$maxN,$maxP); | ||
2339 : | efrank | 1.2 | if ($id ne $rep_id) |
2340 : | efrank | 1.1 | { |
2341 : | efrank | 1.2 | foreach $sim (@raw_sims) |
2342 : | { | ||
2343 : | efrank | 1.1 | |
2344 : | $sim->[0] = $id; | ||
2345 : | $sim->[6] -= $delta; | ||
2346 : | $sim->[7] -= $delta; | ||
2347 : | } | ||
2348 : | } | ||
2349 : | efrank | 1.2 | unshift(@raw_sims,bless([$id,$rep_id,100.00,undef,undef,undef,1,$entry[0]->[1],$delta+1,$maps_to[0]->[1],0.0,,undef,$entry[0]->[1],$maps_to[0]->[1],"blastp",0,0],'Sim')); |
2350 : | overbeek | 1.37 | @sims = grep { $_->id1 ne $_->id2 } &expand_raw_sims($self,\@raw_sims,$maxP,$select,0,$max_expand+1); |
2351 : | efrank | 1.1 | } |
2352 : | } | ||
2353 : | return @sims; | ||
2354 : | } | ||
2355 : | |||
2356 : | sub expand_raw_sims { | ||
2357 : | overbeek | 1.29 | my($self,$raw_sims,$maxP,$select,$dups,$max_expand) = @_; |
2358 : | efrank | 1.1 | my($sim,$id2,%others,$x); |
2359 : | |||
2360 : | my @sims = (); | ||
2361 : | foreach $sim (@$raw_sims) | ||
2362 : | { | ||
2363 : | next if ($sim->psc > $maxP); | ||
2364 : | $id2 = $sim->id2; | ||
2365 : | next if ($others{$id2} && (! $dups)); | ||
2366 : | $others{$id2} = 1; | ||
2367 : | overbeek | 1.37 | if (($select && ($select eq "raw")) || ($max_expand <= 0)) |
2368 : | efrank | 1.1 | { |
2369 : | push(@sims,$sim); | ||
2370 : | } | ||
2371 : | else | ||
2372 : | { | ||
2373 : | my @relevant; | ||
2374 : | overbeek | 1.29 | $max_expand--; |
2375 : | |||
2376 : | efrank | 1.1 | my @maps_to = $self->mapped_prot_ids($id2); |
2377 : | if ((! $select) || ($select eq "fig")) | ||
2378 : | { | ||
2379 : | @relevant = grep { $_->[0] =~ /^fig/ } @maps_to; | ||
2380 : | } | ||
2381 : | elsif ($select && ($select =~ /^ext/i)) | ||
2382 : | { | ||
2383 : | @relevant = grep { $_->[0] !~ /^fig/ } @maps_to; | ||
2384 : | } | ||
2385 : | else | ||
2386 : | { | ||
2387 : | @relevant = @maps_to; | ||
2388 : | } | ||
2389 : | |||
2390 : | foreach $x (@relevant) | ||
2391 : | { | ||
2392 : | my $sim1 = [@$sim]; | ||
2393 : | my($x_id,$x_ln) = @$x; | ||
2394 : | defined($x_ln) || confess "x_ln id2=$id2 x_id=$x_id"; | ||
2395 : | defined($maps_to[0]->[1]) || confess "maps_to"; | ||
2396 : | my $delta2 = $maps_to[0]->[1] - $x_ln; | ||
2397 : | $sim1->[1] = $x_id; | ||
2398 : | $sim1->[8] -= $delta2; | ||
2399 : | $sim1->[9] -= $delta2; | ||
2400 : | bless($sim1,"Sim"); | ||
2401 : | push(@sims,$sim1); | ||
2402 : | } | ||
2403 : | } | ||
2404 : | } | ||
2405 : | return @sims; | ||
2406 : | } | ||
2407 : | |||
2408 : | sub get_raw_sims { | ||
2409 : | my($self,$rep_id,$maxN,$maxP) = @_; | ||
2410 : | my(@sims,$seek,$fileN,$ln,$fh,$file,$readN,$readC,@lines,$i,$sim); | ||
2411 : | my($sim_chunk,$psc,$id2); | ||
2412 : | |||
2413 : | $maxN = $maxN ? $maxN : 500; | ||
2414 : | |||
2415 : | @sims = (); | ||
2416 : | my $rdbH = $self->db_handle; | ||
2417 : | my $relational_db_response = $rdbH->SQL("SELECT seek, fileN, len FROM sim_seeks WHERE id = \'$rep_id\' "); | ||
2418 : | foreach $sim_chunk (@$relational_db_response) | ||
2419 : | { | ||
2420 : | ($seek,$fileN,$ln) = @$sim_chunk; | ||
2421 : | $file = $self->N2file($fileN); | ||
2422 : | $fh = $self->openF($file); | ||
2423 : | if (! $fh) | ||
2424 : | { | ||
2425 : | confess "could not open sims for $file"; | ||
2426 : | } | ||
2427 : | seek($fh,$seek,0); | ||
2428 : | $readN = read($fh,$readC,$ln-1); | ||
2429 : | ($readN == ($ln-1)) | ||
2430 : | || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC"; | ||
2431 : | @lines = grep { | ||
2432 : | (@$_ == 15) && | ||
2433 : | ($_->[12] =~ /^\d+$/) && | ||
2434 : | ($_->[13] =~ /^\d+$/) && | ||
2435 : | ($_->[6] =~ /^\d+$/) && | ||
2436 : | ($_->[7] =~ /^\d+$/) && | ||
2437 : | ($_->[8] =~ /^\d+$/) && | ||
2438 : | ($_->[9] =~ /^\d+$/) && | ||
2439 : | ($_->[2] =~ /^[0-9.]+$/) && | ||
2440 : | ($_->[10] =~ /^[0-9.e-]+$/) | ||
2441 : | } | ||
2442 : | map { [split(/\t/,$_),"blastp"] } | ||
2443 : | split(/\n/,$readC); | ||
2444 : | |||
2445 : | @lines = sort { $a->[10] <=> $b->[10] } @lines; | ||
2446 : | |||
2447 : | for ($i=0; ($i < @lines); $i++) | ||
2448 : | { | ||
2449 : | $psc = $lines[$i]->[10]; | ||
2450 : | $id2 = $lines[$i]->[1]; | ||
2451 : | if ($maxP >= $psc) | ||
2452 : | { | ||
2453 : | $sim = $lines[$i]; | ||
2454 : | bless($sim,"Sim"); | ||
2455 : | push(@sims,$sim); | ||
2456 : | if (@sims == $maxN) { return @sims } | ||
2457 : | } | ||
2458 : | } | ||
2459 : | } | ||
2460 : | return @sims; | ||
2461 : | } | ||
2462 : | |||
2463 : | overbeek | 1.73 | sub bbhs { |
2464 : | my($self,$peg,$cutoff) = @_; | ||
2465 : | overbeek | 1.74 | my($sim,$peg2,$genome2,$i,@sims2,%seen); |
2466 : | overbeek | 1.73 | |
2467 : | $cutoff = defined($cutoff) ? $cutoff : 1.0e-10; | ||
2468 : | my @bbhs = (); | ||
2469 : | |||
2470 : | my $genome1 = $self->genome_of($peg); | ||
2471 : | overbeek | 1.74 | $seen{$genome1} = 1; |
2472 : | overbeek | 1.73 | foreach $sim ($self->sims($peg,10000,$cutoff,"fig")) |
2473 : | { | ||
2474 : | $peg2 = $sim->id2; | ||
2475 : | $genome2 = $self->genome_of($peg2); | ||
2476 : | overbeek | 1.74 | next if ($seen{$genome2}); |
2477 : | $seen{$genome2} = 1; | ||
2478 : | overbeek | 1.73 | @sims2 = $self->sims($peg2,10000,$cutoff,"fig"); |
2479 : | for ($i=0; ($i < @sims2) && ($self->genome_of($sims2[$i]->id2) ne $genome1); $i++) {} | ||
2480 : | if (($i < @sims2) && ($sims2[$i]->id2 eq $peg)) | ||
2481 : | { | ||
2482 : | push(@bbhs,[$peg2,$sim->psc]); | ||
2483 : | } | ||
2484 : | } | ||
2485 : | return @bbhs; | ||
2486 : | } | ||
2487 : | |||
2488 : | efrank | 1.1 | =pod |
2489 : | |||
2490 : | =head1 dsims | ||
2491 : | |||
2492 : | usage: @sims = $fig->dsims($peg,$maxN,$maxP,$select) | ||
2493 : | |||
2494 : | Returns a list of similarities for $peg such that | ||
2495 : | |||
2496 : | there will be at most $maxN similarities, | ||
2497 : | |||
2498 : | each similarity will have a P-score <= $maxP, and | ||
2499 : | |||
2500 : | $select gives processing instructions: | ||
2501 : | |||
2502 : | "raw" means that the similarities will not be expanded (by far fastest option) | ||
2503 : | "fig" means return only similarities to fig genes | ||
2504 : | "all" means that you want all the expanded similarities. | ||
2505 : | |||
2506 : | By "expanded", we refer to taking a "raw similarity" against an entry in the non-redundant | ||
2507 : | protein collection, and converting it to a set of similarities (one for each of the | ||
2508 : | proteins that are essentially identical to the representative in the nr). | ||
2509 : | |||
2510 : | The "dsims" or "dynamic sims" are not precomputed. They are computed using a heuristic which | ||
2511 : | is much faster than blast, but misses some similarities. Essentially, you have an "index" or | ||
2512 : | representative sequences, a quick blast is done against it, and if there are any hits these are | ||
2513 : | used to indicate which sub-databases to blast against. | ||
2514 : | |||
2515 : | =cut | ||
2516 : | |||
2517 : | sub dsims { | ||
2518 : | my($self,$id,$seq,$maxN,$maxP,$select) = @_; | ||
2519 : | my($sim,$sub_dir,$db,$hit,@hits,%in); | ||
2520 : | |||
2521 : | my @index = &blastit($id,$seq,"$FIG_Config::global/SimGen/exemplar.fasta",1.0e-3); | ||
2522 : | foreach $sim (@index) | ||
2523 : | { | ||
2524 : | if ($sim->id2 =~ /_(\d+)$/) | ||
2525 : | { | ||
2526 : | $in{$1}++; | ||
2527 : | } | ||
2528 : | } | ||
2529 : | |||
2530 : | @hits = (); | ||
2531 : | foreach $db (keys(%in)) | ||
2532 : | { | ||
2533 : | $sub_dir = $db % 1000; | ||
2534 : | push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/AccessSets/$sub_dir/$db",$maxP)); | ||
2535 : | |||
2536 : | } | ||
2537 : | |||
2538 : | if (@hits == 0) | ||
2539 : | { | ||
2540 : | push(@hits,&blastit($id,$seq,"$FIG_Config::global/SimGen/nohit.fasta",$maxP)); | ||
2541 : | } | ||
2542 : | |||
2543 : | @hits = sort { ($a->psc <=> $b->psc) or ($a->iden cmp $b->iden) } grep { $_->id2 ne $id } @hits; | ||
2544 : | if ($maxN && ($maxN < @hits)) { $#hits = $maxN - 1 } | ||
2545 : | overbeek | 1.69 | return &expand_raw_sims($self,\@hits,$maxP,$select); |
2546 : | efrank | 1.1 | } |
2547 : | |||
2548 : | sub blastit { | ||
2549 : | my($id,$seq,$db,$maxP) = @_; | ||
2550 : | |||
2551 : | if (! $maxP) { $maxP = 1.0e-5 } | ||
2552 : | my $tmp = &Blast::blastp([[$id,$seq]],$db,"-e $maxP"); | ||
2553 : | my $tmp1 = $tmp->{$id}; | ||
2554 : | if ($tmp1) | ||
2555 : | { | ||
2556 : | return @$tmp1; | ||
2557 : | } | ||
2558 : | return (); | ||
2559 : | } | ||
2560 : | |||
2561 : | overbeek | 1.33 | sub related_by_func_sim { |
2562 : | my($self,$peg,$user) = @_; | ||
2563 : | my($func,$sim,$id2,%related); | ||
2564 : | |||
2565 : | if (($func = $self->function_of($peg,$user)) && (! &FIG::hypo($func))) | ||
2566 : | { | ||
2567 : | foreach $sim ($self->sims($peg,500,1,"fig",500)) | ||
2568 : | { | ||
2569 : | $id2 = $sim->id2; | ||
2570 : | if ($func eq $self->function_of($id2,$user)) | ||
2571 : | { | ||
2572 : | $related{$id2} = 1; | ||
2573 : | } | ||
2574 : | } | ||
2575 : | } | ||
2576 : | return keys(%related); | ||
2577 : | } | ||
2578 : | |||
2579 : | efrank | 1.1 | ################################# chromosomal clusters #################################### |
2580 : | |||
2581 : | =pod | ||
2582 : | |||
2583 : | =head1 in_cluster_with | ||
2584 : | |||
2585 : | usage: @pegs = $fig->in_cluster_with($peg) | ||
2586 : | |||
2587 : | Returns the set of pegs that are thought to be clustered with $peg (on the | ||
2588 : | chromosome). | ||
2589 : | |||
2590 : | =cut | ||
2591 : | |||
2592 : | sub in_cluster_with { | ||
2593 : | my($self,$peg) = @_; | ||
2594 : | my($set,$id,%in); | ||
2595 : | |||
2596 : | return $self->in_set_with($peg,"chromosomal_clusters","cluster_id"); | ||
2597 : | } | ||
2598 : | |||
2599 : | =pod | ||
2600 : | |||
2601 : | =head1 add_chromosomal_clusters | ||
2602 : | |||
2603 : | usage: $fig->add_chromosomal_clusters($file) | ||
2604 : | |||
2605 : | The given file is supposed to contain one predicted chromosomal cluster per line (either | ||
2606 : | comma or tab separated pegs). These will be added (to the extent they are new) to those | ||
2607 : | already in $FIG_Config::global/chromosomal_clusters. | ||
2608 : | |||
2609 : | =cut | ||
2610 : | |||
2611 : | |||
2612 : | sub add_chromosomal_clusters { | ||
2613 : | my($self,$file) = @_; | ||
2614 : | my($set,$added); | ||
2615 : | |||
2616 : | open(TMPCLUST,"<$file") | ||
2617 : | || die "aborted"; | ||
2618 : | while (defined($set = <TMPCLUST>)) | ||
2619 : | { | ||
2620 : | print STDERR "."; | ||
2621 : | golsen | 1.44 | chomp $set; |
2622 : | efrank | 1.1 | $added += $self->add_chromosomal_cluster([split(/[\t,]+/,$set)]); |
2623 : | } | ||
2624 : | close(TMPCLUST); | ||
2625 : | |||
2626 : | if ($added) | ||
2627 : | { | ||
2628 : | my $rdbH = $self->db_handle; | ||
2629 : | $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); | ||
2630 : | return 1; | ||
2631 : | } | ||
2632 : | return 0; | ||
2633 : | } | ||
2634 : | |||
2635 : | #=pod | ||
2636 : | # | ||
2637 : | #=head1 export_chromosomal_clusters | ||
2638 : | # | ||
2639 : | #usage: $fig->export_chromosomal_clusters | ||
2640 : | # | ||
2641 : | #Invoking this routine writes the set of chromosomal clusters as known in the | ||
2642 : | #relational DB back to $FIG_Config::global/chromosomal_clusters. | ||
2643 : | # | ||
2644 : | #=cut | ||
2645 : | # | ||
2646 : | sub export_chromosomal_clusters { | ||
2647 : | my($self) = @_; | ||
2648 : | |||
2649 : | $self->export_set("chromosomal_clusters","cluster_id","$FIG_Config::global/chromosomal_clusters"); | ||
2650 : | } | ||
2651 : | |||
2652 : | sub add_chromosomal_cluster { | ||
2653 : | my($self,$ids) = @_; | ||
2654 : | my($id,$set,%existing,%in,$new,$existing,$new_id); | ||
2655 : | |||
2656 : | # print STDERR "adding cluster ",join(",",@$ids),"\n"; | ||
2657 : | foreach $id (@$ids) | ||
2658 : | { | ||
2659 : | foreach $set ($self->in_sets($id,"chromosomal_clusters","cluster_id")) | ||
2660 : | { | ||
2661 : | $existing{$set} = 1; | ||
2662 : | foreach $id ($self->ids_in_set($set,"chromosomal_clusters","cluster_id")) | ||
2663 : | { | ||
2664 : | $in{$id} = 1; | ||
2665 : | } | ||
2666 : | } | ||
2667 : | } | ||
2668 : | # print &Dumper(\%existing,\%in); | ||
2669 : | |||
2670 : | $new = 0; | ||
2671 : | foreach $id (@$ids) | ||
2672 : | { | ||
2673 : | if (! $in{$id}) | ||
2674 : | { | ||
2675 : | $in{$id} = 1; | ||
2676 : | $new++; | ||
2677 : | } | ||
2678 : | } | ||
2679 : | # print STDERR "$new new ids\n"; | ||
2680 : | if ($new) | ||
2681 : | { | ||
2682 : | foreach $existing (keys(%existing)) | ||
2683 : | { | ||
2684 : | $self->delete_set($existing,"chromosomal_clusters","cluster_id"); | ||
2685 : | } | ||
2686 : | $new_id = $self->next_set("chromosomal_clusters","cluster_id"); | ||
2687 : | # print STDERR "adding new cluster $new_id\n"; | ||
2688 : | $self->insert_set($new_id,[keys(%in)],"chromosomal_clusters","cluster_id"); | ||
2689 : | return 1; | ||
2690 : | } | ||
2691 : | return 0; | ||
2692 : | } | ||
2693 : | |||
2694 : | ################################# PCH pins #################################### | ||
2695 : | |||
2696 : | =pod | ||
2697 : | |||
2698 : | =head1 in_pch_pin_with | ||
2699 : | |||
2700 : | usage: $fig->in_pch_pin_with($peg) | ||
2701 : | |||
2702 : | Returns the set of pegs that are believed to be "pinned" to $peg (in the | ||
2703 : | sense that PCHs occur containing these pegs over significant phylogenetic | ||
2704 : | distances). | ||
2705 : | |||
2706 : | =cut | ||
2707 : | |||
2708 : | sub in_pch_pin_with { | ||
2709 : | my($self,$peg) = @_; | ||
2710 : | my($set,$id,%in); | ||
2711 : | |||
2712 : | return $self->in_set_with($peg,"pch_pins","pin"); | ||
2713 : | } | ||
2714 : | |||
2715 : | =pod | ||
2716 : | |||
2717 : | =head1 add_pch_pins | ||
2718 : | |||
2719 : | usage: $fig->add_pch_pins($file) | ||
2720 : | |||
2721 : | The given file is supposed to contain one set of pinned pegs per line (either | ||
2722 : | comma or tab seprated pegs). These will be added (to the extent they are new) to those | ||
2723 : | already in $FIG_Config::global/pch_pins. | ||
2724 : | |||
2725 : | =cut | ||
2726 : | |||
2727 : | sub add_pch_pins { | ||
2728 : | my($self,$file) = @_; | ||
2729 : | my($set,$added); | ||
2730 : | |||
2731 : | open(TMPCLUST,"<$file") | ||
2732 : | || die "aborted"; | ||
2733 : | while (defined($set = <TMPCLUST>)) | ||
2734 : | { | ||
2735 : | print STDERR "."; | ||
2736 : | golsen | 1.44 | chomp $set; |
2737 : | efrank | 1.1 | my @tmp = split(/[\t,]+/,$set); |
2738 : | if (@tmp < 200) | ||
2739 : | { | ||
2740 : | $added += $self->add_pch_pin([@tmp]); | ||
2741 : | } | ||
2742 : | } | ||
2743 : | close(TMPCLUST); | ||
2744 : | |||
2745 : | if ($added) | ||
2746 : | { | ||
2747 : | my $rdbH = $self->db_handle; | ||
2748 : | $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); | ||
2749 : | return 1; | ||
2750 : | } | ||
2751 : | return 0; | ||
2752 : | } | ||
2753 : | |||
2754 : | sub export_pch_pins { | ||
2755 : | my($self) = @_; | ||
2756 : | |||
2757 : | $self->export_set("pch_pins","pin","$FIG_Config::global/pch_pins"); | ||
2758 : | } | ||
2759 : | |||
2760 : | sub add_pch_pin { | ||
2761 : | my($self,$ids) = @_; | ||
2762 : | my($id,$set,%existing,%in,$new,$existing,$new_id); | ||
2763 : | |||
2764 : | # print STDERR "adding cluster ",join(",",@$ids),"\n"; | ||
2765 : | foreach $id (@$ids) | ||
2766 : | { | ||
2767 : | foreach $set ($self->in_sets($id,"pch_pins","pin")) | ||
2768 : | { | ||
2769 : | $existing{$set} = 1; | ||
2770 : | foreach $id ($self->ids_in_set($set,"pch_pins","pin")) | ||
2771 : | { | ||
2772 : | $in{$id} = 1; | ||
2773 : | } | ||
2774 : | } | ||
2775 : | } | ||
2776 : | # print &Dumper(\%existing,\%in); | ||
2777 : | |||
2778 : | $new = 0; | ||
2779 : | foreach $id (@$ids) | ||
2780 : | { | ||
2781 : | if (! $in{$id}) | ||
2782 : | { | ||
2783 : | $in{$id} = 1; | ||
2784 : | $new++; | ||
2785 : | } | ||
2786 : | } | ||
2787 : | |||
2788 : | if ($new) | ||
2789 : | { | ||
2790 : | overbeek | 1.9 | if (keys(%in) < 300) |
2791 : | efrank | 1.1 | { |
2792 : | overbeek | 1.9 | foreach $existing (keys(%existing)) |
2793 : | { | ||
2794 : | $self->delete_set($existing,"pch_pins","pin"); | ||
2795 : | } | ||
2796 : | $new_id = $self->next_set("pch_pins","pin"); | ||
2797 : | # print STDERR "adding new pin $new_id\n"; | ||
2798 : | $self->insert_set($new_id,[keys(%in)],"pch_pins","pin"); | ||
2799 : | } | ||
2800 : | else | ||
2801 : | { | ||
2802 : | $new_id = $self->next_set("pch_pins","pin"); | ||
2803 : | # print STDERR "adding new pin $new_id\n"; | ||
2804 : | $self->insert_set($new_id,$ids,"pch_pins","pin"); | ||
2805 : | efrank | 1.1 | } |
2806 : | return 1; | ||
2807 : | } | ||
2808 : | return 0; | ||
2809 : | } | ||
2810 : | |||
2811 : | ################################# Annotations #################################### | ||
2812 : | |||
2813 : | =pod | ||
2814 : | |||
2815 : | =head1 add_annotation | ||
2816 : | |||
2817 : | usage: $fig->add_annotation($fid,$user,$annotation) | ||
2818 : | |||
2819 : | $annotation is added as a time-stamped annotation to $peg showing $user as the | ||
2820 : | individual who added the annotation. | ||
2821 : | |||
2822 : | =cut | ||
2823 : | |||
2824 : | sub add_annotation { | ||
2825 : | my($self,$feature_id,$user,$annotation) = @_; | ||
2826 : | my($genome); | ||
2827 : | |||
2828 : | # print STDERR "add: fid=$feature_id user=$user annotation=$annotation\n"; | ||
2829 : | if ($genome = $self->genome_of($feature_id)) | ||
2830 : | { | ||
2831 : | my $file = "$FIG_Config::organisms/$genome/annotations"; | ||
2832 : | my $fileno = $self->file2N($file); | ||
2833 : | my $time_made = time; | ||
2834 : | overbeek | 1.17 | my $ma = ($annotation =~ /^Set master function to/); |
2835 : | |||
2836 : | efrank | 1.1 | |
2837 : | if (open(TMP,">>$file")) | ||
2838 : | { | ||
2839 : | flock(TMP,LOCK_EX) || confess "cannot lock assigned_functions"; | ||
2840 : | seek(TMP,0,2) || confess "failed to seek to the end of the file"; | ||
2841 : | |||
2842 : | my $seek1 = tell TMP; | ||
2843 : | print TMP "$feature_id\n$time_made\n$user\n$annotation", (substr($annotation,-1) eq "\n") ? "" : "\n","//\n"; | ||
2844 : | my $seek2 = tell TMP; | ||
2845 : | close(TMP); | ||
2846 : | disz | 1.60 | chmod 02777, $file; |
2847 : | efrank | 1.1 | my $ln = $seek2 - $seek1; |
2848 : | my $rdbH = $self->db_handle; | ||
2849 : | overbeek | 1.17 | if ($rdbH->SQL("INSERT INTO annotation_seeks ( fid, dateof, who, ma, fileno, seek, len ) VALUES ( \'$feature_id\', $time_made, \'$user\', \'$ma\', $fileno, $seek1, $ln )")) |
2850 : | efrank | 1.1 | { |
2851 : | return 1; | ||
2852 : | } | ||
2853 : | } | ||
2854 : | } | ||
2855 : | return 0; | ||
2856 : | } | ||
2857 : | |||
2858 : | =pod | ||
2859 : | |||
2860 : | overbeek | 1.33 | =head1 merged_related_annotations |
2861 : | |||
2862 : | usage: @annotations = $fig->merged_related_annotations($fids) | ||
2863 : | |||
2864 : | The set of annotations of a set of PEGs ($fids) is returned as a list of 4-tuples. | ||
2865 : | Each entry in the list is of the form [$fid,$timestamp,$user,$annotation]. | ||
2866 : | |||
2867 : | =cut | ||
2868 : | |||
2869 : | sub merged_related_annotations { | ||
2870 : | my($self,$fids) = @_; | ||
2871 : | my($fid); | ||
2872 : | my(@ann) = (); | ||
2873 : | |||
2874 : | foreach $fid (@$fids) | ||
2875 : | { | ||
2876 : | push(@ann,$self->feature_annotations1($fid)); | ||
2877 : | } | ||
2878 : | return map { $_->[1] = localtime($_->[1]); $_ } sort { $a->[1] <=> $b->[1] } @ann; | ||
2879 : | } | ||
2880 : | |||
2881 : | =pod | ||
2882 : | |||
2883 : | efrank | 1.1 | =head1 feature_annotations |
2884 : | |||
2885 : | usage: @annotations = $fig->feature_annotations($fid) | ||
2886 : | |||
2887 : | The set of annotations of $fid is returned as a list of 4-tuples. Each entry in the list | ||
2888 : | is of the form [$fid,$timestamp,$user,$annotation]. | ||
2889 : | |||
2890 : | =cut | ||
2891 : | |||
2892 : | |||
2893 : | sub feature_annotations { | ||
2894 : | my($self,$feature_id) = @_; | ||
2895 : | overbeek | 1.33 | |
2896 : | return map { $_->[1] = localtime($_->[1]); $_ } $self->feature_annotations1($feature_id); | ||
2897 : | } | ||
2898 : | |||
2899 : | sub feature_annotations1 { | ||
2900 : | my($self,$feature_id) = @_; | ||
2901 : | overbeek | 1.16 | my($tuple,$fileN,$seek,$ln,$annotation,$feature_idQ); |
2902 : | efrank | 1.1 | my($file,$fh); |
2903 : | |||
2904 : | my $rdbH = $self->db_handle; | ||
2905 : | my $relational_db_response = $rdbH->SQL("SELECT fileno, seek, len FROM annotation_seeks WHERE fid = \'$feature_id\' "); | ||
2906 : | my @annotations = (); | ||
2907 : | |||
2908 : | foreach $tuple (@$relational_db_response) | ||
2909 : | { | ||
2910 : | ($fileN,$seek,$ln) = @$tuple; | ||
2911 : | overbeek | 1.16 | $annotation = $self->read_annotation($fileN,$seek,$ln); |
2912 : | $feature_idQ = quotemeta $feature_id; | ||
2913 : | if ($annotation =~ /^$feature_idQ\n(\d+)\n([^\n]+)\n(.*)/s) | ||
2914 : | efrank | 1.1 | { |
2915 : | overbeek | 1.16 | push(@annotations,[$feature_id,$1,$2,$3]); |
2916 : | efrank | 1.1 | } |
2917 : | overbeek | 1.16 | else |
2918 : | efrank | 1.1 | { |
2919 : | overbeek | 1.16 | print STDERR "malformed annotation\n$annotation\n"; |
2920 : | efrank | 1.1 | } |
2921 : | } | ||
2922 : | overbeek | 1.33 | return sort { $a->[1] <=> $b->[1] } @annotations; |
2923 : | overbeek | 1.16 | } |
2924 : | |||
2925 : | sub read_annotation { | ||
2926 : | my($self,$fileN,$seek,$ln) = @_; | ||
2927 : | my($readN,$readC); | ||
2928 : | |||
2929 : | my $file = $self->N2file($fileN); | ||
2930 : | my $fh = $self->openF($file); | ||
2931 : | if (! $fh) | ||
2932 : | { | ||
2933 : | confess "could not open annotations for $file"; | ||
2934 : | } | ||
2935 : | seek($fh,$seek,0); | ||
2936 : | overbeek | 1.24 | $readN = read($fh,$readC,$ln-3); |
2937 : | ($readN == ($ln-3)) | ||
2938 : | overbeek | 1.16 | || confess "could not read the block of annotations at $seek for $ln characters; $readN actually read from $file\n$readC"; |
2939 : | return $readC; | ||
2940 : | overbeek | 1.17 | } |
2941 : | |||
2942 : | overbeek | 1.21 | sub epoch_to_readable { |
2943 : | my($epoch) = @_; | ||
2944 : | |||
2945 : | my($sec,$min,$hr,$dd,$mm,$yr) = localtime($epoch); | ||
2946 : | $mm++; | ||
2947 : | $yr += 1900; | ||
2948 : | return "$mm-$dd-$yr:$hr:$min:$sec"; | ||
2949 : | } | ||
2950 : | |||
2951 : | overbeek | 1.17 | sub assignments_made { |
2952 : | my($self,$genomes,$who,$date) = @_; | ||
2953 : | my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); | ||
2954 : | overbeek | 1.30 | my($epoch_date,$when,%sofar,$x); |
2955 : | overbeek | 1.17 | |
2956 : | overbeek | 1.56 | if (! defined($genomes)) { $genomes = [$self->genomes] } |
2957 : | |||
2958 : | overbeek | 1.17 | my %genomes = map { $_ => 1 } @$genomes; |
2959 : | overbeek | 1.19 | if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) |
2960 : | { | ||
2961 : | my($mm,$dd,$yyyy) = ($1,$2,$3); | ||
2962 : | $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); | ||
2963 : | } | ||
2964 : | overbeek | 1.62 | elsif ($date =~ /^\d+$/) |
2965 : | { | ||
2966 : | $epoch_date = $date; | ||
2967 : | } | ||
2968 : | overbeek | 1.19 | else |
2969 : | { | ||
2970 : | $epoch_date = 0; | ||
2971 : | } | ||
2972 : | $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; | ||
2973 : | overbeek | 1.17 | my @assignments = (); |
2974 : | my $rdbH = $self->db_handle; | ||
2975 : | if ($who eq "master") | ||
2976 : | { | ||
2977 : | overbeek | 1.30 | $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); |
2978 : | overbeek | 1.17 | } |
2979 : | else | ||
2980 : | { | ||
2981 : | overbeek | 1.30 | $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); |
2982 : | overbeek | 1.17 | } |
2983 : | |||
2984 : | if ($relational_db_response && (@$relational_db_response > 0)) | ||
2985 : | { | ||
2986 : | foreach $entry (@$relational_db_response) | ||
2987 : | { | ||
2988 : | overbeek | 1.30 | ($fid,$when,$fileno,$seek,$len) = @$entry; |
2989 : | overbeek | 1.17 | if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1}) |
2990 : | { | ||
2991 : | overbeek | 1.67 | if ($len < 4) |
2992 : | { | ||
2993 : | print STDERR "BAD: fid=$fid when=$when fileno=$fileno seek=$seek len=$len\n"; | ||
2994 : | next; | ||
2995 : | } | ||
2996 : | overbeek | 1.17 | $ann = $self->read_annotation($fileno,$seek,$len); |
2997 : | |||
2998 : | if (($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\nSet ([^\n]*)function[^\n]*\n(\S[^\n]+\S)/s) && | ||
2999 : | (($who eq $3) || (($4 eq "master ") && ($who eq "master"))) && | ||
3000 : | overbeek | 1.19 | ($2 >= $epoch_date)) |
3001 : | overbeek | 1.17 | { |
3002 : | overbeek | 1.30 | if ((! $sofar{$1}) || (($x = $sofar{$1}) && ($when > $x->[0]))) |
3003 : | { | ||
3004 : | $sofar{$1} = [$when,$5]; | ||
3005 : | } | ||
3006 : | overbeek | 1.17 | } |
3007 : | } | ||
3008 : | } | ||
3009 : | } | ||
3010 : | overbeek | 1.30 | @assignments = map { $x = $sofar{$_}; [$_,$x->[1]] } keys(%sofar); |
3011 : | overbeek | 1.17 | return @assignments; |
3012 : | efrank | 1.1 | } |
3013 : | |||
3014 : | overbeek | 1.56 | sub annotations_made { |
3015 : | my($self,$genomes,$who,$date) = @_; | ||
3016 : | my($relational_db_response,$entry,$fid,$fileno,$seek,$len,$ann); | ||
3017 : | my($epoch_date,$when,@annotations); | ||
3018 : | |||
3019 : | if (! defined($genomes)) { $genomes = [$self->genomes] } | ||
3020 : | |||
3021 : | my %genomes = map { $_ => 1 } @$genomes; | ||
3022 : | if ($date =~ /^(\d{1,2})\/(\d{1,2})\/(\d{4})$/) | ||
3023 : | { | ||
3024 : | my($mm,$dd,$yyyy) = ($1,$2,$3); | ||
3025 : | $epoch_date = &Time::Local::timelocal(0,0,0,$dd,$mm-1,$yyyy-1900,0,0,0); | ||
3026 : | } | ||
3027 : | overbeek | 1.62 | elsif ($date =~ /^\d+$/) |
3028 : | { | ||
3029 : | $epoch_date = $date; | ||
3030 : | } | ||
3031 : | overbeek | 1.56 | else |
3032 : | { | ||
3033 : | $epoch_date = 0; | ||
3034 : | } | ||
3035 : | $epoch_date = defined($epoch_date) ? $epoch_date-1 : 0; | ||
3036 : | my @annotations = (); | ||
3037 : | my $rdbH = $self->db_handle; | ||
3038 : | if ($who eq "master") | ||
3039 : | { | ||
3040 : | $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE ((ma = \'1\') AND (dateof > $epoch_date))"); | ||
3041 : | } | ||
3042 : | else | ||
3043 : | { | ||
3044 : | $relational_db_response = $rdbH->SQL("SELECT fid, dateof, fileno, seek, len FROM annotation_seeks WHERE (( who = \'$who\' ) AND (dateof > $epoch_date))"); | ||
3045 : | } | ||
3046 : | |||
3047 : | if ($relational_db_response && (@$relational_db_response > 0)) | ||
3048 : | { | ||
3049 : | foreach $entry (@$relational_db_response) | ||
3050 : | { | ||
3051 : | ($fid,$when,$fileno,$seek,$len) = @$entry; | ||
3052 : | if (($fid =~ /^fig\|(\d+\.\d+)/) && $genomes{$1}) | ||
3053 : | { | ||
3054 : | $ann = $self->read_annotation($fileno,$seek,$len); | ||
3055 : | |||
3056 : | overbeek | 1.57 | if ($ann =~ /^(fig\|\d+\.\d+\.peg\.\d+)\n(\d+)\n(\S+)\n(.*\S)/s) |
3057 : | overbeek | 1.56 | { |
3058 : | push(@annotations,[$1,$2,$3,$4]); | ||
3059 : | } | ||
3060 : | } | ||
3061 : | } | ||
3062 : | } | ||
3063 : | return @annotations; | ||
3064 : | } | ||
3065 : | |||
3066 : | efrank | 1.1 | ################################# Indexing Features and Functional Roles #################################### |
3067 : | |||
3068 : | =pod | ||
3069 : | |||
3070 : | =head1 search_index | ||
3071 : | |||
3072 : | usage: ($pegs,$roles) = $fig->search_pattern($pattern) | ||
3073 : | |||
3074 : | All pegs that "match" $pattern are put into a list, and $pegs will be a | ||
3075 : | pointer to that list. | ||
3076 : | |||
3077 : | All roles that "match" $pattern are put into a list, and $roles will be a | ||
3078 : | pointer to that list. | ||
3079 : | |||
3080 : | The notion of "match $pattern" is intentionally left undefined. For now, you | ||
3081 : | will probably get only entries in which each word id $pattern occurs exactly, | ||
3082 : | but that is not a long term commitment. | ||
3083 : | |||
3084 : | =cut | ||
3085 : | |||
3086 : | sub search_index { | ||
3087 : | my($self,$pattern) = @_; | ||
3088 : | my($patternQ,@raw,@pegs,@roles); | ||
3089 : | |||
3090 : | &clean_tmp; | ||
3091 : | $patternQ = $pattern; | ||
3092 : | $patternQ =~ s/\s+/;/g; | ||
3093 : | $patternQ =~ s/\./\\./g; | ||
3094 : | |||
3095 : | # print STDERR "pattern=$pattern patternQ=$patternQ\n"; | ||
3096 : | @raw = `$FIG_Config::ext_bin/glimpse -y -H $FIG_Config::data/Indexes -i -w \'$patternQ\'`; | ||
3097 : | @pegs = sort { &FIG::by_fig_id($a->[0],$b->[0]) } | ||
3098 : | map { $_ =~ s/^\S+:\s+//; [split(/\t/,$_)] } | ||
3099 : | grep { $_ =~ /^\S+peg.index/ } @raw; | ||
3100 : | my %roles = map { $_ =~ s/^\S+:\s+//; $_ => 1} grep { $_ =~ /^\S+role.index/ } @raw; | ||
3101 : | @roles = sort keys(%roles); | ||
3102 : | |||
3103 : | return ([@pegs],[@roles]); | ||
3104 : | } | ||
3105 : | |||
3106 : | ################################# Loading Databases #################################### | ||
3107 : | |||
3108 : | |||
3109 : | #=pod | ||
3110 : | # | ||
3111 : | #=head1 load_all | ||
3112 : | # | ||
3113 : | #usage: load_all | ||
3114 : | # | ||
3115 : | #This function is supposed to reload all entries into the database and do | ||
3116 : | #whatever is required to properly support indexing of pegs and roles. | ||
3117 : | # | ||
3118 : | #=cut | ||
3119 : | |||
3120 : | sub load_all { | ||
3121 : | |||
3122 : | overbeek | 1.15 | &run("index_contigs"); |
3123 : | &run("compute_genome_counts"); | ||
3124 : | efrank | 1.1 | &run("load_features"); |
3125 : | &run("index_sims"); | ||
3126 : | &run("load_peg_mapping"); | ||
3127 : | &run("index_translations"); | ||
3128 : | &run("add_assertions_of_function"); | ||
3129 : | &run("load_protein_families"); | ||
3130 : | &run("load_external_orgs"); | ||
3131 : | &run("load_chromosomal_clusters"); | ||
3132 : | &run("load_pch_pins"); | ||
3133 : | &run("index_neighborhoods"); | ||
3134 : | &run("index_annotations"); | ||
3135 : | &run("load_ec_names"); | ||
3136 : | &run("load_kegg"); | ||
3137 : | overbeek | 1.35 | &run("load_distances"); |
3138 : | efrank | 1.1 | &run("make_indexes"); |
3139 : | overbeek | 1.70 | &run("format_peg_dbs"); |
3140 : | efrank | 1.1 | } |
3141 : | |||
3142 : | ################################# Automated Assignments #################################### | ||
3143 : | |||
3144 : | =pod | ||
3145 : | |||
3146 : | =head1 auto_assign | ||
3147 : | |||
3148 : | usage: $assignment = &FIG::auto_assign($peg,$seq) | ||
3149 : | |||
3150 : | This returns an automated assignment for $peg. $seq is optional; if it is not | ||
3151 : | present, then it is assumed that similarities already exist for $peg. $assignment is set | ||
3152 : | to either | ||
3153 : | |||
3154 : | Function | ||
3155 : | or | ||
3156 : | Function\tW | ||
3157 : | |||
3158 : | if it is felt that the assertion is pretty weak. | ||
3159 : | |||
3160 : | =cut | ||
3161 : | |||
3162 : | sub auto_assign { | ||
3163 : | my($peg,$seq) = @_; | ||
3164 : | |||
3165 : | overbeek | 1.71 | my $cmd = $seq ? "echo \"$peg\t$seq\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls" : "echo \"$peg\" | $FIG_Config::bin/auto_assign | $FIG_Config::bin/make_calls"; |
3166 : | efrank | 1.1 | # print STDERR $cmd; |
3167 : | my(@tmp) = `$cmd`; | ||
3168 : | if ((@tmp == 1) && ($tmp[0] =~ /^\S+\t(\S.*\S)/)) | ||
3169 : | { | ||
3170 : | return $1; | ||
3171 : | } | ||
3172 : | else | ||
3173 : | { | ||
3174 : | return "hypothetical protein"; | ||
3175 : | } | ||
3176 : | } | ||
3177 : | |||
3178 : | ################################# Protein Families #################################### | ||
3179 : | |||
3180 : | =pod | ||
3181 : | |||
3182 : | =head1 all_protein_families | ||
3183 : | |||
3184 : | usage: @all = $fig->all_protein_families | ||
3185 : | |||
3186 : | Returns a list of the ids of all of the protein families currently defined. | ||
3187 : | |||
3188 : | =cut | ||
3189 : | |||
3190 : | sub all_protein_families { | ||
3191 : | my($self) = @_; | ||
3192 : | |||
3193 : | return $self->all_sets("protein_families","family"); | ||
3194 : | } | ||
3195 : | |||
3196 : | =pod | ||
3197 : | |||
3198 : | =head1 ids_in_family | ||
3199 : | |||
3200 : | usage: @pegs = $fig->ids_in_family($family) | ||
3201 : | |||
3202 : | Returns a list of the pegs in $family. | ||
3203 : | |||
3204 : | =cut | ||
3205 : | |||
3206 : | sub ids_in_family { | ||
3207 : | my($self,$family) = @_; | ||
3208 : | |||
3209 : | return $self->ids_in_set($family,"protein_families","family"); | ||
3210 : | } | ||
3211 : | |||
3212 : | =pod | ||
3213 : | |||
3214 : | =head1 family_function | ||
3215 : | |||
3216 : | usage: $func = $fig->family_function($family) | ||
3217 : | |||
3218 : | Returns the putative function of all of the pegs in $family. Remember, we | ||
3219 : | are defining "protein family" as a set of homologous proteins that have the | ||
3220 : | same function. | ||
3221 : | |||
3222 : | =cut | ||
3223 : | |||
3224 : | sub family_function { | ||
3225 : | my($self,$family) = @_; | ||
3226 : | my($relational_db_response); | ||
3227 : | my $rdbH = $self->db_handle; | ||
3228 : | |||
3229 : | defined($family) || confess "family is missing"; | ||
3230 : | if (($relational_db_response = $rdbH->SQL("SELECT function FROM family_function WHERE ( family = $family)")) && | ||
3231 : | (@$relational_db_response >= 1)) | ||
3232 : | { | ||
3233 : | return $relational_db_response->[0]->[0]; | ||
3234 : | } | ||
3235 : | return ""; | ||
3236 : | } | ||
3237 : | |||
3238 : | =pod | ||
3239 : | |||
3240 : | =head1 sz_family | ||
3241 : | |||
3242 : | usage: $n = $fig->sz_family($family) | ||
3243 : | |||
3244 : | Returns the number of pegs in $family. | ||
3245 : | |||
3246 : | =cut | ||
3247 : | |||
3248 : | sub sz_family { | ||
3249 : | my($self,$family) = @_; | ||
3250 : | |||
3251 : | return $self->sz_set($family,"protein_families","family"); | ||
3252 : | } | ||
3253 : | |||
3254 : | =pod | ||
3255 : | |||
3256 : | =head1 in_family | ||
3257 : | |||
3258 : | usage: @pegs = $fig->in_family($family) | ||
3259 : | |||
3260 : | Returns the pegs in $family. | ||
3261 : | |||
3262 : | =cut | ||
3263 : | |||
3264 : | sub in_family { | ||
3265 : | my($self,$id) = @_; | ||
3266 : | |||
3267 : | my @in = $self->in_sets($id,"protein_families","family"); | ||
3268 : | return (@in > 0) ? $in[0] : ""; | ||
3269 : | } | ||
3270 : | |||
3271 : | ################################# Abstract Set Routines #################################### | ||
3272 : | |||
3273 : | sub all_sets { | ||
3274 : | my($self,$relation,$set_name) = @_; | ||
3275 : | my($relational_db_response); | ||
3276 : | |||
3277 : | my $rdbH = $self->db_handle; | ||
3278 : | |||
3279 : | if (($relational_db_response = $rdbH->SQL("SELECT DISTINCT $set_name FROM $relation")) && | ||
3280 : | (@$relational_db_response >= 1)) | ||
3281 : | { | ||
3282 : | return map { $_->[0] } @$relational_db_response; | ||
3283 : | } | ||
3284 : | return (); | ||
3285 : | } | ||
3286 : | |||
3287 : | sub next_set { | ||
3288 : | my($self,$relation,$set_name) = @_; | ||
3289 : | my($relational_db_response); | ||
3290 : | |||
3291 : | my $rdbH = $self->db_handle; | ||
3292 : | |||
3293 : | if (($relational_db_response = $rdbH->SQL("SELECT MAX($set_name) FROM $relation")) && | ||
3294 : | (@$relational_db_response == 1)) | ||
3295 : | { | ||
3296 : | return $relational_db_response->[0]->[0] + 1; | ||
3297 : | } | ||
3298 : | } | ||
3299 : | |||
3300 : | sub ids_in_set { | ||
3301 : | my($self,$which,$relation,$set_name) = @_; | ||
3302 : | my($relational_db_response); | ||
3303 : | |||
3304 : | my $rdbH = $self->db_handle; | ||
3305 : | if (defined($which) && ($which =~ /^\d+$/)) | ||
3306 : | { | ||
3307 : | if (($relational_db_response = $rdbH->SQL("SELECT id FROM $relation WHERE ( $set_name = $which)")) && | ||
3308 : | (@$relational_db_response >= 1)) | ||
3309 : | { | ||
3310 : | return sort { by_fig_id($a,$b) } map { $_->[0] } @$relational_db_response; | ||
3311 : | } | ||
3312 : | } | ||
3313 : | return (); | ||
3314 : | } | ||
3315 : | |||
3316 : | sub in_sets { | ||
3317 : | my($self,$id,$relation,$set_name) = @_; | ||
3318 : | my($relational_db_response); | ||
3319 : | |||
3320 : | my $rdbH = $self->db_handle; | ||
3321 : | |||
3322 : | if (($relational_db_response = $rdbH->SQL("SELECT $set_name FROM $relation WHERE ( id = \'$id\' )")) && | ||
3323 : | (@$relational_db_response >= 1)) | ||
3324 : | { | ||
3325 : | return map { $_->[0] } @$relational_db_response; | ||
3326 : | } | ||
3327 : | return (); | ||
3328 : | } | ||
3329 : | |||
3330 : | sub sz_set { | ||
3331 : | my($self,$which,$relation,$set_name) = @_; | ||
3332 : | my($relational_db_response); | ||
3333 : | |||
3334 : | my $rdbH = $self->db_handle; | ||
3335 : | |||
3336 : | if (($relational_db_response = $rdbH->SQL("SELECT COUNT(*) FROM $relation WHERE ( $set_name = $which)")) && | ||
3337 : | (@$relational_db_response == 1)) | ||
3338 : | { | ||
3339 : | return $relational_db_response->[0]->[0]; | ||
3340 : | } | ||
3341 : | return 0; | ||
3342 : | } | ||
3343 : | |||
3344 : | sub delete_set { | ||
3345 : | my($self,$set,$relation,$set_name) = @_; | ||
3346 : | |||
3347 : | # print STDERR "deleting set $set\n"; | ||
3348 : | my $rdbH = $self->db_handle; | ||
3349 : | |||
3350 : | return $rdbH->SQL("DELETE FROM $relation WHERE ( $set_name = $set )"); | ||
3351 : | } | ||
3352 : | |||
3353 : | sub insert_set { | ||
3354 : | my($self,$set,$ids,$relation,$set_name) = @_; | ||
3355 : | my($id); | ||
3356 : | |||
3357 : | # print STDERR "inserting set $set containing ",join(",",@$ids),"\n"; | ||
3358 : | my $rdbH = $self->db_handle; | ||
3359 : | |||
3360 : | overbeek | 1.23 | my @ids = grep { length($_) < 255 } @$ids; |
3361 : | if (@ids < 2) { return 0 } | ||
3362 : | |||
3363 : | efrank | 1.1 | my $rc = 1; |
3364 : | overbeek | 1.23 | foreach $id (@ids) |
3365 : | efrank | 1.1 | { |
3366 : | if (! $rdbH->SQL("INSERT INTO $relation ( $set_name,id ) VALUES ( $set,\'$id\' )")) | ||
3367 : | { | ||
3368 : | $rc = 0; | ||
3369 : | } | ||
3370 : | } | ||
3371 : | # print STDERR " rc=$rc\n"; | ||
3372 : | return $rc; | ||
3373 : | } | ||
3374 : | |||
3375 : | sub in_set_with { | ||
3376 : | my($self,$peg,$relation,$set_name) = @_; | ||
3377 : | my($set,$id,%in); | ||
3378 : | |||
3379 : | foreach $set ($self->in_sets($peg,$relation,$set_name)) | ||
3380 : | { | ||
3381 : | foreach $id ($self->ids_in_set($set,$relation,$set_name)) | ||
3382 : | { | ||
3383 : | $in{$id} = 1; | ||
3384 : | } | ||
3385 : | } | ||
3386 : | return sort { &by_fig_id($a,$b) } keys(%in); | ||
3387 : | } | ||
3388 : | |||
3389 : | |||
3390 : | sub export_set { | ||
3391 : | my($self,$relation,$set_name,$file) = @_; | ||
3392 : | my($pair); | ||
3393 : | |||
3394 : | my $rdbH = $self->db_handle; | ||
3395 : | my $relational_db_response = $rdbH->SQL("SELECT $set_name, id FROM $relation"); | ||
3396 : | |||
3397 : | open(TMP,">$file") | ||
3398 : | || die "could not open $file"; | ||
3399 : | flock(TMP,LOCK_EX) || confess "cannot lock $file"; | ||
3400 : | seek(TMP,0,2) || confess "failed to seek to the end of the file"; | ||
3401 : | |||
3402 : | foreach $pair (sort { ($a->[0] <=> $b->[0]) or &by_fig_id($a->[1],$b->[1]) } @$relational_db_response) | ||
3403 : | { | ||
3404 : | print TMP join("\t",@$pair),"\n"; | ||
3405 : | } | ||
3406 : | close(TMP); | ||
3407 : | return 1; | ||
3408 : | } | ||
3409 : | |||
3410 : | ################################# KEGG Stuff #################################### | ||
3411 : | |||
3412 : | |||
3413 : | =pod | ||
3414 : | |||
3415 : | =head1 all_compounds | ||
3416 : | |||
3417 : | usage: @compounds = $fig->all_compounds | ||
3418 : | |||
3419 : | Returns a list containing all of the KEGG compounds. | ||
3420 : | |||
3421 : | =cut | ||
3422 : | |||
3423 : | sub all_compounds { | ||
3424 : | my($self) = @_; | ||
3425 : | |||
3426 : | my $rdbH = $self->db_handle; | ||
3427 : | my $relational_db_response = $rdbH->SQL("SELECT DISTINCT cid FROM comp_name"); | ||
3428 : | if (@$relational_db_response > 0) | ||
3429 : | { | ||
3430 : | return sort map { $_->[0] } @$relational_db_response; | ||
3431 : | } | ||
3432 : | return (); | ||
3433 : | } | ||
3434 : | |||
3435 : | =pod | ||
3436 : | |||
3437 : | =head1 names_of_compound | ||
3438 : | |||
3439 : | usage: @names = $fig->names_of_compound | ||
3440 : | |||
3441 : | Returns a list containing all of the names assigned to the KEGG compounds. The list | ||
3442 : | will be ordered as given by KEGG. | ||
3443 : | |||
3444 : | =cut | ||
3445 : | |||
3446 : | sub names_of_compound { | ||
3447 : | my($self,$cid) = @_; | ||
3448 : | |||
3449 : | my $rdbH = $self->db_handle; | ||
3450 : | my $relational_db_response = $rdbH->SQL("SELECT pos,name FROM comp_name where cid = \'$cid\'"); | ||
3451 : | if (@$relational_db_response > 0) | ||
3452 : | { | ||
3453 : | return map { $_->[1] } sort { $a->[0] <=> $b->[0] } @$relational_db_response; | ||
3454 : | } | ||
3455 : | return (); | ||
3456 : | } | ||
3457 : | |||
3458 : | =pod | ||
3459 : | |||
3460 : | =head1 comp2react | ||
3461 : | |||
3462 : | |||
3463 : | usage: @rids = $fig->comp2react($cid) | ||
3464 : | |||
3465 : | Returns a list containing all of the reaction IDs for reactions that take $cid | ||
3466 : | as either a substrate or a product. | ||
3467 : | |||
3468 : | =cut | ||
3469 : | |||
3470 : | sub comp2react { | ||
3471 : | my($self,$cid) = @_; | ||
3472 : | |||
3473 : | my $rdbH = $self->db_handle; | ||
3474 : | my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_compound where cid = \'$cid\'"); | ||
3475 : | if (@$relational_db_response > 0) | ||
3476 : | { | ||
3477 : | return sort map { $_->[0] } @$relational_db_response; | ||
3478 : | } | ||
3479 : | return (); | ||
3480 : | } | ||
3481 : | |||
3482 : | =pod | ||
3483 : | |||
3484 : | =head1 cas | ||
3485 : | |||
3486 : | usage: $cas = $fig->cas($cid) | ||
3487 : | |||
3488 : | Returns the CAS ID for the compound, if known. | ||
3489 : | |||
3490 : | =cut | ||
3491 : | |||
3492 : | sub cas { | ||
3493 : | my($self,$cid) = @_; | ||
3494 : | |||
3495 : | my $rdbH = $self->db_handle; | ||
3496 : | my $relational_db_response = $rdbH->SQL("SELECT cas FROM comp_cas where cid = \'$cid\'"); | ||
3497 : | if (@$relational_db_response == 1) | ||
3498 : | { | ||
3499 : | return $relational_db_response->[0]->[0]; | ||
3500 : | } | ||
3501 : | return ""; | ||
3502 : | } | ||
3503 : | |||
3504 : | =pod | ||
3505 : | |||
3506 : | =head1 cas_to_cid | ||
3507 : | |||
3508 : | usage: $cid = $fig->cas_to_cid($cas) | ||
3509 : | |||
3510 : | Returns the compound id (cid), given the CAS ID. | ||
3511 : | |||
3512 : | =cut | ||
3513 : | |||
3514 : | sub cas_to_cid { | ||
3515 : | my($self,$cas) = @_; | ||
3516 : | |||
3517 : | my $rdbH = $self->db_handle; | ||
3518 : | my $relational_db_response = $rdbH->SQL("SELECT cid FROM comp_cas where cas = \'$cas\'"); | ||
3519 : | if (@$relational_db_response == 1) | ||
3520 : | { | ||
3521 : | return $relational_db_response->[0]->[0]; | ||
3522 : | } | ||
3523 : | return ""; | ||
3524 : | } | ||
3525 : | |||
3526 : | =pod | ||
3527 : | |||
3528 : | =head1 all_reactions | ||
3529 : | |||
3530 : | usage: @rids = $fig->all_reactions | ||
3531 : | |||
3532 : | Returns a list containing all of the KEGG reaction IDs. | ||
3533 : | |||
3534 : | =cut | ||
3535 : | |||
3536 : | sub all_reactions { | ||
3537 : | my($self) = @_; | ||
3538 : | |||
3539 : | my $rdbH = $self->db_handle; | ||
3540 : | my $relational_db_response = $rdbH->SQL("SELECT DISTINCT rid FROM reaction_to_compound"); | ||
3541 : | if (@$relational_db_response > 0) | ||
3542 : | { | ||
3543 : | return sort map { $_->[0] } @$relational_db_response; | ||
3544 : | } | ||
3545 : | return (); | ||
3546 : | } | ||
3547 : | |||
3548 : | =pod | ||
3549 : | |||
3550 : | =head1 reversible | ||
3551 : | |||
3552 : | usage: $rev = $fig->reversible($rid) | ||
3553 : | |||
3554 : | Returns true iff the reactions had a "main direction" designated as "<=>"; | ||
3555 : | |||
3556 : | =cut | ||
3557 : | |||
3558 : | sub reversible { | ||
3559 : | my($self,$rid) = @_; | ||
3560 : | |||
3561 : | my $rdbH = $self->db_handle; | ||
3562 : | my $relational_db_response = $rdbH->SQL("SELECT reversible FROM reversible where rid = \'$rid\'"); | ||
3563 : | if (@$relational_db_response == 1) | ||
3564 : | { | ||
3565 : | return $relational_db_response->[0]->[0]; | ||
3566 : | } | ||
3567 : | return 1; | ||
3568 : | } | ||
3569 : | |||
3570 : | =pod | ||
3571 : | |||
3572 : | =head1 reaction2comp | ||
3573 : | |||
3574 : | usage: @tuples = $fig->reaction2comp($rid,$which) | ||
3575 : | |||
3576 : | Returns the "substrates" iff $which == 0. In any event (i.e., whether you ask for substrates | ||
3577 : | or products), you get back a list of 3-tuples. Each 3-tuple will contain | ||
3578 : | |||
3579 : | [$cid,$stoich,$main] | ||
3580 : | |||
3581 : | Stoichiometry is normally numeric, but can be things like "n" or "(n+1)". | ||
3582 : | $main is 1 iff the compound is considered "main" or "connectable". | ||
3583 : | |||
3584 : | =cut | ||
3585 : | |||
3586 : | sub reaction2comp { | ||
3587 : | my($self,$rid,$which) = @_; | ||
3588 : | |||
3589 : | my $rdbH = $self->db_handle; | ||
3590 : | my $relational_db_response = $rdbH->SQL("SELECT cid,stoich,main FROM reaction_to_compound where rid = \'$rid\' and setn = \'$which\'"); | ||
3591 : | if (@$relational_db_response > 0) | ||
3592 : | { | ||
3593 : | return sort { $a->[0] cmp $b->[0] } map { $_->[1] =~ s/\s+//g; $_ } @$relational_db_response; | ||
3594 : | } | ||
3595 : | return (); | ||
3596 : | } | ||
3597 : | |||
3598 : | =pod | ||
3599 : | |||
3600 : | =head1 catalyzed_by | ||
3601 : | |||
3602 : | usage: @ecs = $fig->catalyzed_by($rid) | ||
3603 : | |||
3604 : | Returns the ECs that are reputed to catalyze the reaction. Note that we are currently | ||
3605 : | just returning the ECs that KEGG gives. We need to handle the incompletely specified forms | ||
3606 : | (e.g., 1.1.1.-), but we do not do it yet. | ||
3607 : | |||
3608 : | =cut | ||
3609 : | |||
3610 : | sub catalyzed_by { | ||
3611 : | my($self,$rid) = @_; | ||
3612 : | |||
3613 : | my $rdbH = $self->db_handle; | ||
3614 : | my $relational_db_response = $rdbH->SQL("SELECT role FROM reaction_to_enzyme where rid = \'$rid\'"); | ||
3615 : | if (@$relational_db_response > 0) | ||
3616 : | { | ||
3617 : | return sort map { $_->[0] } @$relational_db_response; | ||
3618 : | } | ||
3619 : | return (); | ||
3620 : | } | ||
3621 : | |||
3622 : | =pod | ||
3623 : | |||
3624 : | =head1 catalyzes | ||
3625 : | |||
3626 : | usage: @ecs = $fig->catalyzes($role) | ||
3627 : | |||
3628 : | Returns the rids of the reactions catalyzed by the "role" (normally an EC). | ||
3629 : | |||
3630 : | =cut | ||
3631 : | |||
3632 : | sub catalyzes { | ||
3633 : | my($self,$role) = @_; | ||
3634 : | |||
3635 : | my $rdbH = $self->db_handle; | ||
3636 : | my $relational_db_response = $rdbH->SQL("SELECT rid FROM reaction_to_enzyme where role = \'$role\'"); | ||
3637 : | if (@$relational_db_response > 0) | ||
3638 : | { | ||
3639 : | return sort map { $_->[0] } @$relational_db_response; | ||
3640 : | } | ||
3641 : | return (); | ||
3642 : | } | ||
3643 : | |||
3644 : | |||
3645 : | =pod | ||
3646 : | |||
3647 : | =head1 displayable_reaction | ||
3648 : | |||
3649 : | usage: $display_format = $fig->displayable_reaction($rid) | ||
3650 : | |||
3651 : | Returns a string giving the displayable version of a reaction. | ||
3652 : | |||
3653 : | =cut | ||
3654 : | |||
3655 : | sub displayable_reaction { | ||
3656 : | my($self,$rid) = @_; | ||
3657 : | |||
3658 : | my @tmp = `grep $rid $FIG_Config::data/KEGG/reaction_name.lst`; | ||
3659 : | if (@tmp > 0) | ||
3660 : | { | ||
3661 : | golsen | 1.44 | chomp $tmp[0]; |
3662 : | efrank | 1.1 | return $tmp[0]; |
3663 : | } | ||
3664 : | return $rid; | ||
3665 : | } | ||
3666 : | |||
3667 : | =pod | ||
3668 : | |||
3669 : | =head1 all_maps | ||
3670 : | |||
3671 : | usage: @maps = $fig->all_maps | ||
3672 : | |||
3673 : | Returns a list containing all of the KEGG maps that the system knows about (the | ||
3674 : | maps need to be periodically updated). | ||
3675 : | |||
3676 : | =cut | ||
3677 : | |||
3678 : | sub all_maps { | ||
3679 : | my($self,$ec) = @_; | ||
3680 : | |||
3681 : | my $rdbH = $self->db_handle; | ||
3682 : | my $relational_db_response = $rdbH->SQL("SELECT DISTINCT map FROM ec_map "); | ||
3683 : | if (@$relational_db_response > 0) | ||
3684 : | { | ||
3685 : | return map { $_->[0] } @$relational_db_response; | ||
3686 : | } | ||
3687 : | return (); | ||
3688 : | } | ||
3689 : | |||
3690 : | =pod | ||
3691 : | |||
3692 : | =head1 ec_to_maps | ||
3693 : | |||
3694 : | usage: @maps = $fig->ec_to_maps($ec) | ||
3695 : | |||
3696 : | Returns the set of maps that contain $ec as a functional role. $ec is usually an EC number, | ||
3697 : | but in the more general case, it can be a functional role. | ||
3698 : | |||
3699 : | =cut | ||
3700 : | |||
3701 : | sub ec_to_maps { | ||
3702 : | my($self,$ec) = @_; | ||
3703 : | |||
3704 : | my $rdbH = $self->db_handle; | ||
3705 : | my $relational_db_response = $rdbH->SQL("SELECT map FROM ec_map WHERE ( ec = \'$ec\' )"); | ||
3706 : | if (@$relational_db_response > 0) | ||
3707 : | { | ||
3708 : | return map { $_->[0] } @$relational_db_response; | ||
3709 : | } | ||
3710 : | return (); | ||
3711 : | } | ||
3712 : | |||
3713 : | |||
3714 : | =pod | ||
3715 : | |||
3716 : | =head1 map_to_ecs | ||
3717 : | |||
3718 : | usage: @ecs = $fig->map_to_ecs($map) | ||
3719 : | |||
3720 : | Returns the set of functional roles (usually ECs) that are contained in the functionality | ||
3721 : | depicted by $map. | ||
3722 : | |||
3723 : | =cut | ||
3724 : | |||
3725 : | sub map_to_ecs { | ||
3726 : | my($self,$map) = @_; | ||
3727 : | |||
3728 : | my $rdbH = $self->db_handle; | ||
3729 : | my $relational_db_response = $rdbH->SQL("SELECT ec FROM ec_map WHERE ( map = \'$map\' )"); | ||
3730 : | if (@$relational_db_response > 0) | ||
3731 : | { | ||
3732 : | return map { $_->[0] } @$relational_db_response; | ||
3733 : | } | ||
3734 : | return (); | ||
3735 : | } | ||
3736 : | |||
3737 : | =pod | ||
3738 : | |||
3739 : | =head1 map_name | ||
3740 : | |||
3741 : | usage: $name = $fig->map_name($map) | ||
3742 : | |||
3743 : | Returns the descriptive name covering the functionality depicted by $map. | ||
3744 : | |||
3745 : | =cut | ||
3746 : | |||
3747 : | sub map_name { | ||
3748 : | my($self,$map) = @_; | ||
3749 : | |||
3750 : | my $rdbH = $self->db_handle; | ||
3751 : | my $relational_db_response = $rdbH->SQL("SELECT mapname FROM map_name WHERE ( map = \'$map\' )"); | ||
3752 : | if (@$relational_db_response == 1) | ||
3753 : | { | ||
3754 : | return $relational_db_response->[0]->[0]; | ||
3755 : | } | ||
3756 : | return ""; | ||
3757 : | } | ||
3758 : | |||
3759 : | ################################# Functional Roles #################################### | ||
3760 : | |||
3761 : | =pod | ||
3762 : | |||
3763 : | =head1 neighborhood_of_role | ||
3764 : | |||
3765 : | usage: @roles = $fig->neighborhood_of_role($role) | ||
3766 : | |||
3767 : | Returns a list of functional roles that we consider to be "the neighborhood" of $role. | ||
3768 : | |||
3769 : | =cut | ||
3770 : | |||
3771 : | sub neighborhood_of_role { | ||
3772 : | my($self,$role) = @_; | ||
3773 : | my($readC); | ||
3774 : | |||
3775 : | my $file = "$FIG_Config::global/role.neighborhoods"; | ||
3776 : | my $rdbH = $self->db_handle; | ||
3777 : | my $roleQ = quotemeta $role; | ||
3778 : | my $relational_db_response = $rdbH->SQL("SELECT seek, len FROM neigh_seeks WHERE role = \'$roleQ\' "); | ||
3779 : | if (@$relational_db_response == 1) | ||
3780 : | { | ||
3781 : | my($seek,$ln) = @{$relational_db_response->[0]}; | ||
3782 : | my $fh = $self->openF($file); | ||
3783 : | seek($fh,$seek,0); | ||
3784 : | my $readN = read($fh,$readC,$ln-1); | ||
3785 : | ($readN == ($ln-1)) | ||
3786 : | || confess "could not read the block of sims at $seek for $ln - 1 characters; $readN actually read from $file\n$readC"; | ||
3787 : | return grep { $_ && ($_ !~ /^\/\//) } split(/\n/,$readC); | ||
3788 : | } | ||
3789 : | return (); | ||
3790 : | } | ||
3791 : | |||
3792 : | =pod | ||
3793 : | |||
3794 : | =head1 roles_of_function | ||
3795 : | |||
3796 : | usage: @roles = $fig->roles_of_function($func) | ||
3797 : | |||
3798 : | Returns a list of the functional roles implemented by $func. | ||
3799 : | |||
3800 : | =cut | ||
3801 : | |||
3802 : | sub roles_of_function { | ||
3803 : | my($func) = @_; | ||
3804 : | |||
3805 : | return (split(/\s*[\/;]\s+/,$func),($func =~ /\d+\.\d+\.\d+\.\d+/g)); | ||
3806 : | } | ||
3807 : | |||
3808 : | =pod | ||
3809 : | |||
3810 : | =head1 seqs_with_role | ||
3811 : | |||
3812 : | usage: @pegs = $fig->seqs_with_role($role,$who) | ||
3813 : | |||
3814 : | Returns a list of the pegs that implement $role. If $who is not given, it | ||
3815 : | defaults to "master". The system returns all pegs with an assignment made by | ||
3816 : | either "master" or $who (if it is different than the master) that implement $role. | ||
3817 : | Note that this includes pegs for which the "master" annotation disagrees with that | ||
3818 : | of $who, the master's implements $role, and $who's does not. | ||
3819 : | |||
3820 : | =cut | ||
3821 : | |||
3822 : | sub seqs_with_role { | ||
3823 : | overbeek | 1.26 | my($self,$role,$who,$genome) = @_; |
3824 : | my($relational_db_response,$query); | ||
3825 : | efrank | 1.1 | |
3826 : | overbeek | 1.32 | my $roleQ = quotemeta $role; |
3827 : | |||
3828 : | efrank | 1.1 | $who = $who ? $who : "master"; |
3829 : | my $rdbH = $self->db_handle; | ||
3830 : | |||
3831 : | my $who_cond; | ||
3832 : | if ($who eq "master") | ||
3833 : | { | ||
3834 : | $who_cond = "( made_by = \'master\' OR made_by = \'unknown\' )"; | ||
3835 : | } | ||
3836 : | else | ||
3837 : | { | ||
3838 : | $who_cond = "( made_by = \'master\' OR made_by = \'$who\' OR made_by = \'unknown\')"; | ||
3839 : | } | ||
3840 : | overbeek | 1.26 | |
3841 : | if (! $genome) | ||
3842 : | { | ||
3843 : | overbeek | 1.32 | $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond )"; |
3844 : | overbeek | 1.26 | } |
3845 : | else | ||
3846 : | { | ||
3847 : | overbeek | 1.32 | $query = "SELECT distinct prot FROM roles WHERE (( role = \'$roleQ\' ) AND $who_cond AND (org = \'$genome\'))"; |
3848 : | overbeek | 1.26 | } |
3849 : | efrank | 1.1 | return (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) ? |
3850 : | map { $_->[0] } @$relational_db_response : (); | ||
3851 : | } | ||
3852 : | |||
3853 : | =pod | ||
3854 : | |||
3855 : | =head1 seqs_with_roles_in_genomes | ||
3856 : | |||
3857 : | usage: $result = $fig->seqs_with_roles_in_genomes($genomes,$roles,$made_by) | ||
3858 : | |||
3859 : | This routine takes a pointer to a list of genomes ($genomes) and a pointer to a list of | ||
3860 : | roles ($roles) and looks up all of the sequences that connect to those roles according | ||
3861 : | to either the master assignments or those made by $made_by. Again, you will get assignments | ||
3862 : | for which the "master" assignment connects, but the $made_by does not. | ||
3863 : | |||
3864 : | A hash is returned. The keys to the hash are genome IDs for which at least one sequence | ||
3865 : | was found. $result->{$genome} will itself be a hash, assuming that at least one sequence | ||
3866 : | was found for $genome. $result->{$genome}->{$role} will be set to a pointer to a list of | ||
3867 : | 2-tuples. Each 2-tuple will contain [$peg,$function], where $function is the one for | ||
3868 : | $made_by (which may not be the one that connected). | ||
3869 : | |||
3870 : | =cut | ||
3871 : | |||
3872 : | sub seqs_with_roles_in_genomes { | ||
3873 : | my($self,$genomes,$roles,$made_by) = @_; | ||
3874 : | my($genome,$role,$roleQ,$role_cond,$made_by_cond,$query,$relational_db_response,$peg,$genome_cond,$hit); | ||
3875 : | my $rdbH = $self->db_handle; | ||
3876 : | my $result = {}; # foreach $genome ($self->genomes) { $result->{$genome} = {} } | ||
3877 : | if (! $made_by) { $made_by = 'master' } | ||
3878 : | if ((@$genomes > 0) && (@$roles > 0)) | ||
3879 : | { | ||
3880 : | $genome_cond = "(" . join(" OR ",map { "( org = \'$_\' )" } @$genomes) . ")"; | ||
3881 : | $role_cond = "(" . join(" OR ",map { $roleQ = quotemeta $_; "( role = \'$roleQ\' )" } @$roles) . ")"; | ||
3882 : | $made_by_cond = ($made_by eq 'master') ? "(made_by = 'master')" : "(made_by = 'master' OR made_by = '$made_by')"; | ||
3883 : | $query = "SELECT distinct prot, role FROM roles WHERE ( $made_by_cond AND $genome_cond AND $role_cond )"; | ||
3884 : | if (($relational_db_response = $rdbH->SQL($query)) && (@$relational_db_response >= 1)) | ||
3885 : | { | ||
3886 : | foreach $hit (@$relational_db_response) | ||
3887 : | { | ||
3888 : | ($peg,$role) = @$hit; | ||
3889 : | $genome = $self->genome_of($peg); | ||
3890 : | push(@{ $result->{$genome}->{$role} },[$peg,scalar $self->function_of($peg,$made_by)]); | ||
3891 : | } | ||
3892 : | } | ||
3893 : | } | ||
3894 : | return $result; | ||
3895 : | } | ||
3896 : | |||
3897 : | =pod | ||
3898 : | |||
3899 : | =head1 largest_clusters | ||
3900 : | |||
3901 : | usage: @clusters = $fig->largest_clusters($roles,$user) | ||
3902 : | |||
3903 : | mkubal | 1.54 | This routine can be used to find the largest clusters containing some of the |
3904 : | efrank | 1.1 | designated set of roles. A list of clusters is returned. Each cluster is a pointer to |
3905 : | a list of pegs. | ||
3906 : | |||
3907 : | =cut | ||
3908 : | |||
3909 : | sub largest_clusters { | ||
3910 : | my($self,$roles,$user,$sort_by_unique_functions) = @_; | ||
3911 : | my($genome,$x,$role,$y,$peg,$loc,$contig,$beg,$end,%pegs,@pegs,$i,$j); | ||
3912 : | |||
3913 : | my $ss = $self->seqs_with_roles_in_genomes([$self->genomes],$roles,$user); | ||
3914 : | my @clusters = (); | ||
3915 : | |||
3916 : | foreach $genome (keys(%$ss)) | ||
3917 : | { | ||
3918 : | my %pegs; | ||
3919 : | $x = $ss->{$genome}; | ||
3920 : | foreach $role (keys(%$x)) | ||
3921 : | { | ||
3922 : | $y = $x->{$role}; | ||
3923 : | foreach $peg (map { $_->[0] } @$y) | ||
3924 : | { | ||
3925 : | if ($loc = $self->feature_location($peg)) | ||
3926 : | { | ||
3927 : | ($contig,$beg,$end) = &FIG::boundaries_of($loc); | ||
3928 : | $pegs{$peg} = [$peg,$contig,int(($beg + $end) / 2)]; | ||
3929 : | } | ||
3930 : | } | ||
3931 : | } | ||
3932 : | |||
3933 : | @pegs = sort { ($pegs{$a}->[1] cmp $pegs{$b}->[1]) or ($pegs{$a}->[2] <=> $pegs{$b}->[2]) } keys(%pegs); | ||
3934 : | $i = 0; | ||
3935 : | while ($i < $#pegs) | ||
3936 : | { | ||
3937 : | for ($j=$i+1; ($j < @pegs) && &close_enough_locs($pegs{$pegs[$j-1]},$pegs{$pegs[$j]}); $j++) {} | ||
3938 : | if ($j > ($i+1)) | ||
3939 : | { | ||
3940 : | push(@clusters,[@pegs[$i..$j-1]]); | ||
3941 : | } | ||
3942 : | $i = $j; | ||
3943 : | } | ||
3944 : | } | ||
3945 : | if ($sort_by_unique_functions) | ||
3946 : | { | ||
3947 : | @clusters = sort { $self->unique_functions($b,$user) <=> $self->unique_functions($a,$user) } @clusters; | ||
3948 : | } | ||
3949 : | else | ||
3950 : | { | ||
3951 : | @clusters = sort { @$b <=> @$a } @clusters; | ||
3952 : | } | ||
3953 : | return @clusters; | ||
3954 : | } | ||
3955 : | |||
3956 : | sub unique_functions { | ||
3957 : | my($self,$pegs,$user) = @_; | ||
3958 : | my($peg,$func,%seen); | ||
3959 : | |||
3960 : | foreach $peg (@$pegs) | ||
3961 : | { | ||
3962 : | if ($func = $self->function_of($peg,$user)) | ||
3963 : | { | ||
3964 : | $seen{$func} = 1; | ||
3965 : | } | ||
3966 : | } | ||
3967 : | return scalar keys(%seen); | ||
3968 : | } | ||
3969 : | |||
3970 : | sub close_enough_locs { | ||
3971 : | my($x,$y) = @_; | ||
3972 : | |||
3973 : | return (($x->[1] eq $y->[1]) && (abs($x->[2] - $y->[2]) < 5000)); | ||
3974 : | } | ||
3975 : | |||
3976 : | overbeek | 1.59 | sub candidates_for_role { |
3977 : | my($self,$role,$genome,$cutoff,$user) = @_; | ||
3978 : | my($peg); | ||
3979 : | overbeek | 1.64 | |
3980 : | overbeek | 1.59 | $user = $user ? $user : "master"; |
3981 : | |||
3982 : | my @cand = map { $_->[0] } | ||
3983 : | sort { $a->[1] <=> $b->[1] } | ||
3984 : | map { $peg = $_; [$peg,$self->crude_estimate_of_distance($genome,&FIG::genome_of($peg))] } | ||
3985 : | $self->seqs_with_role($role,$user); | ||
3986 : | |||
3987 : | overbeek | 1.64 | return $self->candidates_for_role_from_known($genome,$cutoff,\@cand); |
3988 : | } | ||
3989 : | |||
3990 : | sub candidates_for_role_from_known { | ||
3991 : | my($self,$genome,$cutoff,$known) = @_; | ||
3992 : | my($peg); | ||
3993 : | |||
3994 : | my @cand = @$known; | ||
3995 : | overbeek | 1.59 | my $hits = {}; |
3996 : | my $seen = {}; | ||
3997 : | overbeek | 1.68 | my $how_many = (@cand > 10) ? 9 : scalar @cand; |
3998 : | &try_to_locate($self,$genome,$hits,[@cand[0..$how_many]],$seen,$cutoff); | ||
3999 : | overbeek | 1.59 | if (keys(%$hits) == 0) |
4000 : | { | ||
4001 : | splice(@cand,0,$how_many+1); | ||
4002 : | &try_to_locate($self,$genome,$hits,\@cand,$seen,$cutoff); | ||
4003 : | } | ||
4004 : | return sort {$hits->{$a}->[0] <=> $hits->{$b}->[0]} keys(%$hits); | ||
4005 : | } | ||
4006 : | |||
4007 : | sub try_to_locate { | ||
4008 : | my($self,$genome,$hits,$to_try,$seen,$cutoff) = @_; | ||
4009 : | my($prot,$id2,$psc,$id2a,$x,$sim); | ||
4010 : | |||
4011 : | if (! $cutoff) { $cutoff = 1.0e-5 } | ||
4012 : | |||
4013 : | foreach $prot (@$to_try) | ||
4014 : | { | ||
4015 : | if (! $seen->{$prot}) | ||
4016 : | { | ||
4017 : | if (($prot =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome)) | ||
4018 : | { | ||
4019 : | $hits->{$prot} = [0,$prot]; | ||
4020 : | } | ||
4021 : | else | ||
4022 : | { | ||
4023 : | foreach $sim ($self->sims($prot,1000,$cutoff,"raw",0)) | ||
4024 : | { | ||
4025 : | $id2 = $sim->id2; | ||
4026 : | $psc = $sim->psc; | ||
4027 : | foreach $id2a (map { $_->[0] } $self->mapped_prot_ids($id2)) | ||
4028 : | { | ||
4029 : | if (($id2a =~ /^fig\|(\d+\.\d+)/) && ($1 eq $genome)) | ||
4030 : | { | ||
4031 : | $x = $hits->{$id2a}; | ||
4032 : | if ((! $x) || ($x->[0] > $psc)) | ||
4033 : | { | ||
4034 : | $hits->{$id2a} = [$sim->psc,$prot]; | ||
4035 : | } | ||
4036 : | } | ||
4037 : | elsif ($psc < 1.0e-20) | ||
4038 : | { | ||
4039 : | { | ||
4040 : | $seen->{$id2a} = 1; | ||
4041 : | } | ||
4042 : | } | ||
4043 : | } | ||
4044 : | |||
4045 : | } | ||
4046 : | } | ||
4047 : | } | ||
4048 : | } | ||
4049 : | } | ||
4050 : | |||
4051 : | overbeek | 1.65 | |
4052 : | =pod | ||
4053 : | |||
4054 : | =head1 best_bbh_candidates | ||
4055 : | |||
4056 : | usage: @candidates = $fig->best_bbh_candidates($genome,$cutoff,$requested,$known) | ||
4057 : | |||
4058 : | This routine returns a list of up to $requested candidates from $genome. A candidate is a BBH | ||
4059 : | against one of the PEGs in @$known. Each entry in the list is a 3-tuple: | ||
4060 : | |||
4061 : | [CandidatePEG,KnownBBH,Pscore] | ||
4062 : | |||
4063 : | =cut | ||
4064 : | |||
4065 : | sub best_bbh_candidates { | ||
4066 : | overbeek | 1.71 | my($self,$genome,$cutoff,$requested,$known,$frac_match) = @_; |
4067 : | overbeek | 1.64 | my($i,$j,$k,$sim,@sims,$peg,$id2,$genome2,$sim_back); |
4068 : | overbeek | 1.67 | my($bbh,%seen,%computed_sims,$genome1); |
4069 : | overbeek | 1.64 | |
4070 : | overbeek | 1.71 | $frac_match = defined($frac_match) ? $frac_match : 0.7; |
4071 : | overbeek | 1.64 | my @got = (); |
4072 : | my @cand = $self->candidates_for_role_from_known($genome,$cutoff,$known); | ||
4073 : | if (@cand > 0) | ||
4074 : | { | ||
4075 : | overbeek | 1.67 | my %genomes = map { $genome1 = &FIG::genome_of($_); $genome1 => 1 } @$known; |
4076 : | overbeek | 1.64 | my %pegs = map { $_ => 1 } @$known; |
4077 : | for ($i=0; (@got < $requested) && ($i < @cand); $i++) | ||
4078 : | { | ||
4079 : | $peg = $cand[$i]; | ||
4080 : | undef %seen; | ||
4081 : | @sims = grep { $genomes{&FIG::genome_of($_->id2)} } $self->sims($peg,1000,$cutoff,"fig"); | ||
4082 : | $bbh = 0; | ||
4083 : | for ($j=0; (! $bbh) && ($j < @sims); $j++) | ||
4084 : | { | ||
4085 : | $sim = $sims[$j]; | ||
4086 : | $id2 = $sim->id2; | ||
4087 : | $genome2 = &FIG::genome_of($id2); | ||
4088 : | if (! $seen{$genome2}) | ||
4089 : | { | ||
4090 : | if ($pegs{$id2}) | ||
4091 : | { | ||
4092 : | if (! defined($sim_back = $computed_sims{$id2})) | ||
4093 : | { | ||
4094 : | my @sims_back = $self->sims($id2,1000,$cutoff,"fig"); | ||
4095 : | for ($k=0; ($k < @sims_back) && (&FIG::genome_of($sims_back[$k]->id2) ne $genome); $k++) {} | ||
4096 : | if ($k < @sims_back) | ||
4097 : | { | ||
4098 : | $sim_back = $computed_sims{$id2} = $sims_back[$k]; | ||
4099 : | } | ||
4100 : | else | ||
4101 : | { | ||
4102 : | $sim_back = $computed_sims{$id2} = 0; | ||
4103 : | } | ||
4104 : | } | ||
4105 : | if ($sim_back) | ||
4106 : | { | ||
4107 : | overbeek | 1.71 | if ($self->ok_match($sim_back,$frac_match)) |
4108 : | overbeek | 1.64 | { |
4109 : | overbeek | 1.65 | $bbh = [$id2,$sim_back->psc]; |
4110 : | overbeek | 1.64 | } |
4111 : | } | ||
4112 : | } | ||
4113 : | $seen{$genome2} = 1; | ||
4114 : | } | ||
4115 : | } | ||
4116 : | |||
4117 : | if ($bbh) | ||
4118 : | { | ||
4119 : | overbeek | 1.65 | push(@got,[$peg,@$bbh]); |
4120 : | overbeek | 1.64 | } |
4121 : | } | ||
4122 : | } | ||
4123 : | return @got; | ||
4124 : | } | ||
4125 : | |||
4126 : | |||
4127 : | sub ok_match { | ||
4128 : | overbeek | 1.71 | my($self,$sim,$frac_match) = @_; |
4129 : | overbeek | 1.64 | |
4130 : | my $ln1 = $sim->ln1; | ||
4131 : | my $ln2 = $sim->ln2; | ||
4132 : | my $b1 = $sim->b1; | ||
4133 : | my $e1 = $sim->e1; | ||
4134 : | my $b2 = $sim->b2; | ||
4135 : | my $e2 = $sim->e2; | ||
4136 : | |||
4137 : | overbeek | 1.71 | return (((($e1 - $b1) / $ln1) >= $frac_match) && |
4138 : | ((($e2 - $b2) / $ln2) >= $frac_match)) | ||
4139 : | } | ||
4140 : | |||
4141 : | sub external_calls { | ||
4142 : | my($self,$pegs) = @_; | ||
4143 : | overbeek | 1.72 | my($peg,$func); |
4144 : | overbeek | 1.71 | |
4145 : | open(TMP,">/tmp/pegs.$$") || die "could not open /tmp/pegs.$$"; | ||
4146 : | foreach $peg (@$pegs) | ||
4147 : | { | ||
4148 : | print TMP "$peg\n"; | ||
4149 : | } | ||
4150 : | close(TMP); | ||
4151 : | open(TMP,">/tmp/parms.$$") || die "could not open /tmp/parms.$$"; | ||
4152 : | print TMP "no_fig\t1\n"; | ||
4153 : | close(TMP); | ||
4154 : | |||
4155 : | overbeek | 1.72 | my %call = map { chop; ($peg,$func) = split(/\t/,$_) } |
4156 : | `$FIG_Config::bin/auto_assign /tmp/parms.$$ < /tmp/pegs.$$ 2> /dev/null | $FIG_Config::bin/make_calls`; | ||
4157 : | overbeek | 1.71 | unlink("/tmp/pegs.$$","/tmp/parms.$$"); |
4158 : | overbeek | 1.72 | return map { $call{$_} ? [$_,$call{$_}] : [$_,"hypothetical protein"] } @$pegs; |
4159 : | overbeek | 1.71 | } |
4160 : | |||
4161 : | use SameFunc; | ||
4162 : | |||
4163 : | sub same_func { | ||
4164 : | my($self,$f1,$f2) = @_; | ||
4165 : | |||
4166 : | return &SameFunc::same_func($f1,$f2); | ||
4167 : | overbeek | 1.64 | } |
4168 : | |||
4169 : | efrank | 1.1 | ################################# DNA sequence Stuff #################################### |
4170 : | |||
4171 : | =pod | ||
4172 : | |||
4173 : | =head1 extract_seq | ||
4174 : | |||
4175 : | usage: $seq = &FIG::extract_seq($contigs,$loc) | ||
4176 : | |||
4177 : | This is just a little utility routine that I have found convenient. It assumes that | ||
4178 : | $contigs is a hash that contains IDs as keys and sequences as values. $loc must be of the | ||
4179 : | form | ||
4180 : | Contig_Beg_End | ||
4181 : | |||
4182 : | where Contig is the ID of one of the sequences; Beg and End give the coordinates of the sought | ||
4183 : | subsequence. If Beg > End, it is assumed that you want the reverse complement of the subsequence. | ||
4184 : | This routine plucks out the subsequence for you. | ||
4185 : | |||
4186 : | =cut | ||
4187 : | |||
4188 : | sub extract_seq { | ||
4189 : | my($contigs,$loc) = @_; | ||
4190 : | my($contig,$beg,$end,$contig_seq); | ||
4191 : | my($plus,$minus); | ||
4192 : | |||
4193 : | $plus = $minus = 0; | ||
4194 : | my $strand = ""; | ||
4195 : | my @loc = split(/,/,$loc); | ||
4196 : | my @seq = (); | ||
4197 : | foreach $loc (@loc) | ||
4198 : | { | ||
4199 : | if ($loc =~ /^\S+_(\d+)_(\d+)$/) | ||
4200 : | { | ||
4201 : | if ($1 < $2) | ||
4202 : | { | ||
4203 : | $plus++; | ||
4204 : | } | ||
4205 : | elsif ($2 < $1) | ||
4206 : | { | ||
4207 : | $minus++; | ||
4208 : | } | ||
4209 : | } | ||
4210 : | } | ||
4211 : | if ($plus > $minus) | ||
4212 : | { | ||
4213 : | $strand = "+"; | ||
4214 : | } | ||
4215 : | elsif ($plus < $minus) | ||
4216 : | { | ||
4217 : | $strand = "-"; | ||
4218 : | } | ||
4219 : | |||
4220 : | foreach $loc (@loc) | ||
4221 : | { | ||
4222 : | if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) | ||
4223 : | { | ||
4224 : | ($contig,$beg,$end) = ($1,$2,$3); | ||
4225 : | if (($beg < $end) || (($beg == $end) && ($strand eq "+"))) | ||
4226 : | { | ||
4227 : | $strand = "+"; | ||
4228 : | push(@seq,substr($contigs->{$contig},$beg-1,($end+1-$beg))); | ||
4229 : | } | ||
4230 : | else | ||
4231 : | { | ||
4232 : | $strand = "-"; | ||
4233 : | push(@seq,&reverse_comp(substr($contigs->{$contig},$end-1,($beg+1-$end)))); | ||
4234 : | } | ||
4235 : | } | ||
4236 : | } | ||
4237 : | return join("",@seq); | ||
4238 : | } | ||
4239 : | |||
4240 : | =pod | ||
4241 : | |||
4242 : | =head1 contig_ln | ||
4243 : | |||
4244 : | usage: $n = $fig->contig_ln($genome,$contig) | ||
4245 : | |||
4246 : | Returns the length of $contig from $genome. | ||
4247 : | |||
4248 : | =cut | ||
4249 : | |||
4250 : | sub contig_ln { | ||
4251 : | my($self,$genome,$contig) = @_; | ||
4252 : | my($rdbH,$relational_db_response); | ||
4253 : | |||
4254 : | $rdbH = $self->db_handle; | ||
4255 : | if (defined($genome) && defined($contig)) | ||
4256 : | { | ||
4257 : | if (($relational_db_response = $rdbH->SQL("SELECT len FROM contig_lengths WHERE ( genome = \'$genome\' ) and ( contig = \'$contig\' )")) && | ||
4258 : | |||
4259 : | (@$relational_db_response == 1)) | ||
4260 : | { | ||
4261 : | return $relational_db_response->[0]->[0]; | ||
4262 : | } | ||
4263 : | } | ||
4264 : | return undef; | ||
4265 : | } | ||
4266 : | |||
4267 : | =pod | ||
4268 : | |||
4269 : | =head1 dna_seq | ||
4270 : | |||
4271 : | usage: $seq = dna_seq($genome,@locations) | ||
4272 : | |||
4273 : | Returns the concatenated subsequences described by the list of locations. Each location | ||
4274 : | must be of the form | ||
4275 : | |||
4276 : | Contig_Beg_End | ||
4277 : | |||
4278 : | where Contig must be the ID of a contig for genome $genome. If Beg > End the location | ||
4279 : | describes a stretch of the complementary strand. | ||
4280 : | |||
4281 : | =cut | ||
4282 : | |||
4283 : | sub dna_seq { | ||
4284 : | my($self,$genome,@locations) = @_; | ||
4285 : | my(@pieces,$loc,$contig,$beg,$end,$ln,$rdbH); | ||
4286 : | |||
4287 : | @pieces = (); | ||
4288 : | foreach $loc (@locations) | ||
4289 : | { | ||
4290 : | if ($loc =~ /^(\S+)_(\d+)_(\d+)$/) | ||
4291 : | { | ||
4292 : | ($contig,$beg,$end) = ($1,$2,$3); | ||
4293 : | $ln = $self->contig_ln($genome,$contig); | ||
4294 : | |||
4295 : | if (! $ln) { | ||
4296 : | print STDERR "$genome/$contig: could not get length\n"; | ||
4297 : | return ""; | ||
4298 : | } | ||
4299 : | |||
4300 : | if (&between(1,$beg,$ln) && &between(1,$end,$ln)) | ||
4301 : | { | ||
4302 : | if ($beg < $end) | ||
4303 : | { | ||
4304 : | push(@pieces, $self->get_dna($genome,$contig,$beg,$end)); | ||
4305 : | } | ||
4306 : | else | ||
4307 : | { | ||
4308 : | push(@pieces, &reverse_comp($self->get_dna($genome,$contig,$end,$beg))); | ||
4309 : | } | ||
4310 : | } | ||
4311 : | } | ||
4312 : | } | ||
4313 : | return join("",@pieces); | ||
4314 : | } | ||
4315 : | |||
4316 : | sub get_dna { | ||
4317 : | my($self,$genome,$contig,$beg,$end) = @_; | ||
4318 : | my $relational_db_response; | ||
4319 : | |||
4320 : | my $rdbH = $self->db_handle; | ||
4321 : | my $indexpt = int(($beg-1)/10000) * 10000; | ||
4322 : | if (($relational_db_response = $rdbH->SQL("SELECT startN,fileno,seek FROM contig_seeks WHERE ( genome = \'$genome\' ) AND ( contig = \'$contig\' ) AND ( indexpt = $indexpt )")) && | ||
4323 : | (@$relational_db_response == 1)) | ||
4324 : | { | ||
4325 : | my($startN,$fileN,$seek) = @{$relational_db_response->[0]}; | ||
4326 : | my $fh = $self->openF($self->N2file($fileN)); | ||
4327 : | if (seek($fh,$seek,0)) | ||
4328 : | { | ||
4329 : | my $chunk = ""; | ||
4330 : | read($fh,$chunk,int(($end + 1 - $startN) * 1.03)); | ||
4331 : | $chunk =~ s/\s//g; | ||
4332 : | my $ln = ($end - $beg) + 1; | ||
4333 : | if (length($chunk) >= $ln) | ||
4334 : | { | ||
4335 : | return substr($chunk,(($beg-1)-$startN),$ln); | ||
4336 : | } | ||
4337 : | } | ||
4338 : | } | ||
4339 : | return undef; | ||
4340 : | } | ||
4341 : | |||
4342 : | overbeek | 1.36 | ################################# Taxonomy #################################### |
4343 : | |||
4344 : | =pod | ||
4345 : | |||
4346 : | =head1 taxonomy_of | ||
4347 : | |||
4348 : | usage: $gs = $fig->taxonomy_of($genome_id) | ||
4349 : | |||
4350 : | Returns the taxonomy of the specified genome. Gives the taxonomy down to | ||
4351 : | genus and species. | ||
4352 : | |||
4353 : | =cut | ||
4354 : | |||
4355 : | sub taxonomy_of { | ||
4356 : | my($self,$genome) = @_; | ||
4357 : | my($ans); | ||
4358 : | my $taxonomy = $self->cached('_taxonomy'); | ||
4359 : | |||
4360 : | if (! ($ans = $taxonomy->{$genome})) | ||
4361 : | { | ||
4362 : | my $rdbH = $self->db_handle; | ||
4363 : | my $relational_db_response = $rdbH->SQL("SELECT genome,taxonomy FROM genome"); | ||
4364 : | my $pair; | ||
4365 : | foreach $pair (@$relational_db_response) | ||
4366 : | { | ||
4367 : | $taxonomy->{$pair->[0]} = $pair->[1]; | ||
4368 : | } | ||
4369 : | $ans = $taxonomy->{$genome}; | ||
4370 : | } | ||
4371 : | return $ans; | ||
4372 : | } | ||
4373 : | |||
4374 : | =pod | ||
4375 : | |||
4376 : | =head1 is_bacterial | ||
4377 : | |||
4378 : | usage: $fig->is_bacterial($genome) | ||
4379 : | |||
4380 : | Returns true iff the genome is bacterial. | ||
4381 : | |||
4382 : | =cut | ||
4383 : | |||
4384 : | sub is_bacterial { | ||
4385 : | my($self,$genome) = @_; | ||
4386 : | |||
4387 : | mkubal | 1.53 | return ($self->taxonomy_of($genome) =~ /^Bacteria/) ? 1 : 0; |
4388 : | overbeek | 1.36 | } |
4389 : | |||
4390 : | |||
4391 : | =pod | ||
4392 : | |||
4393 : | =head1 is_archaeal | ||
4394 : | |||
4395 : | usage: $fig->is_archaeal($genome) | ||
4396 : | |||
4397 : | Returns true iff the genome is archaeal. | ||
4398 : | |||
4399 : | =cut | ||
4400 : | |||
4401 : | sub is_archaeal { | ||
4402 : | my($self,$genome) = @_; | ||
4403 : | |||
4404 : | mkubal | 1.53 | return ($self->taxonomy_of($genome) =~ /^Archaea/) ? 1 : 0; |
4405 : | overbeek | 1.36 | } |
4406 : | |||
4407 : | |||
4408 : | =pod | ||
4409 : | |||
4410 : | =head1 is_prokaryotic | ||
4411 : | |||
4412 : | usage: $fig->is_prokaryotic($genome) | ||
4413 : | |||
4414 : | Returns true iff the genome is prokaryotic | ||
4415 : | |||
4416 : | =cut | ||
4417 : | |||
4418 : | sub is_prokaryotic { | ||
4419 : | my($self,$genome) = @_; | ||
4420 : | |||
4421 : | mkubal | 1.53 | return ($self->taxonomy_of($genome) =~ /^(Archaea|Bacteria)/) ? 1 : 0; |
4422 : | overbeek | 1.36 | } |
4423 : | |||
4424 : | |||
4425 : | =pod | ||
4426 : | |||
4427 : | =head1 is_eukaryotic | ||
4428 : | |||
4429 : | usage: $fig->is_eukaryotic($genome) | ||
4430 : | |||
4431 : | Returns true iff the genome is eukaryotic | ||
4432 : | |||
4433 : | =cut | ||
4434 : | |||
4435 : | sub is_eukaryotic { | ||
4436 : | my($self,$genome) = @_; | ||
4437 : | |||
4438 : | mkubal | 1.53 | return ($self->taxonomy_of($genome) =~ /^Eukaryota/) ? 1 : 0; |
4439 : | overbeek | 1.36 | } |
4440 : | |||
4441 : | =pod | ||
4442 : | |||
4443 : | =head1 sort_genomes_by_taxonomy | ||
4444 : | |||
4445 : | usage: @genomes = $fig->sort_genomes_by_taxonomy(@list_of_genomes) | ||
4446 : | |||
4447 : | This routine is used to sort a list of genome IDs to put them | ||
4448 : | into taxonomic order. | ||
4449 : | |||
4450 : | =cut | ||
4451 : | |||
4452 : | sub sort_genomes_by_taxonomy { | ||
4453 : | my($self,@fids) = @_; | ||
4454 : | |||
4455 : | return map { $_->[0] } | ||
4456 : | sort { $a->[1] cmp $b->[1] } | ||
4457 : | map { [$_,$self->taxonomy_of($_)] } | ||
4458 : | @fids; | ||
4459 : | } | ||
4460 : | |||
4461 : | =pod | ||
4462 : | |||
4463 : | =head1 crude_estimate_of_distance | ||
4464 : | |||
4465 : | usage: $dist = $fig->crude_estimate_of_distance($genome1,$genome2) | ||
4466 : | |||
4467 : | There are a number of places where we need estimates of the distance between | ||
4468 : | two genomes. This routine will return a value between 0 and 1, where a value of 0 | ||
4469 : | means "the genomes are essentially identical" and a value of 1 means | ||
4470 : | "the genomes are in different major groupings" (the groupings are archaea, bacteria, | ||
4471 : | euks, and viruses). The measure is extremely crude. | ||
4472 : | |||
4473 : | =cut | ||
4474 : | |||
4475 : | sub crude_estimate_of_distance { | ||
4476 : | my($self,$genome1,$genome2) = @_; | ||
4477 : | my($i,$v,$d,$dist); | ||
4478 : | |||
4479 : | if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } | ||
4480 : | |||
4481 : | my $relational_db_response; | ||
4482 : | my $rdbH = $self->db_handle; | ||
4483 : | |||
4484 : | if (($relational_db_response = $rdbH->SQL("SELECT dist FROM distances WHERE ( genome1 = \'$genome1\' ) AND ( genome2 = \'$genome2\' ) ")) && | ||
4485 : | (@$relational_db_response == 1)) | ||
4486 : | { | ||
4487 : | return $relational_db_response->[0]->[0]; | ||
4488 : | } | ||
4489 : | return $self->crude_estimate_of_distance1($genome1,$genome2); | ||
4490 : | } | ||
4491 : | |||
4492 : | sub crude_estimate_of_distance1 { | ||
4493 : | my($self,$genome1,$genome2) = @_; | ||
4494 : | my($i,$v,$d,$dist); | ||
4495 : | |||
4496 : | if ($genome1 > $genome2) { ($genome1,$genome2) = ($genome2,$genome1) } | ||
4497 : | $dist = $self->cached('_dist'); | ||
4498 : | if (! $dist->{"$genome1,$genome2"}) | ||
4499 : | { | ||
4500 : | my @tax1 = split(/\s*;\s*/,$self->taxonomy_of($genome1)); | ||
4501 : | my @tax2 = split(/\s*;\s*/,$self->taxonomy_of($genome2)); | ||
4502 : | |||
4503 : | $d = 1; | ||
4504 : | for ($i=0, $v=0.5; ($i < @tax1) && ($i < @tax2) && ($tax1[$i] eq $tax2[$i]); $i++, $v = $v/2) | ||
4505 : | { | ||
4506 : | $d -= $v; | ||
4507 : | } | ||
4508 : | $dist->{"$genome1,$genome2"} = $d; | ||
4509 : | } | ||
4510 : | return $dist->{"$genome1,$genome2"}; | ||
4511 : | } | ||
4512 : | |||
4513 : | =pod | ||
4514 : | |||
4515 : | =head1 sort_fids_by_taxonomy | ||
4516 : | |||
4517 : | usage: @sorted_by_taxonomy = $fig->sort_fids_by_taxonomy(@list_of_fids) | ||
4518 : | |||
4519 : | Sorts a list of feature IDs based on the taxonomies of the genomes that contain the features. | ||
4520 : | |||
4521 : | =cut | ||
4522 : | |||
4523 : | sub sort_fids_by_taxonomy { | ||
4524 : | my($self,@fids) = @_; | ||
4525 : | |||
4526 : | return map { $_->[0] } | ||
4527 : | sort { $a->[1] cmp $b->[1] } | ||
4528 : | map { [$_,$self->taxonomy_of(&genome_of($_))] } | ||
4529 : | @fids; | ||
4530 : | } | ||
4531 : | |||
4532 : | sub build_tree_of_complete { | ||
4533 : | my($self,$min_for_label) = @_; | ||
4534 : | my(@last,@tax,$i,$prefix,$lev,$genome,$tax); | ||
4535 : | |||
4536 : | $min_for_label = $min_for_label ? $min_for_label : 10; | ||
4537 : | open(TMP,">/tmp/tree$$") || die "could not open /tmp/tree$$"; | ||
4538 : | print TMP "1. root\n"; | ||
4539 : | |||
4540 : | @last = (); | ||
4541 : | |||
4542 : | |||
4543 : | foreach $genome (grep { $_ !~ /^99999/ } $self->sort_genomes_by_taxonomy($self->genomes("complete"))) | ||
4544 : | { | ||
4545 : | $tax = $self->taxonomy_of($genome); | ||
4546 : | @tax = split(/\s*;\s*/,$tax); | ||
4547 : | push(@tax,$genome); | ||
4548 : | for ($i=0; ((@last > $i) && (@tax > $i) && ($last[$i] eq $tax[$i])); $i++) {} | ||
4549 : | while ($i < @tax) | ||
4550 : | { | ||
4551 : | $lev = $i+2; | ||
4552 : | $prefix = " " x (4 * ($lev-1)); | ||
4553 : | print TMP "$prefix$lev\. $tax[$i]\n"; | ||
4554 : | $i++; | ||
4555 : | } | ||
4556 : | @last = @tax; | ||
4557 : | } | ||
4558 : | close(TMP); | ||
4559 : | my $tree = &tree_utilities::build_tree_from_outline("/tmp/tree$$"); | ||
4560 : | $tree->[0] = 'All'; | ||
4561 : | &limit_labels($tree,$min_for_label); | ||
4562 : | unlink("/tmp/tree$$"); | ||
4563 : | return ($tree,&tips_of_tree($tree)); | ||
4564 : | } | ||
4565 : | |||
4566 : | sub limit_labels { | ||
4567 : | my($tree,$min_for_label) = @_; | ||
4568 : | |||
4569 : | my($children) = &tree_utilities::node_pointers($tree); | ||
4570 : | if (@$children == 1) | ||
4571 : | { | ||
4572 : | return 1; | ||
4573 : | } | ||
4574 : | else | ||
4575 : | { | ||
4576 : | my $n = 0; | ||
4577 : | my $i; | ||
4578 : | for ($i=1; ($i < @$children); $i++) | ||
4579 : | { | ||
4580 : | $n += &limit_labels($children->[$i],$min_for_label); | ||
4581 : | } | ||
4582 : | if ($n < $min_for_label) | ||
4583 : | { | ||
4584 : | $tree->[0] = ""; | ||
4585 : | } | ||
4586 : | return $n; | ||
4587 : | } | ||
4588 : | } | ||
4589 : | |||
4590 : | sub taxonomic_groups_of_complete { | ||
4591 : | my($self,$min_for_labels) = @_; | ||
4592 : | |||
4593 : | my($tree,undef) = $self->build_tree_of_complete($min_for_labels); | ||
4594 : | return &taxonomic_groups($tree); | ||
4595 : | } | ||
4596 : | |||
4597 : | sub taxonomic_groups { | ||
4598 : | my($tree) = @_; | ||
4599 : | |||
4600 : | my($groups,undef) = &taxonomic_groups_and_children($tree); | ||
4601 : | return $groups; | ||
4602 : | } | ||
4603 : | |||
4604 : | sub taxonomic_groups_and_children { | ||
4605 : | my($tree) = @_; | ||
4606 : | my($ids1,$i,$groupsC,$idsC); | ||
4607 : | |||
4608 : | my $ptrs = &tree_utilities::node_pointers($tree); | ||
4609 : | my $ids = []; | ||
4610 : | my $groups = []; | ||
4611 : | |||
4612 : | if (@$ptrs > 1) | ||
4613 : | { | ||
4614 : | $ids1 = []; | ||
4615 : | for ($i=1; ($i < @$ptrs); $i++) | ||
4616 : | { | ||
4617 : | ($groupsC,$idsC) = &taxonomic_groups_and_children($ptrs->[$i]); | ||
4618 : | if (@$groupsC > 0) | ||
4619 : | { | ||
4620 : | push(@$groups,@$groupsC); | ||
4621 : | } | ||
4622 : | push(@$ids1,@$idsC); | ||
4623 : | } | ||
4624 : | |||
4625 : | if ($tree->[0]) | ||
4626 : | { | ||
4627 : | push(@$groups,[$tree->[0],$ids1]); | ||
4628 : | } | ||
4629 : | push(@$ids,@$ids1); | ||
4630 : | } | ||
4631 : | elsif ($tree->[0]) | ||
4632 : | { | ||
4633 : | push(@$ids,$tree->[0]); | ||
4634 : | } | ||
4635 : | |||
4636 : | return ($groups,$ids); | ||
4637 : | } | ||
4638 : | |||
4639 : | overbeek | 1.39 | ################################# Subsystems #################################### |
4640 : | |||
4641 : | sub exportable_subsystem { | ||
4642 : | my($self,$ssa) = @_; | ||
4643 : | my(%seqs,@genomes); | ||
4644 : | |||
4645 : | my $spreadsheet = []; | ||
4646 : | my $notes = []; | ||
4647 : | |||
4648 : | $ssa =~ s/ /_/g; | ||
4649 : | if (open(SSA,"<$FIG_Config::data/Subsystems/$ssa/spreadsheet")) | ||
4650 : | { | ||
4651 : | overbeek | 1.42 | my $version = $self->subsystem_version($ssa); |
4652 : | my $exchangable = $self->is_exchangable_subsystem($ssa); | ||
4653 : | overbeek | 1.49 | push(@$spreadsheet,"$ssa\n$version\n$exchangable\n"); |
4654 : | my @curation = `head -1 $FIG_Config::data/Subsystems/$ssa/curation.log`; | ||
4655 : | push(@$spreadsheet,$curation[0],"//\n"); | ||
4656 : | overbeek | 1.42 | |
4657 : | overbeek | 1.39 | while (defined($_ = <SSA>) && ($_ !~ /^\/\//)) |
4658 : | { | ||
4659 : | push(@$spreadsheet,$_); | ||
4660 : | } | ||
4661 : | push(@$spreadsheet,"//\n"); | ||
4662 : | |||
4663 : | while (defined($_ = <SSA>) && ($_ !~ /^\/\//)) | ||
4664 : | { | ||
4665 : | push(@$spreadsheet,$_); | ||
4666 : | } | ||
4667 : | push(@$spreadsheet,"//\n"); | ||
4668 : | |||
4669 : | while (defined($_ = <SSA>)) | ||
4670 : | { | ||
4671 : | push(@$spreadsheet,$_); | ||
4672 : | golsen | 1.44 | chomp; |
4673 : | overbeek | 1.39 | my @flds = split(/\t/,$_); |
4674 : | my $genome = $flds[0]; | ||
4675 : | push(@genomes,$genome); | ||
4676 : | my($i,$id); | ||
4677 : | for ($i=2; ($i < @flds); $i++) | ||
4678 : | { | ||
4679 : | if ($flds[$i]) | ||
4680 : | { | ||
4681 : | my @entries = split(/,/,$flds[$i]); | ||
4682 : | foreach $id (@entries) | ||
4683 : | { | ||
4684 : | $seqs{"fig\|$genome\.peg.$id"} = 1; | ||
4685 : | } | ||
4686 : | } | ||
4687 : | } | ||
4688 : | } | ||
4689 : | push(@$spreadsheet,"//\n"); | ||
4690 : | |||
4691 : | my $peg; | ||
4692 : | foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs)) | ||
4693 : | { | ||
4694 : | my @aliases = grep { $_ =~ /^(sp\||gi\||pirnr\||kegg\||N[PGZ]_)/ } $self->feature_aliases($peg); | ||
4695 : | push(@$spreadsheet,join("\t",($peg,join(",",@aliases),$self->genus_species($self->genome_of($peg)),scalar $self->function_of($peg))) . "\n"); | ||
4696 : | } | ||
4697 : | push(@$spreadsheet,"//\n"); | ||
4698 : | |||
4699 : | foreach $peg (sort { &FIG::by_fig_id($a,$b) } keys(%seqs)) | ||
4700 : | { | ||
4701 : | my $aliases = $self->feature_aliases($peg); | ||
4702 : | my $seq = $self->get_translation($peg); | ||
4703 : | push(@$spreadsheet,">$peg $aliases\n"); | ||
4704 : | my($i,$ln); | ||
4705 : | $ln = length($seq); | ||
4706 : | for ($i=0; ($i < $ln); $i += 60) | ||
4707 : | { | ||
4708 : | if (($ln - $i) > 60) | ||
4709 : | { | ||
4710 : | push(@$spreadsheet,substr($seq,$i,60) . "\n"); | ||
4711 : | } | ||
4712 : | else | ||
4713 : | { | ||
4714 : | push(@$spreadsheet,substr($seq,$i) . "\n"); | ||
4715 : | } | ||
4716 : | } | ||
4717 |